]> git.proxmox.com Git - ceph.git/blob - ceph/src/mds/Server.cc
import quincy beta 17.1.0
[ceph.git] / ceph / src / mds / Server.cc
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3 /*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 *
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
12 *
13 */
14
15 #include <boost/lexical_cast.hpp>
16 #include "include/ceph_assert.h" // lexical_cast includes system assert.h
17
18 #include <boost/config/warning_disable.hpp>
19 #include <boost/fusion/include/std_pair.hpp>
20 #include <boost/range/adaptor/reversed.hpp>
21
22 #include "MDSRank.h"
23 #include "Server.h"
24 #include "Locker.h"
25 #include "MDCache.h"
26 #include "MDLog.h"
27 #include "Migrator.h"
28 #include "MDBalancer.h"
29 #include "InoTable.h"
30 #include "SnapClient.h"
31 #include "Mutation.h"
32 #include "MetricsHandler.h"
33 #include "cephfs_features.h"
34
35 #include "msg/Messenger.h"
36
37 #include "osdc/Objecter.h"
38
39 #include "events/EUpdate.h"
40 #include "events/EPeerUpdate.h"
41 #include "events/ESession.h"
42 #include "events/EOpen.h"
43 #include "events/ECommitted.h"
44 #include "events/EPurged.h"
45
46 #include "include/stringify.h"
47 #include "include/filepath.h"
48 #include "common/errno.h"
49 #include "common/Timer.h"
50 #include "common/perf_counters.h"
51 #include "include/compat.h"
52 #include "osd/OSDMap.h"
53
54 #include <errno.h>
55
56 #include <list>
57 #include <regex>
58 #include <string_view>
59 #include <functional>
60
61 #include "common/config.h"
62
63 #define dout_context g_ceph_context
64 #define dout_subsys ceph_subsys_mds
65 #undef dout_prefix
66 #define dout_prefix *_dout << "mds." << mds->get_nodeid() << ".server "
67
68 using namespace std;
69
70 class ServerContext : public MDSContext {
71 protected:
72 Server *server;
73 MDSRank *get_mds() override
74 {
75 return server->mds;
76 }
77
78 public:
79 explicit ServerContext(Server *s) : server(s) {
80 ceph_assert(server != NULL);
81 }
82 };
83
84 class Batch_Getattr_Lookup : public BatchOp {
85 protected:
86 Server* server;
87 ceph::ref_t<MDRequestImpl> mdr;
88 std::vector<ceph::ref_t<MDRequestImpl>> batch_reqs;
89 int res = 0;
90 public:
91 Batch_Getattr_Lookup(Server* s, const ceph::ref_t<MDRequestImpl>& r)
92 : server(s), mdr(r) {
93 if (mdr->client_request->get_op() == CEPH_MDS_OP_LOOKUP)
94 mdr->batch_op_map = &mdr->dn[0].back()->batch_ops;
95 else
96 mdr->batch_op_map = &mdr->in[0]->batch_ops;
97 }
98 void add_request(const ceph::ref_t<MDRequestImpl>& r) override {
99 batch_reqs.push_back(r);
100 }
101 ceph::ref_t<MDRequestImpl> find_new_head() override {
102 while (!batch_reqs.empty()) {
103 auto r = std::move(batch_reqs.back());
104 batch_reqs.pop_back();
105 if (r->killed)
106 continue;
107
108 r->batch_op_map = mdr->batch_op_map;
109 mdr->batch_op_map = nullptr;
110 mdr = r;
111 return mdr;
112 }
113 return nullptr;
114 }
115 void _forward(mds_rank_t t) override {
116 MDCache* mdcache = server->mdcache;
117 mdcache->mds->forward_message_mds(mdr->release_client_request(), t);
118 mdr->set_mds_stamp(ceph_clock_now());
119 for (auto& m : batch_reqs) {
120 if (!m->killed)
121 mdcache->request_forward(m, t);
122 }
123 batch_reqs.clear();
124 }
125 void _respond(int r) override {
126 mdr->set_mds_stamp(ceph_clock_now());
127 for (auto& m : batch_reqs) {
128 if (!m->killed) {
129 m->tracei = mdr->tracei;
130 m->tracedn = mdr->tracedn;
131 server->respond_to_request(m, r);
132 }
133 }
134 batch_reqs.clear();
135 server->reply_client_request(mdr, make_message<MClientReply>(*mdr->client_request, r));
136 }
137 void print(std::ostream& o) {
138 o << "[batch front=" << *mdr << "]";
139 }
140 };
141
142 class ServerLogContext : public MDSLogContextBase {
143 protected:
144 Server *server;
145 MDSRank *get_mds() override
146 {
147 return server->mds;
148 }
149
150 MDRequestRef mdr;
151 void pre_finish(int r) override {
152 if (mdr)
153 mdr->mark_event("journal_committed: ");
154 }
155 public:
156 explicit ServerLogContext(Server *s) : server(s) {
157 ceph_assert(server != NULL);
158 }
159 explicit ServerLogContext(Server *s, MDRequestRef& r) : server(s), mdr(r) {
160 ceph_assert(server != NULL);
161 }
162 };
163
164 void Server::create_logger()
165 {
166 PerfCountersBuilder plb(g_ceph_context, "mds_server", l_mdss_first, l_mdss_last);
167
168 plb.add_u64_counter(l_mdss_handle_client_request, "handle_client_request",
169 "Client requests", "hcr", PerfCountersBuilder::PRIO_INTERESTING);
170 plb.add_u64_counter(l_mdss_handle_peer_request, "handle_peer_request",
171 "Peer requests", "hsr", PerfCountersBuilder::PRIO_INTERESTING);
172 plb.add_u64_counter(l_mdss_handle_client_session,
173 "handle_client_session", "Client session messages", "hcs",
174 PerfCountersBuilder::PRIO_INTERESTING);
175 plb.add_u64_counter(l_mdss_cap_revoke_eviction, "cap_revoke_eviction",
176 "Cap Revoke Client Eviction", "cre", PerfCountersBuilder::PRIO_INTERESTING);
177 plb.add_u64_counter(l_mdss_cap_acquisition_throttle,
178 "cap_acquisition_throttle", "Cap acquisition throttle counter", "cat",
179 PerfCountersBuilder::PRIO_INTERESTING);
180
181 // fop latencies are useful
182 plb.set_prio_default(PerfCountersBuilder::PRIO_USEFUL);
183 plb.add_time_avg(l_mdss_req_lookuphash_latency, "req_lookuphash_latency",
184 "Request type lookup hash of inode latency");
185 plb.add_time_avg(l_mdss_req_lookupino_latency, "req_lookupino_latency",
186 "Request type lookup inode latency");
187 plb.add_time_avg(l_mdss_req_lookupparent_latency, "req_lookupparent_latency",
188 "Request type lookup parent latency");
189 plb.add_time_avg(l_mdss_req_lookupname_latency, "req_lookupname_latency",
190 "Request type lookup name latency");
191 plb.add_time_avg(l_mdss_req_lookup_latency, "req_lookup_latency",
192 "Request type lookup latency");
193 plb.add_time_avg(l_mdss_req_lookupsnap_latency, "req_lookupsnap_latency",
194 "Request type lookup snapshot latency");
195 plb.add_time_avg(l_mdss_req_getattr_latency, "req_getattr_latency",
196 "Request type get attribute latency");
197 plb.add_time_avg(l_mdss_req_setattr_latency, "req_setattr_latency",
198 "Request type set attribute latency");
199 plb.add_time_avg(l_mdss_req_setlayout_latency, "req_setlayout_latency",
200 "Request type set file layout latency");
201 plb.add_time_avg(l_mdss_req_setdirlayout_latency, "req_setdirlayout_latency",
202 "Request type set directory layout latency");
203 plb.add_time_avg(l_mdss_req_setxattr_latency, "req_setxattr_latency",
204 "Request type set extended attribute latency");
205 plb.add_time_avg(l_mdss_req_rmxattr_latency, "req_rmxattr_latency",
206 "Request type remove extended attribute latency");
207 plb.add_time_avg(l_mdss_req_readdir_latency, "req_readdir_latency",
208 "Request type read directory latency");
209 plb.add_time_avg(l_mdss_req_setfilelock_latency, "req_setfilelock_latency",
210 "Request type set file lock latency");
211 plb.add_time_avg(l_mdss_req_getfilelock_latency, "req_getfilelock_latency",
212 "Request type get file lock latency");
213 plb.add_time_avg(l_mdss_req_create_latency, "req_create_latency",
214 "Request type create latency");
215 plb.add_time_avg(l_mdss_req_open_latency, "req_open_latency",
216 "Request type open latency");
217 plb.add_time_avg(l_mdss_req_mknod_latency, "req_mknod_latency",
218 "Request type make node latency");
219 plb.add_time_avg(l_mdss_req_link_latency, "req_link_latency",
220 "Request type link latency");
221 plb.add_time_avg(l_mdss_req_unlink_latency, "req_unlink_latency",
222 "Request type unlink latency");
223 plb.add_time_avg(l_mdss_req_rmdir_latency, "req_rmdir_latency",
224 "Request type remove directory latency");
225 plb.add_time_avg(l_mdss_req_rename_latency, "req_rename_latency",
226 "Request type rename latency");
227 plb.add_time_avg(l_mdss_req_mkdir_latency, "req_mkdir_latency",
228 "Request type make directory latency");
229 plb.add_time_avg(l_mdss_req_symlink_latency, "req_symlink_latency",
230 "Request type symbolic link latency");
231 plb.add_time_avg(l_mdss_req_lssnap_latency, "req_lssnap_latency",
232 "Request type list snapshot latency");
233 plb.add_time_avg(l_mdss_req_mksnap_latency, "req_mksnap_latency",
234 "Request type make snapshot latency");
235 plb.add_time_avg(l_mdss_req_rmsnap_latency, "req_rmsnap_latency",
236 "Request type remove snapshot latency");
237 plb.add_time_avg(l_mdss_req_renamesnap_latency, "req_renamesnap_latency",
238 "Request type rename snapshot latency");
239
240 plb.set_prio_default(PerfCountersBuilder::PRIO_DEBUGONLY);
241 plb.add_u64_counter(l_mdss_dispatch_client_request, "dispatch_client_request",
242 "Client requests dispatched");
243 plb.add_u64_counter(l_mdss_dispatch_peer_request, "dispatch_server_request",
244 "Server requests dispatched");
245
246 logger = plb.create_perf_counters();
247 g_ceph_context->get_perfcounters_collection()->add(logger);
248 }
249
250 Server::Server(MDSRank *m, MetricsHandler *metrics_handler) :
251 mds(m),
252 mdcache(mds->mdcache), mdlog(mds->mdlog),
253 recall_throttle(g_conf().get_val<double>("mds_recall_max_decay_rate")),
254 metrics_handler(metrics_handler)
255 {
256 forward_all_requests_to_auth = g_conf().get_val<bool>("mds_forward_all_requests_to_auth");
257 replay_unsafe_with_closed_session = g_conf().get_val<bool>("mds_replay_unsafe_with_closed_session");
258 cap_revoke_eviction_timeout = g_conf().get_val<double>("mds_cap_revoke_eviction_timeout");
259 max_snaps_per_dir = g_conf().get_val<uint64_t>("mds_max_snaps_per_dir");
260 delegate_inos_pct = g_conf().get_val<uint64_t>("mds_client_delegate_inos_pct");
261 max_caps_per_client = g_conf().get_val<uint64_t>("mds_max_caps_per_client");
262 cap_acquisition_throttle = g_conf().get_val<uint64_t>("mds_session_cap_acquisition_throttle");
263 max_caps_throttle_ratio = g_conf().get_val<double>("mds_session_max_caps_throttle_ratio");
264 caps_throttle_retry_request_timeout = g_conf().get_val<double>("mds_cap_acquisition_throttle_retry_request_timeout");
265 dir_max_entries = g_conf().get_val<uint64_t>("mds_dir_max_entries");
266 bal_fragment_size_max = g_conf().get_val<int64_t>("mds_bal_fragment_size_max");
267 supported_features = feature_bitset_t(CEPHFS_FEATURES_MDS_SUPPORTED);
268 }
269
270 void Server::dispatch(const cref_t<Message> &m)
271 {
272 switch (m->get_type()) {
273 case CEPH_MSG_CLIENT_RECONNECT:
274 handle_client_reconnect(ref_cast<MClientReconnect>(m));
275 return;
276 }
277
278 /*
279 *In reconnect phase, client sent unsafe requests to mds before reconnect msg. Seting sessionclosed_isok will handle scenario like this:
280
281 1. In reconnect phase, client sent unsafe requests to mds.
282 2. It reached reconnect timeout. All sessions without sending reconnect msg in time, some of which may had sent unsafe requests, are marked as closed.
283 (Another situation is #31668, which will deny all client reconnect msg to speed up reboot).
284 3.So these unsafe request from session without sending reconnect msg in time or being denied could be handled in clientreplay phase.
285
286 */
287 bool sessionclosed_isok = replay_unsafe_with_closed_session;
288 // active?
289 // handle_peer_request()/handle_client_session() will wait if necessary
290 if (m->get_type() == CEPH_MSG_CLIENT_REQUEST && !mds->is_active()) {
291 const auto &req = ref_cast<MClientRequest>(m);
292 if (mds->is_reconnect() || mds->get_want_state() == CEPH_MDS_STATE_RECONNECT) {
293 Session *session = mds->get_session(req);
294 if (!session || (!session->is_open() && !sessionclosed_isok)) {
295 dout(5) << "session is closed, dropping " << req->get_reqid() << dendl;
296 return;
297 }
298 bool queue_replay = false;
299 if (req->is_replay() || req->is_async()) {
300 dout(3) << "queuing replayed op" << dendl;
301 queue_replay = true;
302 if (req->head.ino &&
303 !session->have_completed_request(req->get_reqid().tid, nullptr)) {
304 inodeno_t ino(req->head.ino);
305 mdcache->add_replay_ino_alloc(ino);
306 if (replay_unsafe_with_closed_session &&
307 session->free_prealloc_inos.contains(ino)) {
308 // don't purge inodes that will be created by later replay
309 session->free_prealloc_inos.erase(ino);
310 session->delegated_inos.insert(ino);
311 }
312 }
313 } else if (req->get_retry_attempt()) {
314 // process completed request in clientreplay stage. The completed request
315 // might have created new file/directorie. This guarantees MDS sends a reply
316 // to client before other request modifies the new file/directorie.
317 if (session->have_completed_request(req->get_reqid().tid, NULL)) {
318 dout(3) << "queuing completed op" << dendl;
319 queue_replay = true;
320 }
321 // this request was created before the cap reconnect message, drop any embedded
322 // cap releases.
323 req->releases.clear();
324 }
325 if (queue_replay) {
326 req->mark_queued_for_replay();
327 mds->enqueue_replay(new C_MDS_RetryMessage(mds, m));
328 return;
329 }
330 }
331
332 bool wait_for_active = true;
333 if (mds->is_stopping()) {
334 wait_for_active = false;
335 } else if (mds->is_clientreplay()) {
336 if (req->is_queued_for_replay()) {
337 wait_for_active = false;
338 }
339 }
340 if (wait_for_active) {
341 dout(3) << "not active yet, waiting" << dendl;
342 mds->wait_for_active(new C_MDS_RetryMessage(mds, m));
343 return;
344 }
345 }
346
347 switch (m->get_type()) {
348 case CEPH_MSG_CLIENT_SESSION:
349 handle_client_session(ref_cast<MClientSession>(m));
350 return;
351 case CEPH_MSG_CLIENT_REQUEST:
352 handle_client_request(ref_cast<MClientRequest>(m));
353 return;
354 case CEPH_MSG_CLIENT_RECLAIM:
355 handle_client_reclaim(ref_cast<MClientReclaim>(m));
356 return;
357 case MSG_MDS_PEER_REQUEST:
358 handle_peer_request(ref_cast<MMDSPeerRequest>(m));
359 return;
360 default:
361 derr << "server unknown message " << m->get_type() << dendl;
362 ceph_abort_msg("server unknown message");
363 }
364 }
365
366
367
368 // ----------------------------------------------------------
369 // SESSION management
370
371 class C_MDS_session_finish : public ServerLogContext {
372 Session *session;
373 uint64_t state_seq;
374 bool open;
375 version_t cmapv;
376 interval_set<inodeno_t> inos_to_free;
377 version_t inotablev;
378 interval_set<inodeno_t> inos_to_purge;
379 LogSegment *ls = nullptr;
380 Context *fin;
381 public:
382 C_MDS_session_finish(Server *srv, Session *se, uint64_t sseq, bool s, version_t mv, Context *fin_ = nullptr) :
383 ServerLogContext(srv), session(se), state_seq(sseq), open(s), cmapv(mv), inotablev(0), fin(fin_) { }
384 C_MDS_session_finish(Server *srv, Session *se, uint64_t sseq, bool s, version_t mv,
385 const interval_set<inodeno_t>& to_free, version_t iv,
386 const interval_set<inodeno_t>& to_purge, LogSegment *_ls, Context *fin_ = nullptr) :
387 ServerLogContext(srv), session(se), state_seq(sseq), open(s), cmapv(mv),
388 inos_to_free(to_free), inotablev(iv), inos_to_purge(to_purge), ls(_ls), fin(fin_) {}
389 void finish(int r) override {
390 ceph_assert(r == 0);
391 server->_session_logged(session, state_seq, open, cmapv, inos_to_free, inotablev, inos_to_purge, ls);
392 if (fin) {
393 fin->complete(r);
394 }
395 }
396 };
397
398 Session* Server::find_session_by_uuid(std::string_view uuid)
399 {
400 Session* session = nullptr;
401 for (auto& it : mds->sessionmap.get_sessions()) {
402 auto& metadata = it.second->info.client_metadata;
403
404 auto p = metadata.find("uuid");
405 if (p == metadata.end() || p->second != uuid)
406 continue;
407
408 if (!session) {
409 session = it.second;
410 } else if (!session->reclaiming_from) {
411 ceph_assert(it.second->reclaiming_from == session);
412 session = it.second;
413 } else {
414 ceph_assert(session->reclaiming_from == it.second);
415 }
416 }
417 return session;
418 }
419
420 void Server::reclaim_session(Session *session, const cref_t<MClientReclaim> &m)
421 {
422 if (!session->is_open() && !session->is_stale()) {
423 dout(10) << "session not open, dropping this req" << dendl;
424 return;
425 }
426
427 auto reply = make_message<MClientReclaimReply>(0);
428 if (m->get_uuid().empty()) {
429 dout(10) << __func__ << " invalid message (no uuid)" << dendl;
430 reply->set_result(-CEPHFS_EINVAL);
431 mds->send_message_client(reply, session);
432 return;
433 }
434
435 unsigned flags = m->get_flags();
436 if (flags != CEPH_RECLAIM_RESET) { // currently only support reset
437 dout(10) << __func__ << " unsupported flags" << dendl;
438 reply->set_result(-CEPHFS_EOPNOTSUPP);
439 mds->send_message_client(reply, session);
440 return;
441 }
442
443 Session* target = find_session_by_uuid(m->get_uuid());
444 if (target) {
445 if (session->info.auth_name != target->info.auth_name) {
446 dout(10) << __func__ << " session auth_name " << session->info.auth_name
447 << " != target auth_name " << target->info.auth_name << dendl;
448 reply->set_result(-CEPHFS_EPERM);
449 mds->send_message_client(reply, session);
450 }
451
452 ceph_assert(!target->reclaiming_from);
453 ceph_assert(!session->reclaiming_from);
454 session->reclaiming_from = target;
455 reply->set_addrs(entity_addrvec_t(target->info.inst.addr));
456 }
457
458 if (flags & CEPH_RECLAIM_RESET) {
459 finish_reclaim_session(session, reply);
460 return;
461 }
462
463 ceph_abort();
464 }
465
466 void Server::finish_reclaim_session(Session *session, const ref_t<MClientReclaimReply> &reply)
467 {
468 Session *target = session->reclaiming_from;
469 if (target) {
470 session->reclaiming_from = nullptr;
471
472 Context *send_reply;
473 if (reply) {
474 int64_t session_id = session->get_client().v;
475 send_reply = new LambdaContext([this, session_id, reply](int r) {
476 ceph_assert(ceph_mutex_is_locked_by_me(mds->mds_lock));
477 Session *session = mds->sessionmap.get_session(entity_name_t::CLIENT(session_id));
478 if (!session) {
479 return;
480 }
481 auto epoch = mds->objecter->with_osdmap([](const OSDMap &map){ return map.get_epoch(); });
482 reply->set_epoch(epoch);
483 mds->send_message_client(reply, session);
484 });
485 } else {
486 send_reply = nullptr;
487 }
488
489 bool blocklisted = mds->objecter->with_osdmap([target](const OSDMap &map) {
490 return map.is_blocklisted(target->info.inst.addr);
491 });
492
493 if (blocklisted || !g_conf()->mds_session_blocklist_on_evict) {
494 kill_session(target, send_reply);
495 } else {
496 CachedStackStringStream css;
497 mds->evict_client(target->get_client().v, false, true, *css, send_reply);
498 }
499 } else if (reply) {
500 mds->send_message_client(reply, session);
501 }
502 }
503
504 void Server::handle_client_reclaim(const cref_t<MClientReclaim> &m)
505 {
506 Session *session = mds->get_session(m);
507 dout(3) << __func__ << " " << *m << " from " << m->get_source() << dendl;
508 ceph_assert(m->get_source().is_client()); // should _not_ come from an mds!
509
510 if (!session) {
511 dout(0) << " ignoring sessionless msg " << *m << dendl;
512 return;
513 }
514
515 std::string_view fs_name = mds->mdsmap->get_fs_name();
516 if (!fs_name.empty() && !session->fs_name_capable(fs_name, MAY_READ)) {
517 dout(0) << " dropping message not allowed for this fs_name: " << *m << dendl;
518 return;
519 }
520
521 if (mds->get_state() < MDSMap::STATE_CLIENTREPLAY) {
522 mds->wait_for_replay(new C_MDS_RetryMessage(mds, m));
523 return;
524 }
525
526 if (m->get_flags() & MClientReclaim::FLAG_FINISH) {
527 finish_reclaim_session(session);
528 } else {
529 reclaim_session(session, m);
530 }
531 }
532
533 void Server::handle_client_session(const cref_t<MClientSession> &m)
534 {
535 version_t pv;
536 Session *session = mds->get_session(m);
537
538 dout(3) << "handle_client_session " << *m << " from " << m->get_source() << dendl;
539 ceph_assert(m->get_source().is_client()); // should _not_ come from an mds!
540
541 if (!session) {
542 dout(0) << " ignoring sessionless msg " << *m << dendl;
543 auto reply = make_message<MClientSession>(CEPH_SESSION_REJECT);
544 reply->metadata["error_string"] = "sessionless";
545 mds->send_message(reply, m->get_connection());
546 return;
547 }
548
549 std::string_view fs_name = mds->mdsmap->get_fs_name();
550 if (!fs_name.empty() && !session->fs_name_capable(fs_name, MAY_READ)) {
551 dout(0) << " dropping message not allowed for this fs_name: " << *m << dendl;
552 auto reply = make_message<MClientSession>(CEPH_SESSION_REJECT);
553 reply->metadata["error_string"] = "client doesn't have caps for FS \"" +
554 std::string(fs_name) + "\"";
555 mds->send_message(std::move(reply), m->get_connection());
556 return;
557 }
558
559 if (m->get_op() == CEPH_SESSION_REQUEST_RENEWCAPS) {
560 // always handle renewcaps (state >= MDSMap::STATE_RECONNECT)
561 } else if (m->get_op() == CEPH_SESSION_REQUEST_CLOSE) {
562 // close requests need to be handled when mds is active
563 if (mds->get_state() < MDSMap::STATE_ACTIVE) {
564 mds->wait_for_active(new C_MDS_RetryMessage(mds, m));
565 return;
566 }
567 } else {
568 if (mds->get_state() < MDSMap::STATE_CLIENTREPLAY) {
569 mds->wait_for_replay(new C_MDS_RetryMessage(mds, m));
570 return;
571 }
572 }
573
574 if (logger)
575 logger->inc(l_mdss_handle_client_session);
576
577 uint64_t sseq = 0;
578 switch (m->get_op()) {
579 case CEPH_SESSION_REQUEST_OPEN:
580 if (session->is_opening() ||
581 session->is_open() ||
582 session->is_stale() ||
583 session->is_killing() ||
584 terminating_sessions) {
585 dout(10) << "currently open|opening|stale|killing, dropping this req" << dendl;
586 return;
587 }
588 ceph_assert(session->is_closed() || session->is_closing());
589
590 if (mds->is_stopping()) {
591 dout(10) << "mds is stopping, dropping open req" << dendl;
592 return;
593 }
594
595 {
596 auto& addr = session->info.inst.addr;
597 session->set_client_metadata(client_metadata_t(m->metadata, m->supported_features, m->metric_spec));
598 auto& client_metadata = session->info.client_metadata;
599
600 auto log_session_status = [this, m, session](std::string_view status, std::string_view err) {
601 auto now = ceph_clock_now();
602 auto throttle_elapsed = m->get_recv_complete_stamp() - m->get_throttle_stamp();
603 auto elapsed = now - m->get_recv_stamp();
604 CachedStackStringStream css;
605 *css << "New client session:"
606 << " addr=\"" << session->info.inst.addr << "\""
607 << ",elapsed=" << elapsed
608 << ",throttled=" << throttle_elapsed
609 << ",status=\"" << status << "\"";
610 if (!err.empty()) {
611 *css << ",error=\"" << err << "\"";
612 }
613 const auto& metadata = session->info.client_metadata;
614 if (auto it = metadata.find("root"); it != metadata.end()) {
615 *css << ",root=\"" << it->second << "\"";
616 }
617 dout(2) << css->strv() << dendl;
618 };
619
620 auto send_reject_message = [this, &session, &log_session_status](std::string_view err_str, unsigned flags=0) {
621 auto m = make_message<MClientSession>(CEPH_SESSION_REJECT, 0, flags);
622 if (session->info.has_feature(CEPHFS_FEATURE_MIMIC))
623 m->metadata["error_string"] = err_str;
624 mds->send_message_client(m, session);
625 log_session_status("REJECTED", err_str);
626 };
627
628 bool blocklisted = mds->objecter->with_osdmap(
629 [&addr](const OSDMap &osd_map) -> bool {
630 return osd_map.is_blocklisted(addr);
631 });
632
633 if (blocklisted) {
634 dout(10) << "rejecting blocklisted client " << addr << dendl;
635 // This goes on the wire and the "blacklisted" substring is
636 // depended upon by the kernel client for detecting whether it
637 // has been blocklisted. If mounted with recover_session=clean
638 // (since 5.4), it tries to automatically recover itself from
639 // blocklisting.
640 unsigned flags = 0;
641 flags |= MClientSession::SESSION_BLOCKLISTED;
642 send_reject_message("blocklisted (blacklisted)", flags);
643 session->clear();
644 break;
645 }
646
647 if (client_metadata.features.empty())
648 infer_supported_features(session, client_metadata);
649
650 dout(20) << __func__ << " CEPH_SESSION_REQUEST_OPEN metadata entries:" << dendl;
651 dout(20) << " features: '" << client_metadata.features << "'" << dendl;
652 dout(20) << " metric specification: [" << client_metadata.metric_spec << "]" << dendl;
653 for (const auto& p : client_metadata) {
654 dout(20) << " " << p.first << ": " << p.second << dendl;
655 }
656
657 feature_bitset_t missing_features = required_client_features;
658 missing_features -= client_metadata.features;
659 if (!missing_features.empty()) {
660 CachedStackStringStream css;
661 *css << "missing required features '" << missing_features << "'";
662 send_reject_message(css->strv());
663 mds->clog->warn() << "client session (" << session->info.inst
664 << ") lacks required features " << missing_features
665 << "; client supports " << client_metadata.features;
666 session->clear();
667 break;
668 }
669
670 // Special case for the 'root' metadata path; validate that the claimed
671 // root is actually within the caps of the session
672 if (auto it = client_metadata.find("root"); it != client_metadata.end()) {
673 auto claimed_root = it->second;
674 CachedStackStringStream css;
675 bool denied = false;
676 // claimed_root has a leading "/" which we strip before passing
677 // into caps check
678 if (claimed_root.empty() || claimed_root[0] != '/') {
679 denied = true;
680 *css << "invalue root '" << claimed_root << "'";
681 } else if (!session->auth_caps.path_capable(claimed_root.substr(1))) {
682 denied = true;
683 *css << "non-allowable root '" << claimed_root << "'";
684 }
685
686 if (denied) {
687 // Tell the client we're rejecting their open
688 send_reject_message(css->strv());
689 mds->clog->warn() << "client session with " << css->strv()
690 << " denied (" << session->info.inst << ")";
691 session->clear();
692 break;
693 }
694 }
695
696 if (auto it = client_metadata.find("uuid"); it != client_metadata.end()) {
697 if (find_session_by_uuid(it->second)) {
698 send_reject_message("duplicated session uuid");
699 mds->clog->warn() << "client session with duplicated session uuid '"
700 << it->second << "' denied (" << session->info.inst << ")";
701 session->clear();
702 break;
703 }
704 }
705
706 if (session->is_closed()) {
707 mds->sessionmap.add_session(session);
708 }
709
710 pv = mds->sessionmap.mark_projected(session);
711 sseq = mds->sessionmap.set_state(session, Session::STATE_OPENING);
712 mds->sessionmap.touch_session(session);
713 auto fin = new LambdaContext([log_session_status = std::move(log_session_status)](int r){
714 ceph_assert(r == 0);
715 log_session_status("ACCEPTED", "");
716 });
717 mdlog->start_submit_entry(new ESession(m->get_source_inst(), true, pv, client_metadata),
718 new C_MDS_session_finish(this, session, sseq, true, pv, fin));
719 mdlog->flush();
720 }
721 break;
722
723 case CEPH_SESSION_REQUEST_RENEWCAPS:
724 if (session->is_open() || session->is_stale()) {
725 mds->sessionmap.touch_session(session);
726 if (session->is_stale()) {
727 mds->sessionmap.set_state(session, Session::STATE_OPEN);
728 mds->locker->resume_stale_caps(session);
729 mds->sessionmap.touch_session(session);
730 }
731 auto reply = make_message<MClientSession>(CEPH_SESSION_RENEWCAPS, m->get_seq());
732 mds->send_message_client(reply, session);
733 } else {
734 dout(10) << "ignoring renewcaps on non open|stale session (" << session->get_state_name() << ")" << dendl;
735 }
736 break;
737
738 case CEPH_SESSION_REQUEST_CLOSE:
739 {
740 if (session->is_closed() ||
741 session->is_closing() ||
742 session->is_killing()) {
743 dout(10) << "already closed|closing|killing, dropping this req" << dendl;
744 return;
745 }
746 if (session->is_importing()) {
747 dout(10) << "ignoring close req on importing session" << dendl;
748 return;
749 }
750 ceph_assert(session->is_open() ||
751 session->is_stale() ||
752 session->is_opening());
753 if (m->get_seq() < session->get_push_seq()) {
754 dout(10) << "old push seq " << m->get_seq() << " < " << session->get_push_seq()
755 << ", dropping" << dendl;
756 return;
757 }
758 // We are getting a seq that is higher than expected.
759 // Handle the same as any other seqn error.
760 //
761 if (m->get_seq() != session->get_push_seq()) {
762 dout(0) << "old push seq " << m->get_seq() << " != " << session->get_push_seq()
763 << ", BUGGY!" << dendl;
764 mds->clog->warn() << "incorrect push seq " << m->get_seq() << " != "
765 << session->get_push_seq() << ", dropping" << " from client : " << session->get_human_name();
766 return;
767 }
768 journal_close_session(session, Session::STATE_CLOSING, NULL);
769 }
770 break;
771
772 case CEPH_SESSION_FLUSHMSG_ACK:
773 finish_flush_session(session, m->get_seq());
774 break;
775
776 case CEPH_SESSION_REQUEST_FLUSH_MDLOG:
777 if (mds->is_active())
778 mdlog->flush();
779 break;
780
781 default:
782 ceph_abort();
783 }
784 }
785
786 void Server::flush_session(Session *session, MDSGatherBuilder& gather) {
787 if (!session->is_open() ||
788 !session->get_connection() ||
789 !session->get_connection()->has_feature(CEPH_FEATURE_EXPORT_PEER)) {
790 return;
791 }
792
793 version_t seq = session->wait_for_flush(gather.new_sub());
794 mds->send_message_client(
795 make_message<MClientSession>(CEPH_SESSION_FLUSHMSG, seq), session);
796 }
797
798 void Server::flush_client_sessions(set<client_t>& client_set, MDSGatherBuilder& gather)
799 {
800 for (const auto& client : client_set) {
801 Session *session = mds->sessionmap.get_session(entity_name_t::CLIENT(client.v));
802 ceph_assert(session);
803 flush_session(session, gather);
804 }
805 }
806
807 void Server::finish_flush_session(Session *session, version_t seq)
808 {
809 MDSContext::vec finished;
810 session->finish_flush(seq, finished);
811 mds->queue_waiters(finished);
812 }
813
814 void Server::_session_logged(Session *session, uint64_t state_seq, bool open, version_t pv,
815 const interval_set<inodeno_t>& inos_to_free, version_t piv,
816 const interval_set<inodeno_t>& inos_to_purge, LogSegment *ls)
817 {
818 dout(10) << "_session_logged " << session->info.inst
819 << " state_seq " << state_seq
820 << " " << (open ? "open":"close") << " " << pv
821 << " inos_to_free " << inos_to_free << " inotablev " << piv
822 << " inos_to_purge " << inos_to_purge << dendl;
823
824 if (!open) {
825 if (inos_to_purge.size()){
826 ceph_assert(ls);
827 session->info.prealloc_inos.subtract(inos_to_purge);
828 ls->purging_inodes.insert(inos_to_purge);
829 if (mds->is_clientreplay() || mds->is_active() || mds->is_stopping())
830 mdcache->purge_inodes(inos_to_purge, ls);
831 }
832
833 if (inos_to_free.size()) {
834 ceph_assert(piv);
835 ceph_assert(session->is_closing() || session->is_killing() ||
836 session->is_opening()); // re-open closing session
837 session->info.prealloc_inos.subtract(inos_to_free);
838 mds->inotable->apply_release_ids(inos_to_free);
839 ceph_assert(mds->inotable->get_version() == piv);
840 }
841 session->free_prealloc_inos = session->info.prealloc_inos;
842 session->delegated_inos.clear();
843 }
844
845 mds->sessionmap.mark_dirty(session);
846
847 // apply
848 if (session->get_state_seq() != state_seq) {
849 dout(10) << " journaled state_seq " << state_seq << " != current " << session->get_state_seq()
850 << ", noop" << dendl;
851 // close must have been canceled (by an import?), or any number of other things..
852 } else if (open) {
853 ceph_assert(session->is_opening());
854 mds->sessionmap.set_state(session, Session::STATE_OPEN);
855 mds->sessionmap.touch_session(session);
856 metrics_handler->add_session(session);
857 ceph_assert(session->get_connection());
858 auto reply = make_message<MClientSession>(CEPH_SESSION_OPEN);
859 if (session->info.has_feature(CEPHFS_FEATURE_MIMIC))
860 reply->supported_features = supported_features;
861 mds->send_message_client(reply, session);
862 if (mdcache->is_readonly()) {
863 auto m = make_message<MClientSession>(CEPH_SESSION_FORCE_RO);
864 mds->send_message_client(m, session);
865 }
866 } else if (session->is_closing() ||
867 session->is_killing()) {
868 // kill any lingering capabilities, leases, requests
869 bool killing = session->is_killing();
870 while (!session->caps.empty()) {
871 Capability *cap = session->caps.front();
872 CInode *in = cap->get_inode();
873 dout(20) << " killing capability " << ccap_string(cap->issued()) << " on " << *in << dendl;
874 mds->locker->remove_client_cap(in, cap, killing);
875 }
876 while (!session->leases.empty()) {
877 ClientLease *r = session->leases.front();
878 CDentry *dn = static_cast<CDentry*>(r->parent);
879 dout(20) << " killing client lease of " << *dn << dendl;
880 dn->remove_client_lease(r, mds->locker);
881 }
882 if (client_reconnect_gather.erase(session->info.get_client())) {
883 dout(20) << " removing client from reconnect set" << dendl;
884 if (client_reconnect_gather.empty()) {
885 dout(7) << " client " << session->info.inst << " was last reconnect, finishing" << dendl;
886 reconnect_gather_finish();
887 }
888 }
889 if (client_reclaim_gather.erase(session->info.get_client())) {
890 dout(20) << " removing client from reclaim set" << dendl;
891 if (client_reclaim_gather.empty()) {
892 dout(7) << " client " << session->info.inst << " was last reclaimed, finishing" << dendl;
893 mds->maybe_clientreplay_done();
894 }
895 }
896
897 if (session->is_closing()) {
898 // mark con disposable. if there is a fault, we will get a
899 // reset and clean it up. if the client hasn't received the
900 // CLOSE message yet, they will reconnect and get an
901 // ms_handle_remote_reset() and realize they had in fact closed.
902 // do this *before* sending the message to avoid a possible
903 // race.
904 if (session->get_connection()) {
905 // Conditional because terminate_sessions will indiscrimately
906 // put sessions in CLOSING whether they ever had a conn or not.
907 session->get_connection()->mark_disposable();
908 }
909
910 // reset session
911 mds->send_message_client(make_message<MClientSession>(CEPH_SESSION_CLOSE), session);
912 mds->sessionmap.set_state(session, Session::STATE_CLOSED);
913 session->clear();
914 metrics_handler->remove_session(session);
915 mds->sessionmap.remove_session(session);
916 } else if (session->is_killing()) {
917 // destroy session, close connection
918 if (session->get_connection()) {
919 session->get_connection()->mark_down();
920 mds->sessionmap.set_state(session, Session::STATE_CLOSED);
921 session->set_connection(nullptr);
922 }
923 metrics_handler->remove_session(session);
924 mds->sessionmap.remove_session(session);
925 } else {
926 ceph_abort();
927 }
928 } else {
929 ceph_abort();
930 }
931 }
932
933 /**
934 * Inject sessions from some source other than actual connections.
935 *
936 * For example:
937 * - sessions inferred from journal replay
938 * - sessions learned from other MDSs during rejoin
939 * - sessions learned from other MDSs during dir/caps migration
940 * - sessions learned from other MDSs during a cross-MDS rename
941 */
942 version_t Server::prepare_force_open_sessions(map<client_t,entity_inst_t>& cm,
943 map<client_t,client_metadata_t>& cmm,
944 map<client_t, pair<Session*,uint64_t> >& smap)
945 {
946 version_t pv = mds->sessionmap.get_projected();
947
948 dout(10) << "prepare_force_open_sessions " << pv
949 << " on " << cm.size() << " clients"
950 << dendl;
951
952 mds->objecter->with_osdmap(
953 [this, &cm, &cmm](const OSDMap &osd_map) {
954 for (auto p = cm.begin(); p != cm.end(); ) {
955 if (osd_map.is_blocklisted(p->second.addr)) {
956 dout(10) << " ignoring blocklisted client." << p->first
957 << " (" << p->second.addr << ")" << dendl;
958 cmm.erase(p->first);
959 cm.erase(p++);
960 } else {
961 ++p;
962 }
963 }
964 });
965
966 for (map<client_t,entity_inst_t>::iterator p = cm.begin(); p != cm.end(); ++p) {
967 Session *session = mds->sessionmap.get_or_add_session(p->second);
968 pv = mds->sessionmap.mark_projected(session);
969 uint64_t sseq;
970 if (session->is_closed() ||
971 session->is_closing() ||
972 session->is_killing()) {
973 sseq = mds->sessionmap.set_state(session, Session::STATE_OPENING);
974 auto q = cmm.find(p->first);
975 if (q != cmm.end())
976 session->info.client_metadata.merge(q->second);
977 } else {
978 ceph_assert(session->is_open() ||
979 session->is_opening() ||
980 session->is_stale());
981 sseq = 0;
982 }
983 smap[p->first] = make_pair(session, sseq);
984 session->inc_importing();
985 }
986 return pv;
987 }
988
989 void Server::finish_force_open_sessions(const map<client_t,pair<Session*,uint64_t> >& smap,
990 bool dec_import)
991 {
992 /*
993 * FIXME: need to carefully consider the race conditions between a
994 * client trying to close a session and an MDS doing an import
995 * trying to force open a session...
996 */
997 dout(10) << "finish_force_open_sessions on " << smap.size() << " clients,"
998 << " initial v " << mds->sessionmap.get_version() << dendl;
999
1000 for (auto &it : smap) {
1001 Session *session = it.second.first;
1002 uint64_t sseq = it.second.second;
1003 if (sseq > 0) {
1004 if (session->get_state_seq() != sseq) {
1005 dout(10) << "force_open_sessions skipping changed " << session->info.inst << dendl;
1006 } else {
1007 dout(10) << "force_open_sessions opened " << session->info.inst << dendl;
1008 mds->sessionmap.set_state(session, Session::STATE_OPEN);
1009 mds->sessionmap.touch_session(session);
1010 metrics_handler->add_session(session);
1011
1012 auto reply = make_message<MClientSession>(CEPH_SESSION_OPEN);
1013 if (session->info.has_feature(CEPHFS_FEATURE_MIMIC))
1014 reply->supported_features = supported_features;
1015 mds->send_message_client(reply, session);
1016
1017 if (mdcache->is_readonly())
1018 mds->send_message_client(make_message<MClientSession>(CEPH_SESSION_FORCE_RO), session);
1019 }
1020 } else {
1021 dout(10) << "force_open_sessions skipping already-open " << session->info.inst << dendl;
1022 ceph_assert(session->is_open() || session->is_stale());
1023 }
1024
1025 if (dec_import) {
1026 session->dec_importing();
1027 }
1028
1029 mds->sessionmap.mark_dirty(session);
1030 }
1031
1032 dout(10) << __func__ << ": final v " << mds->sessionmap.get_version() << dendl;
1033 }
1034
1035 class C_MDS_TerminatedSessions : public ServerContext {
1036 void finish(int r) override {
1037 server->terminating_sessions = false;
1038 }
1039 public:
1040 explicit C_MDS_TerminatedSessions(Server *s) : ServerContext(s) {}
1041 };
1042
1043 void Server::terminate_sessions()
1044 {
1045 dout(5) << "terminating all sessions..." << dendl;
1046
1047 terminating_sessions = true;
1048
1049 // kill them off. clients will retry etc.
1050 set<Session*> sessions;
1051 mds->sessionmap.get_client_session_set(sessions);
1052 for (set<Session*>::const_iterator p = sessions.begin();
1053 p != sessions.end();
1054 ++p) {
1055 Session *session = *p;
1056 if (session->is_closing() ||
1057 session->is_killing() ||
1058 session->is_closed())
1059 continue;
1060 journal_close_session(session, Session::STATE_CLOSING, NULL);
1061 }
1062
1063 mdlog->wait_for_safe(new C_MDS_TerminatedSessions(this));
1064 }
1065
1066
1067 void Server::find_idle_sessions()
1068 {
1069 auto now = clock::now();
1070 auto last_cleared_laggy = mds->last_cleared_laggy();
1071
1072 dout(10) << "find_idle_sessions. last cleared laggy state " << last_cleared_laggy << "s ago" << dendl;
1073
1074 // timeout/stale
1075 // (caps go stale, lease die)
1076 double queue_max_age = mds->get_dispatch_queue_max_age(ceph_clock_now());
1077 double cutoff = queue_max_age + mds->mdsmap->get_session_timeout();
1078
1079 // don't kick clients if we've been laggy
1080 if (last_cleared_laggy < cutoff) {
1081 dout(10) << " last cleared laggy " << last_cleared_laggy << "s ago (< cutoff " << cutoff
1082 << "), not marking any client stale" << dendl;
1083 return;
1084 }
1085
1086 std::vector<Session*> to_evict;
1087
1088 bool defer_session_stale = g_conf().get_val<bool>("mds_defer_session_stale");
1089 const auto sessions_p1 = mds->sessionmap.by_state.find(Session::STATE_OPEN);
1090 if (sessions_p1 != mds->sessionmap.by_state.end() && !sessions_p1->second->empty()) {
1091 std::vector<Session*> new_stale;
1092
1093 for (auto session : *(sessions_p1->second)) {
1094 auto last_cap_renew_span = std::chrono::duration<double>(now - session->last_cap_renew).count();
1095 if (last_cap_renew_span < cutoff) {
1096 dout(20) << "laggiest active session is " << session->info.inst
1097 << " and renewed caps recently (" << last_cap_renew_span << "s ago)" << dendl;
1098 break;
1099 }
1100
1101 if (session->last_seen > session->last_cap_renew) {
1102 last_cap_renew_span = std::chrono::duration<double>(now - session->last_seen).count();
1103 if (last_cap_renew_span < cutoff) {
1104 dout(20) << "laggiest active session is " << session->info.inst
1105 << " and renewed caps recently (" << last_cap_renew_span << "s ago)" << dendl;
1106 continue;
1107 }
1108 }
1109
1110 if (last_cap_renew_span >= mds->mdsmap->get_session_autoclose()) {
1111 dout(20) << "evicting session " << session->info.inst << " since autoclose "
1112 "has arrived" << dendl;
1113 // evict session without marking it stale
1114 to_evict.push_back(session);
1115 continue;
1116 }
1117
1118 if (defer_session_stale &&
1119 !session->is_any_flush_waiter() &&
1120 !mds->locker->is_revoking_any_caps_from(session->get_client())) {
1121 dout(20) << "deferring marking session " << session->info.inst << " stale "
1122 "since it holds no caps" << dendl;
1123 continue;
1124 }
1125
1126 auto it = session->info.client_metadata.find("timeout");
1127 if (it != session->info.client_metadata.end()) {
1128 unsigned timeout = strtoul(it->second.c_str(), nullptr, 0);
1129 if (timeout == 0) {
1130 dout(10) << "skipping session " << session->info.inst
1131 << ", infinite timeout specified" << dendl;
1132 continue;
1133 }
1134 double cutoff = queue_max_age + timeout;
1135 if (last_cap_renew_span < cutoff) {
1136 dout(10) << "skipping session " << session->info.inst
1137 << ", timeout (" << timeout << ") specified"
1138 << " and renewed caps recently (" << last_cap_renew_span << "s ago)" << dendl;
1139 continue;
1140 }
1141
1142 // do not go through stale, evict it directly.
1143 to_evict.push_back(session);
1144 } else {
1145 dout(10) << "new stale session " << session->info.inst
1146 << " last renewed caps " << last_cap_renew_span << "s ago" << dendl;
1147 new_stale.push_back(session);
1148 }
1149 }
1150
1151 for (auto session : new_stale) {
1152 mds->sessionmap.set_state(session, Session::STATE_STALE);
1153 if (mds->locker->revoke_stale_caps(session)) {
1154 mds->locker->remove_stale_leases(session);
1155 finish_flush_session(session, session->get_push_seq());
1156 auto m = make_message<MClientSession>(CEPH_SESSION_STALE, session->get_push_seq());
1157 mds->send_message_client(m, session);
1158 } else {
1159 to_evict.push_back(session);
1160 }
1161 }
1162 }
1163
1164 // autoclose
1165 cutoff = queue_max_age + mds->mdsmap->get_session_autoclose();
1166
1167 // Collect a list of sessions exceeding the autoclose threshold
1168 const auto sessions_p2 = mds->sessionmap.by_state.find(Session::STATE_STALE);
1169 if (sessions_p2 != mds->sessionmap.by_state.end() && !sessions_p2->second->empty()) {
1170 for (auto session : *(sessions_p2->second)) {
1171 ceph_assert(session->is_stale());
1172 auto last_cap_renew_span = std::chrono::duration<double>(now - session->last_cap_renew).count();
1173 if (last_cap_renew_span < cutoff) {
1174 dout(20) << "oldest stale session is " << session->info.inst
1175 << " and recently renewed caps " << last_cap_renew_span << "s ago" << dendl;
1176 break;
1177 }
1178 to_evict.push_back(session);
1179 }
1180 }
1181
1182 for (auto session: to_evict) {
1183 if (session->is_importing()) {
1184 dout(10) << "skipping session " << session->info.inst << ", it's being imported" << dendl;
1185 continue;
1186 }
1187
1188 auto last_cap_renew_span = std::chrono::duration<double>(now - session->last_cap_renew).count();
1189 mds->clog->warn() << "evicting unresponsive client " << *session
1190 << ", after " << last_cap_renew_span << " seconds";
1191 dout(10) << "autoclosing stale session " << session->info.inst
1192 << " last renewed caps " << last_cap_renew_span << "s ago" << dendl;
1193
1194 if (g_conf()->mds_session_blocklist_on_timeout) {
1195 CachedStackStringStream css;
1196 mds->evict_client(session->get_client().v, false, true, *css, nullptr);
1197 } else {
1198 kill_session(session, NULL);
1199 }
1200 }
1201 }
1202
1203 void Server::evict_cap_revoke_non_responders() {
1204 if (!cap_revoke_eviction_timeout) {
1205 return;
1206 }
1207
1208 auto&& to_evict = mds->locker->get_late_revoking_clients(cap_revoke_eviction_timeout);
1209
1210 for (auto const &client: to_evict) {
1211 mds->clog->warn() << "client id " << client << " has not responded to"
1212 << " cap revoke by MDS for over " << cap_revoke_eviction_timeout
1213 << " seconds, evicting";
1214 dout(1) << __func__ << ": evicting cap revoke non-responder client id "
1215 << client << dendl;
1216
1217 CachedStackStringStream css;
1218 bool evicted = mds->evict_client(client.v, false,
1219 g_conf()->mds_session_blocklist_on_evict,
1220 *css, nullptr);
1221 if (evicted && logger) {
1222 logger->inc(l_mdss_cap_revoke_eviction);
1223 }
1224 }
1225 }
1226
1227 void Server::handle_conf_change(const std::set<std::string>& changed) {
1228 if (changed.count("mds_forward_all_requests_to_auth")){
1229 forward_all_requests_to_auth = g_conf().get_val<bool>("mds_forward_all_requests_to_auth");
1230 }
1231 if (changed.count("mds_cap_revoke_eviction_timeout")) {
1232 cap_revoke_eviction_timeout = g_conf().get_val<double>("mds_cap_revoke_eviction_timeout");
1233 dout(20) << __func__ << " cap revoke eviction timeout changed to "
1234 << cap_revoke_eviction_timeout << dendl;
1235 }
1236 if (changed.count("mds_recall_max_decay_rate")) {
1237 recall_throttle = DecayCounter(g_conf().get_val<double>("mds_recall_max_decay_rate"));
1238 }
1239 if (changed.count("mds_max_snaps_per_dir")) {
1240 max_snaps_per_dir = g_conf().get_val<uint64_t>("mds_max_snaps_per_dir");
1241 dout(20) << __func__ << " max snapshots per directory changed to "
1242 << max_snaps_per_dir << dendl;
1243 }
1244 if (changed.count("mds_client_delegate_inos_pct")) {
1245 delegate_inos_pct = g_conf().get_val<uint64_t>("mds_client_delegate_inos_pct");
1246 }
1247 if (changed.count("mds_max_caps_per_client")) {
1248 max_caps_per_client = g_conf().get_val<uint64_t>("mds_max_caps_per_client");
1249 }
1250 if (changed.count("mds_session_cap_acquisition_throttle")) {
1251 cap_acquisition_throttle = g_conf().get_val<uint64_t>("mds_session_cap_acquisition_throttle");
1252 }
1253 if (changed.count("mds_session_max_caps_throttle_ratio")) {
1254 max_caps_throttle_ratio = g_conf().get_val<double>("mds_session_max_caps_throttle_ratio");
1255 }
1256 if (changed.count("mds_cap_acquisition_throttle_retry_request_timeout")) {
1257 caps_throttle_retry_request_timeout = g_conf().get_val<double>("mds_cap_acquisition_throttle_retry_request_timeout");
1258 }
1259 if (changed.count("mds_alternate_name_max")) {
1260 alternate_name_max = g_conf().get_val<Option::size_t>("mds_alternate_name_max");
1261 }
1262 if (changed.count("mds_dir_max_entries")) {
1263 dir_max_entries = g_conf().get_val<uint64_t>("mds_dir_max_entries");
1264 dout(20) << __func__ << " max entries per directory changed to "
1265 << dir_max_entries << dendl;
1266 }
1267 if (changed.count("mds_bal_fragment_size_max")) {
1268 bal_fragment_size_max = g_conf().get_val<int64_t>("mds_bal_fragment_size_max");
1269 dout(20) << __func__ << " max fragment size changed to "
1270 << bal_fragment_size_max << dendl;
1271 }
1272 }
1273
1274 /*
1275 * XXX bump in the interface here, not using an MDSContext here
1276 * because all the callers right now happen to use a SaferCond
1277 */
1278 void Server::kill_session(Session *session, Context *on_safe)
1279 {
1280 ceph_assert(ceph_mutex_is_locked_by_me(mds->mds_lock));
1281
1282 if ((session->is_opening() ||
1283 session->is_open() ||
1284 session->is_stale()) &&
1285 !session->is_importing()) {
1286 dout(10) << "kill_session " << session << dendl;
1287 journal_close_session(session, Session::STATE_KILLING, on_safe);
1288 } else {
1289 dout(10) << "kill_session importing or already closing/killing " << session << dendl;
1290 if (session->is_closing() ||
1291 session->is_killing()) {
1292 if (on_safe)
1293 mdlog->wait_for_safe(new MDSInternalContextWrapper(mds, on_safe));
1294 } else {
1295 ceph_assert(session->is_closed() ||
1296 session->is_importing());
1297 if (on_safe)
1298 on_safe->complete(0);
1299 }
1300 }
1301 }
1302
1303 size_t Server::apply_blocklist(const std::set<entity_addr_t> &blocklist)
1304 {
1305 bool prenautilus = mds->objecter->with_osdmap(
1306 [&](const OSDMap& o) {
1307 return o.require_osd_release < ceph_release_t::nautilus;
1308 });
1309
1310 std::vector<Session*> victims;
1311 const auto& sessions = mds->sessionmap.get_sessions();
1312 for (const auto& p : sessions) {
1313 if (!p.first.is_client()) {
1314 // Do not apply OSDMap blocklist to MDS daemons, we find out
1315 // about their death via MDSMap.
1316 continue;
1317 }
1318
1319 Session *s = p.second;
1320 auto inst_addr = s->info.inst.addr;
1321 // blocklist entries are always TYPE_ANY for nautilus+
1322 inst_addr.set_type(entity_addr_t::TYPE_ANY);
1323 if (blocklist.count(inst_addr)) {
1324 victims.push_back(s);
1325 continue;
1326 }
1327 if (prenautilus) {
1328 // ...except pre-nautilus, they were TYPE_LEGACY
1329 inst_addr.set_type(entity_addr_t::TYPE_LEGACY);
1330 if (blocklist.count(inst_addr)) {
1331 victims.push_back(s);
1332 }
1333 }
1334 }
1335
1336 for (const auto& s : victims) {
1337 kill_session(s, nullptr);
1338 }
1339
1340 dout(10) << "apply_blocklist: killed " << victims.size() << dendl;
1341
1342 return victims.size();
1343 }
1344
1345 void Server::journal_close_session(Session *session, int state, Context *on_safe)
1346 {
1347 dout(10) << __func__ << " : "
1348 << session->info.inst
1349 << " pending_prealloc_inos " << session->pending_prealloc_inos
1350 << " free_prealloc_inos " << session->free_prealloc_inos
1351 << " delegated_inos " << session->delegated_inos << dendl;
1352
1353 uint64_t sseq = mds->sessionmap.set_state(session, state);
1354 version_t pv = mds->sessionmap.mark_projected(session);
1355 version_t piv = 0;
1356
1357 // release alloc and pending-alloc inos for this session
1358 // and wipe out session state, in case the session close aborts for some reason
1359 interval_set<inodeno_t> inos_to_free;
1360 inos_to_free.insert(session->pending_prealloc_inos);
1361 inos_to_free.insert(session->free_prealloc_inos);
1362 if (inos_to_free.size()) {
1363 mds->inotable->project_release_ids(inos_to_free);
1364 piv = mds->inotable->get_projected_version();
1365 } else
1366 piv = 0;
1367
1368 auto le = new ESession(session->info.inst, false, pv, inos_to_free, piv, session->delegated_inos);
1369 auto fin = new C_MDS_session_finish(this, session, sseq, false, pv, inos_to_free, piv,
1370 session->delegated_inos, mdlog->get_current_segment(), on_safe);
1371 mdlog->start_submit_entry(le, fin);
1372 mdlog->flush();
1373
1374 // clean up requests, too
1375 while(!session->requests.empty()) {
1376 auto mdr = MDRequestRef(*session->requests.begin());
1377 mdcache->request_kill(mdr);
1378 }
1379
1380 finish_flush_session(session, session->get_push_seq());
1381 }
1382
1383 void Server::reconnect_clients(MDSContext *reconnect_done_)
1384 {
1385 reconnect_done = reconnect_done_;
1386
1387 auto now = clock::now();
1388 set<Session*> sessions;
1389 mds->sessionmap.get_client_session_set(sessions);
1390 for (auto session : sessions) {
1391 if (session->is_open()) {
1392 client_reconnect_gather.insert(session->get_client());
1393 session->set_reconnecting(true);
1394 session->last_cap_renew = now;
1395 }
1396 }
1397
1398 if (client_reconnect_gather.empty()) {
1399 dout(7) << "reconnect_clients -- no sessions, doing nothing." << dendl;
1400 reconnect_gather_finish();
1401 return;
1402 }
1403
1404 // clients will get the mdsmap and discover we're reconnecting via the monitor.
1405
1406 reconnect_start = now;
1407 dout(1) << "reconnect_clients -- " << client_reconnect_gather.size() << " sessions" << dendl;
1408 mds->sessionmap.dump();
1409 }
1410
1411 void Server::handle_client_reconnect(const cref_t<MClientReconnect> &m)
1412 {
1413 dout(7) << "handle_client_reconnect " << m->get_source()
1414 << (m->has_more() ? " (more)" : "") << dendl;
1415 client_t from = m->get_source().num();
1416 Session *session = mds->get_session(m);
1417 if (!session) {
1418 dout(0) << " ignoring sessionless msg " << *m << dendl;
1419 auto reply = make_message<MClientSession>(CEPH_SESSION_REJECT);
1420 reply->metadata["error_string"] = "sessionless";
1421 mds->send_message(reply, m->get_connection());
1422 return;
1423 }
1424
1425 if (!session->is_open()) {
1426 dout(0) << " ignoring msg from not-open session" << *m << dendl;
1427 auto reply = make_message<MClientSession>(CEPH_SESSION_CLOSE);
1428 mds->send_message(reply, m->get_connection());
1429 return;
1430 }
1431
1432 bool reconnect_all_deny = g_conf().get_val<bool>("mds_deny_all_reconnect");
1433
1434 if (!mds->is_reconnect() && mds->get_want_state() == CEPH_MDS_STATE_RECONNECT) {
1435 dout(10) << " we're almost in reconnect state (mdsmap delivery race?); waiting" << dendl;
1436 mds->wait_for_reconnect(new C_MDS_RetryMessage(mds, m));
1437 return;
1438 }
1439
1440 auto delay = std::chrono::duration<double>(clock::now() - reconnect_start).count();
1441 dout(10) << " reconnect_start " << reconnect_start << " delay " << delay << dendl;
1442
1443 bool deny = false;
1444 if (reconnect_all_deny || !mds->is_reconnect() || mds->get_want_state() != CEPH_MDS_STATE_RECONNECT || reconnect_evicting) {
1445 // XXX maybe in the future we can do better than this?
1446 if (reconnect_all_deny) {
1447 dout(1) << "mds_deny_all_reconnect was set to speed up reboot phase, ignoring reconnect, sending close" << dendl;
1448 } else {
1449 dout(1) << "no longer in reconnect state, ignoring reconnect, sending close" << dendl;
1450 }
1451 mds->clog->info() << "denied reconnect attempt (mds is "
1452 << ceph_mds_state_name(mds->get_state())
1453 << ") from " << m->get_source_inst()
1454 << " after " << delay << " (allowed interval " << g_conf()->mds_reconnect_timeout << ")";
1455 deny = true;
1456 } else {
1457 std::string error_str;
1458 if (!session->is_open()) {
1459 error_str = "session is closed";
1460 } else if (mdcache->is_readonly()) {
1461 error_str = "mds is readonly";
1462 } else {
1463 if (session->info.client_metadata.features.empty())
1464 infer_supported_features(session, session->info.client_metadata);
1465
1466 feature_bitset_t missing_features = required_client_features;
1467 missing_features -= session->info.client_metadata.features;
1468 if (!missing_features.empty()) {
1469 CachedStackStringStream css;
1470 *css << "missing required features '" << missing_features << "'";
1471 error_str = css->strv();
1472 }
1473 }
1474
1475 if (!error_str.empty()) {
1476 deny = true;
1477 dout(1) << " " << error_str << ", ignoring reconnect, sending close" << dendl;
1478 mds->clog->info() << "denied reconnect attempt from "
1479 << m->get_source_inst() << " (" << error_str << ")";
1480 }
1481 }
1482
1483 if (deny) {
1484 auto r = make_message<MClientSession>(CEPH_SESSION_CLOSE);
1485 mds->send_message_client(r, session);
1486 if (session->is_open()) {
1487 client_reconnect_denied.insert(session->get_client());
1488 }
1489 return;
1490 }
1491
1492 if (!m->has_more()) {
1493 metrics_handler->add_session(session);
1494 // notify client of success with an OPEN
1495 auto reply = make_message<MClientSession>(CEPH_SESSION_OPEN);
1496 if (session->info.has_feature(CEPHFS_FEATURE_MIMIC))
1497 reply->supported_features = supported_features;
1498 mds->send_message_client(reply, session);
1499 mds->clog->debug() << "reconnect by " << session->info.inst << " after " << delay;
1500 }
1501
1502 session->last_cap_renew = clock::now();
1503
1504 // snaprealms
1505 for (const auto &r : m->realms) {
1506 CInode *in = mdcache->get_inode(inodeno_t(r.realm.ino));
1507 if (in && in->state_test(CInode::STATE_PURGING))
1508 continue;
1509 if (in) {
1510 if (in->snaprealm) {
1511 dout(15) << "open snaprealm (w inode) on " << *in << dendl;
1512 } else {
1513 // this can happen if we are non-auth or we rollback snaprealm
1514 dout(15) << "open snaprealm (null snaprealm) on " << *in << dendl;
1515 }
1516 mdcache->add_reconnected_snaprealm(from, inodeno_t(r.realm.ino), snapid_t(r.realm.seq));
1517 } else {
1518 dout(15) << "open snaprealm (w/o inode) on " << inodeno_t(r.realm.ino)
1519 << " seq " << r.realm.seq << dendl;
1520 mdcache->add_reconnected_snaprealm(from, inodeno_t(r.realm.ino), snapid_t(r.realm.seq));
1521 }
1522 }
1523
1524 // caps
1525 for (const auto &p : m->caps) {
1526 // make sure our last_cap_id is MAX over all issued caps
1527 if (p.second.capinfo.cap_id > mdcache->last_cap_id)
1528 mdcache->last_cap_id = p.second.capinfo.cap_id;
1529
1530 CInode *in = mdcache->get_inode(p.first);
1531 if (in && in->state_test(CInode::STATE_PURGING))
1532 continue;
1533 if (in && in->is_auth()) {
1534 // we recovered it, and it's ours. take note.
1535 dout(15) << "open cap realm " << inodeno_t(p.second.capinfo.snaprealm)
1536 << " on " << *in << dendl;
1537 in->reconnect_cap(from, p.second, session);
1538 mdcache->add_reconnected_cap(from, p.first, p.second);
1539 recover_filelocks(in, p.second.flockbl, m->get_orig_source().num());
1540 continue;
1541 }
1542
1543 if (in && !in->is_auth()) {
1544 // not mine.
1545 dout(10) << "non-auth " << *in << ", will pass off to authority" << dendl;
1546 // add to cap export list.
1547 mdcache->rejoin_export_caps(p.first, from, p.second,
1548 in->authority().first, true);
1549 } else {
1550 // don't know if the inode is mine
1551 dout(10) << "missing ino " << p.first << ", will load later" << dendl;
1552 mdcache->rejoin_recovered_caps(p.first, from, p.second, MDS_RANK_NONE);
1553 }
1554 }
1555
1556 reconnect_last_seen = clock::now();
1557
1558 if (!m->has_more()) {
1559 mdcache->rejoin_recovered_client(session->get_client(), session->info.inst);
1560
1561 // remove from gather set
1562 client_reconnect_gather.erase(from);
1563 session->set_reconnecting(false);
1564 if (client_reconnect_gather.empty())
1565 reconnect_gather_finish();
1566 }
1567 }
1568
1569 void Server::infer_supported_features(Session *session, client_metadata_t& client_metadata)
1570 {
1571 int supported = -1;
1572 auto it = client_metadata.find("ceph_version");
1573 if (it != client_metadata.end()) {
1574 // user space client
1575 if (it->second.compare(0, 16, "ceph version 12.") == 0)
1576 supported = CEPHFS_FEATURE_LUMINOUS;
1577 else if (session->get_connection()->has_feature(CEPH_FEATURE_FS_CHANGE_ATTR))
1578 supported = CEPHFS_FEATURE_KRAKEN;
1579 } else {
1580 it = client_metadata.find("kernel_version");
1581 if (it != client_metadata.end()) {
1582 // kernel client
1583 if (session->get_connection()->has_feature(CEPH_FEATURE_NEW_OSDOP_ENCODING))
1584 supported = CEPHFS_FEATURE_LUMINOUS;
1585 }
1586 }
1587 if (supported == -1 &&
1588 session->get_connection()->has_feature(CEPH_FEATURE_FS_FILE_LAYOUT_V2))
1589 supported = CEPHFS_FEATURE_JEWEL;
1590
1591 if (supported >= 0) {
1592 unsigned long value = (1UL << (supported + 1)) - 1;
1593 client_metadata.features = feature_bitset_t(value);
1594 dout(10) << __func__ << " got '" << client_metadata.features << "'" << dendl;
1595 }
1596 }
1597
1598 void Server::update_required_client_features()
1599 {
1600 required_client_features = mds->mdsmap->get_required_client_features();
1601 dout(7) << "required_client_features: " << required_client_features << dendl;
1602
1603 if (mds->get_state() >= MDSMap::STATE_RECONNECT) {
1604 set<Session*> sessions;
1605 mds->sessionmap.get_client_session_set(sessions);
1606 for (auto session : sessions) {
1607 feature_bitset_t missing_features = required_client_features;
1608 missing_features -= session->info.client_metadata.features;
1609 if (!missing_features.empty()) {
1610 bool blocklisted = mds->objecter->with_osdmap(
1611 [session](const OSDMap &osd_map) -> bool {
1612 return osd_map.is_blocklisted(session->info.inst.addr);
1613 });
1614 if (blocklisted)
1615 continue;
1616
1617 mds->clog->warn() << "evicting session " << *session << ", missing required features '"
1618 << missing_features << "'";
1619 CachedStackStringStream css;
1620 mds->evict_client(session->get_client().v, false,
1621 g_conf()->mds_session_blocklist_on_evict, *css);
1622 }
1623 }
1624 }
1625 }
1626
1627 void Server::reconnect_gather_finish()
1628 {
1629 dout(7) << "reconnect_gather_finish. failed on " << failed_reconnects << " clients" << dendl;
1630 ceph_assert(reconnect_done);
1631
1632 if (!mds->snapclient->is_synced()) {
1633 // make sure snaptable cache is populated. snaprealms will be
1634 // extensively used in rejoin stage.
1635 dout(7) << " snaptable cache isn't synced, delaying state transition" << dendl;
1636 mds->snapclient->wait_for_sync(reconnect_done);
1637 } else {
1638 reconnect_done->complete(0);
1639 }
1640 reconnect_done = NULL;
1641 }
1642
1643 void Server::reconnect_tick()
1644 {
1645 bool reject_all_reconnect = false;
1646 if (reconnect_evicting) {
1647 dout(7) << "reconnect_tick: waiting for evictions" << dendl;
1648 return;
1649 }
1650
1651 /*
1652 * Set mds_deny_all_reconnect to reject all the reconnect req ,
1653 * then load less meta information in rejoin phase. This will shorten reboot time.
1654 * Moreover, loading less meta increases the chance standby with less memory can failover.
1655
1656 * Why not shorten reconnect period?
1657 * Clients may send unsafe or retry requests, which haven't been
1658 * completed before old mds stop, to new mds. These requests may
1659 * need to be processed during new mds's clientreplay phase,
1660 * see: #https://github.com/ceph/ceph/pull/29059.
1661 */
1662 bool reconnect_all_deny = g_conf().get_val<bool>("mds_deny_all_reconnect");
1663 if (client_reconnect_gather.empty())
1664 return;
1665
1666 if (reconnect_all_deny && (client_reconnect_gather == client_reconnect_denied))
1667 reject_all_reconnect = true;
1668
1669 auto now = clock::now();
1670 auto elapse1 = std::chrono::duration<double>(now - reconnect_start).count();
1671 if (elapse1 < g_conf()->mds_reconnect_timeout && !reject_all_reconnect)
1672 return;
1673
1674 vector<Session*> remaining_sessions;
1675 remaining_sessions.reserve(client_reconnect_gather.size());
1676 for (auto c : client_reconnect_gather) {
1677 Session *session = mds->sessionmap.get_session(entity_name_t::CLIENT(c.v));
1678 ceph_assert(session);
1679 remaining_sessions.push_back(session);
1680 // client re-sends cap flush messages before the reconnect message
1681 if (session->last_seen > reconnect_last_seen)
1682 reconnect_last_seen = session->last_seen;
1683 }
1684
1685 auto elapse2 = std::chrono::duration<double>(now - reconnect_last_seen).count();
1686 if (elapse2 < g_conf()->mds_reconnect_timeout / 2 && !reject_all_reconnect) {
1687 dout(7) << "reconnect_tick: last seen " << elapse2
1688 << " seconds ago, extending reconnect interval" << dendl;
1689 return;
1690 }
1691
1692 dout(7) << "reconnect timed out, " << remaining_sessions.size()
1693 << " clients have not reconnected in time" << dendl;
1694
1695 // If we're doing blocklist evictions, use this to wait for them before
1696 // proceeding to reconnect_gather_finish
1697 MDSGatherBuilder gather(g_ceph_context);
1698
1699 for (auto session : remaining_sessions) {
1700 // Keep sessions that have specified timeout. These sessions will prevent
1701 // mds from going to active. MDS goes to active after they all have been
1702 // killed or reclaimed.
1703 if (session->info.client_metadata.find("timeout") !=
1704 session->info.client_metadata.end()) {
1705 dout(1) << "reconnect keeps " << session->info.inst
1706 << ", need to be reclaimed" << dendl;
1707 client_reclaim_gather.insert(session->get_client());
1708 continue;
1709 }
1710
1711 dout(1) << "reconnect gives up on " << session->info.inst << dendl;
1712
1713 mds->clog->warn() << "evicting unresponsive client " << *session
1714 << ", after waiting " << elapse1
1715 << " seconds during MDS startup";
1716
1717 // make _session_logged() purge orphan objects of lost async/unsafe requests
1718 session->delegated_inos.swap(session->free_prealloc_inos);
1719
1720 if (g_conf()->mds_session_blocklist_on_timeout) {
1721 CachedStackStringStream css;
1722 mds->evict_client(session->get_client().v, false, true, *css,
1723 gather.new_sub());
1724 } else {
1725 kill_session(session, NULL);
1726 }
1727
1728 failed_reconnects++;
1729 }
1730 client_reconnect_gather.clear();
1731 client_reconnect_denied.clear();
1732
1733 if (gather.has_subs()) {
1734 dout(1) << "reconnect will complete once clients are evicted" << dendl;
1735 gather.set_finisher(new MDSInternalContextWrapper(mds, new LambdaContext(
1736 [this](int r){reconnect_gather_finish();})));
1737 gather.activate();
1738 reconnect_evicting = true;
1739 } else {
1740 reconnect_gather_finish();
1741 }
1742 }
1743
1744 void Server::recover_filelocks(CInode *in, bufferlist locks, int64_t client)
1745 {
1746 if (!locks.length()) return;
1747 int numlocks;
1748 ceph_filelock lock;
1749 auto p = locks.cbegin();
1750 decode(numlocks, p);
1751 for (int i = 0; i < numlocks; ++i) {
1752 decode(lock, p);
1753 lock.client = client;
1754 in->get_fcntl_lock_state()->held_locks.insert(pair<uint64_t, ceph_filelock>(lock.start, lock));
1755 ++in->get_fcntl_lock_state()->client_held_lock_counts[client];
1756 }
1757 decode(numlocks, p);
1758 for (int i = 0; i < numlocks; ++i) {
1759 decode(lock, p);
1760 lock.client = client;
1761 in->get_flock_lock_state()->held_locks.insert(pair<uint64_t, ceph_filelock> (lock.start, lock));
1762 ++in->get_flock_lock_state()->client_held_lock_counts[client];
1763 }
1764 }
1765
1766 /**
1767 * Call this when the MDCache is oversized, to send requests to the clients
1768 * to trim some caps, and consequently unpin some inodes in the MDCache so
1769 * that it can trim too.
1770 */
1771 std::pair<bool, uint64_t> Server::recall_client_state(MDSGatherBuilder* gather, RecallFlags flags)
1772 {
1773 const auto now = clock::now();
1774 const bool steady = !!(flags&RecallFlags::STEADY);
1775 const bool enforce_max = !!(flags&RecallFlags::ENFORCE_MAX);
1776 const bool enforce_liveness = !!(flags&RecallFlags::ENFORCE_LIVENESS);
1777 const bool trim = !!(flags&RecallFlags::TRIM);
1778
1779 const auto max_caps_per_client = g_conf().get_val<uint64_t>("mds_max_caps_per_client");
1780 const auto min_caps_per_client = g_conf().get_val<uint64_t>("mds_min_caps_per_client");
1781 const auto recall_global_max_decay_threshold = g_conf().get_val<Option::size_t>("mds_recall_global_max_decay_threshold");
1782 const auto recall_max_caps = g_conf().get_val<Option::size_t>("mds_recall_max_caps");
1783 const auto recall_max_decay_threshold = g_conf().get_val<Option::size_t>("mds_recall_max_decay_threshold");
1784 const auto cache_liveness_magnitude = g_conf().get_val<Option::size_t>("mds_session_cache_liveness_magnitude");
1785
1786 dout(7) << __func__ << ":"
1787 << " min=" << min_caps_per_client
1788 << " max=" << max_caps_per_client
1789 << " total=" << Capability::count()
1790 << " flags=" << flags
1791 << dendl;
1792
1793 /* trim caps of sessions with the most caps first */
1794 std::multimap<uint64_t, Session*> caps_session;
1795 auto f = [&caps_session, enforce_max, enforce_liveness, trim, max_caps_per_client, cache_liveness_magnitude](auto& s) {
1796 auto num_caps = s->caps.size();
1797 auto cache_liveness = s->get_session_cache_liveness();
1798 if (trim || (enforce_max && num_caps > max_caps_per_client) || (enforce_liveness && cache_liveness < (num_caps>>cache_liveness_magnitude))) {
1799 caps_session.emplace(std::piecewise_construct, std::forward_as_tuple(num_caps), std::forward_as_tuple(s));
1800 }
1801 };
1802 mds->sessionmap.get_client_sessions(std::move(f));
1803
1804 std::pair<bool, uint64_t> result = {false, 0};
1805 auto& [throttled, caps_recalled] = result;
1806 last_recall_state = now;
1807 for (const auto& [num_caps, session] : boost::adaptors::reverse(caps_session)) {
1808 if (!session->is_open() ||
1809 !session->get_connection() ||
1810 !session->info.inst.name.is_client())
1811 continue;
1812
1813 dout(10) << __func__ << ":"
1814 << " session " << session->info.inst
1815 << " caps " << num_caps
1816 << ", leases " << session->leases.size()
1817 << dendl;
1818
1819 uint64_t newlim;
1820 if (num_caps < recall_max_caps || (num_caps-recall_max_caps) < min_caps_per_client) {
1821 newlim = min_caps_per_client;
1822 } else {
1823 newlim = num_caps-recall_max_caps;
1824 }
1825 if (num_caps > newlim) {
1826 /* now limit the number of caps we recall at a time to prevent overloading ourselves */
1827 uint64_t recall = std::min<uint64_t>(recall_max_caps, num_caps-newlim);
1828 newlim = num_caps-recall;
1829 const uint64_t session_recall_throttle = session->get_recall_caps_throttle();
1830 const uint64_t session_recall_throttle2o = session->get_recall_caps_throttle2o();
1831 const uint64_t global_recall_throttle = recall_throttle.get();
1832 if (session_recall_throttle+recall > recall_max_decay_threshold) {
1833 dout(15) << " session recall threshold (" << recall_max_decay_threshold << ") hit at " << session_recall_throttle << "; skipping!" << dendl;
1834 throttled = true;
1835 continue;
1836 } else if (session_recall_throttle2o+recall > recall_max_caps*2) {
1837 dout(15) << " session recall 2nd-order threshold (" << 2*recall_max_caps << ") hit at " << session_recall_throttle2o << "; skipping!" << dendl;
1838 throttled = true;
1839 continue;
1840 } else if (global_recall_throttle+recall > recall_global_max_decay_threshold) {
1841 dout(15) << " global recall threshold (" << recall_global_max_decay_threshold << ") hit at " << global_recall_throttle << "; skipping!" << dendl;
1842 throttled = true;
1843 break;
1844 }
1845
1846 // now check if we've recalled caps recently and the client is unlikely to satisfy a new recall
1847 if (steady) {
1848 const auto session_recall = session->get_recall_caps();
1849 const auto session_release = session->get_release_caps();
1850 if (2*session_release < session_recall && 2*session_recall > recall_max_decay_threshold) {
1851 /* The session has been unable to keep up with the number of caps
1852 * recalled (by half); additionally, to prevent marking sessions
1853 * we've just begun to recall from, the session_recall counter
1854 * (decayed count of caps recently recalled) is **greater** than the
1855 * session threshold for the session's cap recall throttle.
1856 */
1857 dout(15) << " 2*session_release < session_recall"
1858 " (2*" << session_release << " < " << session_recall << ") &&"
1859 " 2*session_recall < recall_max_decay_threshold"
1860 " (2*" << session_recall << " > " << recall_max_decay_threshold << ")"
1861 " Skipping because we are unlikely to get more released." << dendl;
1862 continue;
1863 } else if (recall < recall_max_caps && 2*recall < session_recall) {
1864 /* The number of caps recalled is less than the number we *could*
1865 * recall (so there isn't much left to recall?) and the number of
1866 * caps is less than the current recall_caps counter (decayed count
1867 * of caps recently recalled).
1868 */
1869 dout(15) << " 2*recall < session_recall "
1870 " (2*" << recall << " < " << session_recall << ") &&"
1871 " recall < recall_max_caps (" << recall << " < " << recall_max_caps << ");"
1872 " Skipping because we are unlikely to get more released." << dendl;
1873 continue;
1874 }
1875 }
1876
1877 dout(7) << " recalling " << recall << " caps; session_recall_throttle = " << session_recall_throttle << "; global_recall_throttle = " << global_recall_throttle << dendl;
1878
1879 auto m = make_message<MClientSession>(CEPH_SESSION_RECALL_STATE);
1880 m->head.max_caps = newlim;
1881 mds->send_message_client(m, session);
1882 if (gather) {
1883 flush_session(session, *gather);
1884 }
1885 caps_recalled += session->notify_recall_sent(newlim);
1886 recall_throttle.hit(recall);
1887 }
1888 }
1889
1890 dout(7) << "recalled" << (throttled ? " (throttled)" : "") << " " << caps_recalled << " client caps." << dendl;
1891
1892 return result;
1893 }
1894
1895 void Server::force_clients_readonly()
1896 {
1897 dout(10) << "force_clients_readonly" << dendl;
1898 set<Session*> sessions;
1899 mds->sessionmap.get_client_session_set(sessions);
1900 for (set<Session*>::const_iterator p = sessions.begin();
1901 p != sessions.end();
1902 ++p) {
1903 Session *session = *p;
1904 if (!session->info.inst.name.is_client() ||
1905 !(session->is_open() || session->is_stale()))
1906 continue;
1907 mds->send_message_client(make_message<MClientSession>(CEPH_SESSION_FORCE_RO), session);
1908 }
1909 }
1910
1911 /*******
1912 * some generic stuff for finishing off requests
1913 */
1914 void Server::journal_and_reply(MDRequestRef& mdr, CInode *in, CDentry *dn, LogEvent *le, MDSLogContextBase *fin)
1915 {
1916 dout(10) << "journal_and_reply tracei " << in << " tracedn " << dn << dendl;
1917 ceph_assert(!mdr->has_completed);
1918
1919 // note trace items for eventual reply.
1920 mdr->tracei = in;
1921 if (in)
1922 mdr->pin(in);
1923
1924 mdr->tracedn = dn;
1925 if (dn)
1926 mdr->pin(dn);
1927
1928 early_reply(mdr, in, dn);
1929
1930 mdr->committing = true;
1931 submit_mdlog_entry(le, fin, mdr, __func__);
1932
1933 if (mdr->client_request && mdr->client_request->is_queued_for_replay()) {
1934 if (mds->queue_one_replay()) {
1935 dout(10) << " queued next replay op" << dendl;
1936 } else {
1937 dout(10) << " journaled last replay op" << dendl;
1938 }
1939 } else if (mdr->did_early_reply)
1940 mds->locker->drop_rdlocks_for_early_reply(mdr.get());
1941 else
1942 mdlog->flush();
1943 }
1944
1945 void Server::submit_mdlog_entry(LogEvent *le, MDSLogContextBase *fin, MDRequestRef& mdr,
1946 std::string_view event)
1947 {
1948 if (mdr) {
1949 string event_str("submit entry: ");
1950 event_str += event;
1951 mdr->mark_event(event_str);
1952 }
1953 mdlog->submit_entry(le, fin);
1954 }
1955
1956 /*
1957 * send response built from mdr contents and error code; clean up mdr
1958 */
1959 void Server::respond_to_request(MDRequestRef& mdr, int r)
1960 {
1961 if (mdr->client_request) {
1962 if (mdr->is_batch_head()) {
1963 dout(20) << __func__ << " batch head " << *mdr << dendl;
1964 mdr->release_batch_op()->respond(r);
1965 } else {
1966 reply_client_request(mdr, make_message<MClientReply>(*mdr->client_request, r));
1967 }
1968 } else if (mdr->internal_op > -1) {
1969 dout(10) << "respond_to_request on internal request " << mdr << dendl;
1970 if (!mdr->internal_op_finish)
1971 ceph_abort_msg("trying to respond to internal op without finisher");
1972 mdr->internal_op_finish->complete(r);
1973 mdcache->request_finish(mdr);
1974 }
1975 }
1976
1977 // statistics mds req op number and latency
1978 void Server::perf_gather_op_latency(const cref_t<MClientRequest> &req, utime_t lat)
1979 {
1980 int code = l_mdss_first;
1981 switch(req->get_op()) {
1982 case CEPH_MDS_OP_LOOKUPHASH:
1983 code = l_mdss_req_lookuphash_latency;
1984 break;
1985 case CEPH_MDS_OP_LOOKUPINO:
1986 code = l_mdss_req_lookupino_latency;
1987 break;
1988 case CEPH_MDS_OP_LOOKUPPARENT:
1989 code = l_mdss_req_lookupparent_latency;
1990 break;
1991 case CEPH_MDS_OP_LOOKUPNAME:
1992 code = l_mdss_req_lookupname_latency;
1993 break;
1994 case CEPH_MDS_OP_LOOKUP:
1995 code = l_mdss_req_lookup_latency;
1996 break;
1997 case CEPH_MDS_OP_LOOKUPSNAP:
1998 code = l_mdss_req_lookupsnap_latency;
1999 break;
2000 case CEPH_MDS_OP_GETATTR:
2001 code = l_mdss_req_getattr_latency;
2002 break;
2003 case CEPH_MDS_OP_SETATTR:
2004 code = l_mdss_req_setattr_latency;
2005 break;
2006 case CEPH_MDS_OP_SETLAYOUT:
2007 code = l_mdss_req_setlayout_latency;
2008 break;
2009 case CEPH_MDS_OP_SETDIRLAYOUT:
2010 code = l_mdss_req_setdirlayout_latency;
2011 break;
2012 case CEPH_MDS_OP_SETXATTR:
2013 code = l_mdss_req_setxattr_latency;
2014 break;
2015 case CEPH_MDS_OP_RMXATTR:
2016 code = l_mdss_req_rmxattr_latency;
2017 break;
2018 case CEPH_MDS_OP_READDIR:
2019 code = l_mdss_req_readdir_latency;
2020 break;
2021 case CEPH_MDS_OP_SETFILELOCK:
2022 code = l_mdss_req_setfilelock_latency;
2023 break;
2024 case CEPH_MDS_OP_GETFILELOCK:
2025 code = l_mdss_req_getfilelock_latency;
2026 break;
2027 case CEPH_MDS_OP_CREATE:
2028 code = l_mdss_req_create_latency;
2029 break;
2030 case CEPH_MDS_OP_OPEN:
2031 code = l_mdss_req_open_latency;
2032 break;
2033 case CEPH_MDS_OP_MKNOD:
2034 code = l_mdss_req_mknod_latency;
2035 break;
2036 case CEPH_MDS_OP_LINK:
2037 code = l_mdss_req_link_latency;
2038 break;
2039 case CEPH_MDS_OP_UNLINK:
2040 code = l_mdss_req_unlink_latency;
2041 break;
2042 case CEPH_MDS_OP_RMDIR:
2043 code = l_mdss_req_rmdir_latency;
2044 break;
2045 case CEPH_MDS_OP_RENAME:
2046 code = l_mdss_req_rename_latency;
2047 break;
2048 case CEPH_MDS_OP_MKDIR:
2049 code = l_mdss_req_mkdir_latency;
2050 break;
2051 case CEPH_MDS_OP_SYMLINK:
2052 code = l_mdss_req_symlink_latency;
2053 break;
2054 case CEPH_MDS_OP_LSSNAP:
2055 code = l_mdss_req_lssnap_latency;
2056 break;
2057 case CEPH_MDS_OP_MKSNAP:
2058 code = l_mdss_req_mksnap_latency;
2059 break;
2060 case CEPH_MDS_OP_RMSNAP:
2061 code = l_mdss_req_rmsnap_latency;
2062 break;
2063 case CEPH_MDS_OP_RENAMESNAP:
2064 code = l_mdss_req_renamesnap_latency;
2065 break;
2066 default:
2067 dout(1) << ": unknown client op" << dendl;
2068 return;
2069 }
2070 logger->tinc(code, lat);
2071 }
2072
2073 void Server::early_reply(MDRequestRef& mdr, CInode *tracei, CDentry *tracedn)
2074 {
2075 if (!g_conf()->mds_early_reply)
2076 return;
2077
2078 if (mdr->no_early_reply) {
2079 dout(10) << "early_reply - flag no_early_reply is set, not allowed." << dendl;
2080 return;
2081 }
2082
2083 if (mdr->has_more() && mdr->more()->has_journaled_peers) {
2084 dout(10) << "early_reply - there are journaled peers, not allowed." << dendl;
2085 return;
2086 }
2087
2088 if (mdr->alloc_ino) {
2089 dout(10) << "early_reply - allocated ino, not allowed" << dendl;
2090 return;
2091 }
2092
2093 const cref_t<MClientRequest> &req = mdr->client_request;
2094 entity_inst_t client_inst = req->get_source_inst();
2095 if (client_inst.name.is_mds())
2096 return;
2097
2098 if (req->is_replay()) {
2099 dout(10) << " no early reply on replay op" << dendl;
2100 return;
2101 }
2102
2103
2104 auto reply = make_message<MClientReply>(*req, 0);
2105 reply->set_unsafe();
2106
2107 // mark xlocks "done", indicating that we are exposing uncommitted changes.
2108 //
2109 //_rename_finish() does not send dentry link/unlink message to replicas.
2110 // so do not set xlocks on dentries "done", the xlocks prevent dentries
2111 // that have projected linkages from getting new replica.
2112 mds->locker->set_xlocks_done(mdr.get(), req->get_op() == CEPH_MDS_OP_RENAME);
2113
2114 dout(10) << "early_reply " << reply->get_result()
2115 << " (" << cpp_strerror(reply->get_result())
2116 << ") " << *req << dendl;
2117
2118 if (tracei || tracedn) {
2119 if (tracei)
2120 mdr->cap_releases.erase(tracei->vino());
2121 if (tracedn)
2122 mdr->cap_releases.erase(tracedn->get_dir()->get_inode()->vino());
2123
2124 set_trace_dist(reply, tracei, tracedn, mdr);
2125 }
2126
2127 reply->set_extra_bl(mdr->reply_extra_bl);
2128 mds->send_message_client(reply, mdr->session);
2129
2130 mdr->did_early_reply = true;
2131
2132 mds->logger->inc(l_mds_reply);
2133 utime_t lat = ceph_clock_now() - req->get_recv_stamp();
2134 mds->logger->tinc(l_mds_reply_latency, lat);
2135 if (client_inst.name.is_client()) {
2136 mds->sessionmap.hit_session(mdr->session);
2137 }
2138 perf_gather_op_latency(req, lat);
2139 dout(20) << "lat " << lat << dendl;
2140
2141 mdr->mark_event("early_replied");
2142 }
2143
2144 /*
2145 * send given reply
2146 * include a trace to tracei
2147 * Clean up mdr
2148 */
2149 void Server::reply_client_request(MDRequestRef& mdr, const ref_t<MClientReply> &reply)
2150 {
2151 ceph_assert(mdr.get());
2152 const cref_t<MClientRequest> &req = mdr->client_request;
2153
2154 dout(7) << "reply_client_request " << reply->get_result()
2155 << " (" << cpp_strerror(reply->get_result())
2156 << ") " << *req << dendl;
2157
2158 mdr->mark_event("replying");
2159
2160 Session *session = mdr->session;
2161
2162 // note successful request in session map?
2163 //
2164 // setfilelock requests are special, they only modify states in MDS memory.
2165 // The states get lost when MDS fails. If Client re-send a completed
2166 // setfilelock request, it means that client did not receive corresponding
2167 // setfilelock reply. So MDS should re-execute the setfilelock request.
2168 if (req->may_write() && req->get_op() != CEPH_MDS_OP_SETFILELOCK &&
2169 reply->get_result() == 0 && session) {
2170 inodeno_t created = mdr->alloc_ino ? mdr->alloc_ino : mdr->used_prealloc_ino;
2171 session->add_completed_request(mdr->reqid.tid, created);
2172 if (mdr->ls) {
2173 mdr->ls->touched_sessions.insert(session->info.inst.name);
2174 }
2175 }
2176
2177 // give any preallocated inos to the session
2178 apply_allocated_inos(mdr, session);
2179
2180 // get tracei/tracedn from mdr?
2181 CInode *tracei = mdr->tracei;
2182 CDentry *tracedn = mdr->tracedn;
2183
2184 bool is_replay = mdr->client_request->is_replay();
2185 bool did_early_reply = mdr->did_early_reply;
2186 entity_inst_t client_inst = req->get_source_inst();
2187
2188 if (!did_early_reply && !is_replay) {
2189
2190 mds->logger->inc(l_mds_reply);
2191 utime_t lat = ceph_clock_now() - mdr->client_request->get_recv_stamp();
2192 mds->logger->tinc(l_mds_reply_latency, lat);
2193 if (session && client_inst.name.is_client()) {
2194 mds->sessionmap.hit_session(session);
2195 }
2196 perf_gather_op_latency(req, lat);
2197 dout(20) << "lat " << lat << dendl;
2198
2199 if (tracei)
2200 mdr->cap_releases.erase(tracei->vino());
2201 if (tracedn)
2202 mdr->cap_releases.erase(tracedn->get_dir()->get_inode()->vino());
2203 }
2204
2205 // drop non-rdlocks before replying, so that we can issue leases
2206 mdcache->request_drop_non_rdlocks(mdr);
2207
2208 // reply at all?
2209 if (session && !client_inst.name.is_mds()) {
2210 // send reply.
2211 if (!did_early_reply && // don't issue leases if we sent an earlier reply already
2212 (tracei || tracedn)) {
2213 if (is_replay) {
2214 if (tracei)
2215 mdcache->try_reconnect_cap(tracei, session);
2216 } else {
2217 // include metadata in reply
2218 set_trace_dist(reply, tracei, tracedn, mdr);
2219 }
2220 }
2221
2222 // We can set the extra bl unconditionally: if it's already been sent in the
2223 // early_reply, set_extra_bl will have claimed it and reply_extra_bl is empty
2224 reply->set_extra_bl(mdr->reply_extra_bl);
2225
2226 reply->set_mdsmap_epoch(mds->mdsmap->get_epoch());
2227 mds->send_message_client(reply, session);
2228 }
2229
2230 if (req->is_queued_for_replay() &&
2231 (mdr->has_completed || reply->get_result() < 0)) {
2232 if (reply->get_result() < 0) {
2233 int r = reply->get_result();
2234 derr << "reply_client_request: failed to replay " << *req
2235 << " error " << r << " (" << cpp_strerror(r) << ")" << dendl;
2236 mds->clog->warn() << "failed to replay " << req->get_reqid() << " error " << r;
2237 }
2238 mds->queue_one_replay();
2239 }
2240
2241 // clean up request
2242 mdcache->request_finish(mdr);
2243
2244 // take a closer look at tracei, if it happens to be a remote link
2245 if (tracei &&
2246 tracedn &&
2247 tracedn->get_projected_linkage()->is_remote()) {
2248 mdcache->eval_remote(tracedn);
2249 }
2250 }
2251
2252 /*
2253 * pass inode OR dentry (not both, or we may get confused)
2254 *
2255 * trace is in reverse order (i.e. root inode comes last)
2256 */
2257 void Server::set_trace_dist(const ref_t<MClientReply> &reply,
2258 CInode *in, CDentry *dn,
2259 MDRequestRef& mdr)
2260 {
2261 // skip doing this for debugging purposes?
2262 if (g_conf()->mds_inject_traceless_reply_probability &&
2263 mdr->ls && !mdr->o_trunc &&
2264 (rand() % 10000 < g_conf()->mds_inject_traceless_reply_probability * 10000.0)) {
2265 dout(5) << "deliberately skipping trace for " << *reply << dendl;
2266 return;
2267 }
2268
2269 // inode, dentry, dir, ..., inode
2270 bufferlist bl;
2271 mds_rank_t whoami = mds->get_nodeid();
2272 Session *session = mdr->session;
2273 snapid_t snapid = mdr->snapid;
2274 utime_t now = ceph_clock_now();
2275
2276 dout(20) << "set_trace_dist snapid " << snapid << dendl;
2277
2278 // realm
2279 if (snapid == CEPH_NOSNAP) {
2280 SnapRealm *realm;
2281 if (in)
2282 realm = in->find_snaprealm();
2283 else
2284 realm = dn->get_dir()->get_inode()->find_snaprealm();
2285 reply->snapbl = realm->get_snap_trace();
2286 dout(10) << "set_trace_dist snaprealm " << *realm << " len=" << reply->snapbl.length() << dendl;
2287 }
2288
2289 // dir + dentry?
2290 if (dn) {
2291 reply->head.is_dentry = 1;
2292 CDir *dir = dn->get_dir();
2293 CInode *diri = dir->get_inode();
2294
2295 diri->encode_inodestat(bl, session, NULL, snapid);
2296 dout(20) << "set_trace_dist added diri " << *diri << dendl;
2297
2298 #ifdef MDS_VERIFY_FRAGSTAT
2299 if (dir->is_complete())
2300 dir->verify_fragstat();
2301 #endif
2302 DirStat ds;
2303 ds.frag = dir->get_frag();
2304 ds.auth = dir->get_dir_auth().first;
2305 if (dir->is_auth() && !forward_all_requests_to_auth)
2306 dir->get_dist_spec(ds.dist, whoami);
2307
2308 dir->encode_dirstat(bl, session->info, ds);
2309 dout(20) << "set_trace_dist added dir " << *dir << dendl;
2310
2311 encode(dn->get_name(), bl);
2312
2313 int lease_mask = 0;
2314 CDentry::linkage_t *dnl = dn->get_linkage(mdr->get_client(), mdr);
2315 if (dnl->is_primary()) {
2316 ceph_assert(dnl->get_inode() == in);
2317 lease_mask = CEPH_LEASE_PRIMARY_LINK;
2318 } else {
2319 if (dnl->is_remote())
2320 ceph_assert(dnl->get_remote_ino() == in->ino());
2321 else
2322 ceph_assert(!in);
2323 }
2324 mds->locker->issue_client_lease(dn, mdr, lease_mask, now, bl);
2325 dout(20) << "set_trace_dist added dn " << snapid << " " << *dn << dendl;
2326 } else
2327 reply->head.is_dentry = 0;
2328
2329 // inode
2330 if (in) {
2331 in->encode_inodestat(bl, session, NULL, snapid, 0, mdr->getattr_caps);
2332 dout(20) << "set_trace_dist added in " << *in << dendl;
2333 reply->head.is_target = 1;
2334 } else
2335 reply->head.is_target = 0;
2336
2337 reply->set_trace(bl);
2338 }
2339
2340 void Server::handle_client_request(const cref_t<MClientRequest> &req)
2341 {
2342 dout(4) << "handle_client_request " << *req << dendl;
2343
2344 if (mds->logger)
2345 mds->logger->inc(l_mds_request);
2346 if (logger)
2347 logger->inc(l_mdss_handle_client_request);
2348
2349 if (!mdcache->is_open()) {
2350 dout(5) << "waiting for root" << dendl;
2351 mdcache->wait_for_open(new C_MDS_RetryMessage(mds, req));
2352 return;
2353 }
2354
2355 bool sessionclosed_isok = replay_unsafe_with_closed_session;
2356 // active session?
2357 Session *session = 0;
2358 if (req->get_source().is_client()) {
2359 session = mds->get_session(req);
2360 if (!session) {
2361 dout(5) << "no session for " << req->get_source() << ", dropping" << dendl;
2362 } else if ((session->is_closed() && (!mds->is_clientreplay() || !sessionclosed_isok)) ||
2363 session->is_closing() ||
2364 session->is_killing()) {
2365 dout(5) << "session closed|closing|killing, dropping" << dendl;
2366 session = NULL;
2367 }
2368 if (!session) {
2369 if (req->is_queued_for_replay())
2370 mds->queue_one_replay();
2371 return;
2372 }
2373 }
2374
2375 // old mdsmap?
2376 if (req->get_mdsmap_epoch() < mds->mdsmap->get_epoch()) {
2377 // send it? hrm, this isn't ideal; they may get a lot of copies if
2378 // they have a high request rate.
2379 }
2380
2381 // completed request?
2382 bool has_completed = false;
2383 if (req->is_replay() || req->get_retry_attempt()) {
2384 ceph_assert(session);
2385 inodeno_t created;
2386 if (session->have_completed_request(req->get_reqid().tid, &created)) {
2387 has_completed = true;
2388 if (!session->is_open())
2389 return;
2390 // Don't send traceless reply if the completed request has created
2391 // new inode. Treat the request as lookup request instead.
2392 if (req->is_replay() ||
2393 ((created == inodeno_t() || !mds->is_clientreplay()) &&
2394 req->get_op() != CEPH_MDS_OP_OPEN &&
2395 req->get_op() != CEPH_MDS_OP_CREATE)) {
2396 dout(5) << "already completed " << req->get_reqid() << dendl;
2397 auto reply = make_message<MClientReply>(*req, 0);
2398 if (created != inodeno_t()) {
2399 bufferlist extra;
2400 encode(created, extra);
2401 reply->set_extra_bl(extra);
2402 }
2403 mds->send_message_client(reply, session);
2404
2405 if (req->is_queued_for_replay())
2406 mds->queue_one_replay();
2407
2408 return;
2409 }
2410 if (req->get_op() != CEPH_MDS_OP_OPEN &&
2411 req->get_op() != CEPH_MDS_OP_CREATE) {
2412 dout(10) << " completed request which created new inode " << created
2413 << ", convert it to lookup request" << dendl;
2414 req->head.op = req->get_dentry_wanted() ? CEPH_MDS_OP_LOOKUP : CEPH_MDS_OP_GETATTR;
2415 req->head.args.getattr.mask = CEPH_STAT_CAP_INODE_ALL;
2416 }
2417 }
2418 }
2419
2420 // trim completed_request list
2421 if (req->get_oldest_client_tid() > 0) {
2422 dout(15) << " oldest_client_tid=" << req->get_oldest_client_tid() << dendl;
2423 ceph_assert(session);
2424 if (session->trim_completed_requests(req->get_oldest_client_tid())) {
2425 // Sessions 'completed_requests' was dirtied, mark it to be
2426 // potentially flushed at segment expiry.
2427 mdlog->get_current_segment()->touched_sessions.insert(session->info.inst.name);
2428
2429 if (session->get_num_trim_requests_warnings() > 0 &&
2430 session->get_num_completed_requests() * 2 < g_conf()->mds_max_completed_requests)
2431 session->reset_num_trim_requests_warnings();
2432 } else {
2433 if (session->get_num_completed_requests() >=
2434 (g_conf()->mds_max_completed_requests << session->get_num_trim_requests_warnings())) {
2435 session->inc_num_trim_requests_warnings();
2436 CachedStackStringStream css;
2437 *css << "client." << session->get_client() << " does not advance its oldest_client_tid ("
2438 << req->get_oldest_client_tid() << "), "
2439 << session->get_num_completed_requests()
2440 << " completed requests recorded in session\n";
2441 mds->clog->warn() << css->strv();
2442 dout(20) << __func__ << " " << css->strv() << dendl;
2443 }
2444 }
2445 }
2446
2447 // register + dispatch
2448 MDRequestRef mdr = mdcache->request_start(req);
2449 if (!mdr.get())
2450 return;
2451
2452 if (session) {
2453 mdr->session = session;
2454 session->requests.push_back(&mdr->item_session_request);
2455 }
2456
2457 if (has_completed)
2458 mdr->has_completed = true;
2459
2460 // process embedded cap releases?
2461 // (only if NOT replay!)
2462 if (!req->releases.empty() && req->get_source().is_client() && !req->is_replay()) {
2463 client_t client = req->get_source().num();
2464 for (const auto &r : req->releases) {
2465 mds->locker->process_request_cap_release(mdr, client, r.item, r.dname);
2466 }
2467 req->releases.clear();
2468 }
2469
2470 dispatch_client_request(mdr);
2471 return;
2472 }
2473
2474 void Server::handle_osd_map()
2475 {
2476 /* Note that we check the OSDMAP_FULL flag directly rather than
2477 * using osdmap_full_flag(), because we want to know "is the flag set"
2478 * rather than "does the flag apply to us?" */
2479 mds->objecter->with_osdmap([this](const OSDMap& o) {
2480 auto pi = o.get_pg_pool(mds->get_metadata_pool());
2481 is_full = pi && pi->has_flag(pg_pool_t::FLAG_FULL);
2482 dout(7) << __func__ << ": full = " << is_full << " epoch = "
2483 << o.get_epoch() << dendl;
2484 });
2485 }
2486
2487 void Server::dispatch_client_request(MDRequestRef& mdr)
2488 {
2489 // we shouldn't be waiting on anyone.
2490 ceph_assert(!mdr->has_more() || mdr->more()->waiting_on_peer.empty());
2491
2492 if (mdr->killed) {
2493 dout(10) << "request " << *mdr << " was killed" << dendl;
2494 //if the mdr is a "batch_op" and it has followers, pick a follower as
2495 //the new "head of the batch ops" and go on processing the new one.
2496 if (mdr->is_batch_head()) {
2497 int mask = mdr->client_request->head.args.getattr.mask;
2498 auto it = mdr->batch_op_map->find(mask);
2499 auto new_batch_head = it->second->find_new_head();
2500 if (!new_batch_head) {
2501 mdr->batch_op_map->erase(it);
2502 return;
2503 }
2504 mdr = std::move(new_batch_head);
2505 } else {
2506 return;
2507 }
2508 } else if (mdr->aborted) {
2509 mdr->aborted = false;
2510 mdcache->request_kill(mdr);
2511 return;
2512 }
2513
2514 const cref_t<MClientRequest> &req = mdr->client_request;
2515
2516 if (logger) logger->inc(l_mdss_dispatch_client_request);
2517
2518 dout(7) << "dispatch_client_request " << *req << dendl;
2519
2520 if (req->may_write() && mdcache->is_readonly()) {
2521 dout(10) << " read-only FS" << dendl;
2522 respond_to_request(mdr, -CEPHFS_EROFS);
2523 return;
2524 }
2525 if (mdr->has_more() && mdr->more()->peer_error) {
2526 dout(10) << " got error from peers" << dendl;
2527 respond_to_request(mdr, mdr->more()->peer_error);
2528 return;
2529 }
2530
2531 if (is_full) {
2532 CInode *cur = try_get_auth_inode(mdr, req->get_filepath().get_ino());
2533 if (!cur) {
2534 // the request is already responded to
2535 return;
2536 }
2537 if (req->get_op() == CEPH_MDS_OP_SETLAYOUT ||
2538 req->get_op() == CEPH_MDS_OP_SETDIRLAYOUT ||
2539 req->get_op() == CEPH_MDS_OP_SETLAYOUT ||
2540 req->get_op() == CEPH_MDS_OP_RMXATTR ||
2541 req->get_op() == CEPH_MDS_OP_SETXATTR ||
2542 req->get_op() == CEPH_MDS_OP_CREATE ||
2543 req->get_op() == CEPH_MDS_OP_SYMLINK ||
2544 req->get_op() == CEPH_MDS_OP_MKSNAP ||
2545 ((req->get_op() == CEPH_MDS_OP_LINK ||
2546 req->get_op() == CEPH_MDS_OP_RENAME) &&
2547 (!mdr->has_more() || mdr->more()->witnessed.empty())) // haven't started peer request
2548 ) {
2549
2550 if (check_access(mdr, cur, MAY_FULL)) {
2551 dout(20) << __func__ << ": full, has FULL caps, permitting op " << ceph_mds_op_name(req->get_op()) << dendl;
2552 } else {
2553 dout(20) << __func__ << ": full, responding CEPHFS_ENOSPC to op " << ceph_mds_op_name(req->get_op()) << dendl;
2554 respond_to_request(mdr, -CEPHFS_ENOSPC);
2555 return;
2556 }
2557 } else {
2558 dout(20) << __func__ << ": full, permitting op " << ceph_mds_op_name(req->get_op()) << dendl;
2559 }
2560 }
2561
2562 switch (req->get_op()) {
2563 case CEPH_MDS_OP_LOOKUPHASH:
2564 case CEPH_MDS_OP_LOOKUPINO:
2565 handle_client_lookup_ino(mdr, false, false);
2566 break;
2567 case CEPH_MDS_OP_LOOKUPPARENT:
2568 handle_client_lookup_ino(mdr, true, false);
2569 break;
2570 case CEPH_MDS_OP_LOOKUPNAME:
2571 handle_client_lookup_ino(mdr, false, true);
2572 break;
2573
2574 // inodes ops.
2575 case CEPH_MDS_OP_LOOKUP:
2576 handle_client_getattr(mdr, true);
2577 break;
2578
2579 case CEPH_MDS_OP_LOOKUPSNAP:
2580 // lookupsnap does not reference a CDentry; treat it as a getattr
2581 case CEPH_MDS_OP_GETATTR:
2582 handle_client_getattr(mdr, false);
2583 break;
2584
2585 case CEPH_MDS_OP_SETATTR:
2586 handle_client_setattr(mdr);
2587 break;
2588 case CEPH_MDS_OP_SETLAYOUT:
2589 handle_client_setlayout(mdr);
2590 break;
2591 case CEPH_MDS_OP_SETDIRLAYOUT:
2592 handle_client_setdirlayout(mdr);
2593 break;
2594 case CEPH_MDS_OP_SETXATTR:
2595 handle_client_setxattr(mdr);
2596 break;
2597 case CEPH_MDS_OP_RMXATTR:
2598 handle_client_removexattr(mdr);
2599 break;
2600
2601 case CEPH_MDS_OP_READDIR:
2602 handle_client_readdir(mdr);
2603 break;
2604
2605 case CEPH_MDS_OP_SETFILELOCK:
2606 handle_client_file_setlock(mdr);
2607 break;
2608
2609 case CEPH_MDS_OP_GETFILELOCK:
2610 handle_client_file_readlock(mdr);
2611 break;
2612
2613 // funky.
2614 case CEPH_MDS_OP_CREATE:
2615 if (mdr->has_completed)
2616 handle_client_open(mdr); // already created.. just open
2617 else
2618 handle_client_openc(mdr);
2619 break;
2620
2621 case CEPH_MDS_OP_OPEN:
2622 handle_client_open(mdr);
2623 break;
2624
2625 // namespace.
2626 // no prior locks.
2627 case CEPH_MDS_OP_MKNOD:
2628 handle_client_mknod(mdr);
2629 break;
2630 case CEPH_MDS_OP_LINK:
2631 handle_client_link(mdr);
2632 break;
2633 case CEPH_MDS_OP_UNLINK:
2634 case CEPH_MDS_OP_RMDIR:
2635 handle_client_unlink(mdr);
2636 break;
2637 case CEPH_MDS_OP_RENAME:
2638 handle_client_rename(mdr);
2639 break;
2640 case CEPH_MDS_OP_MKDIR:
2641 handle_client_mkdir(mdr);
2642 break;
2643 case CEPH_MDS_OP_SYMLINK:
2644 handle_client_symlink(mdr);
2645 break;
2646
2647
2648 // snaps
2649 case CEPH_MDS_OP_LSSNAP:
2650 handle_client_lssnap(mdr);
2651 break;
2652 case CEPH_MDS_OP_MKSNAP:
2653 handle_client_mksnap(mdr);
2654 break;
2655 case CEPH_MDS_OP_RMSNAP:
2656 handle_client_rmsnap(mdr);
2657 break;
2658 case CEPH_MDS_OP_RENAMESNAP:
2659 handle_client_renamesnap(mdr);
2660 break;
2661
2662 default:
2663 dout(1) << " unknown client op " << req->get_op() << dendl;
2664 respond_to_request(mdr, -CEPHFS_EOPNOTSUPP);
2665 }
2666 }
2667
2668
2669 // ---------------------------------------
2670 // PEER REQUESTS
2671
2672 void Server::handle_peer_request(const cref_t<MMDSPeerRequest> &m)
2673 {
2674 dout(4) << "handle_peer_request " << m->get_reqid() << " from " << m->get_source() << dendl;
2675 mds_rank_t from = mds_rank_t(m->get_source().num());
2676
2677 if (logger) logger->inc(l_mdss_handle_peer_request);
2678
2679 // reply?
2680 if (m->is_reply())
2681 return handle_peer_request_reply(m);
2682
2683 // the purpose of rename notify is enforcing causal message ordering. making sure
2684 // bystanders have received all messages from rename srcdn's auth MDS.
2685 if (m->get_op() == MMDSPeerRequest::OP_RENAMENOTIFY) {
2686 auto reply = make_message<MMDSPeerRequest>(m->get_reqid(), m->get_attempt(), MMDSPeerRequest::OP_RENAMENOTIFYACK);
2687 mds->send_message(reply, m->get_connection());
2688 return;
2689 }
2690
2691 CDentry *straydn = NULL;
2692 if (m->straybl.length() > 0) {
2693 mdcache->decode_replica_stray(straydn, m->straybl, from);
2694 ceph_assert(straydn);
2695 m->straybl.clear();
2696 }
2697
2698 if (!mds->is_clientreplay() && !mds->is_active() && !mds->is_stopping()) {
2699 dout(3) << "not clientreplay|active yet, waiting" << dendl;
2700 mds->wait_for_replay(new C_MDS_RetryMessage(mds, m));
2701 return;
2702 }
2703
2704 // am i a new peer?
2705 MDRequestRef mdr;
2706 if (mdcache->have_request(m->get_reqid())) {
2707 // existing?
2708 mdr = mdcache->request_get(m->get_reqid());
2709
2710 // is my request newer?
2711 if (mdr->attempt > m->get_attempt()) {
2712 dout(10) << "local request " << *mdr << " attempt " << mdr->attempt << " > " << m->get_attempt()
2713 << ", dropping " << *m << dendl;
2714 return;
2715 }
2716
2717 if (mdr->attempt < m->get_attempt()) {
2718 // mine is old, close it out
2719 dout(10) << "local request " << *mdr << " attempt " << mdr->attempt << " < " << m->get_attempt()
2720 << ", closing out" << dendl;
2721 mdcache->request_finish(mdr);
2722 mdr.reset();
2723 } else if (mdr->peer_to_mds != from) {
2724 dout(10) << "local request " << *mdr << " not peer to mds." << from << dendl;
2725 return;
2726 }
2727
2728 // may get these while mdr->peer_request is non-null
2729 if (m->get_op() == MMDSPeerRequest::OP_DROPLOCKS) {
2730 mds->locker->drop_locks(mdr.get());
2731 return;
2732 }
2733 if (m->get_op() == MMDSPeerRequest::OP_FINISH) {
2734 if (m->is_abort()) {
2735 mdr->aborted = true;
2736 if (mdr->peer_request) {
2737 // only abort on-going xlock, wrlock and auth pin
2738 ceph_assert(!mdr->peer_did_prepare());
2739 } else {
2740 mdcache->request_finish(mdr);
2741 }
2742 } else {
2743 if (m->inode_export.length() > 0)
2744 mdr->more()->inode_import = m->inode_export;
2745 // finish off request.
2746 mdcache->request_finish(mdr);
2747 }
2748 return;
2749 }
2750 }
2751 if (!mdr.get()) {
2752 // new?
2753 if (m->get_op() == MMDSPeerRequest::OP_FINISH) {
2754 dout(10) << "missing peer request for " << m->get_reqid()
2755 << " OP_FINISH, must have lost race with a forward" << dendl;
2756 return;
2757 }
2758 mdr = mdcache->request_start_peer(m->get_reqid(), m->get_attempt(), m);
2759 mdr->set_op_stamp(m->op_stamp);
2760 }
2761 ceph_assert(mdr->peer_request == 0); // only one at a time, please!
2762
2763 if (straydn) {
2764 mdr->pin(straydn);
2765 mdr->straydn = straydn;
2766 }
2767
2768 if (mds->is_clientreplay() && !mds->mdsmap->is_clientreplay(from) &&
2769 mdr->locks.empty()) {
2770 dout(3) << "not active yet, waiting" << dendl;
2771 mds->wait_for_active(new C_MDS_RetryMessage(mds, m));
2772 return;
2773 }
2774
2775 mdr->reset_peer_request(m);
2776
2777 dispatch_peer_request(mdr);
2778 }
2779
2780 void Server::handle_peer_request_reply(const cref_t<MMDSPeerRequest> &m)
2781 {
2782 mds_rank_t from = mds_rank_t(m->get_source().num());
2783
2784 if (!mds->is_clientreplay() && !mds->is_active() && !mds->is_stopping()) {
2785 metareqid_t r = m->get_reqid();
2786 if (!mdcache->have_uncommitted_leader(r, from)) {
2787 dout(10) << "handle_peer_request_reply ignoring peer reply from mds."
2788 << from << " reqid " << r << dendl;
2789 return;
2790 }
2791 dout(3) << "not clientreplay|active yet, waiting" << dendl;
2792 mds->wait_for_replay(new C_MDS_RetryMessage(mds, m));
2793 return;
2794 }
2795
2796 if (m->get_op() == MMDSPeerRequest::OP_COMMITTED) {
2797 metareqid_t r = m->get_reqid();
2798 mdcache->committed_leader_peer(r, from);
2799 return;
2800 }
2801
2802 MDRequestRef mdr = mdcache->request_get(m->get_reqid());
2803 if (m->get_attempt() != mdr->attempt) {
2804 dout(10) << "handle_peer_request_reply " << *mdr << " ignoring reply from other attempt "
2805 << m->get_attempt() << dendl;
2806 return;
2807 }
2808
2809 switch (m->get_op()) {
2810 case MMDSPeerRequest::OP_XLOCKACK:
2811 {
2812 // identify lock, leader request
2813 SimpleLock *lock = mds->locker->get_lock(m->get_lock_type(),
2814 m->get_object_info());
2815 mdr->more()->peers.insert(from);
2816 lock->decode_locked_state(m->get_lock_data());
2817 dout(10) << "got remote xlock on " << *lock << " on " << *lock->get_parent() << dendl;
2818 mdr->emplace_lock(lock, MutationImpl::LockOp::XLOCK);
2819 mdr->finish_locking(lock);
2820 lock->get_xlock(mdr, mdr->get_client());
2821
2822 ceph_assert(mdr->more()->waiting_on_peer.count(from));
2823 mdr->more()->waiting_on_peer.erase(from);
2824 ceph_assert(mdr->more()->waiting_on_peer.empty());
2825 mdcache->dispatch_request(mdr);
2826 }
2827 break;
2828
2829 case MMDSPeerRequest::OP_WRLOCKACK:
2830 {
2831 // identify lock, leader request
2832 SimpleLock *lock = mds->locker->get_lock(m->get_lock_type(),
2833 m->get_object_info());
2834 mdr->more()->peers.insert(from);
2835 dout(10) << "got remote wrlock on " << *lock << " on " << *lock->get_parent() << dendl;
2836 auto it = mdr->emplace_lock(lock, MutationImpl::LockOp::REMOTE_WRLOCK, from);
2837 ceph_assert(it->is_remote_wrlock());
2838 ceph_assert(it->wrlock_target == from);
2839
2840 mdr->finish_locking(lock);
2841
2842 ceph_assert(mdr->more()->waiting_on_peer.count(from));
2843 mdr->more()->waiting_on_peer.erase(from);
2844 ceph_assert(mdr->more()->waiting_on_peer.empty());
2845 mdcache->dispatch_request(mdr);
2846 }
2847 break;
2848
2849 case MMDSPeerRequest::OP_AUTHPINACK:
2850 handle_peer_auth_pin_ack(mdr, m);
2851 break;
2852
2853 case MMDSPeerRequest::OP_LINKPREPACK:
2854 handle_peer_link_prep_ack(mdr, m);
2855 break;
2856
2857 case MMDSPeerRequest::OP_RMDIRPREPACK:
2858 handle_peer_rmdir_prep_ack(mdr, m);
2859 break;
2860
2861 case MMDSPeerRequest::OP_RENAMEPREPACK:
2862 handle_peer_rename_prep_ack(mdr, m);
2863 break;
2864
2865 case MMDSPeerRequest::OP_RENAMENOTIFYACK:
2866 handle_peer_rename_notify_ack(mdr, m);
2867 break;
2868
2869 default:
2870 ceph_abort();
2871 }
2872 }
2873
2874 void Server::dispatch_peer_request(MDRequestRef& mdr)
2875 {
2876 dout(7) << "dispatch_peer_request " << *mdr << " " << *mdr->peer_request << dendl;
2877
2878 if (mdr->aborted) {
2879 dout(7) << " abort flag set, finishing" << dendl;
2880 mdcache->request_finish(mdr);
2881 return;
2882 }
2883
2884 if (logger) logger->inc(l_mdss_dispatch_peer_request);
2885
2886 int op = mdr->peer_request->get_op();
2887 switch (op) {
2888 case MMDSPeerRequest::OP_XLOCK:
2889 case MMDSPeerRequest::OP_WRLOCK:
2890 {
2891 // identify object
2892 SimpleLock *lock = mds->locker->get_lock(mdr->peer_request->get_lock_type(),
2893 mdr->peer_request->get_object_info());
2894
2895 if (!lock) {
2896 dout(10) << "don't have object, dropping" << dendl;
2897 ceph_abort(); // can this happen, if we auth pinned properly.
2898 }
2899 if (op == MMDSPeerRequest::OP_XLOCK && !lock->get_parent()->is_auth()) {
2900 dout(10) << "not auth for remote xlock attempt, dropping on "
2901 << *lock << " on " << *lock->get_parent() << dendl;
2902 } else {
2903 // use acquire_locks so that we get auth_pinning.
2904 MutationImpl::LockOpVec lov;
2905 for (const auto& p : mdr->locks) {
2906 if (p.is_xlock())
2907 lov.add_xlock(p.lock);
2908 else if (p.is_wrlock())
2909 lov.add_wrlock(p.lock);
2910 }
2911
2912 int replycode = 0;
2913 switch (op) {
2914 case MMDSPeerRequest::OP_XLOCK:
2915 lov.add_xlock(lock);
2916 replycode = MMDSPeerRequest::OP_XLOCKACK;
2917 break;
2918 case MMDSPeerRequest::OP_WRLOCK:
2919 lov.add_wrlock(lock);
2920 replycode = MMDSPeerRequest::OP_WRLOCKACK;
2921 break;
2922 }
2923
2924 if (!mds->locker->acquire_locks(mdr, lov))
2925 return;
2926
2927 // ack
2928 auto r = make_message<MMDSPeerRequest>(mdr->reqid, mdr->attempt, replycode);
2929 r->set_lock_type(lock->get_type());
2930 lock->get_parent()->set_object_info(r->get_object_info());
2931 if (replycode == MMDSPeerRequest::OP_XLOCKACK)
2932 lock->encode_locked_state(r->get_lock_data());
2933 mds->send_message(r, mdr->peer_request->get_connection());
2934 }
2935
2936 // done.
2937 mdr->reset_peer_request();
2938 }
2939 break;
2940
2941 case MMDSPeerRequest::OP_UNXLOCK:
2942 case MMDSPeerRequest::OP_UNWRLOCK:
2943 {
2944 SimpleLock *lock = mds->locker->get_lock(mdr->peer_request->get_lock_type(),
2945 mdr->peer_request->get_object_info());
2946 ceph_assert(lock);
2947 auto it = mdr->locks.find(lock);
2948 ceph_assert(it != mdr->locks.end());
2949 bool need_issue = false;
2950 switch (op) {
2951 case MMDSPeerRequest::OP_UNXLOCK:
2952 mds->locker->xlock_finish(it, mdr.get(), &need_issue);
2953 break;
2954 case MMDSPeerRequest::OP_UNWRLOCK:
2955 mds->locker->wrlock_finish(it, mdr.get(), &need_issue);
2956 break;
2957 }
2958 if (need_issue)
2959 mds->locker->issue_caps(static_cast<CInode*>(lock->get_parent()));
2960
2961 // done. no ack necessary.
2962 mdr->reset_peer_request();
2963 }
2964 break;
2965
2966 case MMDSPeerRequest::OP_AUTHPIN:
2967 handle_peer_auth_pin(mdr);
2968 break;
2969
2970 case MMDSPeerRequest::OP_LINKPREP:
2971 case MMDSPeerRequest::OP_UNLINKPREP:
2972 handle_peer_link_prep(mdr);
2973 break;
2974
2975 case MMDSPeerRequest::OP_RMDIRPREP:
2976 handle_peer_rmdir_prep(mdr);
2977 break;
2978
2979 case MMDSPeerRequest::OP_RENAMEPREP:
2980 handle_peer_rename_prep(mdr);
2981 break;
2982
2983 default:
2984 ceph_abort();
2985 }
2986 }
2987
2988 void Server::handle_peer_auth_pin(MDRequestRef& mdr)
2989 {
2990 dout(10) << "handle_peer_auth_pin " << *mdr << dendl;
2991
2992 // build list of objects
2993 list<MDSCacheObject*> objects;
2994 CInode *auth_pin_freeze = NULL;
2995 bool nonblocking = mdr->peer_request->is_nonblocking();
2996 bool fail = false, wouldblock = false, readonly = false;
2997 ref_t<MMDSPeerRequest> reply;
2998
2999 if (mdcache->is_readonly()) {
3000 dout(10) << " read-only FS" << dendl;
3001 readonly = true;
3002 fail = true;
3003 }
3004
3005 if (!fail) {
3006 for (const auto &oi : mdr->peer_request->get_authpins()) {
3007 MDSCacheObject *object = mdcache->get_object(oi);
3008 if (!object) {
3009 dout(10) << " don't have " << oi << dendl;
3010 fail = true;
3011 break;
3012 }
3013
3014 objects.push_back(object);
3015 if (oi == mdr->peer_request->get_authpin_freeze())
3016 auth_pin_freeze = static_cast<CInode*>(object);
3017 }
3018 }
3019
3020 // can we auth pin them?
3021 if (!fail) {
3022 for (const auto& obj : objects) {
3023 if (!obj->is_auth()) {
3024 dout(10) << " not auth for " << *obj << dendl;
3025 fail = true;
3026 break;
3027 }
3028 if (mdr->is_auth_pinned(obj))
3029 continue;
3030 if (!mdr->can_auth_pin(obj)) {
3031 if (nonblocking) {
3032 dout(10) << " can't auth_pin (freezing?) " << *obj << " nonblocking" << dendl;
3033 fail = true;
3034 wouldblock = true;
3035 break;
3036 }
3037 // wait
3038 dout(10) << " waiting for authpinnable on " << *obj << dendl;
3039 obj->add_waiter(CDir::WAIT_UNFREEZE, new C_MDS_RetryRequest(mdcache, mdr));
3040 mdr->drop_local_auth_pins();
3041
3042 mds->locker->notify_freeze_waiter(obj);
3043 goto blocked;
3044 }
3045 }
3046 }
3047
3048 if (!fail) {
3049 /* freeze authpin wrong inode */
3050 if (mdr->has_more() && mdr->more()->is_freeze_authpin &&
3051 mdr->more()->rename_inode != auth_pin_freeze)
3052 mdr->unfreeze_auth_pin(true);
3053
3054 /* handle_peer_rename_prep() call freeze_inode() to wait for all other operations
3055 * on the source inode to complete. This happens after all locks for the rename
3056 * operation are acquired. But to acquire locks, we need auth pin locks' parent
3057 * objects first. So there is an ABBA deadlock if someone auth pins the source inode
3058 * after locks are acquired and before Server::handle_peer_rename_prep() is called.
3059 * The solution is freeze the inode and prevent other MDRequests from getting new
3060 * auth pins.
3061 */
3062 if (auth_pin_freeze) {
3063 dout(10) << " freezing auth pin on " << *auth_pin_freeze << dendl;
3064 if (!mdr->freeze_auth_pin(auth_pin_freeze)) {
3065 auth_pin_freeze->add_waiter(CInode::WAIT_FROZEN, new C_MDS_RetryRequest(mdcache, mdr));
3066 mds->mdlog->flush();
3067 goto blocked;
3068 }
3069 }
3070 }
3071
3072 reply = make_message<MMDSPeerRequest>(mdr->reqid, mdr->attempt, MMDSPeerRequest::OP_AUTHPINACK);
3073
3074 if (fail) {
3075 mdr->drop_local_auth_pins(); // just in case
3076 if (readonly)
3077 reply->mark_error_rofs();
3078 if (wouldblock)
3079 reply->mark_error_wouldblock();
3080 } else {
3081 // auth pin!
3082 for (const auto& obj : objects) {
3083 dout(10) << "auth_pinning " << *obj << dendl;
3084 mdr->auth_pin(obj);
3085 }
3086 // return list of my auth_pins (if any)
3087 for (const auto &p : mdr->object_states) {
3088 if (!p.second.auth_pinned)
3089 continue;
3090 MDSCacheObjectInfo info;
3091 p.first->set_object_info(info);
3092 reply->get_authpins().push_back(info);
3093 if (p.first == (MDSCacheObject*)auth_pin_freeze)
3094 auth_pin_freeze->set_object_info(reply->get_authpin_freeze());
3095 }
3096 }
3097
3098 mds->send_message_mds(reply, mdr->peer_to_mds);
3099
3100 // clean up this request
3101 mdr->reset_peer_request();
3102 return;
3103
3104 blocked:
3105 if (mdr->peer_request->should_notify_blocking()) {
3106 reply = make_message<MMDSPeerRequest>(mdr->reqid, mdr->attempt, MMDSPeerRequest::OP_AUTHPINACK);
3107 reply->mark_req_blocked();
3108 mds->send_message_mds(reply, mdr->peer_to_mds);
3109 mdr->peer_request->clear_notify_blocking();
3110 }
3111 return;
3112 }
3113
3114 void Server::handle_peer_auth_pin_ack(MDRequestRef& mdr, const cref_t<MMDSPeerRequest> &ack)
3115 {
3116 dout(10) << "handle_peer_auth_pin_ack on " << *mdr << " " << *ack << dendl;
3117 mds_rank_t from = mds_rank_t(ack->get_source().num());
3118
3119 if (ack->is_req_blocked()) {
3120 mdr->disable_lock_cache();
3121 // peer auth pin is blocked, drop locks to avoid deadlock
3122 mds->locker->drop_locks(mdr.get(), nullptr);
3123 return;
3124 }
3125
3126 // added auth pins?
3127 set<MDSCacheObject*> pinned;
3128 for (const auto &oi : ack->get_authpins()) {
3129 MDSCacheObject *object = mdcache->get_object(oi);
3130 ceph_assert(object); // we pinned it
3131 dout(10) << " remote has pinned " << *object << dendl;
3132 mdr->set_remote_auth_pinned(object, from);
3133 if (oi == ack->get_authpin_freeze())
3134 mdr->set_remote_frozen_auth_pin(static_cast<CInode *>(object));
3135 pinned.insert(object);
3136 }
3137
3138 // removed frozen auth pin ?
3139 if (mdr->more()->is_remote_frozen_authpin &&
3140 ack->get_authpin_freeze() == MDSCacheObjectInfo()) {
3141 auto stat_p = mdr->find_object_state(mdr->more()->rename_inode);
3142 ceph_assert(stat_p);
3143 if (stat_p->remote_auth_pinned == from) {
3144 mdr->more()->is_remote_frozen_authpin = false;
3145 }
3146 }
3147
3148 // removed auth pins?
3149 for (auto& p : mdr->object_states) {
3150 if (p.second.remote_auth_pinned == MDS_RANK_NONE)
3151 continue;
3152 MDSCacheObject* object = p.first;
3153 if (p.second.remote_auth_pinned == from && pinned.count(object) == 0) {
3154 dout(10) << " remote has unpinned " << *object << dendl;
3155 mdr->_clear_remote_auth_pinned(p.second);
3156 }
3157 }
3158
3159 // note peer
3160 mdr->more()->peers.insert(from);
3161
3162 // clear from waiting list
3163 auto ret = mdr->more()->waiting_on_peer.erase(from);
3164 ceph_assert(ret);
3165
3166 if (ack->is_error_rofs()) {
3167 mdr->more()->peer_error = -CEPHFS_EROFS;
3168 } else if (ack->is_error_wouldblock()) {
3169 mdr->more()->peer_error = -CEPHFS_EWOULDBLOCK;
3170 }
3171
3172 // go again?
3173 if (mdr->more()->waiting_on_peer.empty())
3174 mdcache->dispatch_request(mdr);
3175 else
3176 dout(10) << "still waiting on peers " << mdr->more()->waiting_on_peer << dendl;
3177 }
3178
3179
3180 // ---------------------------------------
3181 // HELPERS
3182
3183
3184 /**
3185 * check whether we are permitted to complete a request
3186 *
3187 * Check whether we have permission to perform the operation specified
3188 * by mask on the given inode, based on the capability in the mdr's
3189 * session.
3190 */
3191 bool Server::check_access(MDRequestRef& mdr, CInode *in, unsigned mask)
3192 {
3193 if (mdr->session) {
3194 int r = mdr->session->check_access(
3195 in, mask,
3196 mdr->client_request->get_caller_uid(),
3197 mdr->client_request->get_caller_gid(),
3198 &mdr->client_request->get_caller_gid_list(),
3199 mdr->client_request->head.args.setattr.uid,
3200 mdr->client_request->head.args.setattr.gid);
3201 if (r < 0) {
3202 respond_to_request(mdr, r);
3203 return false;
3204 }
3205 }
3206 return true;
3207 }
3208
3209 /**
3210 * check whether fragment has reached maximum size
3211 *
3212 */
3213 bool Server::check_fragment_space(MDRequestRef &mdr, CDir *dir)
3214 {
3215 const auto size = dir->get_frag_size();
3216 const auto max = bal_fragment_size_max;
3217 if (size >= max) {
3218 dout(10) << "fragment " << *dir << " size exceeds " << max << " (CEPHFS_ENOSPC)" << dendl;
3219 respond_to_request(mdr, -CEPHFS_ENOSPC);
3220 return false;
3221 } else {
3222 dout(20) << "fragment " << *dir << " size " << size << " < " << max << dendl;
3223 }
3224
3225 return true;
3226 }
3227
3228 /**
3229 * check whether entries in a dir reached maximum size
3230 *
3231 */
3232 bool Server::check_dir_max_entries(MDRequestRef &mdr, CDir *in)
3233 {
3234 const uint64_t size = in->inode->get_projected_inode()->dirstat.nfiles +
3235 in->inode->get_projected_inode()->dirstat.nsubdirs;
3236 if (dir_max_entries && size >= dir_max_entries) {
3237 dout(10) << "entries per dir " << *in << " size exceeds " << dir_max_entries << " (ENOSPC)" << dendl;
3238 respond_to_request(mdr, -ENOSPC);
3239 return false;
3240 }
3241 return true;
3242 }
3243
3244
3245 CDentry* Server::prepare_stray_dentry(MDRequestRef& mdr, CInode *in)
3246 {
3247 string straydname;
3248 in->name_stray_dentry(straydname);
3249
3250 CDentry *straydn = mdr->straydn;
3251 if (straydn) {
3252 ceph_assert(straydn->get_name() == straydname);
3253 return straydn;
3254 }
3255 CDir *straydir = mdcache->get_stray_dir(in);
3256
3257 if (!mdr->client_request->is_replay() &&
3258 !check_fragment_space(mdr, straydir))
3259 return nullptr;
3260
3261 straydn = straydir->lookup(straydname);
3262 if (!straydn) {
3263 if (straydir->is_frozen_dir()) {
3264 dout(10) << __func__ << ": " << *straydir << " is frozen, waiting" << dendl;
3265 mds->locker->drop_locks(mdr.get());
3266 mdr->drop_local_auth_pins();
3267 straydir->add_waiter(CInode::WAIT_UNFREEZE, new C_MDS_RetryRequest(mdcache, mdr));
3268 return nullptr;
3269 }
3270 straydn = straydir->add_null_dentry(straydname);
3271 straydn->mark_new();
3272 } else {
3273 ceph_assert(straydn->get_projected_linkage()->is_null());
3274 }
3275
3276 straydn->state_set(CDentry::STATE_STRAY);
3277 mdr->straydn = straydn;
3278 mdr->pin(straydn);
3279
3280 return straydn;
3281 }
3282
3283 /** prepare_new_inode
3284 *
3285 * create a new inode. set c/m/atime. hit dir pop.
3286 */
3287 CInode* Server::prepare_new_inode(MDRequestRef& mdr, CDir *dir, inodeno_t useino, unsigned mode,
3288 const file_layout_t *layout)
3289 {
3290 CInode *in = new CInode(mdcache);
3291 auto _inode = in->_get_inode();
3292
3293 // Server::prepare_force_open_sessions() can re-open session in closing
3294 // state. In that corner case, session's prealloc_inos are being freed.
3295 // To simplify the code, we disallow using/refilling session's prealloc_ino
3296 // while session is opening.
3297 bool allow_prealloc_inos = mdr->session->is_open();
3298
3299 // assign ino
3300 if (allow_prealloc_inos && (mdr->used_prealloc_ino = _inode->ino = mdr->session->take_ino(useino))) {
3301 mds->sessionmap.mark_projected(mdr->session);
3302 dout(10) << "prepare_new_inode used_prealloc " << mdr->used_prealloc_ino
3303 << " (" << mdr->session->info.prealloc_inos.size() << " left)"
3304 << dendl;
3305 } else {
3306 mdr->alloc_ino =
3307 _inode->ino = mds->inotable->project_alloc_id(useino);
3308 dout(10) << "prepare_new_inode alloc " << mdr->alloc_ino << dendl;
3309 }
3310
3311 if (useino && useino != _inode->ino) {
3312 dout(0) << "WARNING: client specified " << useino << " and i allocated " << _inode->ino << dendl;
3313 mds->clog->error() << mdr->client_request->get_source()
3314 << " specified ino " << useino
3315 << " but mds." << mds->get_nodeid() << " allocated " << _inode->ino;
3316 //ceph_abort(); // just for now.
3317 }
3318
3319 if (allow_prealloc_inos &&
3320 mdr->session->get_num_projected_prealloc_inos() < g_conf()->mds_client_prealloc_inos / 2) {
3321 int need = g_conf()->mds_client_prealloc_inos - mdr->session->get_num_projected_prealloc_inos();
3322 mds->inotable->project_alloc_ids(mdr->prealloc_inos, need);
3323 ceph_assert(mdr->prealloc_inos.size()); // or else fix projected increment semantics
3324 mdr->session->pending_prealloc_inos.insert(mdr->prealloc_inos);
3325 mds->sessionmap.mark_projected(mdr->session);
3326 dout(10) << "prepare_new_inode prealloc " << mdr->prealloc_inos << dendl;
3327 }
3328
3329 _inode->version = 1;
3330 _inode->xattr_version = 1;
3331 _inode->nlink = 1; // FIXME
3332
3333 _inode->mode = mode;
3334
3335 // FIPS zeroization audit 20191117: this memset is not security related.
3336 memset(&_inode->dir_layout, 0, sizeof(_inode->dir_layout));
3337 if (_inode->is_dir()) {
3338 _inode->dir_layout.dl_dir_hash = g_conf()->mds_default_dir_hash;
3339 } else if (layout) {
3340 _inode->layout = *layout;
3341 } else {
3342 _inode->layout = mdcache->default_file_layout;
3343 }
3344
3345 _inode->truncate_size = -1ull; // not truncated, yet!
3346 _inode->truncate_seq = 1; /* starting with 1, 0 is kept for no-truncation logic */
3347
3348 CInode *diri = dir->get_inode();
3349
3350 dout(10) << oct << " dir mode 0" << diri->get_inode()->mode << " new mode 0" << mode << dec << dendl;
3351
3352 if (diri->get_inode()->mode & S_ISGID) {
3353 dout(10) << " dir is sticky" << dendl;
3354 _inode->gid = diri->get_inode()->gid;
3355 if (S_ISDIR(mode)) {
3356 dout(10) << " new dir also sticky" << dendl;
3357 _inode->mode |= S_ISGID;
3358 }
3359 } else
3360 _inode->gid = mdr->client_request->get_caller_gid();
3361
3362 _inode->uid = mdr->client_request->get_caller_uid();
3363
3364 _inode->btime = _inode->ctime = _inode->mtime = _inode->atime =
3365 mdr->get_op_stamp();
3366
3367 _inode->change_attr = 0;
3368
3369 const cref_t<MClientRequest> &req = mdr->client_request;
3370 if (req->get_data().length()) {
3371 auto p = req->get_data().cbegin();
3372
3373 // xattrs on new inode?
3374 auto _xattrs = CInode::allocate_xattr_map();
3375 decode_noshare(*_xattrs, p);
3376 dout(10) << "prepare_new_inode setting xattrs " << *_xattrs << dendl;
3377 if (_xattrs->count("encryption.ctx")) {
3378 _inode->fscrypt = true;
3379 }
3380 in->reset_xattrs(std::move(_xattrs));
3381 }
3382
3383 if (!mds->mdsmap->get_inline_data_enabled() ||
3384 !mdr->session->get_connection()->has_feature(CEPH_FEATURE_MDS_INLINE_DATA))
3385 _inode->inline_data.version = CEPH_INLINE_NONE;
3386
3387 mdcache->add_inode(in); // add
3388 dout(10) << "prepare_new_inode " << *in << dendl;
3389 return in;
3390 }
3391
3392 void Server::journal_allocated_inos(MDRequestRef& mdr, EMetaBlob *blob)
3393 {
3394 dout(20) << "journal_allocated_inos sessionmapv " << mds->sessionmap.get_projected()
3395 << " inotablev " << mds->inotable->get_projected_version()
3396 << dendl;
3397 blob->set_ino_alloc(mdr->alloc_ino,
3398 mdr->used_prealloc_ino,
3399 mdr->prealloc_inos,
3400 mdr->client_request->get_source(),
3401 mds->sessionmap.get_projected(),
3402 mds->inotable->get_projected_version());
3403 }
3404
3405 void Server::apply_allocated_inos(MDRequestRef& mdr, Session *session)
3406 {
3407 dout(10) << "apply_allocated_inos " << mdr->alloc_ino
3408 << " / " << mdr->prealloc_inos
3409 << " / " << mdr->used_prealloc_ino << dendl;
3410
3411 if (mdr->alloc_ino) {
3412 mds->inotable->apply_alloc_id(mdr->alloc_ino);
3413 }
3414 if (mdr->prealloc_inos.size()) {
3415 ceph_assert(session);
3416 session->pending_prealloc_inos.subtract(mdr->prealloc_inos);
3417 session->free_prealloc_inos.insert(mdr->prealloc_inos);
3418 session->info.prealloc_inos.insert(mdr->prealloc_inos);
3419 mds->sessionmap.mark_dirty(session, !mdr->used_prealloc_ino);
3420 mds->inotable->apply_alloc_ids(mdr->prealloc_inos);
3421 }
3422 if (mdr->used_prealloc_ino) {
3423 ceph_assert(session);
3424 session->info.prealloc_inos.erase(mdr->used_prealloc_ino);
3425 mds->sessionmap.mark_dirty(session);
3426 }
3427 }
3428
3429 class C_MDS_TryFindInode : public ServerContext {
3430 MDRequestRef mdr;
3431 public:
3432 C_MDS_TryFindInode(Server *s, MDRequestRef& r) : ServerContext(s), mdr(r) {}
3433 void finish(int r) override {
3434 if (r == -CEPHFS_ESTALE) // :( find_ino_peers failed
3435 server->respond_to_request(mdr, r);
3436 else
3437 server->dispatch_client_request(mdr);
3438 }
3439 };
3440
3441 /* If this returns null, the request has been handled
3442 * as appropriate: forwarded on, or the client's been replied to */
3443 CInode* Server::rdlock_path_pin_ref(MDRequestRef& mdr,
3444 bool want_auth,
3445 bool no_want_auth)
3446 {
3447 const filepath& refpath = mdr->get_filepath();
3448 dout(10) << "rdlock_path_pin_ref " << *mdr << " " << refpath << dendl;
3449
3450 if (mdr->locking_state & MutationImpl::PATH_LOCKED)
3451 return mdr->in[0];
3452
3453 // traverse
3454 CF_MDS_RetryRequestFactory cf(mdcache, mdr, true);
3455 int flags = 0;
3456 if (refpath.is_last_snap()) {
3457 if (!no_want_auth)
3458 want_auth = true;
3459 } else {
3460 if (!no_want_auth && forward_all_requests_to_auth)
3461 want_auth = true;
3462 flags |= MDS_TRAVERSE_RDLOCK_PATH | MDS_TRAVERSE_RDLOCK_SNAP;
3463 }
3464 if (want_auth)
3465 flags |= MDS_TRAVERSE_WANT_AUTH;
3466 int r = mdcache->path_traverse(mdr, cf, refpath, flags, &mdr->dn[0], &mdr->in[0]);
3467 if (r > 0)
3468 return nullptr; // delayed
3469 if (r < 0) { // error
3470 if (r == -CEPHFS_ENOENT && !mdr->dn[0].empty()) {
3471 if (mdr->client_request &&
3472 mdr->client_request->get_dentry_wanted())
3473 mdr->tracedn = mdr->dn[0].back();
3474 respond_to_request(mdr, r);
3475 } else if (r == -CEPHFS_ESTALE) {
3476 dout(10) << "FAIL on CEPHFS_ESTALE but attempting recovery" << dendl;
3477 MDSContext *c = new C_MDS_TryFindInode(this, mdr);
3478 mdcache->find_ino_peers(refpath.get_ino(), c);
3479 } else {
3480 dout(10) << "FAIL on error " << r << dendl;
3481 respond_to_request(mdr, r);
3482 }
3483 return nullptr;
3484 }
3485 CInode *ref = mdr->in[0];
3486 dout(10) << "ref is " << *ref << dendl;
3487
3488 if (want_auth) {
3489 // auth_pin?
3490 // do NOT proceed if freezing, as cap release may defer in that case, and
3491 // we could deadlock when we try to lock @ref.
3492 // if we're already auth_pinned, continue; the release has already been processed.
3493 if (ref->is_frozen() || ref->is_frozen_auth_pin() ||
3494 (ref->is_freezing() && !mdr->is_auth_pinned(ref))) {
3495 dout(7) << "waiting for !frozen/authpinnable on " << *ref << dendl;
3496 ref->add_waiter(CInode::WAIT_UNFREEZE, cf.build());
3497 if (mdr->is_any_remote_auth_pin())
3498 mds->locker->notify_freeze_waiter(ref);
3499 return 0;
3500 }
3501 mdr->auth_pin(ref);
3502 }
3503
3504 // set and pin ref
3505 mdr->pin(ref);
3506 return ref;
3507 }
3508
3509
3510 /** rdlock_path_xlock_dentry
3511 * traverse path to the directory that could/would contain dentry.
3512 * make sure i am auth for that dentry, forward as necessary.
3513 * create null dentry in place (or use existing if okexist).
3514 * get rdlocks on traversed dentries, xlock on new dentry.
3515 */
3516 CDentry* Server::rdlock_path_xlock_dentry(MDRequestRef& mdr,
3517 bool create, bool okexist, bool want_layout)
3518 {
3519 const filepath& refpath = mdr->get_filepath();
3520 dout(10) << "rdlock_path_xlock_dentry " << *mdr << " " << refpath << dendl;
3521
3522 if (mdr->locking_state & MutationImpl::PATH_LOCKED)
3523 return mdr->dn[0].back();
3524
3525 // figure parent dir vs dname
3526 if (refpath.depth() == 0) {
3527 dout(7) << "invalid path (zero length)" << dendl;
3528 respond_to_request(mdr, -CEPHFS_EINVAL);
3529 return nullptr;
3530 }
3531
3532 if (refpath.is_last_snap()) {
3533 respond_to_request(mdr, -CEPHFS_EROFS);
3534 return nullptr;
3535 }
3536
3537 if (refpath.is_last_dot_or_dotdot()) {
3538 dout(7) << "invalid path (last dot or dot_dot)" << dendl;
3539 if (create)
3540 respond_to_request(mdr, -CEPHFS_EEXIST);
3541 else
3542 respond_to_request(mdr, -CEPHFS_ENOTEMPTY);
3543 return nullptr;
3544 }
3545
3546 // traverse to parent dir
3547 CF_MDS_RetryRequestFactory cf(mdcache, mdr, true);
3548 int flags = MDS_TRAVERSE_RDLOCK_SNAP | MDS_TRAVERSE_RDLOCK_PATH |
3549 MDS_TRAVERSE_WANT_DENTRY | MDS_TRAVERSE_XLOCK_DENTRY |
3550 MDS_TRAVERSE_WANT_AUTH;
3551 if (refpath.depth() == 1 && !mdr->lock_cache_disabled)
3552 flags |= MDS_TRAVERSE_CHECK_LOCKCACHE;
3553 if (create)
3554 flags |= MDS_TRAVERSE_RDLOCK_AUTHLOCK;
3555 if (want_layout)
3556 flags |= MDS_TRAVERSE_WANT_DIRLAYOUT;
3557 int r = mdcache->path_traverse(mdr, cf, refpath, flags, &mdr->dn[0]);
3558 if (r > 0)
3559 return nullptr; // delayed
3560 if (r < 0) {
3561 if (r == -CEPHFS_ESTALE) {
3562 dout(10) << "FAIL on CEPHFS_ESTALE but attempting recovery" << dendl;
3563 mdcache->find_ino_peers(refpath.get_ino(), new C_MDS_TryFindInode(this, mdr));
3564 return nullptr;
3565 }
3566 respond_to_request(mdr, r);
3567 return nullptr;
3568 }
3569
3570 CDentry *dn = mdr->dn[0].back();
3571 CDir *dir = dn->get_dir();
3572 CInode *diri = dir->get_inode();
3573
3574 if (!mdr->reqid.name.is_mds()) {
3575 if (diri->is_system() && !diri->is_root()) {
3576 respond_to_request(mdr, -CEPHFS_EROFS);
3577 return nullptr;
3578 }
3579 }
3580
3581 if (!diri->is_base() && diri->get_projected_parent_dir()->inode->is_stray()) {
3582 respond_to_request(mdr, -CEPHFS_ENOENT);
3583 return nullptr;
3584 }
3585
3586 CDentry::linkage_t *dnl = dn->get_projected_linkage();
3587 if (dnl->is_null()) {
3588 if (!create && okexist) {
3589 respond_to_request(mdr, -CEPHFS_ENOENT);
3590 return nullptr;
3591 }
3592
3593 snapid_t next_snap = mdcache->get_global_snaprealm()->get_newest_seq() + 1;
3594 dn->first = std::max(dn->first, next_snap);
3595 } else {
3596 if (!okexist) {
3597 respond_to_request(mdr, -CEPHFS_EEXIST);
3598 return nullptr;
3599 }
3600 mdr->in[0] = dnl->get_inode();
3601 }
3602
3603 return dn;
3604 }
3605
3606 /** rdlock_two_paths_xlock_destdn
3607 * traverse two paths and lock the two paths in proper order.
3608 * The order of taking locks is:
3609 * 1. Lock directory inodes or dentries according to which trees they
3610 * are under. Lock objects under fs root before objects under mdsdir.
3611 * 2. Lock directory inodes or dentries according to their depth, in
3612 * ascending order.
3613 * 3. Lock directory inodes or dentries according to inode numbers or
3614 * dentries' parent inode numbers, in ascending order.
3615 * 4. Lock dentries in the same directory in order of their keys.
3616 * 5. Lock non-directory inodes according to inode numbers, in ascending
3617 * order.
3618 */
3619 std::pair<CDentry*, CDentry*>
3620 Server::rdlock_two_paths_xlock_destdn(MDRequestRef& mdr, bool xlock_srcdn)
3621 {
3622
3623 const filepath& refpath = mdr->get_filepath();
3624 const filepath& refpath2 = mdr->get_filepath2();
3625
3626 dout(10) << "rdlock_two_paths_xlock_destdn " << *mdr << " " << refpath << " " << refpath2 << dendl;
3627
3628 if (mdr->locking_state & MutationImpl::PATH_LOCKED)
3629 return std::make_pair(mdr->dn[0].back(), mdr->dn[1].back());
3630
3631 if (refpath.depth() != 1 || refpath2.depth() != 1) {
3632 respond_to_request(mdr, -CEPHFS_EINVAL);
3633 return std::pair<CDentry*, CDentry*>(nullptr, nullptr);
3634 }
3635
3636 if (refpath.is_last_snap() || refpath2.is_last_snap()) {
3637 respond_to_request(mdr, -CEPHFS_EROFS);
3638 return std::make_pair(nullptr, nullptr);
3639 }
3640
3641 // traverse to parent dir
3642 CF_MDS_RetryRequestFactory cf(mdcache, mdr, true);
3643 int flags = MDS_TRAVERSE_RDLOCK_SNAP | MDS_TRAVERSE_WANT_DENTRY | MDS_TRAVERSE_WANT_AUTH;
3644 int r = mdcache->path_traverse(mdr, cf, refpath, flags, &mdr->dn[0]);
3645 if (r != 0) {
3646 if (r == -CEPHFS_ESTALE) {
3647 dout(10) << "CEPHFS_ESTALE on path, attempting recovery" << dendl;
3648 mdcache->find_ino_peers(refpath.get_ino(), new C_MDS_TryFindInode(this, mdr));
3649 } else if (r < 0) {
3650 respond_to_request(mdr, r);
3651 }
3652 return std::make_pair(nullptr, nullptr);
3653 }
3654
3655 flags = MDS_TRAVERSE_RDLOCK_SNAP2 | MDS_TRAVERSE_WANT_DENTRY | MDS_TRAVERSE_DISCOVER;
3656 r = mdcache->path_traverse(mdr, cf, refpath2, flags, &mdr->dn[1]);
3657 if (r != 0) {
3658 if (r == -CEPHFS_ESTALE) {
3659 dout(10) << "CEPHFS_ESTALE on path2, attempting recovery" << dendl;
3660 mdcache->find_ino_peers(refpath2.get_ino(), new C_MDS_TryFindInode(this, mdr));
3661 } else if (r < 0) {
3662 respond_to_request(mdr, r);
3663 }
3664 return std::make_pair(nullptr, nullptr);
3665 }
3666
3667 CDentry *srcdn = mdr->dn[1].back();
3668 CDir *srcdir = srcdn->get_dir();
3669 CDentry *destdn = mdr->dn[0].back();
3670 CDir *destdir = destdn->get_dir();
3671
3672 if (!mdr->reqid.name.is_mds()) {
3673 if ((srcdir->get_inode()->is_system() && !srcdir->get_inode()->is_root()) ||
3674 (destdir->get_inode()->is_system() && !destdir->get_inode()->is_root())) {
3675 respond_to_request(mdr, -CEPHFS_EROFS);
3676 return std::make_pair(nullptr, nullptr);
3677 }
3678 }
3679
3680 if (!destdir->get_inode()->is_base() &&
3681 destdir->get_inode()->get_projected_parent_dir()->inode->is_stray()) {
3682 respond_to_request(mdr, -CEPHFS_ENOENT);
3683 return std::make_pair(nullptr, nullptr);
3684 }
3685
3686 MutationImpl::LockOpVec lov;
3687 if (srcdir->get_inode() == destdir->get_inode()) {
3688 lov.add_wrlock(&destdir->inode->filelock);
3689 lov.add_wrlock(&destdir->inode->nestlock);
3690 if (xlock_srcdn && srcdir != destdir) {
3691 mds_rank_t srcdir_auth = srcdir->authority().first;
3692 if (srcdir_auth != mds->get_nodeid()) {
3693 lov.add_remote_wrlock(&srcdir->inode->filelock, srcdir_auth);
3694 lov.add_remote_wrlock(&srcdir->inode->nestlock, srcdir_auth);
3695 }
3696 }
3697
3698 if (srcdn->get_name() > destdn->get_name())
3699 lov.add_xlock(&destdn->lock);
3700
3701 if (xlock_srcdn)
3702 lov.add_xlock(&srcdn->lock);
3703 else
3704 lov.add_rdlock(&srcdn->lock);
3705
3706 if (srcdn->get_name() < destdn->get_name())
3707 lov.add_xlock(&destdn->lock);
3708 } else {
3709 int cmp = mdr->compare_paths();
3710 bool lock_destdir_first =
3711 (cmp < 0 || (cmp == 0 && destdir->ino() < srcdir->ino()));
3712
3713 if (lock_destdir_first) {
3714 lov.add_wrlock(&destdir->inode->filelock);
3715 lov.add_wrlock(&destdir->inode->nestlock);
3716 lov.add_xlock(&destdn->lock);
3717 }
3718
3719 if (xlock_srcdn) {
3720 mds_rank_t srcdir_auth = srcdir->authority().first;
3721 if (srcdir_auth == mds->get_nodeid()) {
3722 lov.add_wrlock(&srcdir->inode->filelock);
3723 lov.add_wrlock(&srcdir->inode->nestlock);
3724 } else {
3725 lov.add_remote_wrlock(&srcdir->inode->filelock, srcdir_auth);
3726 lov.add_remote_wrlock(&srcdir->inode->nestlock, srcdir_auth);
3727 }
3728 lov.add_xlock(&srcdn->lock);
3729 } else {
3730 lov.add_rdlock(&srcdn->lock);
3731 }
3732
3733 if (!lock_destdir_first) {
3734 lov.add_wrlock(&destdir->inode->filelock);
3735 lov.add_wrlock(&destdir->inode->nestlock);
3736 lov.add_xlock(&destdn->lock);
3737 }
3738 }
3739
3740 CInode *auth_pin_freeze = nullptr;
3741 // XXX any better way to do this?
3742 if (xlock_srcdn && !srcdn->is_auth()) {
3743 CDentry::linkage_t *srcdnl = srcdn->get_projected_linkage();
3744 auth_pin_freeze = srcdnl->is_primary() ? srcdnl->get_inode() : nullptr;
3745 }
3746 if (!mds->locker->acquire_locks(mdr, lov, auth_pin_freeze))
3747 return std::make_pair(nullptr, nullptr);
3748
3749 if (srcdn->get_projected_linkage()->is_null()) {
3750 respond_to_request(mdr, -CEPHFS_ENOENT);
3751 return std::make_pair(nullptr, nullptr);
3752 }
3753
3754 if (destdn->get_projected_linkage()->is_null()) {
3755 snapid_t next_snap = mdcache->get_global_snaprealm()->get_newest_seq() + 1;
3756 destdn->first = std::max(destdn->first, next_snap);
3757 }
3758
3759 mdr->locking_state |= MutationImpl::PATH_LOCKED;
3760
3761 return std::make_pair(destdn, srcdn);
3762 }
3763
3764 /**
3765 * try_open_auth_dirfrag -- open dirfrag, or forward to dirfrag auth
3766 *
3767 * @param diri base inode
3768 * @param fg the exact frag we want
3769 * @param mdr request
3770 * @returns the pointer, or NULL if it had to be delayed (but mdr is taken care of)
3771 */
3772 CDir* Server::try_open_auth_dirfrag(CInode *diri, frag_t fg, MDRequestRef& mdr)
3773 {
3774 CDir *dir = diri->get_dirfrag(fg);
3775
3776 if (dir) {
3777 // am i auth for the dirfrag?
3778 if (!dir->is_auth()) {
3779 mds_rank_t auth = dir->authority().first;
3780 dout(7) << "try_open_auth_dirfrag: not auth for " << *dir
3781 << ", fw to mds." << auth << dendl;
3782 mdcache->request_forward(mdr, auth);
3783 return nullptr;
3784 }
3785 } else {
3786 // not open and inode not mine?
3787 if (!diri->is_auth()) {
3788 mds_rank_t inauth = diri->authority().first;
3789 dout(7) << "try_open_auth_dirfrag: not open, not inode auth, fw to mds." << inauth << dendl;
3790 mdcache->request_forward(mdr, inauth);
3791 return nullptr;
3792 }
3793
3794 // not open and inode frozen?
3795 if (diri->is_frozen()) {
3796 dout(10) << "try_open_auth_dirfrag: dir inode is frozen, waiting " << *diri << dendl;
3797 ceph_assert(diri->get_parent_dir());
3798 diri->add_waiter(CInode::WAIT_UNFREEZE, new C_MDS_RetryRequest(mdcache, mdr));
3799 return nullptr;
3800 }
3801
3802 // invent?
3803 dir = diri->get_or_open_dirfrag(mdcache, fg);
3804 }
3805
3806 return dir;
3807 }
3808
3809
3810 // ===============================================================================
3811 // STAT
3812
3813 void Server::handle_client_getattr(MDRequestRef& mdr, bool is_lookup)
3814 {
3815 const cref_t<MClientRequest> &req = mdr->client_request;
3816
3817 if (req->get_filepath().depth() == 0 && is_lookup) {
3818 // refpath can't be empty for lookup but it can for
3819 // getattr (we do getattr with empty refpath for mount of '/')
3820 respond_to_request(mdr, -CEPHFS_EINVAL);
3821 return;
3822 }
3823
3824 bool want_auth = false;
3825 int mask = req->head.args.getattr.mask;
3826 if (mask & CEPH_STAT_RSTAT)
3827 want_auth = true; // set want_auth for CEPH_STAT_RSTAT mask
3828
3829 if (!mdr->is_batch_head() && mdr->can_batch()) {
3830 CF_MDS_RetryRequestFactory cf(mdcache, mdr, false);
3831 int r = mdcache->path_traverse(mdr, cf, mdr->get_filepath(),
3832 (want_auth ? MDS_TRAVERSE_WANT_AUTH : 0),
3833 &mdr->dn[0], &mdr->in[0]);
3834 if (r > 0)
3835 return; // delayed
3836
3837 if (r < 0) {
3838 // fall-thru. let rdlock_path_pin_ref() check again.
3839 } else if (is_lookup) {
3840 CDentry* dn = mdr->dn[0].back();
3841 mdr->pin(dn);
3842 auto em = dn->batch_ops.emplace(std::piecewise_construct, std::forward_as_tuple(mask), std::forward_as_tuple());
3843 if (em.second) {
3844 em.first->second = std::make_unique<Batch_Getattr_Lookup>(this, mdr);
3845 } else {
3846 dout(20) << __func__ << ": LOOKUP op, wait for previous same getattr ops to respond. " << *mdr << dendl;
3847 em.first->second->add_request(mdr);
3848 return;
3849 }
3850 } else {
3851 CInode *in = mdr->in[0];
3852 mdr->pin(in);
3853 auto em = in->batch_ops.emplace(std::piecewise_construct, std::forward_as_tuple(mask), std::forward_as_tuple());
3854 if (em.second) {
3855 em.first->second = std::make_unique<Batch_Getattr_Lookup>(this, mdr);
3856 } else {
3857 dout(20) << __func__ << ": GETATTR op, wait for previous same getattr ops to respond. " << *mdr << dendl;
3858 em.first->second->add_request(mdr);
3859 return;
3860 }
3861 }
3862 }
3863
3864 CInode *ref = rdlock_path_pin_ref(mdr, want_auth, false);
3865 if (!ref)
3866 return;
3867
3868 mdr->getattr_caps = mask;
3869
3870 /*
3871 * if client currently holds the EXCL cap on a field, do not rdlock
3872 * it; client's stat() will result in valid info if _either_ EXCL
3873 * cap is held or MDS rdlocks and reads the value here.
3874 *
3875 * handling this case here is easier than weakening rdlock
3876 * semantics... that would cause problems elsewhere.
3877 */
3878 client_t client = mdr->get_client();
3879 int issued = 0;
3880 Capability *cap = ref->get_client_cap(client);
3881 if (cap && (mdr->snapid == CEPH_NOSNAP ||
3882 mdr->snapid <= cap->client_follows))
3883 issued = cap->issued();
3884
3885 // FIXME
3886 MutationImpl::LockOpVec lov;
3887 if ((mask & CEPH_CAP_LINK_SHARED) && !(issued & CEPH_CAP_LINK_EXCL))
3888 lov.add_rdlock(&ref->linklock);
3889 if ((mask & CEPH_CAP_AUTH_SHARED) && !(issued & CEPH_CAP_AUTH_EXCL))
3890 lov.add_rdlock(&ref->authlock);
3891 if ((mask & CEPH_CAP_XATTR_SHARED) && !(issued & CEPH_CAP_XATTR_EXCL))
3892 lov.add_rdlock(&ref->xattrlock);
3893 if ((mask & CEPH_CAP_FILE_SHARED) && !(issued & CEPH_CAP_FILE_EXCL)) {
3894 // Don't wait on unstable filelock if client is allowed to read file size.
3895 // This can reduce the response time of getattr in the case that multiple
3896 // clients do stat(2) and there are writers.
3897 // The downside of this optimization is that mds may not issue Fs caps along
3898 // with getattr reply. Client may need to send more getattr requests.
3899 if (mdr->is_rdlocked(&ref->filelock)) {
3900 lov.add_rdlock(&ref->filelock);
3901 } else if (ref->filelock.is_stable() ||
3902 ref->filelock.get_num_wrlocks() > 0 ||
3903 !ref->filelock.can_read(mdr->get_client())) {
3904 lov.add_rdlock(&ref->filelock);
3905 mdr->locking_state &= ~MutationImpl::ALL_LOCKED;
3906 }
3907 }
3908
3909 if (!mds->locker->acquire_locks(mdr, lov))
3910 return;
3911
3912 if (!check_access(mdr, ref, MAY_READ))
3913 return;
3914
3915 utime_t now = ceph_clock_now();
3916 mdr->set_mds_stamp(now);
3917
3918 // note which caps are requested, so we return at least a snapshot
3919 // value for them. (currently this matters for xattrs and inline data)
3920 mdr->getattr_caps = mask;
3921
3922 mds->balancer->hit_inode(ref, META_POP_IRD, req->get_source().num());
3923
3924 // reply
3925 dout(10) << "reply to stat on " << *req << dendl;
3926 mdr->tracei = ref;
3927 if (is_lookup)
3928 mdr->tracedn = mdr->dn[0].back();
3929 respond_to_request(mdr, 0);
3930 }
3931
3932 struct C_MDS_LookupIno2 : public ServerContext {
3933 MDRequestRef mdr;
3934 C_MDS_LookupIno2(Server *s, MDRequestRef& r) : ServerContext(s), mdr(r) {}
3935 void finish(int r) override {
3936 server->_lookup_ino_2(mdr, r);
3937 }
3938 };
3939
3940 /*
3941 * filepath: ino
3942 */
3943 void Server::handle_client_lookup_ino(MDRequestRef& mdr,
3944 bool want_parent, bool want_dentry)
3945 {
3946 const cref_t<MClientRequest> &req = mdr->client_request;
3947
3948 if ((uint64_t)req->head.args.lookupino.snapid > 0)
3949 return _lookup_snap_ino(mdr);
3950
3951 inodeno_t ino = req->get_filepath().get_ino();
3952 auto _ino = ino.val;
3953
3954 /* It's been observed [1] that a client may lookup a private ~mdsdir inode.
3955 * I do not have an explanation for how that happened organically but this
3956 * check will ensure that the client can no longer do that.
3957 *
3958 * [1] https://tracker.ceph.com/issues/49922
3959 */
3960 if (MDS_IS_PRIVATE_INO(_ino)) {
3961 respond_to_request(mdr, -CEPHFS_ESTALE);
3962 return;
3963 }
3964
3965 CInode *in = mdcache->get_inode(ino);
3966 if (in && in->state_test(CInode::STATE_PURGING)) {
3967 respond_to_request(mdr, -CEPHFS_ESTALE);
3968 return;
3969 }
3970 if (!in) {
3971 mdcache->open_ino(ino, (int64_t)-1, new C_MDS_LookupIno2(this, mdr), false);
3972 return;
3973 }
3974
3975 // check for nothing (not read or write); this still applies the
3976 // path check.
3977 if (!check_access(mdr, in, 0))
3978 return;
3979
3980 CDentry *dn = in->get_projected_parent_dn();
3981 CInode *diri = dn ? dn->get_dir()->inode : NULL;
3982
3983 MutationImpl::LockOpVec lov;
3984 if (dn && (want_parent || want_dentry)) {
3985 mdr->pin(dn);
3986 lov.add_rdlock(&dn->lock);
3987 }
3988
3989 unsigned mask = req->head.args.lookupino.mask;
3990 if (mask) {
3991 Capability *cap = in->get_client_cap(mdr->get_client());
3992 int issued = 0;
3993 if (cap && (mdr->snapid == CEPH_NOSNAP || mdr->snapid <= cap->client_follows))
3994 issued = cap->issued();
3995 // FIXME
3996 // permission bits, ACL/security xattrs
3997 if ((mask & CEPH_CAP_AUTH_SHARED) && (issued & CEPH_CAP_AUTH_EXCL) == 0)
3998 lov.add_rdlock(&in->authlock);
3999 if ((mask & CEPH_CAP_XATTR_SHARED) && (issued & CEPH_CAP_XATTR_EXCL) == 0)
4000 lov.add_rdlock(&in->xattrlock);
4001
4002 mdr->getattr_caps = mask;
4003 }
4004
4005 if (!lov.empty()) {
4006 if (!mds->locker->acquire_locks(mdr, lov))
4007 return;
4008
4009 if (diri != NULL) {
4010 // need read access to directory inode
4011 if (!check_access(mdr, diri, MAY_READ))
4012 return;
4013 }
4014 }
4015
4016 if (want_parent) {
4017 if (in->is_base()) {
4018 respond_to_request(mdr, -CEPHFS_EINVAL);
4019 return;
4020 }
4021 if (!diri || diri->is_stray()) {
4022 respond_to_request(mdr, -CEPHFS_ESTALE);
4023 return;
4024 }
4025 dout(10) << "reply to lookup_parent " << *in << dendl;
4026 mdr->tracei = diri;
4027 respond_to_request(mdr, 0);
4028 } else {
4029 if (want_dentry) {
4030 inodeno_t dirino = req->get_filepath2().get_ino();
4031 if (!diri || (dirino != inodeno_t() && diri->ino() != dirino)) {
4032 respond_to_request(mdr, -CEPHFS_ENOENT);
4033 return;
4034 }
4035 dout(10) << "reply to lookup_name " << *in << dendl;
4036 } else
4037 dout(10) << "reply to lookup_ino " << *in << dendl;
4038
4039 mdr->tracei = in;
4040 if (want_dentry)
4041 mdr->tracedn = dn;
4042 respond_to_request(mdr, 0);
4043 }
4044 }
4045
4046 void Server::_lookup_snap_ino(MDRequestRef& mdr)
4047 {
4048 const cref_t<MClientRequest> &req = mdr->client_request;
4049
4050 vinodeno_t vino;
4051 vino.ino = req->get_filepath().get_ino();
4052 vino.snapid = (__u64)req->head.args.lookupino.snapid;
4053 inodeno_t parent_ino = (__u64)req->head.args.lookupino.parent;
4054 __u32 hash = req->head.args.lookupino.hash;
4055
4056 dout(7) << "lookup_snap_ino " << vino << " parent " << parent_ino << " hash " << hash << dendl;
4057
4058 CInode *in = mdcache->lookup_snap_inode(vino);
4059 if (!in) {
4060 in = mdcache->get_inode(vino.ino);
4061 if (in) {
4062 if (in->state_test(CInode::STATE_PURGING) ||
4063 !in->has_snap_data(vino.snapid)) {
4064 if (in->is_dir() || !parent_ino) {
4065 respond_to_request(mdr, -CEPHFS_ESTALE);
4066 return;
4067 }
4068 in = NULL;
4069 }
4070 }
4071 }
4072
4073 if (in) {
4074 dout(10) << "reply to lookup_snap_ino " << *in << dendl;
4075 mdr->snapid = vino.snapid;
4076 mdr->tracei = in;
4077 respond_to_request(mdr, 0);
4078 return;
4079 }
4080
4081 CInode *diri = NULL;
4082 if (parent_ino) {
4083 diri = mdcache->get_inode(parent_ino);
4084 if (!diri) {
4085 mdcache->open_ino(parent_ino, mds->get_metadata_pool(), new C_MDS_LookupIno2(this, mdr));
4086 return;
4087 }
4088
4089 if (!diri->is_dir()) {
4090 respond_to_request(mdr, -CEPHFS_EINVAL);
4091 return;
4092 }
4093
4094 MutationImpl::LockOpVec lov;
4095 lov.add_rdlock(&diri->dirfragtreelock);
4096 if (!mds->locker->acquire_locks(mdr, lov))
4097 return;
4098
4099 frag_t frag = diri->dirfragtree[hash];
4100 CDir *dir = try_open_auth_dirfrag(diri, frag, mdr);
4101 if (!dir)
4102 return;
4103
4104 if (!dir->is_complete()) {
4105 if (dir->is_frozen()) {
4106 mds->locker->drop_locks(mdr.get());
4107 mdr->drop_local_auth_pins();
4108 dir->add_waiter(CDir::WAIT_UNFREEZE, new C_MDS_RetryRequest(mdcache, mdr));
4109 return;
4110 }
4111 dir->fetch(new C_MDS_RetryRequest(mdcache, mdr), true);
4112 return;
4113 }
4114
4115 respond_to_request(mdr, -CEPHFS_ESTALE);
4116 } else {
4117 mdcache->open_ino(vino.ino, mds->get_metadata_pool(), new C_MDS_LookupIno2(this, mdr), false);
4118 }
4119 }
4120
4121 void Server::_lookup_ino_2(MDRequestRef& mdr, int r)
4122 {
4123 inodeno_t ino = mdr->client_request->get_filepath().get_ino();
4124 dout(10) << "_lookup_ino_2 " << mdr.get() << " ino " << ino << " r=" << r << dendl;
4125
4126 // `r` is a rank if >=0, else an error code
4127 if (r >= 0) {
4128 mds_rank_t dest_rank(r);
4129 if (dest_rank == mds->get_nodeid())
4130 dispatch_client_request(mdr);
4131 else
4132 mdcache->request_forward(mdr, dest_rank);
4133 return;
4134 }
4135
4136 // give up
4137 if (r == -CEPHFS_ENOENT || r == -CEPHFS_ENODATA)
4138 r = -CEPHFS_ESTALE;
4139 respond_to_request(mdr, r);
4140 }
4141
4142
4143 /* This function takes responsibility for the passed mdr*/
4144 void Server::handle_client_open(MDRequestRef& mdr)
4145 {
4146 const cref_t<MClientRequest> &req = mdr->client_request;
4147 dout(7) << "open on " << req->get_filepath() << dendl;
4148
4149 int flags = req->head.args.open.flags;
4150 int cmode = ceph_flags_to_mode(flags);
4151 if (cmode < 0) {
4152 respond_to_request(mdr, -CEPHFS_EINVAL);
4153 return;
4154 }
4155
4156 bool need_auth = !file_mode_is_readonly(cmode) ||
4157 (flags & (CEPH_O_TRUNC | CEPH_O_DIRECTORY));
4158
4159 if ((cmode & CEPH_FILE_MODE_WR) && mdcache->is_readonly()) {
4160 dout(7) << "read-only FS" << dendl;
4161 respond_to_request(mdr, -CEPHFS_EROFS);
4162 return;
4163 }
4164
4165 CInode *cur = rdlock_path_pin_ref(mdr, need_auth);
4166 if (!cur)
4167 return;
4168
4169 if (cur->is_frozen() || cur->state_test(CInode::STATE_EXPORTINGCAPS)) {
4170 ceph_assert(!need_auth);
4171 mdr->locking_state &= ~(MutationImpl::PATH_LOCKED | MutationImpl::ALL_LOCKED);
4172 CInode *cur = rdlock_path_pin_ref(mdr, true);
4173 if (!cur)
4174 return;
4175 }
4176
4177 if (!cur->is_file()) {
4178 // can only open non-regular inode with mode FILE_MODE_PIN, at least for now.
4179 cmode = CEPH_FILE_MODE_PIN;
4180 // the inode is symlink and client wants to follow it, ignore the O_TRUNC flag.
4181 if (cur->is_symlink() && !(flags & CEPH_O_NOFOLLOW))
4182 flags &= ~CEPH_O_TRUNC;
4183 }
4184
4185 dout(10) << "open flags = " << flags
4186 << ", filemode = " << cmode
4187 << ", need_auth = " << need_auth
4188 << dendl;
4189
4190 // regular file?
4191 /*if (!cur->inode.is_file() && !cur->inode.is_dir()) {
4192 dout(7) << "not a file or dir " << *cur << dendl;
4193 respond_to_request(mdr, -CEPHFS_ENXIO); // FIXME what error do we want?
4194 return;
4195 }*/
4196 if ((flags & CEPH_O_DIRECTORY) && !cur->is_dir() && !cur->is_symlink()) {
4197 dout(7) << "specified O_DIRECTORY on non-directory " << *cur << dendl;
4198 respond_to_request(mdr, -CEPHFS_EINVAL);
4199 return;
4200 }
4201
4202 if ((flags & CEPH_O_TRUNC) && !cur->is_file()) {
4203 dout(7) << "specified O_TRUNC on !(file|symlink) " << *cur << dendl;
4204 // we should return -CEPHFS_EISDIR for directory, return -CEPHFS_EINVAL for other non-regular
4205 respond_to_request(mdr, cur->is_dir() ? -CEPHFS_EISDIR : -CEPHFS_EINVAL);
4206 return;
4207 }
4208
4209 if (cur->get_inode()->inline_data.version != CEPH_INLINE_NONE &&
4210 !mdr->session->get_connection()->has_feature(CEPH_FEATURE_MDS_INLINE_DATA)) {
4211 dout(7) << "old client cannot open inline data file " << *cur << dendl;
4212 respond_to_request(mdr, -CEPHFS_EPERM);
4213 return;
4214 }
4215
4216 // snapped data is read only
4217 if (mdr->snapid != CEPH_NOSNAP &&
4218 ((cmode & CEPH_FILE_MODE_WR) || req->may_write())) {
4219 dout(7) << "snap " << mdr->snapid << " is read-only " << *cur << dendl;
4220 respond_to_request(mdr, -CEPHFS_EROFS);
4221 return;
4222 }
4223
4224 MutationImpl::LockOpVec lov;
4225
4226 unsigned mask = req->head.args.open.mask;
4227 if (mask) {
4228 Capability *cap = cur->get_client_cap(mdr->get_client());
4229 int issued = 0;
4230 if (cap && (mdr->snapid == CEPH_NOSNAP || mdr->snapid <= cap->client_follows))
4231 issued = cap->issued();
4232 // permission bits, ACL/security xattrs
4233 if ((mask & CEPH_CAP_AUTH_SHARED) && (issued & CEPH_CAP_AUTH_EXCL) == 0)
4234 lov.add_rdlock(&cur->authlock);
4235 if ((mask & CEPH_CAP_XATTR_SHARED) && (issued & CEPH_CAP_XATTR_EXCL) == 0)
4236 lov.add_rdlock(&cur->xattrlock);
4237
4238 mdr->getattr_caps = mask;
4239 }
4240
4241 // O_TRUNC
4242 if ((flags & CEPH_O_TRUNC) && !mdr->has_completed) {
4243 ceph_assert(cur->is_auth());
4244
4245 lov.add_xlock(&cur->filelock);
4246 if (!mds->locker->acquire_locks(mdr, lov))
4247 return;
4248
4249 if (!check_access(mdr, cur, MAY_WRITE))
4250 return;
4251
4252 // wait for pending truncate?
4253 const auto& pi = cur->get_projected_inode();
4254 if (pi->is_truncating()) {
4255 dout(10) << " waiting for pending truncate from " << pi->truncate_from
4256 << " to " << pi->truncate_size << " to complete on " << *cur << dendl;
4257 mds->locker->drop_locks(mdr.get());
4258 mdr->drop_local_auth_pins();
4259 cur->add_waiter(CInode::WAIT_TRUNC, new C_MDS_RetryRequest(mdcache, mdr));
4260 return;
4261 }
4262
4263 do_open_truncate(mdr, cmode);
4264 return;
4265 }
4266
4267 // sync filelock if snapped.
4268 // this makes us wait for writers to flushsnaps, ensuring we get accurate metadata,
4269 // and that data itself is flushed so that we can read the snapped data off disk.
4270 if (mdr->snapid != CEPH_NOSNAP && !cur->is_dir()) {
4271 lov.add_rdlock(&cur->filelock);
4272 }
4273
4274 if (!mds->locker->acquire_locks(mdr, lov))
4275 return;
4276
4277 mask = MAY_READ;
4278 if (cmode & CEPH_FILE_MODE_WR)
4279 mask |= MAY_WRITE;
4280 if (!check_access(mdr, cur, mask))
4281 return;
4282
4283 utime_t now = ceph_clock_now();
4284 mdr->set_mds_stamp(now);
4285
4286 if (cur->is_file() || cur->is_dir()) {
4287 if (mdr->snapid == CEPH_NOSNAP) {
4288 // register new cap
4289 Capability *cap = mds->locker->issue_new_caps(cur, cmode, mdr, nullptr);
4290 if (cap)
4291 dout(12) << "open issued caps " << ccap_string(cap->pending())
4292 << " for " << req->get_source()
4293 << " on " << *cur << dendl;
4294 } else {
4295 int caps = ceph_caps_for_mode(cmode);
4296 dout(12) << "open issued IMMUTABLE SNAP caps " << ccap_string(caps)
4297 << " for " << req->get_source()
4298 << " snapid " << mdr->snapid
4299 << " on " << *cur << dendl;
4300 mdr->snap_caps = caps;
4301 }
4302 }
4303
4304 // increase max_size?
4305 if (cmode & CEPH_FILE_MODE_WR)
4306 mds->locker->check_inode_max_size(cur);
4307
4308 // make sure this inode gets into the journal
4309 if (cur->is_auth() && cur->last == CEPH_NOSNAP &&
4310 mdcache->open_file_table.should_log_open(cur)) {
4311 EOpen *le = new EOpen(mds->mdlog);
4312 mdlog->start_entry(le);
4313 le->add_clean_inode(cur);
4314 mdlog->submit_entry(le);
4315 }
4316
4317 // hit pop
4318 if (cmode & CEPH_FILE_MODE_WR)
4319 mds->balancer->hit_inode(cur, META_POP_IWR);
4320 else
4321 mds->balancer->hit_inode(cur, META_POP_IRD,
4322 mdr->client_request->get_source().num());
4323
4324 CDentry *dn = 0;
4325 if (req->get_dentry_wanted()) {
4326 ceph_assert(mdr->dn[0].size());
4327 dn = mdr->dn[0].back();
4328 }
4329
4330 mdr->tracei = cur;
4331 mdr->tracedn = dn;
4332 respond_to_request(mdr, 0);
4333 }
4334
4335 class C_MDS_openc_finish : public ServerLogContext {
4336 CDentry *dn;
4337 CInode *newi;
4338 public:
4339 C_MDS_openc_finish(Server *s, MDRequestRef& r, CDentry *d, CInode *ni) :
4340 ServerLogContext(s, r), dn(d), newi(ni) {}
4341 void finish(int r) override {
4342 ceph_assert(r == 0);
4343
4344 dn->pop_projected_linkage();
4345
4346 // dirty inode, dn, dir
4347 newi->mark_dirty(mdr->ls);
4348 newi->mark_dirty_parent(mdr->ls, true);
4349
4350 mdr->apply();
4351
4352 get_mds()->locker->share_inode_max_size(newi);
4353
4354 MDRequestRef null_ref;
4355 get_mds()->mdcache->send_dentry_link(dn, null_ref);
4356
4357 get_mds()->balancer->hit_inode(newi, META_POP_IWR);
4358
4359 server->respond_to_request(mdr, 0);
4360
4361 ceph_assert(g_conf()->mds_kill_openc_at != 1);
4362 }
4363 };
4364
4365 /* This function takes responsibility for the passed mdr*/
4366 void Server::handle_client_openc(MDRequestRef& mdr)
4367 {
4368 const cref_t<MClientRequest> &req = mdr->client_request;
4369 client_t client = mdr->get_client();
4370
4371 dout(7) << "open w/ O_CREAT on " << req->get_filepath() << dendl;
4372
4373 int cmode = ceph_flags_to_mode(req->head.args.open.flags);
4374 if (cmode < 0) {
4375 respond_to_request(mdr, -CEPHFS_EINVAL);
4376 return;
4377 }
4378
4379 bool excl = req->head.args.open.flags & CEPH_O_EXCL;
4380 CDentry *dn = rdlock_path_xlock_dentry(mdr, true, !excl, true);
4381 if (!dn)
4382 return;
4383
4384 CDentry::linkage_t *dnl = dn->get_projected_linkage();
4385 if (!excl && !dnl->is_null()) {
4386 // it existed.
4387 mds->locker->xlock_downgrade(&dn->lock, mdr.get());
4388
4389 MutationImpl::LockOpVec lov;
4390 lov.add_rdlock(&dnl->get_inode()->snaplock);
4391 if (!mds->locker->acquire_locks(mdr, lov))
4392 return;
4393
4394 handle_client_open(mdr);
4395 return;
4396 }
4397
4398 ceph_assert(dnl->is_null());
4399
4400 if (req->get_alternate_name().size() > alternate_name_max) {
4401 dout(10) << " alternate_name longer than " << alternate_name_max << dendl;
4402 respond_to_request(mdr, -CEPHFS_ENAMETOOLONG);
4403 return;
4404 }
4405 dn->set_alternate_name(req->get_alternate_name());
4406
4407 // set layout
4408 file_layout_t layout;
4409 if (mdr->dir_layout != file_layout_t())
4410 layout = mdr->dir_layout;
4411 else
4412 layout = mdcache->default_file_layout;
4413
4414 // What kind of client caps are required to complete this operation
4415 uint64_t access = MAY_WRITE;
4416
4417 const auto default_layout = layout;
4418
4419 // fill in any special params from client
4420 if (req->head.args.open.stripe_unit)
4421 layout.stripe_unit = req->head.args.open.stripe_unit;
4422 if (req->head.args.open.stripe_count)
4423 layout.stripe_count = req->head.args.open.stripe_count;
4424 if (req->head.args.open.object_size)
4425 layout.object_size = req->head.args.open.object_size;
4426 if (req->get_connection()->has_feature(CEPH_FEATURE_CREATEPOOLID) &&
4427 (__s32)req->head.args.open.pool >= 0) {
4428 layout.pool_id = req->head.args.open.pool;
4429
4430 // make sure we have as new a map as the client
4431 if (req->get_mdsmap_epoch() > mds->mdsmap->get_epoch()) {
4432 mds->wait_for_mdsmap(req->get_mdsmap_epoch(), new C_MDS_RetryRequest(mdcache, mdr));
4433 return;
4434 }
4435 }
4436
4437 // If client doesn't have capability to modify layout pools, then
4438 // only permit this request if the requested pool matches what the
4439 // file would have inherited anyway from its parent.
4440 if (default_layout != layout) {
4441 access |= MAY_SET_VXATTR;
4442 }
4443
4444 if (!layout.is_valid()) {
4445 dout(10) << " invalid initial file layout" << dendl;
4446 respond_to_request(mdr, -CEPHFS_EINVAL);
4447 return;
4448 }
4449 if (!mds->mdsmap->is_data_pool(layout.pool_id)) {
4450 dout(10) << " invalid data pool " << layout.pool_id << dendl;
4451 respond_to_request(mdr, -CEPHFS_EINVAL);
4452 return;
4453 }
4454
4455 // created null dn.
4456 CDir *dir = dn->get_dir();
4457 CInode *diri = dir->get_inode();
4458 if (!check_access(mdr, diri, access))
4459 return;
4460 if (!check_fragment_space(mdr, dir))
4461 return;
4462 if (!check_dir_max_entries(mdr, dir))
4463 return;
4464
4465 if (mdr->dn[0].size() == 1)
4466 mds->locker->create_lock_cache(mdr, diri, &mdr->dir_layout);
4467
4468 // create inode.
4469 CInode *newi = prepare_new_inode(mdr, dn->get_dir(), inodeno_t(req->head.ino),
4470 req->head.args.open.mode | S_IFREG, &layout);
4471 ceph_assert(newi);
4472
4473 // it's a file.
4474 dn->push_projected_linkage(newi);
4475
4476 auto _inode = newi->_get_inode();
4477 _inode->version = dn->pre_dirty();
4478 if (layout.pool_id != mdcache->default_file_layout.pool_id)
4479 _inode->add_old_pool(mdcache->default_file_layout.pool_id);
4480 _inode->update_backtrace();
4481 _inode->rstat.rfiles = 1;
4482 _inode->accounted_rstat = _inode->rstat;
4483
4484 SnapRealm *realm = diri->find_snaprealm();
4485 snapid_t follows = mdcache->get_global_snaprealm()->get_newest_seq();
4486 ceph_assert(follows >= realm->get_newest_seq());
4487
4488 ceph_assert(dn->first == follows+1);
4489 newi->first = dn->first;
4490
4491 // do the open
4492 Capability *cap = mds->locker->issue_new_caps(newi, cmode, mdr, realm);
4493 newi->authlock.set_state(LOCK_EXCL);
4494 newi->xattrlock.set_state(LOCK_EXCL);
4495
4496 if (cap && (cmode & CEPH_FILE_MODE_WR)) {
4497 _inode->client_ranges[client].range.first = 0;
4498 _inode->client_ranges[client].range.last = _inode->layout.stripe_unit;
4499 _inode->client_ranges[client].follows = follows;
4500 newi->mark_clientwriteable();
4501 cap->mark_clientwriteable();
4502 }
4503
4504 // prepare finisher
4505 mdr->ls = mdlog->get_current_segment();
4506 EUpdate *le = new EUpdate(mdlog, "openc");
4507 mdlog->start_entry(le);
4508 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
4509 journal_allocated_inos(mdr, &le->metablob);
4510 mdcache->predirty_journal_parents(mdr, &le->metablob, newi, dn->get_dir(), PREDIRTY_PRIMARY|PREDIRTY_DIR, 1);
4511 le->metablob.add_primary_dentry(dn, newi, true, true, true);
4512
4513 // make sure this inode gets into the journal
4514 le->metablob.add_opened_ino(newi->ino());
4515
4516 C_MDS_openc_finish *fin = new C_MDS_openc_finish(this, mdr, dn, newi);
4517
4518 if (mdr->session->info.has_feature(CEPHFS_FEATURE_DELEG_INO)) {
4519 openc_response_t ocresp;
4520
4521 dout(10) << "adding created_ino and delegated_inos" << dendl;
4522 ocresp.created_ino = _inode->ino;
4523
4524 if (delegate_inos_pct && !req->is_queued_for_replay()) {
4525 // Try to delegate some prealloc_inos to the client, if it's down to half the max
4526 unsigned frac = 100 / delegate_inos_pct;
4527 if (mdr->session->delegated_inos.size() < (unsigned)g_conf()->mds_client_prealloc_inos / frac / 2)
4528 mdr->session->delegate_inos(g_conf()->mds_client_prealloc_inos / frac, ocresp.delegated_inos);
4529 }
4530
4531 encode(ocresp, mdr->reply_extra_bl);
4532 } else if (mdr->client_request->get_connection()->has_feature(CEPH_FEATURE_REPLY_CREATE_INODE)) {
4533 dout(10) << "adding ino to reply to indicate inode was created" << dendl;
4534 // add the file created flag onto the reply if create_flags features is supported
4535 encode(newi->ino(), mdr->reply_extra_bl);
4536 }
4537
4538 journal_and_reply(mdr, newi, dn, le, fin);
4539
4540 // We hit_dir (via hit_inode) in our finish callback, but by then we might
4541 // have overshot the split size (multiple opencs in flight), so here is
4542 // an early chance to split the dir if this openc makes it oversized.
4543 mds->balancer->maybe_fragment(dir, false);
4544 }
4545
4546
4547
4548 void Server::handle_client_readdir(MDRequestRef& mdr)
4549 {
4550 const cref_t<MClientRequest> &req = mdr->client_request;
4551 Session *session = mds->get_session(req);
4552 client_t client = req->get_source().num();
4553 MutationImpl::LockOpVec lov;
4554 CInode *diri = rdlock_path_pin_ref(mdr, false, true);
4555 if (!diri) return;
4556
4557 // it's a directory, right?
4558 if (!diri->is_dir()) {
4559 // not a dir
4560 dout(10) << "reply to " << *req << " readdir -CEPHFS_ENOTDIR" << dendl;
4561 respond_to_request(mdr, -CEPHFS_ENOTDIR);
4562 return;
4563 }
4564
4565 auto num_caps = session->get_num_caps();
4566 auto session_cap_acquisition = session->get_cap_acquisition();
4567
4568 if (num_caps > static_cast<uint64_t>(max_caps_per_client * max_caps_throttle_ratio) && session_cap_acquisition >= cap_acquisition_throttle) {
4569 dout(20) << "readdir throttled. max_caps_per_client: " << max_caps_per_client << " num_caps: " << num_caps
4570 << " session_cap_acquistion: " << session_cap_acquisition << " cap_acquisition_throttle: " << cap_acquisition_throttle << dendl;
4571 if (logger)
4572 logger->inc(l_mdss_cap_acquisition_throttle);
4573
4574 mds->timer.add_event_after(caps_throttle_retry_request_timeout, new C_MDS_RetryRequest(mdcache, mdr));
4575 return;
4576 }
4577
4578 lov.add_rdlock(&diri->filelock);
4579 lov.add_rdlock(&diri->dirfragtreelock);
4580
4581 if (!mds->locker->acquire_locks(mdr, lov))
4582 return;
4583
4584 if (!check_access(mdr, diri, MAY_READ))
4585 return;
4586
4587 // which frag?
4588 frag_t fg = (__u32)req->head.args.readdir.frag;
4589 unsigned req_flags = (__u32)req->head.args.readdir.flags;
4590 string offset_str = req->get_path2();
4591
4592 __u32 offset_hash = 0;
4593 if (!offset_str.empty())
4594 offset_hash = ceph_frag_value(diri->hash_dentry_name(offset_str));
4595 else
4596 offset_hash = (__u32)req->head.args.readdir.offset_hash;
4597
4598 dout(10) << " frag " << fg << " offset '" << offset_str << "'"
4599 << " offset_hash " << offset_hash << " flags " << req_flags << dendl;
4600
4601 // does the frag exist?
4602 if (diri->dirfragtree[fg.value()] != fg) {
4603 frag_t newfg;
4604 if (req_flags & CEPH_READDIR_REPLY_BITFLAGS) {
4605 if (fg.contains((unsigned)offset_hash)) {
4606 newfg = diri->dirfragtree[offset_hash];
4607 } else {
4608 // client actually wants next frag
4609 newfg = diri->dirfragtree[fg.value()];
4610 }
4611 } else {
4612 offset_str.clear();
4613 newfg = diri->dirfragtree[fg.value()];
4614 }
4615 dout(10) << " adjust frag " << fg << " -> " << newfg << " " << diri->dirfragtree << dendl;
4616 fg = newfg;
4617 }
4618
4619 CDir *dir = try_open_auth_dirfrag(diri, fg, mdr);
4620 if (!dir) return;
4621
4622 // ok!
4623 dout(10) << "handle_client_readdir on " << *dir << dendl;
4624 ceph_assert(dir->is_auth());
4625
4626 if (!dir->is_complete()) {
4627 if (dir->is_frozen()) {
4628 dout(7) << "dir is frozen " << *dir << dendl;
4629 mds->locker->drop_locks(mdr.get());
4630 mdr->drop_local_auth_pins();
4631 dir->add_waiter(CDir::WAIT_UNFREEZE, new C_MDS_RetryRequest(mdcache, mdr));
4632 return;
4633 }
4634 // fetch
4635 dout(10) << " incomplete dir contents for readdir on " << *dir << ", fetching" << dendl;
4636 dir->fetch(new C_MDS_RetryRequest(mdcache, mdr), true);
4637 return;
4638 }
4639
4640 #ifdef MDS_VERIFY_FRAGSTAT
4641 dir->verify_fragstat();
4642 #endif
4643
4644 utime_t now = ceph_clock_now();
4645 mdr->set_mds_stamp(now);
4646
4647 snapid_t snapid = mdr->snapid;
4648 dout(10) << "snapid " << snapid << dendl;
4649
4650 SnapRealm *realm = diri->find_snaprealm();
4651
4652 unsigned max = req->head.args.readdir.max_entries;
4653 if (!max)
4654 max = dir->get_num_any(); // whatever, something big.
4655 unsigned max_bytes = req->head.args.readdir.max_bytes;
4656 if (!max_bytes)
4657 // make sure at least one item can be encoded
4658 max_bytes = (512 << 10) + g_conf()->mds_max_xattr_pairs_size;
4659
4660 // start final blob
4661 bufferlist dirbl;
4662 DirStat ds;
4663 ds.frag = dir->get_frag();
4664 ds.auth = dir->get_dir_auth().first;
4665 if (dir->is_auth() && !forward_all_requests_to_auth)
4666 dir->get_dist_spec(ds.dist, mds->get_nodeid());
4667
4668 dir->encode_dirstat(dirbl, mdr->session->info, ds);
4669
4670 // count bytes available.
4671 // this isn't perfect, but we should capture the main variable/unbounded size items!
4672 int front_bytes = dirbl.length() + sizeof(__u32) + sizeof(__u8)*2;
4673 int bytes_left = max_bytes - front_bytes;
4674 bytes_left -= realm->get_snap_trace().length();
4675
4676 // build dir contents
4677 bufferlist dnbl;
4678 __u32 numfiles = 0;
4679 bool start = !offset_hash && offset_str.empty();
4680 // skip all dns < dentry_key_t(snapid, offset_str, offset_hash)
4681 dentry_key_t skip_key(snapid, offset_str.c_str(), offset_hash);
4682 auto it = start ? dir->begin() : dir->lower_bound(skip_key);
4683 bool end = (it == dir->end());
4684 for (; !end && numfiles < max; end = (it == dir->end())) {
4685 CDentry *dn = it->second;
4686 ++it;
4687
4688 if (dn->state_test(CDentry::STATE_PURGING))
4689 continue;
4690
4691 bool dnp = dn->use_projected(client, mdr);
4692 CDentry::linkage_t *dnl = dnp ? dn->get_projected_linkage() : dn->get_linkage();
4693
4694 if (dnl->is_null())
4695 continue;
4696
4697 if (dn->last < snapid || dn->first > snapid) {
4698 dout(20) << "skipping non-overlapping snap " << *dn << dendl;
4699 continue;
4700 }
4701
4702 if (!start) {
4703 dentry_key_t offset_key(dn->last, offset_str.c_str(), offset_hash);
4704 if (!(offset_key < dn->key()))
4705 continue;
4706 }
4707
4708 CInode *in = dnl->get_inode();
4709
4710 if (in && in->ino() == CEPH_INO_CEPH)
4711 continue;
4712
4713 // remote link?
4714 // better for the MDS to do the work, if we think the client will stat any of these files.
4715 if (dnl->is_remote() && !in) {
4716 in = mdcache->get_inode(dnl->get_remote_ino());
4717 if (in) {
4718 dn->link_remote(dnl, in);
4719 } else if (dn->state_test(CDentry::STATE_BADREMOTEINO)) {
4720 dout(10) << "skipping bad remote ino on " << *dn << dendl;
4721 continue;
4722 } else {
4723 // touch everything i _do_ have
4724 for (auto &p : *dir) {
4725 if (!p.second->get_linkage()->is_null())
4726 mdcache->lru.lru_touch(p.second);
4727 }
4728
4729 // already issued caps and leases, reply immediately.
4730 if (dnbl.length() > 0) {
4731 mdcache->open_remote_dentry(dn, dnp, new C_MDSInternalNoop);
4732 dout(10) << " open remote dentry after caps were issued, stopping at "
4733 << dnbl.length() << " < " << bytes_left << dendl;
4734 break;
4735 }
4736
4737 mds->locker->drop_locks(mdr.get());
4738 mdr->drop_local_auth_pins();
4739 mdcache->open_remote_dentry(dn, dnp, new C_MDS_RetryRequest(mdcache, mdr));
4740 return;
4741 }
4742 }
4743 ceph_assert(in);
4744
4745 if ((int)(dnbl.length() + dn->get_name().length() + sizeof(__u32) + sizeof(LeaseStat)) > bytes_left) {
4746 dout(10) << " ran out of room, stopping at " << dnbl.length() << " < " << bytes_left << dendl;
4747 break;
4748 }
4749
4750 unsigned start_len = dnbl.length();
4751
4752 // dentry
4753 dout(12) << "including dn " << *dn << dendl;
4754 encode(dn->get_name(), dnbl);
4755 int lease_mask = dnl->is_primary() ? CEPH_LEASE_PRIMARY_LINK : 0;
4756 mds->locker->issue_client_lease(dn, mdr, lease_mask, now, dnbl);
4757
4758 // inode
4759 dout(12) << "including inode " << *in << dendl;
4760 int r = in->encode_inodestat(dnbl, mdr->session, realm, snapid, bytes_left - (int)dnbl.length());
4761 if (r < 0) {
4762 // chop off dn->name, lease
4763 dout(10) << " ran out of room, stopping at " << start_len << " < " << bytes_left << dendl;
4764 bufferlist keep;
4765 keep.substr_of(dnbl, 0, start_len);
4766 dnbl.swap(keep);
4767 break;
4768 }
4769 ceph_assert(r >= 0);
4770 numfiles++;
4771
4772 // touch dn
4773 mdcache->lru.lru_touch(dn);
4774 }
4775
4776 session->touch_readdir_cap(numfiles);
4777
4778 __u16 flags = 0;
4779 if (end) {
4780 flags = CEPH_READDIR_FRAG_END;
4781 if (start)
4782 flags |= CEPH_READDIR_FRAG_COMPLETE; // FIXME: what purpose does this serve
4783 }
4784 // client only understand END and COMPLETE flags ?
4785 if (req_flags & CEPH_READDIR_REPLY_BITFLAGS) {
4786 flags |= CEPH_READDIR_HASH_ORDER | CEPH_READDIR_OFFSET_HASH;
4787 }
4788
4789 // finish final blob
4790 encode(numfiles, dirbl);
4791 encode(flags, dirbl);
4792 dirbl.claim_append(dnbl);
4793
4794 // yay, reply
4795 dout(10) << "reply to " << *req << " readdir num=" << numfiles
4796 << " bytes=" << dirbl.length()
4797 << " start=" << (int)start
4798 << " end=" << (int)end
4799 << dendl;
4800 mdr->reply_extra_bl = dirbl;
4801
4802 // bump popularity. NOTE: this doesn't quite capture it.
4803 mds->balancer->hit_dir(dir, META_POP_READDIR, -1, numfiles);
4804
4805 // reply
4806 mdr->tracei = diri;
4807 respond_to_request(mdr, 0);
4808 }
4809
4810
4811
4812 // ===============================================================================
4813 // INODE UPDATES
4814
4815
4816 /*
4817 * finisher for basic inode updates
4818 */
4819 class C_MDS_inode_update_finish : public ServerLogContext {
4820 CInode *in;
4821 bool truncating_smaller, changed_ranges, adjust_realm;
4822 public:
4823 C_MDS_inode_update_finish(Server *s, MDRequestRef& r, CInode *i,
4824 bool sm=false, bool cr=false, bool ar=false) :
4825 ServerLogContext(s, r), in(i),
4826 truncating_smaller(sm), changed_ranges(cr), adjust_realm(ar) { }
4827 void finish(int r) override {
4828 ceph_assert(r == 0);
4829
4830 int snap_op = (in->snaprealm ? CEPH_SNAP_OP_UPDATE : CEPH_SNAP_OP_SPLIT);
4831
4832 // apply
4833 mdr->apply();
4834
4835 MDSRank *mds = get_mds();
4836
4837 // notify any clients
4838 if (truncating_smaller && in->get_inode()->is_truncating()) {
4839 mds->locker->issue_truncate(in);
4840 mds->mdcache->truncate_inode(in, mdr->ls);
4841 }
4842
4843 if (adjust_realm) {
4844 mds->mdcache->send_snap_update(in, 0, snap_op);
4845 mds->mdcache->do_realm_invalidate_and_update_notify(in, snap_op);
4846 }
4847
4848 get_mds()->balancer->hit_inode(in, META_POP_IWR);
4849
4850 server->respond_to_request(mdr, 0);
4851
4852 if (changed_ranges)
4853 get_mds()->locker->share_inode_max_size(in);
4854 }
4855 };
4856
4857 void Server::handle_client_file_setlock(MDRequestRef& mdr)
4858 {
4859 const cref_t<MClientRequest> &req = mdr->client_request;
4860 MutationImpl::LockOpVec lov;
4861
4862 // get the inode to operate on, and set up any locks needed for that
4863 CInode *cur = rdlock_path_pin_ref(mdr, true);
4864 if (!cur)
4865 return;
4866
4867 lov.add_xlock(&cur->flocklock);
4868 /* acquire_locks will return true if it gets the locks. If it fails,
4869 it will redeliver this request at a later date, so drop the request.
4870 */
4871 if (!mds->locker->acquire_locks(mdr, lov)) {
4872 dout(10) << "handle_client_file_setlock could not get locks!" << dendl;
4873 return;
4874 }
4875
4876 // copy the lock change into a ceph_filelock so we can store/apply it
4877 ceph_filelock set_lock;
4878 set_lock.start = req->head.args.filelock_change.start;
4879 set_lock.length = req->head.args.filelock_change.length;
4880 set_lock.client = req->get_orig_source().num();
4881 set_lock.owner = req->head.args.filelock_change.owner;
4882 set_lock.pid = req->head.args.filelock_change.pid;
4883 set_lock.type = req->head.args.filelock_change.type;
4884 bool will_wait = req->head.args.filelock_change.wait;
4885
4886 dout(10) << "handle_client_file_setlock: " << set_lock << dendl;
4887
4888 ceph_lock_state_t *lock_state = NULL;
4889 bool interrupt = false;
4890
4891 // get the appropriate lock state
4892 switch (req->head.args.filelock_change.rule) {
4893 case CEPH_LOCK_FLOCK_INTR:
4894 interrupt = true;
4895 // fall-thru
4896 case CEPH_LOCK_FLOCK:
4897 lock_state = cur->get_flock_lock_state();
4898 break;
4899
4900 case CEPH_LOCK_FCNTL_INTR:
4901 interrupt = true;
4902 // fall-thru
4903 case CEPH_LOCK_FCNTL:
4904 lock_state = cur->get_fcntl_lock_state();
4905 break;
4906
4907 default:
4908 dout(10) << "got unknown lock type " << set_lock.type
4909 << ", dropping request!" << dendl;
4910 respond_to_request(mdr, -CEPHFS_EOPNOTSUPP);
4911 return;
4912 }
4913
4914 dout(10) << " state prior to lock change: " << *lock_state << dendl;
4915 if (CEPH_LOCK_UNLOCK == set_lock.type) {
4916 list<ceph_filelock> activated_locks;
4917 MDSContext::vec waiters;
4918 if (lock_state->is_waiting(set_lock)) {
4919 dout(10) << " unlock removing waiting lock " << set_lock << dendl;
4920 lock_state->remove_waiting(set_lock);
4921 cur->take_waiting(CInode::WAIT_FLOCK, waiters);
4922 } else if (!interrupt) {
4923 dout(10) << " unlock attempt on " << set_lock << dendl;
4924 lock_state->remove_lock(set_lock, activated_locks);
4925 cur->take_waiting(CInode::WAIT_FLOCK, waiters);
4926 }
4927 mds->queue_waiters(waiters);
4928
4929 respond_to_request(mdr, 0);
4930 } else {
4931 dout(10) << " lock attempt on " << set_lock << dendl;
4932 bool deadlock = false;
4933 if (mdr->more()->flock_was_waiting &&
4934 !lock_state->is_waiting(set_lock)) {
4935 dout(10) << " was waiting for lock but not anymore, must have been canceled " << set_lock << dendl;
4936 respond_to_request(mdr, -CEPHFS_EINTR);
4937 } else if (!lock_state->add_lock(set_lock, will_wait, mdr->more()->flock_was_waiting, &deadlock)) {
4938 dout(10) << " it failed on this attempt" << dendl;
4939 // couldn't set lock right now
4940 if (deadlock) {
4941 respond_to_request(mdr, -CEPHFS_EDEADLK);
4942 } else if (!will_wait) {
4943 respond_to_request(mdr, -CEPHFS_EWOULDBLOCK);
4944 } else {
4945 dout(10) << " added to waiting list" << dendl;
4946 ceph_assert(lock_state->is_waiting(set_lock));
4947 mdr->more()->flock_was_waiting = true;
4948 mds->locker->drop_locks(mdr.get());
4949 mdr->drop_local_auth_pins();
4950 mdr->mark_event("failed to add lock, waiting");
4951 mdr->mark_nowarn();
4952 cur->add_waiter(CInode::WAIT_FLOCK, new C_MDS_RetryRequest(mdcache, mdr));
4953 }
4954 } else
4955 respond_to_request(mdr, 0);
4956 }
4957 dout(10) << " state after lock change: " << *lock_state << dendl;
4958 }
4959
4960 void Server::handle_client_file_readlock(MDRequestRef& mdr)
4961 {
4962 const cref_t<MClientRequest> &req = mdr->client_request;
4963 MutationImpl::LockOpVec lov;
4964
4965 // get the inode to operate on, and set up any locks needed for that
4966 CInode *cur = rdlock_path_pin_ref(mdr, true);
4967 if (!cur)
4968 return;
4969
4970 /* acquire_locks will return true if it gets the locks. If it fails,
4971 it will redeliver this request at a later date, so drop the request.
4972 */
4973 lov.add_rdlock(&cur->flocklock);
4974 if (!mds->locker->acquire_locks(mdr, lov)) {
4975 dout(10) << "handle_client_file_readlock could not get locks!" << dendl;
4976 return;
4977 }
4978
4979 // copy the lock change into a ceph_filelock so we can store/apply it
4980 ceph_filelock checking_lock;
4981 checking_lock.start = req->head.args.filelock_change.start;
4982 checking_lock.length = req->head.args.filelock_change.length;
4983 checking_lock.client = req->get_orig_source().num();
4984 checking_lock.owner = req->head.args.filelock_change.owner;
4985 checking_lock.pid = req->head.args.filelock_change.pid;
4986 checking_lock.type = req->head.args.filelock_change.type;
4987
4988 // get the appropriate lock state
4989 ceph_lock_state_t *lock_state = NULL;
4990 switch (req->head.args.filelock_change.rule) {
4991 case CEPH_LOCK_FLOCK:
4992 lock_state = cur->get_flock_lock_state();
4993 break;
4994
4995 case CEPH_LOCK_FCNTL:
4996 lock_state = cur->get_fcntl_lock_state();
4997 break;
4998
4999 default:
5000 dout(10) << "got unknown lock type " << checking_lock.type << dendl;
5001 respond_to_request(mdr, -CEPHFS_EINVAL);
5002 return;
5003 }
5004 lock_state->look_for_lock(checking_lock);
5005
5006 bufferlist lock_bl;
5007 encode(checking_lock, lock_bl);
5008
5009 mdr->reply_extra_bl = lock_bl;
5010 respond_to_request(mdr, 0);
5011 }
5012
5013 void Server::handle_client_setattr(MDRequestRef& mdr)
5014 {
5015 const cref_t<MClientRequest> &req = mdr->client_request;
5016 MutationImpl::LockOpVec lov;
5017 CInode *cur = rdlock_path_pin_ref(mdr, true);
5018 if (!cur) return;
5019
5020 if (mdr->snapid != CEPH_NOSNAP) {
5021 respond_to_request(mdr, -CEPHFS_EROFS);
5022 return;
5023 }
5024 if (cur->ino() < MDS_INO_SYSTEM_BASE && !cur->is_base()) {
5025 respond_to_request(mdr, -CEPHFS_EPERM);
5026 return;
5027 }
5028
5029 __u32 mask = req->head.args.setattr.mask;
5030 __u32 access_mask = MAY_WRITE;
5031
5032 // xlock inode
5033 if (mask & (CEPH_SETATTR_MODE|CEPH_SETATTR_UID|CEPH_SETATTR_GID|CEPH_SETATTR_BTIME|CEPH_SETATTR_KILL_SGUID))
5034 lov.add_xlock(&cur->authlock);
5035 if (mask & (CEPH_SETATTR_MTIME|CEPH_SETATTR_ATIME|CEPH_SETATTR_SIZE))
5036 lov.add_xlock(&cur->filelock);
5037 if (mask & CEPH_SETATTR_CTIME)
5038 lov.add_wrlock(&cur->versionlock);
5039
5040 if (!mds->locker->acquire_locks(mdr, lov))
5041 return;
5042
5043 if ((mask & CEPH_SETATTR_UID) && (cur->get_inode()->uid != req->head.args.setattr.uid))
5044 access_mask |= MAY_CHOWN;
5045
5046 if ((mask & CEPH_SETATTR_GID) && (cur->get_inode()->gid != req->head.args.setattr.gid))
5047 access_mask |= MAY_CHGRP;
5048
5049 if (!check_access(mdr, cur, access_mask))
5050 return;
5051
5052 // trunc from bigger -> smaller?
5053 const auto& pip = cur->get_projected_inode();
5054
5055 uint64_t old_size = std::max<uint64_t>(pip->size, req->head.args.setattr.old_size);
5056
5057 // CEPHFS_ENOSPC on growing file while full, but allow shrinks
5058 if (is_full && req->head.args.setattr.size > old_size) {
5059 dout(20) << __func__ << ": full, responding CEPHFS_ENOSPC to setattr with larger size" << dendl;
5060 respond_to_request(mdr, -CEPHFS_ENOSPC);
5061 return;
5062 }
5063
5064 bool truncating_smaller = false;
5065 if (mask & CEPH_SETATTR_SIZE) {
5066 truncating_smaller = req->head.args.setattr.size < old_size;
5067 if (truncating_smaller && pip->is_truncating()) {
5068 dout(10) << " waiting for pending truncate from " << pip->truncate_from
5069 << " to " << pip->truncate_size << " to complete on " << *cur << dendl;
5070 mds->locker->drop_locks(mdr.get());
5071 mdr->drop_local_auth_pins();
5072 cur->add_waiter(CInode::WAIT_TRUNC, new C_MDS_RetryRequest(mdcache, mdr));
5073 return;
5074 }
5075 }
5076
5077 bool changed_ranges = false;
5078
5079 // project update
5080 mdr->ls = mdlog->get_current_segment();
5081 EUpdate *le = new EUpdate(mdlog, "setattr");
5082 mdlog->start_entry(le);
5083
5084 auto pi = cur->project_inode(mdr);
5085
5086 if (mask & CEPH_SETATTR_UID)
5087 pi.inode->uid = req->head.args.setattr.uid;
5088 if (mask & CEPH_SETATTR_GID)
5089 pi.inode->gid = req->head.args.setattr.gid;
5090
5091 if (mask & CEPH_SETATTR_MODE)
5092 pi.inode->mode = (pi.inode->mode & ~07777) | (req->head.args.setattr.mode & 07777);
5093 else if ((mask & (CEPH_SETATTR_UID|CEPH_SETATTR_GID|CEPH_SETATTR_KILL_SGUID)) &&
5094 S_ISREG(pi.inode->mode) &&
5095 (pi.inode->mode & (S_IXUSR|S_IXGRP|S_IXOTH))) {
5096 pi.inode->mode &= ~(S_ISUID|S_ISGID);
5097 }
5098
5099 if (mask & CEPH_SETATTR_MTIME)
5100 pi.inode->mtime = req->head.args.setattr.mtime;
5101 if (mask & CEPH_SETATTR_ATIME)
5102 pi.inode->atime = req->head.args.setattr.atime;
5103 if (mask & CEPH_SETATTR_BTIME)
5104 pi.inode->btime = req->head.args.setattr.btime;
5105 if (mask & (CEPH_SETATTR_ATIME | CEPH_SETATTR_MTIME | CEPH_SETATTR_BTIME))
5106 pi.inode->time_warp_seq++; // maybe not a timewarp, but still a serialization point.
5107 if (mask & CEPH_SETATTR_SIZE) {
5108 if (truncating_smaller) {
5109 pi.inode->truncate(old_size, req->head.args.setattr.size);
5110 le->metablob.add_truncate_start(cur->ino());
5111 } else {
5112 pi.inode->size = req->head.args.setattr.size;
5113 pi.inode->rstat.rbytes = pi.inode->size;
5114 }
5115 pi.inode->mtime = mdr->get_op_stamp();
5116
5117 // adjust client's max_size?
5118 if (mds->locker->calc_new_client_ranges(cur, pi.inode->size)) {
5119 dout(10) << " client_ranges " << cur->get_previous_projected_inode()->client_ranges
5120 << " -> " << pi.inode->client_ranges << dendl;
5121 changed_ranges = true;
5122 }
5123 }
5124
5125 pi.inode->version = cur->pre_dirty();
5126 pi.inode->ctime = mdr->get_op_stamp();
5127 if (mdr->get_op_stamp() > pi.inode->rstat.rctime)
5128 pi.inode->rstat.rctime = mdr->get_op_stamp();
5129 pi.inode->change_attr++;
5130
5131 // log + wait
5132 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
5133 mdcache->predirty_journal_parents(mdr, &le->metablob, cur, 0, PREDIRTY_PRIMARY);
5134 mdcache->journal_dirty_inode(mdr.get(), &le->metablob, cur);
5135
5136 journal_and_reply(mdr, cur, 0, le, new C_MDS_inode_update_finish(this, mdr, cur,
5137 truncating_smaller, changed_ranges));
5138
5139 // flush immediately if there are readers/writers waiting
5140 if (mdr->is_xlocked(&cur->filelock) &&
5141 (cur->get_caps_wanted() & (CEPH_CAP_FILE_RD|CEPH_CAP_FILE_WR)))
5142 mds->mdlog->flush();
5143 }
5144
5145 /* Takes responsibility for mdr */
5146 void Server::do_open_truncate(MDRequestRef& mdr, int cmode)
5147 {
5148 CInode *in = mdr->in[0];
5149 client_t client = mdr->get_client();
5150 ceph_assert(in);
5151
5152 dout(10) << "do_open_truncate " << *in << dendl;
5153
5154 SnapRealm *realm = in->find_snaprealm();
5155 Capability *cap = mds->locker->issue_new_caps(in, cmode, mdr, realm);
5156
5157 mdr->ls = mdlog->get_current_segment();
5158 EUpdate *le = new EUpdate(mdlog, "open_truncate");
5159 mdlog->start_entry(le);
5160
5161 // prepare
5162 auto pi = in->project_inode(mdr);
5163 pi.inode->version = in->pre_dirty();
5164 pi.inode->mtime = pi.inode->ctime = mdr->get_op_stamp();
5165 if (mdr->get_op_stamp() > pi.inode->rstat.rctime)
5166 pi.inode->rstat.rctime = mdr->get_op_stamp();
5167 pi.inode->change_attr++;
5168
5169 uint64_t old_size = std::max<uint64_t>(pi.inode->size, mdr->client_request->head.args.open.old_size);
5170 if (old_size > 0) {
5171 pi.inode->truncate(old_size, 0);
5172 le->metablob.add_truncate_start(in->ino());
5173 }
5174
5175 bool changed_ranges = false;
5176 if (cap && (cmode & CEPH_FILE_MODE_WR)) {
5177 pi.inode->client_ranges[client].range.first = 0;
5178 pi.inode->client_ranges[client].range.last = pi.inode->get_layout_size_increment();
5179 pi.inode->client_ranges[client].follows = realm->get_newest_seq();
5180 changed_ranges = true;
5181 in->mark_clientwriteable();
5182 cap->mark_clientwriteable();
5183 }
5184
5185 le->metablob.add_client_req(mdr->reqid, mdr->client_request->get_oldest_client_tid());
5186
5187 mdcache->predirty_journal_parents(mdr, &le->metablob, in, 0, PREDIRTY_PRIMARY);
5188 mdcache->journal_dirty_inode(mdr.get(), &le->metablob, in);
5189
5190 // make sure ino gets into the journal
5191 le->metablob.add_opened_ino(in->ino());
5192
5193 mdr->o_trunc = true;
5194
5195 CDentry *dn = 0;
5196 if (mdr->client_request->get_dentry_wanted()) {
5197 ceph_assert(mdr->dn[0].size());
5198 dn = mdr->dn[0].back();
5199 }
5200
5201 journal_and_reply(mdr, in, dn, le, new C_MDS_inode_update_finish(this, mdr, in, old_size > 0,
5202 changed_ranges));
5203 // Although the `open` part can give an early reply, the truncation won't
5204 // happen until our EUpdate is persistent, to give the client a prompt
5205 // response we must also flush that event.
5206 mdlog->flush();
5207 }
5208
5209
5210 /* This function cleans up the passed mdr */
5211 void Server::handle_client_setlayout(MDRequestRef& mdr)
5212 {
5213 const cref_t<MClientRequest> &req = mdr->client_request;
5214 CInode *cur = rdlock_path_pin_ref(mdr, true);
5215 if (!cur) return;
5216
5217 if (mdr->snapid != CEPH_NOSNAP) {
5218 respond_to_request(mdr, -CEPHFS_EROFS);
5219 return;
5220 }
5221 if (!cur->is_file()) {
5222 respond_to_request(mdr, -CEPHFS_EINVAL);
5223 return;
5224 }
5225 if (cur->get_projected_inode()->size ||
5226 cur->get_projected_inode()->truncate_seq > 1) {
5227 respond_to_request(mdr, -CEPHFS_ENOTEMPTY);
5228 return;
5229 }
5230
5231 // validate layout
5232 file_layout_t layout = cur->get_projected_inode()->layout;
5233 // save existing layout for later
5234 const auto old_layout = layout;
5235
5236 int access = MAY_WRITE;
5237
5238 if (req->head.args.setlayout.layout.fl_object_size > 0)
5239 layout.object_size = req->head.args.setlayout.layout.fl_object_size;
5240 if (req->head.args.setlayout.layout.fl_stripe_unit > 0)
5241 layout.stripe_unit = req->head.args.setlayout.layout.fl_stripe_unit;
5242 if (req->head.args.setlayout.layout.fl_stripe_count > 0)
5243 layout.stripe_count=req->head.args.setlayout.layout.fl_stripe_count;
5244 if (req->head.args.setlayout.layout.fl_pg_pool > 0) {
5245 layout.pool_id = req->head.args.setlayout.layout.fl_pg_pool;
5246
5247 // make sure we have as new a map as the client
5248 if (req->get_mdsmap_epoch() > mds->mdsmap->get_epoch()) {
5249 mds->wait_for_mdsmap(req->get_mdsmap_epoch(), new C_MDS_RetryRequest(mdcache, mdr));
5250 return;
5251 }
5252 }
5253
5254 // Don't permit layout modifications without 'p' caps
5255 if (layout != old_layout) {
5256 access |= MAY_SET_VXATTR;
5257 }
5258
5259 if (!layout.is_valid()) {
5260 dout(10) << "bad layout" << dendl;
5261 respond_to_request(mdr, -CEPHFS_EINVAL);
5262 return;
5263 }
5264 if (!mds->mdsmap->is_data_pool(layout.pool_id)) {
5265 dout(10) << " invalid data pool " << layout.pool_id << dendl;
5266 respond_to_request(mdr, -CEPHFS_EINVAL);
5267 return;
5268 }
5269
5270 MutationImpl::LockOpVec lov;
5271 lov.add_xlock(&cur->filelock);
5272 if (!mds->locker->acquire_locks(mdr, lov))
5273 return;
5274
5275 if (!check_access(mdr, cur, access))
5276 return;
5277
5278 // project update
5279 auto pi = cur->project_inode(mdr);
5280 pi.inode->layout = layout;
5281 // add the old pool to the inode
5282 pi.inode->add_old_pool(old_layout.pool_id);
5283 pi.inode->version = cur->pre_dirty();
5284 pi.inode->ctime = mdr->get_op_stamp();
5285 if (mdr->get_op_stamp() > pi.inode->rstat.rctime)
5286 pi.inode->rstat.rctime = mdr->get_op_stamp();
5287 pi.inode->change_attr++;
5288
5289 // log + wait
5290 mdr->ls = mdlog->get_current_segment();
5291 EUpdate *le = new EUpdate(mdlog, "setlayout");
5292 mdlog->start_entry(le);
5293 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
5294 mdcache->predirty_journal_parents(mdr, &le->metablob, cur, 0, PREDIRTY_PRIMARY);
5295 mdcache->journal_dirty_inode(mdr.get(), &le->metablob, cur);
5296
5297 journal_and_reply(mdr, cur, 0, le, new C_MDS_inode_update_finish(this, mdr, cur));
5298 }
5299
5300 bool Server::xlock_policylock(MDRequestRef& mdr, CInode *in, bool want_layout, bool xlock_snaplock)
5301 {
5302 if (mdr->locking_state & MutationImpl::ALL_LOCKED)
5303 return true;
5304
5305 MutationImpl::LockOpVec lov;
5306 lov.add_xlock(&in->policylock);
5307 if (xlock_snaplock)
5308 lov.add_xlock(&in->snaplock);
5309 else
5310 lov.add_rdlock(&in->snaplock);
5311 if (!mds->locker->acquire_locks(mdr, lov))
5312 return false;
5313
5314 if (want_layout && in->get_projected_inode()->has_layout()) {
5315 mdr->dir_layout = in->get_projected_inode()->layout;
5316 want_layout = false;
5317 }
5318 if (CDentry *pdn = in->get_projected_parent_dn(); pdn) {
5319 if (!mds->locker->try_rdlock_snap_layout(pdn->get_dir()->get_inode(), mdr, 0, want_layout))
5320 return false;
5321 }
5322
5323 mdr->locking_state |= MutationImpl::ALL_LOCKED;
5324 return true;
5325 }
5326
5327 CInode* Server::try_get_auth_inode(MDRequestRef& mdr, inodeno_t ino)
5328 {
5329 CInode *in = mdcache->get_inode(ino);
5330 if (!in || in->state_test(CInode::STATE_PURGING)) {
5331 respond_to_request(mdr, -CEPHFS_ESTALE);
5332 return nullptr;
5333 }
5334 if (!in->is_auth()) {
5335 mdcache->request_forward(mdr, in->authority().first);
5336 return nullptr;
5337 }
5338
5339 return in;
5340 }
5341
5342 void Server::handle_client_setdirlayout(MDRequestRef& mdr)
5343 {
5344 const cref_t<MClientRequest> &req = mdr->client_request;
5345
5346 // can't use rdlock_path_pin_ref because we need to xlock snaplock/policylock
5347 CInode *cur = try_get_auth_inode(mdr, req->get_filepath().get_ino());
5348 if (!cur)
5349 return;
5350
5351 if (!cur->is_dir()) {
5352 respond_to_request(mdr, -CEPHFS_ENOTDIR);
5353 return;
5354 }
5355
5356 if (!xlock_policylock(mdr, cur, true))
5357 return;
5358
5359 // validate layout
5360 const auto& old_pi = cur->get_projected_inode();
5361 file_layout_t layout;
5362 if (old_pi->has_layout())
5363 layout = old_pi->layout;
5364 else if (mdr->dir_layout != file_layout_t())
5365 layout = mdr->dir_layout;
5366 else
5367 layout = mdcache->default_file_layout;
5368
5369 // Level of access required to complete
5370 int access = MAY_WRITE;
5371
5372 const auto old_layout = layout;
5373
5374 if (req->head.args.setlayout.layout.fl_object_size > 0)
5375 layout.object_size = req->head.args.setlayout.layout.fl_object_size;
5376 if (req->head.args.setlayout.layout.fl_stripe_unit > 0)
5377 layout.stripe_unit = req->head.args.setlayout.layout.fl_stripe_unit;
5378 if (req->head.args.setlayout.layout.fl_stripe_count > 0)
5379 layout.stripe_count=req->head.args.setlayout.layout.fl_stripe_count;
5380 if (req->head.args.setlayout.layout.fl_pg_pool > 0) {
5381 layout.pool_id = req->head.args.setlayout.layout.fl_pg_pool;
5382 // make sure we have as new a map as the client
5383 if (req->get_mdsmap_epoch() > mds->mdsmap->get_epoch()) {
5384 mds->wait_for_mdsmap(req->get_mdsmap_epoch(), new C_MDS_RetryRequest(mdcache, mdr));
5385 return;
5386 }
5387 }
5388
5389 if (layout != old_layout) {
5390 access |= MAY_SET_VXATTR;
5391 }
5392
5393 if (!layout.is_valid()) {
5394 dout(10) << "bad layout" << dendl;
5395 respond_to_request(mdr, -CEPHFS_EINVAL);
5396 return;
5397 }
5398 if (!mds->mdsmap->is_data_pool(layout.pool_id)) {
5399 dout(10) << " invalid data pool " << layout.pool_id << dendl;
5400 respond_to_request(mdr, -CEPHFS_EINVAL);
5401 return;
5402 }
5403
5404 if (!check_access(mdr, cur, access))
5405 return;
5406
5407 auto pi = cur->project_inode(mdr);
5408 pi.inode->layout = layout;
5409 pi.inode->version = cur->pre_dirty();
5410
5411 // log + wait
5412 mdr->ls = mdlog->get_current_segment();
5413 EUpdate *le = new EUpdate(mdlog, "setlayout");
5414 mdlog->start_entry(le);
5415 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
5416 mdcache->predirty_journal_parents(mdr, &le->metablob, cur, 0, PREDIRTY_PRIMARY);
5417 mdcache->journal_dirty_inode(mdr.get(), &le->metablob, cur);
5418
5419 mdr->no_early_reply = true;
5420 journal_and_reply(mdr, cur, 0, le, new C_MDS_inode_update_finish(this, mdr, cur));
5421 }
5422
5423 // XATTRS
5424
5425 int Server::parse_layout_vxattr(string name, string value, const OSDMap& osdmap,
5426 file_layout_t *layout, bool validate)
5427 {
5428 dout(20) << "parse_layout_vxattr name " << name << " value '" << value << "'" << dendl;
5429 try {
5430 if (name == "layout") {
5431 string::iterator begin = value.begin();
5432 string::iterator end = value.end();
5433 keys_and_values<string::iterator> p; // create instance of parser
5434 std::map<string, string> m; // map to receive results
5435 if (!qi::parse(begin, end, p, m)) { // returns true if successful
5436 return -CEPHFS_EINVAL;
5437 }
5438 string left(begin, end);
5439 dout(10) << " parsed " << m << " left '" << left << "'" << dendl;
5440 if (begin != end)
5441 return -CEPHFS_EINVAL;
5442 for (map<string,string>::iterator q = m.begin(); q != m.end(); ++q) {
5443 // Skip validation on each attr, we do it once at the end (avoid
5444 // rejecting intermediate states if the overall result is ok)
5445 int r = parse_layout_vxattr(string("layout.") + q->first, q->second,
5446 osdmap, layout, false);
5447 if (r < 0)
5448 return r;
5449 }
5450 } else if (name == "layout.object_size") {
5451 layout->object_size = boost::lexical_cast<unsigned>(value);
5452 } else if (name == "layout.stripe_unit") {
5453 layout->stripe_unit = boost::lexical_cast<unsigned>(value);
5454 } else if (name == "layout.stripe_count") {
5455 layout->stripe_count = boost::lexical_cast<unsigned>(value);
5456 } else if (name == "layout.pool") {
5457 try {
5458 layout->pool_id = boost::lexical_cast<unsigned>(value);
5459 } catch (boost::bad_lexical_cast const&) {
5460 int64_t pool = osdmap.lookup_pg_pool_name(value);
5461 if (pool < 0) {
5462 dout(10) << " unknown pool " << value << dendl;
5463 return -CEPHFS_ENOENT;
5464 }
5465 layout->pool_id = pool;
5466 }
5467 } else if (name == "layout.pool_namespace") {
5468 layout->pool_ns = value;
5469 } else {
5470 dout(10) << " unknown layout vxattr " << name << dendl;
5471 return -CEPHFS_EINVAL;
5472 }
5473 } catch (boost::bad_lexical_cast const&) {
5474 dout(10) << "bad vxattr value, unable to parse int for " << name << dendl;
5475 return -CEPHFS_EINVAL;
5476 }
5477
5478 if (validate && !layout->is_valid()) {
5479 dout(10) << "bad layout" << dendl;
5480 return -CEPHFS_EINVAL;
5481 }
5482 if (!mds->mdsmap->is_data_pool(layout->pool_id)) {
5483 dout(10) << " invalid data pool " << layout->pool_id << dendl;
5484 return -CEPHFS_EINVAL;
5485 }
5486 return 0;
5487 }
5488
5489 int Server::parse_quota_vxattr(string name, string value, quota_info_t *quota)
5490 {
5491 dout(20) << "parse_quota_vxattr name " << name << " value '" << value << "'" << dendl;
5492 try {
5493 if (name == "quota") {
5494 string::iterator begin = value.begin();
5495 string::iterator end = value.end();
5496 if (begin == end) {
5497 // keep quota unchanged. (for create_quota_realm())
5498 return 0;
5499 }
5500 keys_and_values<string::iterator> p; // create instance of parser
5501 std::map<string, string> m; // map to receive results
5502 if (!qi::parse(begin, end, p, m)) { // returns true if successful
5503 return -CEPHFS_EINVAL;
5504 }
5505 string left(begin, end);
5506 dout(10) << " parsed " << m << " left '" << left << "'" << dendl;
5507 if (begin != end)
5508 return -CEPHFS_EINVAL;
5509 for (map<string,string>::iterator q = m.begin(); q != m.end(); ++q) {
5510 int r = parse_quota_vxattr(string("quota.") + q->first, q->second, quota);
5511 if (r < 0)
5512 return r;
5513 }
5514 } else if (name == "quota.max_bytes") {
5515 int64_t q = boost::lexical_cast<int64_t>(value);
5516 if (q < 0)
5517 return -CEPHFS_EINVAL;
5518 quota->max_bytes = q;
5519 } else if (name == "quota.max_files") {
5520 int64_t q = boost::lexical_cast<int64_t>(value);
5521 if (q < 0)
5522 return -CEPHFS_EINVAL;
5523 quota->max_files = q;
5524 } else {
5525 dout(10) << " unknown quota vxattr " << name << dendl;
5526 return -CEPHFS_EINVAL;
5527 }
5528 } catch (boost::bad_lexical_cast const&) {
5529 dout(10) << "bad vxattr value, unable to parse int for " << name << dendl;
5530 return -CEPHFS_EINVAL;
5531 }
5532
5533 if (!quota->is_valid()) {
5534 dout(10) << "bad quota" << dendl;
5535 return -CEPHFS_EINVAL;
5536 }
5537 return 0;
5538 }
5539
5540 void Server::create_quota_realm(CInode *in)
5541 {
5542 dout(10) << __func__ << " " << *in << dendl;
5543
5544 auto req = make_message<MClientRequest>(CEPH_MDS_OP_SETXATTR);
5545 req->set_filepath(filepath(in->ino()));
5546 req->set_string2("ceph.quota");
5547 // empty vxattr value
5548 req->set_tid(mds->issue_tid());
5549
5550 mds->send_message_mds(req, in->authority().first);
5551 }
5552
5553 /*
5554 * Verify that the file layout attribute carried by client
5555 * is well-formatted.
5556 * Return 0 on success, otherwise this function takes
5557 * responsibility for the passed mdr.
5558 */
5559 int Server::check_layout_vxattr(MDRequestRef& mdr,
5560 string name,
5561 string value,
5562 file_layout_t *layout)
5563 {
5564 const cref_t<MClientRequest> &req = mdr->client_request;
5565 epoch_t epoch;
5566 int r;
5567
5568 mds->objecter->with_osdmap([&](const OSDMap& osdmap) {
5569 r = parse_layout_vxattr(name, value, osdmap, layout);
5570 epoch = osdmap.get_epoch();
5571 });
5572
5573 if (r == -CEPHFS_ENOENT) {
5574
5575 // we don't have the specified pool, make sure our map
5576 // is newer than or as new as the client.
5577 epoch_t req_epoch = req->get_osdmap_epoch();
5578
5579 if (req_epoch > epoch) {
5580
5581 // well, our map is older. consult mds.
5582 auto fin = new C_IO_Wrapper(mds, new C_MDS_RetryRequest(mdcache, mdr));
5583
5584 mds->objecter->wait_for_map(req_epoch, lambdafy(fin));
5585 return r;
5586 } else if (req_epoch == 0 && !mdr->waited_for_osdmap) {
5587
5588 // For compatibility with client w/ old code, we still need get the
5589 // latest map. One day if COMPACT_VERSION of MClientRequest >=3,
5590 // we can remove those code.
5591 mdr->waited_for_osdmap = true;
5592 mds->objecter->wait_for_latest_osdmap(std::ref(*new C_IO_Wrapper(
5593 mds, new C_MDS_RetryRequest(mdcache, mdr))));
5594 return r;
5595 }
5596 }
5597
5598 if (r < 0) {
5599
5600 if (r == -CEPHFS_ENOENT)
5601 r = -CEPHFS_EINVAL;
5602
5603 respond_to_request(mdr, r);
5604 return r;
5605 }
5606
5607 // all is well
5608 return 0;
5609 }
5610
5611 void Server::handle_set_vxattr(MDRequestRef& mdr, CInode *cur)
5612 {
5613 const cref_t<MClientRequest> &req = mdr->client_request;
5614 string name(req->get_path2());
5615 bufferlist bl = req->get_data();
5616 string value (bl.c_str(), bl.length());
5617 dout(10) << "handle_set_vxattr " << name
5618 << " val " << value.length()
5619 << " bytes on " << *cur
5620 << dendl;
5621
5622 CInode::mempool_inode *pip = nullptr;
5623 string rest;
5624
5625 if (!check_access(mdr, cur, MAY_SET_VXATTR)) {
5626 return;
5627 }
5628
5629 bool adjust_realm = false;
5630 if (name.compare(0, 15, "ceph.dir.layout") == 0) {
5631 if (!cur->is_dir()) {
5632 respond_to_request(mdr, -CEPHFS_EINVAL);
5633 return;
5634 }
5635
5636 if (!xlock_policylock(mdr, cur, true))
5637 return;
5638
5639 file_layout_t layout;
5640 if (cur->get_projected_inode()->has_layout())
5641 layout = cur->get_projected_inode()->layout;
5642 else if (mdr->dir_layout != file_layout_t())
5643 layout = mdr->dir_layout;
5644 else
5645 layout = mdcache->default_file_layout;
5646
5647 rest = name.substr(name.find("layout"));
5648 if (check_layout_vxattr(mdr, rest, value, &layout) < 0)
5649 return;
5650
5651 auto pi = cur->project_inode(mdr);
5652 pi.inode->layout = layout;
5653 mdr->no_early_reply = true;
5654 pip = pi.inode.get();
5655 } else if (name.compare(0, 16, "ceph.file.layout") == 0) {
5656 if (!cur->is_file()) {
5657 respond_to_request(mdr, -CEPHFS_EINVAL);
5658 return;
5659 }
5660 if (cur->get_projected_inode()->size ||
5661 cur->get_projected_inode()->truncate_seq > 1) {
5662 respond_to_request(mdr, -CEPHFS_ENOTEMPTY);
5663 return;
5664 }
5665 file_layout_t layout = cur->get_projected_inode()->layout;
5666 rest = name.substr(name.find("layout"));
5667 if (check_layout_vxattr(mdr, rest, value, &layout) < 0)
5668 return;
5669
5670 MutationImpl::LockOpVec lov;
5671 lov.add_xlock(&cur->filelock);
5672 if (!mds->locker->acquire_locks(mdr, lov))
5673 return;
5674
5675 auto pi = cur->project_inode(mdr);
5676 int64_t old_pool = pi.inode->layout.pool_id;
5677 pi.inode->add_old_pool(old_pool);
5678 pi.inode->layout = layout;
5679 pip = pi.inode.get();
5680 } else if (name.compare(0, 10, "ceph.quota") == 0) {
5681 if (!cur->is_dir()) {
5682 respond_to_request(mdr, -CEPHFS_EINVAL);
5683 return;
5684 }
5685
5686 quota_info_t quota = cur->get_projected_inode()->quota;
5687
5688 rest = name.substr(name.find("quota"));
5689 int r = parse_quota_vxattr(rest, value, &quota);
5690 if (r < 0) {
5691 respond_to_request(mdr, r);
5692 return;
5693 }
5694
5695 if (quota.is_enable() && !cur->get_projected_srnode())
5696 adjust_realm = true;
5697
5698 if (!xlock_policylock(mdr, cur, false, adjust_realm))
5699 return;
5700
5701 if (cur->get_projected_inode()->quota == quota) {
5702 respond_to_request(mdr, 0);
5703 return;
5704 }
5705
5706 auto pi = cur->project_inode(mdr, false, adjust_realm);
5707 pi.inode->quota = quota;
5708
5709 if (adjust_realm)
5710 pi.snapnode->created = pi.snapnode->seq = cur->find_snaprealm()->get_newest_seq();
5711
5712 mdr->no_early_reply = true;
5713 pip = pi.inode.get();
5714
5715 client_t exclude_ct = mdr->get_client();
5716 mdcache->broadcast_quota_to_client(cur, exclude_ct, true);
5717 } else if (name == "ceph.dir.subvolume"sv) {
5718 if (!cur->is_dir()) {
5719 respond_to_request(mdr, -CEPHFS_EINVAL);
5720 return;
5721 }
5722
5723 bool val;
5724 try {
5725 val = boost::lexical_cast<bool>(value);
5726 } catch (boost::bad_lexical_cast const&) {
5727 dout(10) << "bad vxattr value, unable to parse bool for " << name << dendl;
5728 respond_to_request(mdr, -CEPHFS_EINVAL);
5729 return;
5730 }
5731
5732 /* Verify it's not already a subvolume with lighter weight
5733 * rdlock.
5734 */
5735 if (!mdr->more()->rdonly_checks) {
5736 if (!(mdr->locking_state & MutationImpl::ALL_LOCKED)) {
5737 MutationImpl::LockOpVec lov;
5738 lov.add_rdlock(&cur->snaplock);
5739 if (!mds->locker->acquire_locks(mdr, lov))
5740 return;
5741 mdr->locking_state |= MutationImpl::ALL_LOCKED;
5742 }
5743 const auto srnode = cur->get_projected_srnode();
5744 if (val == (srnode && srnode->is_subvolume())) {
5745 dout(20) << "already marked subvolume" << dendl;
5746 respond_to_request(mdr, 0);
5747 return;
5748 }
5749 mdr->more()->rdonly_checks = true;
5750 }
5751
5752 if ((mdr->locking_state & MutationImpl::ALL_LOCKED) && !mdr->is_xlocked(&cur->snaplock)) {
5753 /* drop the rdlock and acquire xlocks */
5754 dout(20) << "dropping rdlocks" << dendl;
5755 mds->locker->drop_locks(mdr.get());
5756 if (!xlock_policylock(mdr, cur, false, true))
5757 return;
5758 }
5759
5760 /* repeat rdonly checks in case changed between rdlock -> xlock */
5761 SnapRealm *realm = cur->find_snaprealm();
5762 if (val) {
5763 inodeno_t subvol_ino = realm->get_subvolume_ino();
5764 // can't create subvolume inside another subvolume
5765 if (subvol_ino && subvol_ino != cur->ino()) {
5766 respond_to_request(mdr, -CEPHFS_EINVAL);
5767 return;
5768 }
5769 }
5770
5771 const auto srnode = cur->get_projected_srnode();
5772 if (val == (srnode && srnode->is_subvolume())) {
5773 respond_to_request(mdr, 0);
5774 return;
5775 }
5776
5777 auto pi = cur->project_inode(mdr, false, true);
5778 if (!srnode)
5779 pi.snapnode->created = pi.snapnode->seq = realm->get_newest_seq();
5780 if (val)
5781 pi.snapnode->mark_subvolume();
5782 else
5783 pi.snapnode->clear_subvolume();
5784
5785 mdr->no_early_reply = true;
5786 pip = pi.inode.get();
5787 adjust_realm = true;
5788 } else if (name == "ceph.dir.pin"sv) {
5789 if (!cur->is_dir() || cur->is_root()) {
5790 respond_to_request(mdr, -CEPHFS_EINVAL);
5791 return;
5792 }
5793
5794 mds_rank_t rank;
5795 try {
5796 rank = boost::lexical_cast<mds_rank_t>(value);
5797 if (rank < 0) rank = MDS_RANK_NONE;
5798 else if (rank >= MAX_MDS) {
5799 respond_to_request(mdr, -CEPHFS_EDOM);
5800 return;
5801 }
5802 } catch (boost::bad_lexical_cast const&) {
5803 dout(10) << "bad vxattr value, unable to parse int for " << name << dendl;
5804 respond_to_request(mdr, -CEPHFS_EINVAL);
5805 return;
5806 }
5807
5808 if (!xlock_policylock(mdr, cur))
5809 return;
5810
5811 auto pi = cur->project_inode(mdr);
5812 cur->set_export_pin(rank);
5813 pip = pi.inode.get();
5814 } else if (name == "ceph.dir.pin.random"sv) {
5815 if (!cur->is_dir() || cur->is_root()) {
5816 respond_to_request(mdr, -CEPHFS_EINVAL);
5817 return;
5818 }
5819
5820 double val;
5821 try {
5822 val = boost::lexical_cast<double>(value);
5823 } catch (boost::bad_lexical_cast const&) {
5824 dout(10) << "bad vxattr value, unable to parse float for " << name << dendl;
5825 respond_to_request(mdr, -CEPHFS_EINVAL);
5826 return;
5827 }
5828
5829 if (val < 0.0 || 1.0 < val) {
5830 respond_to_request(mdr, -CEPHFS_EDOM);
5831 return;
5832 } else if (mdcache->export_ephemeral_random_max < val) {
5833 respond_to_request(mdr, -CEPHFS_EINVAL);
5834 return;
5835 }
5836
5837 if (!xlock_policylock(mdr, cur))
5838 return;
5839
5840 auto pi = cur->project_inode(mdr);
5841 cur->setxattr_ephemeral_rand(val);
5842 pip = pi.inode.get();
5843 } else if (name == "ceph.dir.pin.distributed"sv) {
5844 if (!cur->is_dir() || cur->is_root()) {
5845 respond_to_request(mdr, -CEPHFS_EINVAL);
5846 return;
5847 }
5848
5849 bool val;
5850 try {
5851 val = boost::lexical_cast<bool>(value);
5852 } catch (boost::bad_lexical_cast const&) {
5853 dout(10) << "bad vxattr value, unable to parse bool for " << name << dendl;
5854 respond_to_request(mdr, -CEPHFS_EINVAL);
5855 return;
5856 }
5857
5858 if (!xlock_policylock(mdr, cur))
5859 return;
5860
5861 auto pi = cur->project_inode(mdr);
5862 cur->setxattr_ephemeral_dist(val);
5863 pip = pi.inode.get();
5864 } else {
5865 dout(10) << " unknown vxattr " << name << dendl;
5866 respond_to_request(mdr, -CEPHFS_EINVAL);
5867 return;
5868 }
5869
5870 pip->change_attr++;
5871 pip->ctime = mdr->get_op_stamp();
5872 if (mdr->get_op_stamp() > pip->rstat.rctime)
5873 pip->rstat.rctime = mdr->get_op_stamp();
5874 pip->version = cur->pre_dirty();
5875 if (cur->is_file())
5876 pip->update_backtrace();
5877
5878 // log + wait
5879 mdr->ls = mdlog->get_current_segment();
5880 EUpdate *le = new EUpdate(mdlog, "set vxattr layout");
5881 mdlog->start_entry(le);
5882 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
5883 mdcache->predirty_journal_parents(mdr, &le->metablob, cur, 0, PREDIRTY_PRIMARY);
5884 mdcache->journal_dirty_inode(mdr.get(), &le->metablob, cur);
5885
5886 journal_and_reply(mdr, cur, 0, le, new C_MDS_inode_update_finish(this, mdr, cur,
5887 false, false, adjust_realm));
5888 return;
5889 }
5890
5891 void Server::handle_remove_vxattr(MDRequestRef& mdr, CInode *cur)
5892 {
5893 const cref_t<MClientRequest> &req = mdr->client_request;
5894 string name(req->get_path2());
5895
5896 dout(10) << __func__ << " " << name << " on " << *cur << dendl;
5897
5898 if (name == "ceph.dir.layout") {
5899 if (!cur->is_dir()) {
5900 respond_to_request(mdr, -CEPHFS_ENODATA);
5901 return;
5902 }
5903 if (cur->is_root()) {
5904 dout(10) << "can't remove layout policy on the root directory" << dendl;
5905 respond_to_request(mdr, -CEPHFS_EINVAL);
5906 return;
5907 }
5908
5909 if (!cur->get_projected_inode()->has_layout()) {
5910 respond_to_request(mdr, -CEPHFS_ENODATA);
5911 return;
5912 }
5913
5914 MutationImpl::LockOpVec lov;
5915 lov.add_xlock(&cur->policylock);
5916 if (!mds->locker->acquire_locks(mdr, lov))
5917 return;
5918
5919 auto pi = cur->project_inode(mdr);
5920 pi.inode->clear_layout();
5921 pi.inode->version = cur->pre_dirty();
5922
5923 // log + wait
5924 mdr->ls = mdlog->get_current_segment();
5925 EUpdate *le = new EUpdate(mdlog, "remove dir layout vxattr");
5926 mdlog->start_entry(le);
5927 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
5928 mdcache->predirty_journal_parents(mdr, &le->metablob, cur, 0, PREDIRTY_PRIMARY);
5929 mdcache->journal_dirty_inode(mdr.get(), &le->metablob, cur);
5930
5931 mdr->no_early_reply = true;
5932 journal_and_reply(mdr, cur, 0, le, new C_MDS_inode_update_finish(this, mdr, cur));
5933 return;
5934 } else if (name == "ceph.dir.layout.pool_namespace"
5935 || name == "ceph.file.layout.pool_namespace") {
5936 // Namespace is the only layout field that has a meaningful
5937 // null/none value (empty string, means default layout). Is equivalent
5938 // to a setxattr with empty string: pass through the empty payload of
5939 // the rmxattr request to do this.
5940 handle_set_vxattr(mdr, cur);
5941 return;
5942 }
5943
5944 respond_to_request(mdr, -CEPHFS_ENODATA);
5945 }
5946
5947 const Server::XattrHandler Server::xattr_handlers[] = {
5948 {
5949 xattr_name: Server::DEFAULT_HANDLER,
5950 description: "default xattr handler",
5951 validate: &Server::default_xattr_validate,
5952 setxattr: &Server::default_setxattr_handler,
5953 removexattr: &Server::default_removexattr_handler,
5954 },
5955 {
5956 xattr_name: "ceph.mirror.info",
5957 description: "mirror info xattr handler",
5958 validate: &Server::mirror_info_xattr_validate,
5959 setxattr: &Server::mirror_info_setxattr_handler,
5960 removexattr: &Server::mirror_info_removexattr_handler
5961 },
5962 };
5963
5964 const Server::XattrHandler* Server::get_xattr_or_default_handler(std::string_view xattr_name) {
5965 const XattrHandler *default_xattr_handler = nullptr;
5966
5967 for (auto &handler : xattr_handlers) {
5968 if (handler.xattr_name == Server::DEFAULT_HANDLER) {
5969 ceph_assert(default_xattr_handler == nullptr);
5970 default_xattr_handler = &handler;
5971 }
5972 if (handler.xattr_name == xattr_name) {
5973 dout(20) << "handler=" << handler.description << dendl;
5974 return &handler;
5975 }
5976 }
5977
5978 ceph_assert(default_xattr_handler != nullptr);
5979 dout(20) << "handler=" << default_xattr_handler->description << dendl;
5980 return default_xattr_handler;
5981 }
5982
5983 int Server::xattr_validate(CInode *cur, const InodeStoreBase::xattr_map_const_ptr xattrs,
5984 const std::string &xattr_name, int op, int flags) {
5985 if (op == CEPH_MDS_OP_SETXATTR) {
5986 if (xattrs) {
5987 if ((flags & CEPH_XATTR_CREATE) && xattrs->count(mempool::mds_co::string(xattr_name))) {
5988 dout(10) << "setxattr '" << xattr_name << "' XATTR_CREATE and CEPHFS_EEXIST on " << *cur << dendl;
5989 return -CEPHFS_EEXIST;
5990 }
5991 }
5992 if ((flags & CEPH_XATTR_REPLACE) && !(xattrs && xattrs->count(mempool::mds_co::string(xattr_name)))) {
5993 dout(10) << "setxattr '" << xattr_name << "' XATTR_REPLACE and CEPHFS_ENODATA on " << *cur << dendl;
5994 return -CEPHFS_ENODATA;
5995 }
5996
5997 return 0;
5998 }
5999
6000 if (op == CEPH_MDS_OP_RMXATTR) {
6001 if (!xattrs || xattrs->count(mempool::mds_co::string(xattr_name)) == 0) {
6002 dout(10) << "removexattr '" << xattr_name << "' and CEPHFS_ENODATA on " << *cur << dendl;
6003 return -CEPHFS_ENODATA;
6004 }
6005
6006 return 0;
6007 }
6008
6009 derr << ": unhandled validation for: " << xattr_name << dendl;
6010 return -CEPHFS_EINVAL;
6011 }
6012
6013 void Server::xattr_set(InodeStoreBase::xattr_map_ptr xattrs, const std::string &xattr_name,
6014 const bufferlist &xattr_value) {
6015 size_t len = xattr_value.length();
6016 bufferptr b = buffer::create(len);
6017 if (len) {
6018 xattr_value.begin().copy(len, b.c_str());
6019 }
6020 auto em = xattrs->emplace(std::piecewise_construct,
6021 std::forward_as_tuple(mempool::mds_co::string(xattr_name)),
6022 std::forward_as_tuple(b));
6023 if (!em.second) {
6024 em.first->second = b;
6025 }
6026 }
6027
6028 void Server::xattr_rm(InodeStoreBase::xattr_map_ptr xattrs, const std::string &xattr_name) {
6029 xattrs->erase(mempool::mds_co::string(xattr_name));
6030 }
6031
6032 int Server::default_xattr_validate(CInode *cur, const InodeStoreBase::xattr_map_const_ptr xattrs,
6033 XattrOp *xattr_op) {
6034 return xattr_validate(cur, xattrs, xattr_op->xattr_name, xattr_op->op, xattr_op->flags);
6035 }
6036
6037 void Server::default_setxattr_handler(CInode *cur, InodeStoreBase::xattr_map_ptr xattrs,
6038 const XattrOp &xattr_op) {
6039 xattr_set(xattrs, xattr_op.xattr_name, xattr_op.xattr_value);
6040 }
6041
6042 void Server::default_removexattr_handler(CInode *cur, InodeStoreBase::xattr_map_ptr xattrs,
6043 const XattrOp &xattr_op) {
6044 xattr_rm(xattrs, xattr_op.xattr_name);
6045 }
6046
6047 // mirror info xattr handlers
6048 const std::string Server::MirrorXattrInfo::MIRROR_INFO_REGEX = "^cluster_id=([a-f0-9]{8}-" \
6049 "[a-f0-9]{4}-[a-f0-9]{4}-" \
6050 "[a-f0-9]{4}-[a-f0-9]{12})" \
6051 " fs_id=(\\d+)$";
6052 const std::string Server::MirrorXattrInfo::CLUSTER_ID = "ceph.mirror.info.cluster_id";
6053 const std::string Server::MirrorXattrInfo::FS_ID = "ceph.mirror.info.fs_id";
6054 int Server::parse_mirror_info_xattr(const std::string &name, const std::string &value,
6055 std::string &cluster_id, std::string &fs_id) {
6056 dout(20) << "parsing name=" << name << ", value=" << value << dendl;
6057
6058 static const std::regex regex(Server::MirrorXattrInfo::MIRROR_INFO_REGEX);
6059 std::smatch match;
6060
6061 std::regex_search(value, match, regex);
6062 if (match.size() != 3) {
6063 derr << "mirror info parse error" << dendl;
6064 return -CEPHFS_EINVAL;
6065 }
6066
6067 cluster_id = match[1];
6068 fs_id = match[2];
6069 dout(20) << " parsed cluster_id=" << cluster_id << ", fs_id=" << fs_id << dendl;
6070 return 0;
6071 }
6072
6073 int Server::mirror_info_xattr_validate(CInode *cur, const InodeStoreBase::xattr_map_const_ptr xattrs,
6074 XattrOp *xattr_op) {
6075 if (!cur->is_root()) {
6076 return -CEPHFS_EINVAL;
6077 }
6078
6079 int v1 = xattr_validate(cur, xattrs, Server::MirrorXattrInfo::CLUSTER_ID, xattr_op->op, xattr_op->flags);
6080 int v2 = xattr_validate(cur, xattrs, Server::MirrorXattrInfo::FS_ID, xattr_op->op, xattr_op->flags);
6081 if (v1 != v2) {
6082 derr << "inconsistent mirror info state (" << v1 << "," << v2 << ")" << dendl;
6083 return -CEPHFS_EINVAL;
6084 }
6085
6086 if (v1 < 0) {
6087 return v1;
6088 }
6089
6090 if (xattr_op->op == CEPH_MDS_OP_RMXATTR) {
6091 return 0;
6092 }
6093
6094 std::string cluster_id;
6095 std::string fs_id;
6096 int r = parse_mirror_info_xattr(xattr_op->xattr_name, xattr_op->xattr_value.to_str(),
6097 cluster_id, fs_id);
6098 if (r < 0) {
6099 return r;
6100 }
6101
6102 xattr_op->xinfo = std::make_unique<MirrorXattrInfo>(cluster_id, fs_id);
6103 return 0;
6104 }
6105
6106 void Server::mirror_info_setxattr_handler(CInode *cur, InodeStoreBase::xattr_map_ptr xattrs,
6107 const XattrOp &xattr_op) {
6108 auto mirror_info = dynamic_cast<MirrorXattrInfo&>(*(xattr_op.xinfo));
6109
6110 bufferlist bl;
6111 bl.append(mirror_info.cluster_id.c_str(), mirror_info.cluster_id.length());
6112 xattr_set(xattrs, Server::MirrorXattrInfo::CLUSTER_ID, bl);
6113
6114 bl.clear();
6115 bl.append(mirror_info.fs_id.c_str(), mirror_info.fs_id.length());
6116 xattr_set(xattrs, Server::MirrorXattrInfo::FS_ID, bl);
6117 }
6118
6119 void Server::mirror_info_removexattr_handler(CInode *cur, InodeStoreBase::xattr_map_ptr xattrs,
6120 const XattrOp &xattr_op) {
6121 xattr_rm(xattrs, Server::MirrorXattrInfo::CLUSTER_ID);
6122 xattr_rm(xattrs, Server::MirrorXattrInfo::FS_ID);
6123 }
6124
6125 void Server::handle_client_setxattr(MDRequestRef& mdr)
6126 {
6127 const cref_t<MClientRequest> &req = mdr->client_request;
6128 string name(req->get_path2());
6129
6130 // is a ceph virtual xattr?
6131 if (is_ceph_vxattr(name)) {
6132 // can't use rdlock_path_pin_ref because we need to xlock snaplock/policylock
6133 CInode *cur = try_get_auth_inode(mdr, req->get_filepath().get_ino());
6134 if (!cur)
6135 return;
6136
6137 handle_set_vxattr(mdr, cur);
6138 return;
6139 }
6140
6141 if (!is_allowed_ceph_xattr(name)) {
6142 respond_to_request(mdr, -CEPHFS_EINVAL);
6143 return;
6144 }
6145
6146 CInode *cur = rdlock_path_pin_ref(mdr, true);
6147 if (!cur)
6148 return;
6149
6150 if (mdr->snapid != CEPH_NOSNAP) {
6151 respond_to_request(mdr, -CEPHFS_EROFS);
6152 return;
6153 }
6154
6155 int flags = req->head.args.setxattr.flags;
6156
6157 MutationImpl::LockOpVec lov;
6158 lov.add_xlock(&cur->xattrlock);
6159 if (!mds->locker->acquire_locks(mdr, lov))
6160 return;
6161
6162 if (!check_access(mdr, cur, MAY_WRITE))
6163 return;
6164
6165 size_t len = req->get_data().length();
6166 size_t inc = len + name.length();
6167
6168 auto handler = Server::get_xattr_or_default_handler(name);
6169 const auto& pxattrs = cur->get_projected_xattrs();
6170 if (pxattrs) {
6171 // check xattrs kv pairs size
6172 size_t cur_xattrs_size = 0;
6173 for (const auto& p : *pxattrs) {
6174 if ((flags & CEPH_XATTR_REPLACE) && name.compare(p.first) == 0) {
6175 continue;
6176 }
6177 cur_xattrs_size += p.first.length() + p.second.length();
6178 }
6179
6180 if (((cur_xattrs_size + inc) > g_conf()->mds_max_xattr_pairs_size)) {
6181 dout(10) << "xattr kv pairs size too big. cur_xattrs_size "
6182 << cur_xattrs_size << ", inc " << inc << dendl;
6183 respond_to_request(mdr, -CEPHFS_ENOSPC);
6184 return;
6185 }
6186 }
6187
6188 XattrOp xattr_op(CEPH_MDS_OP_SETXATTR, name, req->get_data(), flags);
6189 int r = std::invoke(handler->validate, this, cur, pxattrs, &xattr_op);
6190 if (r < 0) {
6191 respond_to_request(mdr, r);
6192 return;
6193 }
6194
6195 dout(10) << "setxattr '" << name << "' len " << len << " on " << *cur << dendl;
6196
6197 // project update
6198 auto pi = cur->project_inode(mdr, true);
6199 pi.inode->version = cur->pre_dirty();
6200 pi.inode->ctime = mdr->get_op_stamp();
6201 if (mdr->get_op_stamp() > pi.inode->rstat.rctime)
6202 pi.inode->rstat.rctime = mdr->get_op_stamp();
6203 if (name == "encryption.ctx"sv)
6204 pi.inode->fscrypt = true;
6205 pi.inode->change_attr++;
6206 pi.inode->xattr_version++;
6207
6208 if ((flags & CEPH_XATTR_REMOVE)) {
6209 std::invoke(handler->removexattr, this, cur, pi.xattrs, xattr_op);
6210 } else {
6211 std::invoke(handler->setxattr, this, cur, pi.xattrs, xattr_op);
6212 }
6213
6214 // log + wait
6215 mdr->ls = mdlog->get_current_segment();
6216 EUpdate *le = new EUpdate(mdlog, "setxattr");
6217 mdlog->start_entry(le);
6218 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
6219 mdcache->predirty_journal_parents(mdr, &le->metablob, cur, 0, PREDIRTY_PRIMARY);
6220 mdcache->journal_dirty_inode(mdr.get(), &le->metablob, cur);
6221
6222 journal_and_reply(mdr, cur, 0, le, new C_MDS_inode_update_finish(this, mdr, cur));
6223 }
6224
6225 void Server::handle_client_removexattr(MDRequestRef& mdr)
6226 {
6227 const cref_t<MClientRequest> &req = mdr->client_request;
6228 std::string name(req->get_path2());
6229
6230 // is a ceph virtual xattr?
6231 if (is_ceph_vxattr(name)) {
6232 // can't use rdlock_path_pin_ref because we need to xlock snaplock/policylock
6233 CInode *cur = try_get_auth_inode(mdr, req->get_filepath().get_ino());
6234 if (!cur)
6235 return;
6236
6237 handle_remove_vxattr(mdr, cur);
6238 return;
6239 }
6240
6241 if (!is_allowed_ceph_xattr(name)) {
6242 respond_to_request(mdr, -CEPHFS_EINVAL);
6243 return;
6244 }
6245
6246 CInode* cur = rdlock_path_pin_ref(mdr, true);
6247 if (!cur)
6248 return;
6249
6250 if (mdr->snapid != CEPH_NOSNAP) {
6251 respond_to_request(mdr, -CEPHFS_EROFS);
6252 return;
6253 }
6254
6255 MutationImpl::LockOpVec lov;
6256 lov.add_xlock(&cur->xattrlock);
6257 if (!mds->locker->acquire_locks(mdr, lov))
6258 return;
6259
6260
6261 auto handler = Server::get_xattr_or_default_handler(name);
6262 bufferlist bl;
6263 XattrOp xattr_op(CEPH_MDS_OP_RMXATTR, name, bl, 0);
6264
6265 const auto& pxattrs = cur->get_projected_xattrs();
6266 int r = std::invoke(handler->validate, this, cur, pxattrs, &xattr_op);
6267 if (r < 0) {
6268 respond_to_request(mdr, r);
6269 return;
6270 }
6271
6272 dout(10) << "removexattr '" << name << "' on " << *cur << dendl;
6273
6274 // project update
6275 auto pi = cur->project_inode(mdr, true);
6276 pi.inode->version = cur->pre_dirty();
6277 pi.inode->ctime = mdr->get_op_stamp();
6278 if (mdr->get_op_stamp() > pi.inode->rstat.rctime)
6279 pi.inode->rstat.rctime = mdr->get_op_stamp();
6280 pi.inode->change_attr++;
6281 pi.inode->xattr_version++;
6282 std::invoke(handler->removexattr, this, cur, pi.xattrs, xattr_op);
6283
6284 // log + wait
6285 mdr->ls = mdlog->get_current_segment();
6286 EUpdate *le = new EUpdate(mdlog, "removexattr");
6287 mdlog->start_entry(le);
6288 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
6289 mdcache->predirty_journal_parents(mdr, &le->metablob, cur, 0, PREDIRTY_PRIMARY);
6290 mdcache->journal_dirty_inode(mdr.get(), &le->metablob, cur);
6291
6292 journal_and_reply(mdr, cur, 0, le, new C_MDS_inode_update_finish(this, mdr, cur));
6293 }
6294
6295
6296 // =================================================================
6297 // DIRECTORY and NAMESPACE OPS
6298
6299
6300 // ------------------------------------------------
6301
6302 // MKNOD
6303
6304 class C_MDS_mknod_finish : public ServerLogContext {
6305 CDentry *dn;
6306 CInode *newi;
6307 public:
6308 C_MDS_mknod_finish(Server *s, MDRequestRef& r, CDentry *d, CInode *ni) :
6309 ServerLogContext(s, r), dn(d), newi(ni) {}
6310 void finish(int r) override {
6311 ceph_assert(r == 0);
6312
6313 // link the inode
6314 dn->pop_projected_linkage();
6315
6316 // be a bit hacky with the inode version, here.. we decrement it
6317 // just to keep mark_dirty() happen. (we didn't bother projecting
6318 // a new version of hte inode since it's just been created)
6319 newi->mark_dirty(mdr->ls);
6320 newi->mark_dirty_parent(mdr->ls, true);
6321
6322 // mkdir?
6323 if (newi->is_dir()) {
6324 CDir *dir = newi->get_dirfrag(frag_t());
6325 ceph_assert(dir);
6326 dir->mark_dirty(mdr->ls);
6327 dir->mark_new(mdr->ls);
6328 }
6329
6330 mdr->apply();
6331
6332 MDRequestRef null_ref;
6333 get_mds()->mdcache->send_dentry_link(dn, null_ref);
6334
6335 if (newi->is_file()) {
6336 get_mds()->locker->share_inode_max_size(newi);
6337 } else if (newi->is_dir()) {
6338 // We do this now so that the linkages on the new directory are stable.
6339 newi->maybe_ephemeral_rand();
6340 }
6341
6342 // hit pop
6343 get_mds()->balancer->hit_inode(newi, META_POP_IWR);
6344
6345 // reply
6346 server->respond_to_request(mdr, 0);
6347 }
6348 };
6349
6350
6351 void Server::handle_client_mknod(MDRequestRef& mdr)
6352 {
6353 const cref_t<MClientRequest> &req = mdr->client_request;
6354 client_t client = mdr->get_client();
6355
6356 unsigned mode = req->head.args.mknod.mode;
6357 if ((mode & S_IFMT) == 0)
6358 mode |= S_IFREG;
6359
6360 mdr->disable_lock_cache();
6361 CDentry *dn = rdlock_path_xlock_dentry(mdr, true, false, S_ISREG(mode));
6362 if (!dn)
6363 return;
6364
6365 CDir *dir = dn->get_dir();
6366 CInode *diri = dir->get_inode();
6367 if (!check_access(mdr, diri, MAY_WRITE))
6368 return;
6369 if (!check_fragment_space(mdr, dir))
6370 return;
6371 if (!check_dir_max_entries(mdr, dir))
6372 return;
6373
6374 ceph_assert(dn->get_projected_linkage()->is_null());
6375 if (req->get_alternate_name().size() > alternate_name_max) {
6376 dout(10) << " alternate_name longer than " << alternate_name_max << dendl;
6377 respond_to_request(mdr, -CEPHFS_ENAMETOOLONG);
6378 return;
6379 }
6380 dn->set_alternate_name(req->get_alternate_name());
6381
6382 // set layout
6383 file_layout_t layout;
6384 if (mdr->dir_layout != file_layout_t())
6385 layout = mdr->dir_layout;
6386 else
6387 layout = mdcache->default_file_layout;
6388
6389 CInode *newi = prepare_new_inode(mdr, dn->get_dir(), inodeno_t(req->head.ino), mode, &layout);
6390 ceph_assert(newi);
6391
6392 dn->push_projected_linkage(newi);
6393
6394 auto _inode = newi->_get_inode();
6395 _inode->version = dn->pre_dirty();
6396 _inode->rdev = req->head.args.mknod.rdev;
6397 _inode->rstat.rfiles = 1;
6398 _inode->accounted_rstat = _inode->rstat;
6399 if (layout.pool_id != mdcache->default_file_layout.pool_id)
6400 _inode->add_old_pool(mdcache->default_file_layout.pool_id);
6401 _inode->update_backtrace();
6402
6403 snapid_t follows = mdcache->get_global_snaprealm()->get_newest_seq();
6404 SnapRealm *realm = dn->get_dir()->inode->find_snaprealm();
6405 ceph_assert(follows >= realm->get_newest_seq());
6406
6407 // if the client created a _regular_ file via MKNOD, it's highly likely they'll
6408 // want to write to it (e.g., if they are reexporting NFS)
6409 if (S_ISREG(_inode->mode)) {
6410 // issue a cap on the file
6411 int cmode = CEPH_FILE_MODE_RDWR;
6412 Capability *cap = mds->locker->issue_new_caps(newi, cmode, mdr, realm);
6413 if (cap) {
6414 cap->set_wanted(0);
6415
6416 // put locks in excl mode
6417 newi->filelock.set_state(LOCK_EXCL);
6418 newi->authlock.set_state(LOCK_EXCL);
6419 newi->xattrlock.set_state(LOCK_EXCL);
6420
6421 dout(15) << " setting a client_range too, since this is a regular file" << dendl;
6422 _inode->client_ranges[client].range.first = 0;
6423 _inode->client_ranges[client].range.last = _inode->layout.stripe_unit;
6424 _inode->client_ranges[client].follows = follows;
6425 newi->mark_clientwriteable();
6426 cap->mark_clientwriteable();
6427 }
6428 }
6429
6430 ceph_assert(dn->first == follows + 1);
6431 newi->first = dn->first;
6432
6433 dout(10) << "mknod mode " << _inode->mode << " rdev " << _inode->rdev << dendl;
6434
6435 // prepare finisher
6436 mdr->ls = mdlog->get_current_segment();
6437 EUpdate *le = new EUpdate(mdlog, "mknod");
6438 mdlog->start_entry(le);
6439 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
6440 journal_allocated_inos(mdr, &le->metablob);
6441
6442 mdcache->predirty_journal_parents(mdr, &le->metablob, newi, dn->get_dir(),
6443 PREDIRTY_PRIMARY|PREDIRTY_DIR, 1);
6444 le->metablob.add_primary_dentry(dn, newi, true, true, true);
6445
6446 journal_and_reply(mdr, newi, dn, le, new C_MDS_mknod_finish(this, mdr, dn, newi));
6447 mds->balancer->maybe_fragment(dn->get_dir(), false);
6448 }
6449
6450
6451
6452 // MKDIR
6453 /* This function takes responsibility for the passed mdr*/
6454 void Server::handle_client_mkdir(MDRequestRef& mdr)
6455 {
6456 const cref_t<MClientRequest> &req = mdr->client_request;
6457
6458 mdr->disable_lock_cache();
6459 CDentry *dn = rdlock_path_xlock_dentry(mdr, true);
6460 if (!dn)
6461 return;
6462
6463 CDir *dir = dn->get_dir();
6464 CInode *diri = dir->get_inode();
6465
6466 // mkdir check access
6467 if (!check_access(mdr, diri, MAY_WRITE))
6468 return;
6469
6470 if (!check_fragment_space(mdr, dir))
6471 return;
6472 if (!check_dir_max_entries(mdr, dir))
6473 return;
6474
6475 ceph_assert(dn->get_projected_linkage()->is_null());
6476 if (req->get_alternate_name().size() > alternate_name_max) {
6477 dout(10) << " alternate_name longer than " << alternate_name_max << dendl;
6478 respond_to_request(mdr, -CEPHFS_ENAMETOOLONG);
6479 return;
6480 }
6481 dn->set_alternate_name(req->get_alternate_name());
6482
6483 // new inode
6484 unsigned mode = req->head.args.mkdir.mode;
6485 mode &= ~S_IFMT;
6486 mode |= S_IFDIR;
6487 CInode *newi = prepare_new_inode(mdr, dir, inodeno_t(req->head.ino), mode);
6488 ceph_assert(newi);
6489
6490 // it's a directory.
6491 dn->push_projected_linkage(newi);
6492
6493 auto _inode = newi->_get_inode();
6494 _inode->version = dn->pre_dirty();
6495 _inode->rstat.rsubdirs = 1;
6496 _inode->accounted_rstat = _inode->rstat;
6497 _inode->update_backtrace();
6498
6499 snapid_t follows = mdcache->get_global_snaprealm()->get_newest_seq();
6500 SnapRealm *realm = dn->get_dir()->inode->find_snaprealm();
6501 ceph_assert(follows >= realm->get_newest_seq());
6502
6503 dout(12) << " follows " << follows << dendl;
6504 ceph_assert(dn->first == follows + 1);
6505 newi->first = dn->first;
6506
6507 // ...and that new dir is empty.
6508 CDir *newdir = newi->get_or_open_dirfrag(mdcache, frag_t());
6509 newdir->state_set(CDir::STATE_CREATING);
6510 newdir->mark_complete();
6511 newdir->_get_fnode()->version = newdir->pre_dirty();
6512
6513 // prepare finisher
6514 mdr->ls = mdlog->get_current_segment();
6515 EUpdate *le = new EUpdate(mdlog, "mkdir");
6516 mdlog->start_entry(le);
6517 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
6518 journal_allocated_inos(mdr, &le->metablob);
6519 mdcache->predirty_journal_parents(mdr, &le->metablob, newi, dn->get_dir(), PREDIRTY_PRIMARY|PREDIRTY_DIR, 1);
6520 le->metablob.add_primary_dentry(dn, newi, true, true);
6521 le->metablob.add_new_dir(newdir); // dirty AND complete AND new
6522
6523 // issue a cap on the directory
6524 int cmode = CEPH_FILE_MODE_RDWR;
6525 Capability *cap = mds->locker->issue_new_caps(newi, cmode, mdr, realm);
6526 if (cap) {
6527 cap->set_wanted(0);
6528
6529 // put locks in excl mode
6530 newi->filelock.set_state(LOCK_EXCL);
6531 newi->authlock.set_state(LOCK_EXCL);
6532 newi->xattrlock.set_state(LOCK_EXCL);
6533 }
6534
6535 // make sure this inode gets into the journal
6536 le->metablob.add_opened_ino(newi->ino());
6537
6538 journal_and_reply(mdr, newi, dn, le, new C_MDS_mknod_finish(this, mdr, dn, newi));
6539
6540 // We hit_dir (via hit_inode) in our finish callback, but by then we might
6541 // have overshot the split size (multiple mkdir in flight), so here is
6542 // an early chance to split the dir if this mkdir makes it oversized.
6543 mds->balancer->maybe_fragment(dir, false);
6544 }
6545
6546
6547 // SYMLINK
6548
6549 void Server::handle_client_symlink(MDRequestRef& mdr)
6550 {
6551 const auto& req = mdr->client_request;
6552
6553 mdr->disable_lock_cache();
6554 CDentry *dn = rdlock_path_xlock_dentry(mdr, true);
6555 if (!dn)
6556 return;
6557
6558 CDir *dir = dn->get_dir();
6559 CInode *diri = dir->get_inode();
6560
6561 if (!check_access(mdr, diri, MAY_WRITE))
6562 return;
6563 if (!check_fragment_space(mdr, dir))
6564 return;
6565 if (!check_dir_max_entries(mdr, dir))
6566 return;
6567
6568 ceph_assert(dn->get_projected_linkage()->is_null());
6569 if (req->get_alternate_name().size() > alternate_name_max) {
6570 dout(10) << " alternate_name longer than " << alternate_name_max << dendl;
6571 respond_to_request(mdr, -CEPHFS_ENAMETOOLONG);
6572 }
6573 dn->set_alternate_name(req->get_alternate_name());
6574
6575 unsigned mode = S_IFLNK | 0777;
6576 CInode *newi = prepare_new_inode(mdr, dir, inodeno_t(req->head.ino), mode);
6577 ceph_assert(newi);
6578
6579 // it's a symlink
6580 dn->push_projected_linkage(newi);
6581
6582 newi->symlink = req->get_path2();
6583 auto _inode = newi->_get_inode();
6584 _inode->version = dn->pre_dirty();
6585 _inode->size = newi->symlink.length();
6586 _inode->rstat.rbytes = _inode->size;
6587 _inode->rstat.rfiles = 1;
6588 _inode->accounted_rstat = _inode->rstat;
6589 _inode->update_backtrace();
6590
6591 newi->first = dn->first;
6592
6593 // prepare finisher
6594 mdr->ls = mdlog->get_current_segment();
6595 EUpdate *le = new EUpdate(mdlog, "symlink");
6596 mdlog->start_entry(le);
6597 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
6598 journal_allocated_inos(mdr, &le->metablob);
6599 mdcache->predirty_journal_parents(mdr, &le->metablob, newi, dn->get_dir(), PREDIRTY_PRIMARY|PREDIRTY_DIR, 1);
6600 le->metablob.add_primary_dentry(dn, newi, true, true);
6601
6602 journal_and_reply(mdr, newi, dn, le, new C_MDS_mknod_finish(this, mdr, dn, newi));
6603 mds->balancer->maybe_fragment(dir, false);
6604 }
6605
6606
6607
6608
6609
6610 // LINK
6611
6612 void Server::handle_client_link(MDRequestRef& mdr)
6613 {
6614 const cref_t<MClientRequest> &req = mdr->client_request;
6615
6616 dout(7) << "handle_client_link " << req->get_filepath()
6617 << " to " << req->get_filepath2()
6618 << dendl;
6619
6620 mdr->disable_lock_cache();
6621
6622 CDentry *destdn;
6623 CInode *targeti;
6624
6625 if (req->get_filepath2().depth() == 0) {
6626 targeti = mdcache->get_inode(req->get_filepath2().get_ino());
6627 if (!targeti) {
6628 dout(10) << "CEPHFS_ESTALE on path2, attempting recovery" << dendl;
6629 mdcache->find_ino_peers(req->get_filepath2().get_ino(), new C_MDS_TryFindInode(this, mdr));
6630 return;
6631 }
6632 mdr->pin(targeti);
6633
6634 if (!(mdr->locking_state & MutationImpl::SNAP2_LOCKED)) {
6635 CDentry *pdn = targeti->get_projected_parent_dn();
6636 if (!pdn) {
6637 dout(7) << "target has no parent dn, failing..." << dendl;
6638 respond_to_request(mdr, -CEPHFS_EINVAL);
6639 return;
6640 }
6641 if (!mds->locker->try_rdlock_snap_layout(pdn->get_dir()->get_inode(), mdr, 1))
6642 return;
6643 mdr->locking_state |= MutationImpl::SNAP2_LOCKED;
6644 }
6645
6646 destdn = rdlock_path_xlock_dentry(mdr, false);
6647 if (!destdn)
6648 return;
6649 } else {
6650 auto ret = rdlock_two_paths_xlock_destdn(mdr, false);
6651 destdn = ret.first;
6652 if (!destdn)
6653 return;
6654
6655 if (!destdn->get_projected_linkage()->is_null()) {
6656 respond_to_request(mdr, -CEPHFS_EEXIST);
6657 return;
6658 }
6659
6660 targeti = ret.second->get_projected_linkage()->get_inode();
6661 }
6662
6663 ceph_assert(destdn->get_projected_linkage()->is_null());
6664 if (req->get_alternate_name().size() > alternate_name_max) {
6665 dout(10) << " alternate_name longer than " << alternate_name_max << dendl;
6666 respond_to_request(mdr, -CEPHFS_ENAMETOOLONG);
6667 return;
6668 }
6669 destdn->set_alternate_name(req->get_alternate_name());
6670
6671 if (targeti->is_dir()) {
6672 dout(7) << "target is a dir, failing..." << dendl;
6673 respond_to_request(mdr, -CEPHFS_EINVAL);
6674 return;
6675 }
6676
6677 CDir *dir = destdn->get_dir();
6678 dout(7) << "handle_client_link link " << destdn->get_name() << " in " << *dir << dendl;
6679 dout(7) << "target is " << *targeti << dendl;
6680
6681 if (!(mdr->locking_state & MutationImpl::ALL_LOCKED)) {
6682 MutationImpl::LockOpVec lov;
6683 lov.add_xlock(&targeti->snaplock);
6684 lov.add_xlock(&targeti->linklock);
6685
6686 if (!mds->locker->acquire_locks(mdr, lov))
6687 return;
6688
6689 mdr->locking_state |= MutationImpl::ALL_LOCKED;
6690 }
6691
6692 if (targeti->get_projected_inode()->nlink == 0) {
6693 dout(7) << "target has no link, failing..." << dendl;
6694 respond_to_request(mdr, -CEPHFS_ENOENT);
6695 return;
6696 }
6697
6698 if ((!mdr->has_more() || mdr->more()->witnessed.empty())) {
6699 if (!check_access(mdr, targeti, MAY_WRITE))
6700 return;
6701
6702 if (!check_access(mdr, dir->get_inode(), MAY_WRITE))
6703 return;
6704
6705 if (!check_fragment_space(mdr, dir))
6706 return;
6707
6708 if (!check_dir_max_entries(mdr, dir))
6709 return;
6710 }
6711
6712 CInode* target_pin = targeti->get_projected_parent_dir()->inode;
6713 SnapRealm *target_realm = target_pin->find_snaprealm();
6714 if (target_pin != dir->inode &&
6715 target_realm->get_subvolume_ino() !=
6716 dir->inode->find_snaprealm()->get_subvolume_ino()) {
6717 dout(7) << "target is in different subvolume, failing..." << dendl;
6718 respond_to_request(mdr, -CEPHFS_EXDEV);
6719 return;
6720 }
6721
6722 // go!
6723 ceph_assert(g_conf()->mds_kill_link_at != 1);
6724
6725 // local or remote?
6726 if (targeti->is_auth())
6727 _link_local(mdr, destdn, targeti, target_realm);
6728 else
6729 _link_remote(mdr, true, destdn, targeti);
6730 mds->balancer->maybe_fragment(dir, false);
6731 }
6732
6733
6734 class C_MDS_link_local_finish : public ServerLogContext {
6735 CDentry *dn;
6736 CInode *targeti;
6737 version_t dnpv;
6738 version_t tipv;
6739 bool adjust_realm;
6740 public:
6741 C_MDS_link_local_finish(Server *s, MDRequestRef& r, CDentry *d, CInode *ti,
6742 version_t dnpv_, version_t tipv_, bool ar) :
6743 ServerLogContext(s, r), dn(d), targeti(ti),
6744 dnpv(dnpv_), tipv(tipv_), adjust_realm(ar) { }
6745 void finish(int r) override {
6746 ceph_assert(r == 0);
6747 server->_link_local_finish(mdr, dn, targeti, dnpv, tipv, adjust_realm);
6748 }
6749 };
6750
6751
6752 void Server::_link_local(MDRequestRef& mdr, CDentry *dn, CInode *targeti, SnapRealm *target_realm)
6753 {
6754 dout(10) << "_link_local " << *dn << " to " << *targeti << dendl;
6755
6756 mdr->ls = mdlog->get_current_segment();
6757
6758 // predirty NEW dentry
6759 version_t dnpv = dn->pre_dirty();
6760 version_t tipv = targeti->pre_dirty();
6761
6762 // project inode update
6763 auto pi = targeti->project_inode(mdr);
6764 pi.inode->nlink++;
6765 pi.inode->ctime = mdr->get_op_stamp();
6766 if (mdr->get_op_stamp() > pi.inode->rstat.rctime)
6767 pi.inode->rstat.rctime = mdr->get_op_stamp();
6768 pi.inode->change_attr++;
6769 pi.inode->version = tipv;
6770
6771 bool adjust_realm = false;
6772 if (!target_realm->get_subvolume_ino() && !targeti->is_projected_snaprealm_global()) {
6773 sr_t *newsnap = targeti->project_snaprealm();
6774 targeti->mark_snaprealm_global(newsnap);
6775 targeti->record_snaprealm_parent_dentry(newsnap, target_realm, targeti->get_projected_parent_dn(), true);
6776 adjust_realm = true;
6777 }
6778
6779 // log + wait
6780 EUpdate *le = new EUpdate(mdlog, "link_local");
6781 mdlog->start_entry(le);
6782 le->metablob.add_client_req(mdr->reqid, mdr->client_request->get_oldest_client_tid());
6783 mdcache->predirty_journal_parents(mdr, &le->metablob, targeti, dn->get_dir(), PREDIRTY_DIR, 1); // new dn
6784 mdcache->predirty_journal_parents(mdr, &le->metablob, targeti, 0, PREDIRTY_PRIMARY); // targeti
6785 le->metablob.add_remote_dentry(dn, true, targeti->ino(), targeti->d_type()); // new remote
6786 mdcache->journal_dirty_inode(mdr.get(), &le->metablob, targeti);
6787
6788 // do this after predirty_*, to avoid funky extra dnl arg
6789 dn->push_projected_linkage(targeti->ino(), targeti->d_type());
6790
6791 journal_and_reply(mdr, targeti, dn, le,
6792 new C_MDS_link_local_finish(this, mdr, dn, targeti, dnpv, tipv, adjust_realm));
6793 }
6794
6795 void Server::_link_local_finish(MDRequestRef& mdr, CDentry *dn, CInode *targeti,
6796 version_t dnpv, version_t tipv, bool adjust_realm)
6797 {
6798 dout(10) << "_link_local_finish " << *dn << " to " << *targeti << dendl;
6799
6800 // link and unlock the NEW dentry
6801 CDentry::linkage_t *dnl = dn->pop_projected_linkage();
6802 if (!dnl->get_inode())
6803 dn->link_remote(dnl, targeti);
6804 dn->mark_dirty(dnpv, mdr->ls);
6805
6806 // target inode
6807 mdr->apply();
6808
6809 MDRequestRef null_ref;
6810 mdcache->send_dentry_link(dn, null_ref);
6811
6812 if (adjust_realm) {
6813 int op = CEPH_SNAP_OP_SPLIT;
6814 mds->mdcache->send_snap_update(targeti, 0, op);
6815 mds->mdcache->do_realm_invalidate_and_update_notify(targeti, op);
6816 }
6817
6818 // bump target popularity
6819 mds->balancer->hit_inode(targeti, META_POP_IWR);
6820 mds->balancer->hit_dir(dn->get_dir(), META_POP_IWR);
6821
6822 // reply
6823 respond_to_request(mdr, 0);
6824 }
6825
6826
6827 // link / unlink remote
6828
6829 class C_MDS_link_remote_finish : public ServerLogContext {
6830 bool inc;
6831 CDentry *dn;
6832 CInode *targeti;
6833 version_t dpv;
6834 public:
6835 C_MDS_link_remote_finish(Server *s, MDRequestRef& r, bool i, CDentry *d, CInode *ti) :
6836 ServerLogContext(s, r), inc(i), dn(d), targeti(ti),
6837 dpv(d->get_projected_version()) {}
6838 void finish(int r) override {
6839 ceph_assert(r == 0);
6840 server->_link_remote_finish(mdr, inc, dn, targeti, dpv);
6841 }
6842 };
6843
6844 void Server::_link_remote(MDRequestRef& mdr, bool inc, CDentry *dn, CInode *targeti)
6845 {
6846 dout(10) << "_link_remote "
6847 << (inc ? "link ":"unlink ")
6848 << *dn << " to " << *targeti << dendl;
6849
6850 // 1. send LinkPrepare to dest (journal nlink++ prepare)
6851 mds_rank_t linkauth = targeti->authority().first;
6852 if (mdr->more()->witnessed.count(linkauth) == 0) {
6853 if (mds->is_cluster_degraded() &&
6854 !mds->mdsmap->is_clientreplay_or_active_or_stopping(linkauth)) {
6855 dout(10) << " targeti auth mds." << linkauth << " is not active" << dendl;
6856 if (mdr->more()->waiting_on_peer.empty())
6857 mds->wait_for_active_peer(linkauth, new C_MDS_RetryRequest(mdcache, mdr));
6858 return;
6859 }
6860
6861 dout(10) << " targeti auth must prepare nlink++/--" << dendl;
6862 int op;
6863 if (inc)
6864 op = MMDSPeerRequest::OP_LINKPREP;
6865 else
6866 op = MMDSPeerRequest::OP_UNLINKPREP;
6867 auto req = make_message<MMDSPeerRequest>(mdr->reqid, mdr->attempt, op);
6868 targeti->set_object_info(req->get_object_info());
6869 req->op_stamp = mdr->get_op_stamp();
6870 if (auto& desti_srnode = mdr->more()->desti_srnode)
6871 encode(*desti_srnode, req->desti_snapbl);
6872 mds->send_message_mds(req, linkauth);
6873
6874 ceph_assert(mdr->more()->waiting_on_peer.count(linkauth) == 0);
6875 mdr->more()->waiting_on_peer.insert(linkauth);
6876 return;
6877 }
6878 dout(10) << " targeti auth has prepared nlink++/--" << dendl;
6879
6880 ceph_assert(g_conf()->mds_kill_link_at != 2);
6881
6882 if (auto& desti_srnode = mdr->more()->desti_srnode) {
6883 delete desti_srnode;
6884 desti_srnode = NULL;
6885 }
6886
6887 mdr->set_mds_stamp(ceph_clock_now());
6888
6889 // add to event
6890 mdr->ls = mdlog->get_current_segment();
6891 EUpdate *le = new EUpdate(mdlog, inc ? "link_remote":"unlink_remote");
6892 mdlog->start_entry(le);
6893 le->metablob.add_client_req(mdr->reqid, mdr->client_request->get_oldest_client_tid());
6894 if (!mdr->more()->witnessed.empty()) {
6895 dout(20) << " noting uncommitted_peers " << mdr->more()->witnessed << dendl;
6896 le->reqid = mdr->reqid;
6897 le->had_peers = true;
6898 mdcache->add_uncommitted_leader(mdr->reqid, mdr->ls, mdr->more()->witnessed);
6899 }
6900
6901 if (inc) {
6902 dn->pre_dirty();
6903 mdcache->predirty_journal_parents(mdr, &le->metablob, targeti, dn->get_dir(), PREDIRTY_DIR, 1);
6904 le->metablob.add_remote_dentry(dn, true, targeti->ino(), targeti->d_type()); // new remote
6905 dn->push_projected_linkage(targeti->ino(), targeti->d_type());
6906 } else {
6907 dn->pre_dirty();
6908 mdcache->predirty_journal_parents(mdr, &le->metablob, targeti, dn->get_dir(), PREDIRTY_DIR, -1);
6909 mdcache->journal_cow_dentry(mdr.get(), &le->metablob, dn);
6910 le->metablob.add_null_dentry(dn, true);
6911 dn->push_projected_linkage();
6912 }
6913
6914 journal_and_reply(mdr, (inc ? targeti : nullptr), dn, le,
6915 new C_MDS_link_remote_finish(this, mdr, inc, dn, targeti));
6916 }
6917
6918 void Server::_link_remote_finish(MDRequestRef& mdr, bool inc,
6919 CDentry *dn, CInode *targeti,
6920 version_t dpv)
6921 {
6922 dout(10) << "_link_remote_finish "
6923 << (inc ? "link ":"unlink ")
6924 << *dn << " to " << *targeti << dendl;
6925
6926 ceph_assert(g_conf()->mds_kill_link_at != 3);
6927
6928 if (!mdr->more()->witnessed.empty())
6929 mdcache->logged_leader_update(mdr->reqid);
6930
6931 if (inc) {
6932 // link the new dentry
6933 CDentry::linkage_t *dnl = dn->pop_projected_linkage();
6934 if (!dnl->get_inode())
6935 dn->link_remote(dnl, targeti);
6936 dn->mark_dirty(dpv, mdr->ls);
6937 } else {
6938 // unlink main dentry
6939 dn->get_dir()->unlink_inode(dn);
6940 dn->pop_projected_linkage();
6941 dn->mark_dirty(dn->get_projected_version(), mdr->ls); // dirty old dentry
6942 }
6943
6944 mdr->apply();
6945
6946 MDRequestRef null_ref;
6947 if (inc)
6948 mdcache->send_dentry_link(dn, null_ref);
6949 else
6950 mdcache->send_dentry_unlink(dn, NULL, null_ref);
6951
6952 // bump target popularity
6953 mds->balancer->hit_inode(targeti, META_POP_IWR);
6954 mds->balancer->hit_dir(dn->get_dir(), META_POP_IWR);
6955
6956 // reply
6957 respond_to_request(mdr, 0);
6958
6959 if (!inc)
6960 // removing a new dn?
6961 dn->get_dir()->try_remove_unlinked_dn(dn);
6962 }
6963
6964
6965 // remote linking/unlinking
6966
6967 class C_MDS_PeerLinkPrep : public ServerLogContext {
6968 CInode *targeti;
6969 bool adjust_realm;
6970 public:
6971 C_MDS_PeerLinkPrep(Server *s, MDRequestRef& r, CInode *t, bool ar) :
6972 ServerLogContext(s, r), targeti(t), adjust_realm(ar) { }
6973 void finish(int r) override {
6974 ceph_assert(r == 0);
6975 server->_logged_peer_link(mdr, targeti, adjust_realm);
6976 }
6977 };
6978
6979 class C_MDS_PeerLinkCommit : public ServerContext {
6980 MDRequestRef mdr;
6981 CInode *targeti;
6982 public:
6983 C_MDS_PeerLinkCommit(Server *s, MDRequestRef& r, CInode *t) :
6984 ServerContext(s), mdr(r), targeti(t) { }
6985 void finish(int r) override {
6986 server->_commit_peer_link(mdr, r, targeti);
6987 }
6988 };
6989
6990 void Server::handle_peer_link_prep(MDRequestRef& mdr)
6991 {
6992 dout(10) << "handle_peer_link_prep " << *mdr
6993 << " on " << mdr->peer_request->get_object_info()
6994 << dendl;
6995
6996 ceph_assert(g_conf()->mds_kill_link_at != 4);
6997
6998 CInode *targeti = mdcache->get_inode(mdr->peer_request->get_object_info().ino);
6999 ceph_assert(targeti);
7000 dout(10) << "targeti " << *targeti << dendl;
7001 CDentry *dn = targeti->get_parent_dn();
7002 CDentry::linkage_t *dnl = dn->get_linkage();
7003 ceph_assert(dnl->is_primary());
7004
7005 mdr->set_op_stamp(mdr->peer_request->op_stamp);
7006
7007 mdr->auth_pin(targeti);
7008
7009 //ceph_abort(); // test hack: make sure leader can handle a peer that fails to prepare...
7010 ceph_assert(g_conf()->mds_kill_link_at != 5);
7011
7012 // journal it
7013 mdr->ls = mdlog->get_current_segment();
7014 EPeerUpdate *le = new EPeerUpdate(mdlog, "peer_link_prep", mdr->reqid, mdr->peer_to_mds,
7015 EPeerUpdate::OP_PREPARE, EPeerUpdate::LINK);
7016 mdlog->start_entry(le);
7017
7018 auto pi = dnl->get_inode()->project_inode(mdr);
7019
7020 // update journaled target inode
7021 bool inc;
7022 bool adjust_realm = false;
7023 bool realm_projected = false;
7024 if (mdr->peer_request->get_op() == MMDSPeerRequest::OP_LINKPREP) {
7025 inc = true;
7026 pi.inode->nlink++;
7027
7028 CDentry *target_pdn = targeti->get_projected_parent_dn();
7029 SnapRealm *target_realm = target_pdn->get_dir()->inode->find_snaprealm();
7030 if (!target_realm->get_subvolume_ino() && !targeti->is_projected_snaprealm_global()) {
7031 sr_t *newsnap = targeti->project_snaprealm();
7032 targeti->mark_snaprealm_global(newsnap);
7033 targeti->record_snaprealm_parent_dentry(newsnap, target_realm, target_pdn, true);
7034 adjust_realm = true;
7035 realm_projected = true;
7036 }
7037 } else {
7038 inc = false;
7039 pi.inode->nlink--;
7040 if (targeti->is_projected_snaprealm_global()) {
7041 ceph_assert(mdr->peer_request->desti_snapbl.length());
7042 auto p = mdr->peer_request->desti_snapbl.cbegin();
7043
7044 sr_t *newsnap = targeti->project_snaprealm();
7045 decode(*newsnap, p);
7046
7047 if (pi.inode->nlink == 0)
7048 ceph_assert(!newsnap->is_parent_global());
7049
7050 realm_projected = true;
7051 } else {
7052 ceph_assert(mdr->peer_request->desti_snapbl.length() == 0);
7053 }
7054 }
7055
7056 link_rollback rollback;
7057 rollback.reqid = mdr->reqid;
7058 rollback.ino = targeti->ino();
7059 rollback.old_ctime = targeti->get_inode()->ctime; // we hold versionlock xlock; no concorrent projections
7060 const auto& pf = targeti->get_parent_dn()->get_dir()->get_projected_fnode();
7061 rollback.old_dir_mtime = pf->fragstat.mtime;
7062 rollback.old_dir_rctime = pf->rstat.rctime;
7063 rollback.was_inc = inc;
7064 if (realm_projected) {
7065 if (targeti->snaprealm) {
7066 encode(true, rollback.snapbl);
7067 targeti->encode_snap_blob(rollback.snapbl);
7068 } else {
7069 encode(false, rollback.snapbl);
7070 }
7071 }
7072 encode(rollback, le->rollback);
7073 mdr->more()->rollback_bl = le->rollback;
7074
7075 pi.inode->ctime = mdr->get_op_stamp();
7076 pi.inode->version = targeti->pre_dirty();
7077
7078 dout(10) << " projected inode " << pi.inode->ino << " v " << pi.inode->version << dendl;
7079
7080 // commit case
7081 mdcache->predirty_journal_parents(mdr, &le->commit, dnl->get_inode(), 0, PREDIRTY_SHALLOW|PREDIRTY_PRIMARY);
7082 mdcache->journal_dirty_inode(mdr.get(), &le->commit, targeti);
7083 mdcache->add_uncommitted_peer(mdr->reqid, mdr->ls, mdr->peer_to_mds);
7084
7085 // set up commit waiter
7086 mdr->more()->peer_commit = new C_MDS_PeerLinkCommit(this, mdr, targeti);
7087
7088 mdr->more()->peer_update_journaled = true;
7089 submit_mdlog_entry(le, new C_MDS_PeerLinkPrep(this, mdr, targeti, adjust_realm),
7090 mdr, __func__);
7091 mdlog->flush();
7092 }
7093
7094 void Server::_logged_peer_link(MDRequestRef& mdr, CInode *targeti, bool adjust_realm)
7095 {
7096 dout(10) << "_logged_peer_link " << *mdr
7097 << " " << *targeti << dendl;
7098
7099 ceph_assert(g_conf()->mds_kill_link_at != 6);
7100
7101 // update the target
7102 mdr->apply();
7103
7104 // hit pop
7105 mds->balancer->hit_inode(targeti, META_POP_IWR);
7106
7107 // done.
7108 mdr->reset_peer_request();
7109
7110 if (adjust_realm) {
7111 int op = CEPH_SNAP_OP_SPLIT;
7112 mds->mdcache->send_snap_update(targeti, 0, op);
7113 mds->mdcache->do_realm_invalidate_and_update_notify(targeti, op);
7114 }
7115
7116 // ack
7117 if (!mdr->aborted) {
7118 auto reply = make_message<MMDSPeerRequest>(mdr->reqid, mdr->attempt, MMDSPeerRequest::OP_LINKPREPACK);
7119 mds->send_message_mds(reply, mdr->peer_to_mds);
7120 } else {
7121 dout(10) << " abort flag set, finishing" << dendl;
7122 mdcache->request_finish(mdr);
7123 }
7124 }
7125
7126
7127 struct C_MDS_CommittedPeer : public ServerLogContext {
7128 C_MDS_CommittedPeer(Server *s, MDRequestRef& m) : ServerLogContext(s, m) {}
7129 void finish(int r) override {
7130 server->_committed_peer(mdr);
7131 }
7132 };
7133
7134 void Server::_commit_peer_link(MDRequestRef& mdr, int r, CInode *targeti)
7135 {
7136 dout(10) << "_commit_peer_link " << *mdr
7137 << " r=" << r
7138 << " " << *targeti << dendl;
7139
7140 ceph_assert(g_conf()->mds_kill_link_at != 7);
7141
7142 if (r == 0) {
7143 // drop our pins, etc.
7144 mdr->cleanup();
7145
7146 // write a commit to the journal
7147 EPeerUpdate *le = new EPeerUpdate(mdlog, "peer_link_commit", mdr->reqid, mdr->peer_to_mds,
7148 EPeerUpdate::OP_COMMIT, EPeerUpdate::LINK);
7149 mdlog->start_entry(le);
7150 submit_mdlog_entry(le, new C_MDS_CommittedPeer(this, mdr), mdr, __func__);
7151 mdlog->flush();
7152 } else {
7153 do_link_rollback(mdr->more()->rollback_bl, mdr->peer_to_mds, mdr);
7154 }
7155 }
7156
7157 void Server::_committed_peer(MDRequestRef& mdr)
7158 {
7159 dout(10) << "_committed_peer " << *mdr << dendl;
7160
7161 ceph_assert(g_conf()->mds_kill_link_at != 8);
7162
7163 bool assert_exist = mdr->more()->peer_update_journaled;
7164 mdcache->finish_uncommitted_peer(mdr->reqid, assert_exist);
7165 auto req = make_message<MMDSPeerRequest>(mdr->reqid, mdr->attempt, MMDSPeerRequest::OP_COMMITTED);
7166 mds->send_message_mds(req, mdr->peer_to_mds);
7167 mdcache->request_finish(mdr);
7168 }
7169
7170 struct C_MDS_LoggedLinkRollback : public ServerLogContext {
7171 MutationRef mut;
7172 map<client_t,ref_t<MClientSnap>> splits;
7173 C_MDS_LoggedLinkRollback(Server *s, MutationRef& m, MDRequestRef& r,
7174 map<client_t,ref_t<MClientSnap>>&& _splits) :
7175 ServerLogContext(s, r), mut(m), splits(std::move(_splits)) {
7176 }
7177 void finish(int r) override {
7178 server->_link_rollback_finish(mut, mdr, splits);
7179 }
7180 };
7181
7182 void Server::do_link_rollback(bufferlist &rbl, mds_rank_t leader, MDRequestRef& mdr)
7183 {
7184 link_rollback rollback;
7185 auto p = rbl.cbegin();
7186 decode(rollback, p);
7187
7188 dout(10) << "do_link_rollback on " << rollback.reqid
7189 << (rollback.was_inc ? " inc":" dec")
7190 << " ino " << rollback.ino
7191 << dendl;
7192
7193 ceph_assert(g_conf()->mds_kill_link_at != 9);
7194
7195 mdcache->add_rollback(rollback.reqid, leader); // need to finish this update before resolve finishes
7196 ceph_assert(mdr || mds->is_resolve());
7197
7198 MutationRef mut(new MutationImpl(nullptr, utime_t(), rollback.reqid));
7199 mut->ls = mds->mdlog->get_current_segment();
7200
7201 CInode *in = mdcache->get_inode(rollback.ino);
7202 ceph_assert(in);
7203 dout(10) << " target is " << *in << dendl;
7204 ceph_assert(!in->is_projected()); // live peer request hold versionlock xlock.
7205
7206 auto pi = in->project_inode(mut);
7207 pi.inode->version = in->pre_dirty();
7208
7209 // parent dir rctime
7210 CDir *parent = in->get_projected_parent_dn()->get_dir();
7211 auto pf = parent->project_fnode(mut);
7212 pf->version = parent->pre_dirty();
7213 if (pf->fragstat.mtime == pi.inode->ctime) {
7214 pf->fragstat.mtime = rollback.old_dir_mtime;
7215 if (pf->rstat.rctime == pi.inode->ctime)
7216 pf->rstat.rctime = rollback.old_dir_rctime;
7217 mut->add_updated_lock(&parent->get_inode()->filelock);
7218 mut->add_updated_lock(&parent->get_inode()->nestlock);
7219 }
7220
7221 // inode
7222 pi.inode->ctime = rollback.old_ctime;
7223 if (rollback.was_inc)
7224 pi.inode->nlink--;
7225 else
7226 pi.inode->nlink++;
7227
7228 map<client_t,ref_t<MClientSnap>> splits;
7229 if (rollback.snapbl.length() && in->snaprealm) {
7230 bool hadrealm;
7231 auto p = rollback.snapbl.cbegin();
7232 decode(hadrealm, p);
7233 if (hadrealm) {
7234 if (!mds->is_resolve()) {
7235 sr_t *new_srnode = new sr_t();
7236 decode(*new_srnode, p);
7237 in->project_snaprealm(new_srnode);
7238 } else {
7239 decode(in->snaprealm->srnode, p);
7240 }
7241 } else {
7242 SnapRealm *realm = parent->get_inode()->find_snaprealm();
7243 if (!mds->is_resolve())
7244 mdcache->prepare_realm_merge(in->snaprealm, realm, splits);
7245 in->project_snaprealm(NULL);
7246 }
7247 }
7248
7249 // journal it
7250 EPeerUpdate *le = new EPeerUpdate(mdlog, "peer_link_rollback", rollback.reqid, leader,
7251 EPeerUpdate::OP_ROLLBACK, EPeerUpdate::LINK);
7252 mdlog->start_entry(le);
7253 le->commit.add_dir_context(parent);
7254 le->commit.add_dir(parent, true);
7255 le->commit.add_primary_dentry(in->get_projected_parent_dn(), 0, true);
7256
7257 submit_mdlog_entry(le, new C_MDS_LoggedLinkRollback(this, mut, mdr, std::move(splits)),
7258 mdr, __func__);
7259 mdlog->flush();
7260 }
7261
7262 void Server::_link_rollback_finish(MutationRef& mut, MDRequestRef& mdr,
7263 map<client_t,ref_t<MClientSnap>>& splits)
7264 {
7265 dout(10) << "_link_rollback_finish" << dendl;
7266
7267 ceph_assert(g_conf()->mds_kill_link_at != 10);
7268
7269 mut->apply();
7270
7271 if (!mds->is_resolve())
7272 mdcache->send_snaps(splits);
7273
7274 if (mdr)
7275 mdcache->request_finish(mdr);
7276
7277 mdcache->finish_rollback(mut->reqid, mdr);
7278
7279 mut->cleanup();
7280 }
7281
7282
7283 void Server::handle_peer_link_prep_ack(MDRequestRef& mdr, const cref_t<MMDSPeerRequest> &m)
7284 {
7285 dout(10) << "handle_peer_link_prep_ack " << *mdr
7286 << " " << *m << dendl;
7287 mds_rank_t from = mds_rank_t(m->get_source().num());
7288
7289 ceph_assert(g_conf()->mds_kill_link_at != 11);
7290
7291 // note peer
7292 mdr->more()->peers.insert(from);
7293
7294 // witnessed!
7295 ceph_assert(mdr->more()->witnessed.count(from) == 0);
7296 mdr->more()->witnessed.insert(from);
7297 ceph_assert(!m->is_not_journaled());
7298 mdr->more()->has_journaled_peers = true;
7299
7300 // remove from waiting list
7301 ceph_assert(mdr->more()->waiting_on_peer.count(from));
7302 mdr->more()->waiting_on_peer.erase(from);
7303
7304 ceph_assert(mdr->more()->waiting_on_peer.empty());
7305
7306 dispatch_client_request(mdr); // go again!
7307 }
7308
7309
7310
7311
7312
7313 // UNLINK
7314
7315 void Server::handle_client_unlink(MDRequestRef& mdr)
7316 {
7317 const cref_t<MClientRequest> &req = mdr->client_request;
7318 client_t client = mdr->get_client();
7319
7320 // rmdir or unlink?
7321 bool rmdir = (req->get_op() == CEPH_MDS_OP_RMDIR);
7322
7323 if (rmdir)
7324 mdr->disable_lock_cache();
7325 CDentry *dn = rdlock_path_xlock_dentry(mdr, false, true);
7326 if (!dn)
7327 return;
7328
7329 CDentry::linkage_t *dnl = dn->get_linkage(client, mdr);
7330 ceph_assert(!dnl->is_null());
7331 CInode *in = dnl->get_inode();
7332
7333 if (rmdir) {
7334 dout(7) << "handle_client_rmdir on " << *dn << dendl;
7335 } else {
7336 dout(7) << "handle_client_unlink on " << *dn << dendl;
7337 }
7338 dout(7) << "dn links to " << *in << dendl;
7339
7340 // rmdir vs is_dir
7341 if (in->is_dir()) {
7342 if (rmdir) {
7343 // do empty directory checks
7344 if (_dir_is_nonempty_unlocked(mdr, in)) {
7345 respond_to_request(mdr, -CEPHFS_ENOTEMPTY);
7346 return;
7347 }
7348 } else {
7349 dout(7) << "handle_client_unlink on dir " << *in << ", returning error" << dendl;
7350 respond_to_request(mdr, -CEPHFS_EISDIR);
7351 return;
7352 }
7353 } else {
7354 if (rmdir) {
7355 // unlink
7356 dout(7) << "handle_client_rmdir on non-dir " << *in << ", returning error" << dendl;
7357 respond_to_request(mdr, -CEPHFS_ENOTDIR);
7358 return;
7359 }
7360 }
7361
7362 CInode *diri = dn->get_dir()->get_inode();
7363 if ((!mdr->has_more() || mdr->more()->witnessed.empty())) {
7364 if (!check_access(mdr, diri, MAY_WRITE))
7365 return;
7366 }
7367
7368 // -- create stray dentry? --
7369 CDentry *straydn = NULL;
7370 if (dnl->is_primary()) {
7371 straydn = prepare_stray_dentry(mdr, dnl->get_inode());
7372 if (!straydn)
7373 return;
7374 dout(10) << " straydn is " << *straydn << dendl;
7375 } else if (mdr->straydn) {
7376 mdr->unpin(mdr->straydn);
7377 mdr->straydn = NULL;
7378 }
7379
7380 // lock
7381 if (!(mdr->locking_state & MutationImpl::ALL_LOCKED)) {
7382 MutationImpl::LockOpVec lov;
7383
7384 lov.add_xlock(&in->linklock);
7385 lov.add_xlock(&in->snaplock);
7386 if (in->is_dir())
7387 lov.add_rdlock(&in->filelock); // to verify it's empty
7388
7389 if (straydn) {
7390 lov.add_wrlock(&straydn->get_dir()->inode->filelock);
7391 lov.add_wrlock(&straydn->get_dir()->inode->nestlock);
7392 lov.add_xlock(&straydn->lock);
7393 }
7394
7395 if (!mds->locker->acquire_locks(mdr, lov))
7396 return;
7397
7398 mdr->locking_state |= MutationImpl::ALL_LOCKED;
7399 }
7400
7401 if (in->is_dir() &&
7402 _dir_is_nonempty(mdr, in)) {
7403 respond_to_request(mdr, -CEPHFS_ENOTEMPTY);
7404 return;
7405 }
7406
7407 if (straydn)
7408 straydn->first = mdcache->get_global_snaprealm()->get_newest_seq() + 1;
7409
7410 if (!mdr->more()->desti_srnode) {
7411 if (in->is_projected_snaprealm_global()) {
7412 sr_t *new_srnode = in->prepare_new_srnode(0);
7413 in->record_snaprealm_parent_dentry(new_srnode, nullptr, dn, dnl->is_primary());
7414 // dropping the last linkage or dropping the last remote linkage,
7415 // detch the inode from global snaprealm
7416 auto nlink = in->get_projected_inode()->nlink;
7417 if (nlink == 1 ||
7418 (nlink == 2 && !dnl->is_primary() &&
7419 !in->get_projected_parent_dir()->inode->is_stray()))
7420 in->clear_snaprealm_global(new_srnode);
7421 mdr->more()->desti_srnode = new_srnode;
7422 } else if (dnl->is_primary()) {
7423 // prepare snaprealm blob for peer request
7424 SnapRealm *realm = in->find_snaprealm();
7425 snapid_t follows = realm->get_newest_seq();
7426 if (in->snaprealm || follows + 1 > in->get_oldest_snap()) {
7427 sr_t *new_srnode = in->prepare_new_srnode(follows);
7428 in->record_snaprealm_past_parent(new_srnode, straydn->get_dir()->inode->find_snaprealm());
7429 mdr->more()->desti_srnode = new_srnode;
7430 }
7431 }
7432 }
7433
7434 // yay!
7435 if (in->is_dir() && in->has_subtree_root_dirfrag()) {
7436 // subtree root auths need to be witnesses
7437 set<mds_rank_t> witnesses;
7438 in->list_replicas(witnesses);
7439 dout(10) << " witnesses " << witnesses << ", have " << mdr->more()->witnessed << dendl;
7440
7441 for (set<mds_rank_t>::iterator p = witnesses.begin();
7442 p != witnesses.end();
7443 ++p) {
7444 if (mdr->more()->witnessed.count(*p)) {
7445 dout(10) << " already witnessed by mds." << *p << dendl;
7446 } else if (mdr->more()->waiting_on_peer.count(*p)) {
7447 dout(10) << " already waiting on witness mds." << *p << dendl;
7448 } else {
7449 if (!_rmdir_prepare_witness(mdr, *p, mdr->dn[0], straydn))
7450 return;
7451 }
7452 }
7453 if (!mdr->more()->waiting_on_peer.empty())
7454 return; // we're waiting for a witness.
7455 }
7456
7457 if (!rmdir && dnl->is_primary() && mdr->dn[0].size() == 1)
7458 mds->locker->create_lock_cache(mdr, diri);
7459
7460 // ok!
7461 if (dnl->is_remote() && !dnl->get_inode()->is_auth())
7462 _link_remote(mdr, false, dn, dnl->get_inode());
7463 else
7464 _unlink_local(mdr, dn, straydn);
7465 }
7466
7467 class C_MDS_unlink_local_finish : public ServerLogContext {
7468 CDentry *dn;
7469 CDentry *straydn;
7470 version_t dnpv; // deleted dentry
7471 public:
7472 C_MDS_unlink_local_finish(Server *s, MDRequestRef& r, CDentry *d, CDentry *sd) :
7473 ServerLogContext(s, r), dn(d), straydn(sd),
7474 dnpv(d->get_projected_version()) {}
7475 void finish(int r) override {
7476 ceph_assert(r == 0);
7477 server->_unlink_local_finish(mdr, dn, straydn, dnpv);
7478 }
7479 };
7480
7481 void Server::_unlink_local(MDRequestRef& mdr, CDentry *dn, CDentry *straydn)
7482 {
7483 dout(10) << "_unlink_local " << *dn << dendl;
7484
7485 CDentry::linkage_t *dnl = dn->get_projected_linkage();
7486 CInode *in = dnl->get_inode();
7487
7488
7489 // ok, let's do it.
7490 mdr->ls = mdlog->get_current_segment();
7491
7492 // prepare log entry
7493 EUpdate *le = new EUpdate(mdlog, "unlink_local");
7494 mdlog->start_entry(le);
7495 le->metablob.add_client_req(mdr->reqid, mdr->client_request->get_oldest_client_tid());
7496 if (!mdr->more()->witnessed.empty()) {
7497 dout(20) << " noting uncommitted_peers " << mdr->more()->witnessed << dendl;
7498 le->reqid = mdr->reqid;
7499 le->had_peers = true;
7500 mdcache->add_uncommitted_leader(mdr->reqid, mdr->ls, mdr->more()->witnessed);
7501 }
7502
7503 if (straydn) {
7504 ceph_assert(dnl->is_primary());
7505 straydn->push_projected_linkage(in);
7506 }
7507
7508 // the unlinked dentry
7509 dn->pre_dirty();
7510
7511 auto pi = in->project_inode(mdr);
7512 {
7513 std::string t;
7514 dn->make_path_string(t, true);
7515 pi.inode->stray_prior_path = std::move(t);
7516 }
7517 pi.inode->version = in->pre_dirty();
7518 pi.inode->ctime = mdr->get_op_stamp();
7519 if (mdr->get_op_stamp() > pi.inode->rstat.rctime)
7520 pi.inode->rstat.rctime = mdr->get_op_stamp();
7521 pi.inode->change_attr++;
7522 pi.inode->nlink--;
7523 if (pi.inode->nlink == 0)
7524 in->state_set(CInode::STATE_ORPHAN);
7525
7526 if (mdr->more()->desti_srnode) {
7527 auto& desti_srnode = mdr->more()->desti_srnode;
7528 in->project_snaprealm(desti_srnode);
7529 desti_srnode = NULL;
7530 }
7531
7532 if (straydn) {
7533 // will manually pop projected inode
7534
7535 // primary link. add stray dentry.
7536 mdcache->predirty_journal_parents(mdr, &le->metablob, in, dn->get_dir(), PREDIRTY_PRIMARY|PREDIRTY_DIR, -1);
7537 mdcache->predirty_journal_parents(mdr, &le->metablob, in, straydn->get_dir(), PREDIRTY_PRIMARY|PREDIRTY_DIR, 1);
7538
7539 pi.inode->update_backtrace();
7540 le->metablob.add_primary_dentry(straydn, in, true, true);
7541 } else {
7542 // remote link. update remote inode.
7543 mdcache->predirty_journal_parents(mdr, &le->metablob, in, dn->get_dir(), PREDIRTY_DIR, -1);
7544 mdcache->predirty_journal_parents(mdr, &le->metablob, in, 0, PREDIRTY_PRIMARY);
7545 mdcache->journal_dirty_inode(mdr.get(), &le->metablob, in);
7546 }
7547
7548 mdcache->journal_cow_dentry(mdr.get(), &le->metablob, dn);
7549 le->metablob.add_null_dentry(dn, true);
7550
7551 if (in->is_dir()) {
7552 dout(10) << " noting renamed (unlinked) dir ino " << in->ino() << " in metablob" << dendl;
7553 le->metablob.renamed_dirino = in->ino();
7554 }
7555
7556 dn->push_projected_linkage();
7557
7558 if (straydn) {
7559 ceph_assert(in->first <= straydn->first);
7560 in->first = straydn->first;
7561 }
7562
7563 if (in->is_dir()) {
7564 ceph_assert(straydn);
7565 mdcache->project_subtree_rename(in, dn->get_dir(), straydn->get_dir());
7566 }
7567
7568 journal_and_reply(mdr, 0, dn, le, new C_MDS_unlink_local_finish(this, mdr, dn, straydn));
7569 }
7570
7571 void Server::_unlink_local_finish(MDRequestRef& mdr,
7572 CDentry *dn, CDentry *straydn,
7573 version_t dnpv)
7574 {
7575 dout(10) << "_unlink_local_finish " << *dn << dendl;
7576
7577 if (!mdr->more()->witnessed.empty())
7578 mdcache->logged_leader_update(mdr->reqid);
7579
7580 CInode *strayin = NULL;
7581 bool hadrealm = false;
7582 if (straydn) {
7583 // if there is newly created snaprealm, need to split old snaprealm's
7584 // inodes_with_caps. So pop snaprealm before linkage changes.
7585 strayin = dn->get_linkage()->get_inode();
7586 hadrealm = strayin->snaprealm ? true : false;
7587 strayin->early_pop_projected_snaprealm();
7588 }
7589
7590 // unlink main dentry
7591 dn->get_dir()->unlink_inode(dn);
7592 dn->pop_projected_linkage();
7593 dn->mark_dirty(dnpv, mdr->ls);
7594
7595 // relink as stray? (i.e. was primary link?)
7596 if (straydn) {
7597 dout(20) << " straydn is " << *straydn << dendl;
7598 straydn->pop_projected_linkage();
7599 mdcache->touch_dentry_bottom(straydn);
7600 }
7601
7602 mdr->apply();
7603
7604 mdcache->send_dentry_unlink(dn, straydn, mdr);
7605
7606 if (straydn) {
7607 // update subtree map?
7608 if (strayin->is_dir())
7609 mdcache->adjust_subtree_after_rename(strayin, dn->get_dir(), true);
7610
7611 if (strayin->snaprealm && !hadrealm)
7612 mdcache->do_realm_invalidate_and_update_notify(strayin, CEPH_SNAP_OP_SPLIT, false);
7613 }
7614
7615 // bump pop
7616 mds->balancer->hit_dir(dn->get_dir(), META_POP_IWR);
7617
7618 // reply
7619 respond_to_request(mdr, 0);
7620
7621 // removing a new dn?
7622 dn->get_dir()->try_remove_unlinked_dn(dn);
7623
7624 // clean up ?
7625 // respond_to_request() drops locks. So stray reintegration can race with us.
7626 if (straydn && !straydn->get_projected_linkage()->is_null()) {
7627 // Tip off the MDCache that this dentry is a stray that
7628 // might be elegible for purge.
7629 mdcache->notify_stray(straydn);
7630 }
7631 }
7632
7633 bool Server::_rmdir_prepare_witness(MDRequestRef& mdr, mds_rank_t who, vector<CDentry*>& trace, CDentry *straydn)
7634 {
7635 if (mds->is_cluster_degraded() &&
7636 !mds->mdsmap->is_clientreplay_or_active_or_stopping(who)) {
7637 dout(10) << "_rmdir_prepare_witness mds." << who << " is not active" << dendl;
7638 if (mdr->more()->waiting_on_peer.empty())
7639 mds->wait_for_active_peer(who, new C_MDS_RetryRequest(mdcache, mdr));
7640 return false;
7641 }
7642
7643 dout(10) << "_rmdir_prepare_witness mds." << who << dendl;
7644 auto req = make_message<MMDSPeerRequest>(mdr->reqid, mdr->attempt, MMDSPeerRequest::OP_RMDIRPREP);
7645 req->srcdnpath = filepath(trace.front()->get_dir()->ino());
7646 for (auto dn : trace)
7647 req->srcdnpath.push_dentry(dn->get_name());
7648 mdcache->encode_replica_stray(straydn, who, req->straybl);
7649 if (mdr->more()->desti_srnode)
7650 encode(*mdr->more()->desti_srnode, req->desti_snapbl);
7651
7652 req->op_stamp = mdr->get_op_stamp();
7653 mds->send_message_mds(req, who);
7654
7655 ceph_assert(mdr->more()->waiting_on_peer.count(who) == 0);
7656 mdr->more()->waiting_on_peer.insert(who);
7657 return true;
7658 }
7659
7660 struct C_MDS_PeerRmdirPrep : public ServerLogContext {
7661 CDentry *dn, *straydn;
7662 C_MDS_PeerRmdirPrep(Server *s, MDRequestRef& r, CDentry *d, CDentry *st)
7663 : ServerLogContext(s, r), dn(d), straydn(st) {}
7664 void finish(int r) override {
7665 server->_logged_peer_rmdir(mdr, dn, straydn);
7666 }
7667 };
7668
7669 struct C_MDS_PeerRmdirCommit : public ServerContext {
7670 MDRequestRef mdr;
7671 CDentry *straydn;
7672 C_MDS_PeerRmdirCommit(Server *s, MDRequestRef& r, CDentry *sd)
7673 : ServerContext(s), mdr(r), straydn(sd) { }
7674 void finish(int r) override {
7675 server->_commit_peer_rmdir(mdr, r, straydn);
7676 }
7677 };
7678
7679 void Server::handle_peer_rmdir_prep(MDRequestRef& mdr)
7680 {
7681 dout(10) << "handle_peer_rmdir_prep " << *mdr
7682 << " " << mdr->peer_request->srcdnpath
7683 << " to " << mdr->peer_request->destdnpath
7684 << dendl;
7685
7686 vector<CDentry*> trace;
7687 filepath srcpath(mdr->peer_request->srcdnpath);
7688 dout(10) << " src " << srcpath << dendl;
7689 CInode *in;
7690 CF_MDS_RetryRequestFactory cf(mdcache, mdr, false);
7691 int r = mdcache->path_traverse(mdr, cf, srcpath,
7692 MDS_TRAVERSE_DISCOVER | MDS_TRAVERSE_PATH_LOCKED,
7693 &trace, &in);
7694 if (r > 0) return;
7695 if (r == -CEPHFS_ESTALE) {
7696 mdcache->find_ino_peers(srcpath.get_ino(), new C_MDS_RetryRequest(mdcache, mdr),
7697 mdr->peer_to_mds, true);
7698 return;
7699 }
7700 ceph_assert(r == 0);
7701 CDentry *dn = trace.back();
7702 dout(10) << " dn " << *dn << dendl;
7703 mdr->pin(dn);
7704
7705 ceph_assert(mdr->straydn);
7706 CDentry *straydn = mdr->straydn;
7707 dout(10) << " straydn " << *straydn << dendl;
7708
7709 mdr->set_op_stamp(mdr->peer_request->op_stamp);
7710
7711 rmdir_rollback rollback;
7712 rollback.reqid = mdr->reqid;
7713 rollback.src_dir = dn->get_dir()->dirfrag();
7714 rollback.src_dname = dn->get_name();
7715 rollback.dest_dir = straydn->get_dir()->dirfrag();
7716 rollback.dest_dname = straydn->get_name();
7717 if (mdr->peer_request->desti_snapbl.length()) {
7718 if (in->snaprealm) {
7719 encode(true, rollback.snapbl);
7720 in->encode_snap_blob(rollback.snapbl);
7721 } else {
7722 encode(false, rollback.snapbl);
7723 }
7724 }
7725 encode(rollback, mdr->more()->rollback_bl);
7726 // FIXME: rollback snaprealm
7727 dout(20) << " rollback is " << mdr->more()->rollback_bl.length() << " bytes" << dendl;
7728
7729 // set up commit waiter
7730 mdr->more()->peer_commit = new C_MDS_PeerRmdirCommit(this, mdr, straydn);
7731
7732 straydn->push_projected_linkage(in);
7733 dn->push_projected_linkage();
7734
7735 ceph_assert(straydn->first >= in->first);
7736 in->first = straydn->first;
7737
7738 if (!in->has_subtree_root_dirfrag(mds->get_nodeid())) {
7739 dout(10) << " no auth subtree in " << *in << ", skipping journal" << dendl;
7740 _logged_peer_rmdir(mdr, dn, straydn);
7741 return;
7742 }
7743
7744 mdr->ls = mdlog->get_current_segment();
7745 EPeerUpdate *le = new EPeerUpdate(mdlog, "peer_rmdir", mdr->reqid, mdr->peer_to_mds,
7746 EPeerUpdate::OP_PREPARE, EPeerUpdate::RMDIR);
7747 mdlog->start_entry(le);
7748 le->rollback = mdr->more()->rollback_bl;
7749
7750 le->commit.add_dir_context(straydn->get_dir());
7751 le->commit.add_primary_dentry(straydn, in, true);
7752 // peer: no need to journal original dentry
7753
7754 dout(10) << " noting renamed (unlinked) dir ino " << in->ino() << " in metablob" << dendl;
7755 le->commit.renamed_dirino = in->ino();
7756
7757 mdcache->project_subtree_rename(in, dn->get_dir(), straydn->get_dir());
7758 mdcache->add_uncommitted_peer(mdr->reqid, mdr->ls, mdr->peer_to_mds);
7759
7760 mdr->more()->peer_update_journaled = true;
7761 submit_mdlog_entry(le, new C_MDS_PeerRmdirPrep(this, mdr, dn, straydn),
7762 mdr, __func__);
7763 mdlog->flush();
7764 }
7765
7766 void Server::_logged_peer_rmdir(MDRequestRef& mdr, CDentry *dn, CDentry *straydn)
7767 {
7768 dout(10) << "_logged_peer_rmdir " << *mdr << " on " << *dn << dendl;
7769 CInode *in = dn->get_linkage()->get_inode();
7770
7771 bool new_realm;
7772 if (mdr->peer_request->desti_snapbl.length()) {
7773 new_realm = !in->snaprealm;
7774 in->decode_snap_blob(mdr->peer_request->desti_snapbl);
7775 ceph_assert(in->snaprealm);
7776 } else {
7777 new_realm = false;
7778 }
7779
7780 // update our cache now, so we are consistent with what is in the journal
7781 // when we journal a subtree map
7782 dn->get_dir()->unlink_inode(dn);
7783 straydn->pop_projected_linkage();
7784 dn->pop_projected_linkage();
7785
7786 mdcache->adjust_subtree_after_rename(in, dn->get_dir(), mdr->more()->peer_update_journaled);
7787
7788 if (new_realm)
7789 mdcache->do_realm_invalidate_and_update_notify(in, CEPH_SNAP_OP_SPLIT, false);
7790
7791 // done.
7792 mdr->reset_peer_request();
7793 mdr->straydn = 0;
7794
7795 if (!mdr->aborted) {
7796 auto reply = make_message<MMDSPeerRequest>(mdr->reqid, mdr->attempt, MMDSPeerRequest::OP_RMDIRPREPACK);
7797 if (!mdr->more()->peer_update_journaled)
7798 reply->mark_not_journaled();
7799 mds->send_message_mds(reply, mdr->peer_to_mds);
7800 } else {
7801 dout(10) << " abort flag set, finishing" << dendl;
7802 mdcache->request_finish(mdr);
7803 }
7804 }
7805
7806 void Server::handle_peer_rmdir_prep_ack(MDRequestRef& mdr, const cref_t<MMDSPeerRequest> &ack)
7807 {
7808 dout(10) << "handle_peer_rmdir_prep_ack " << *mdr
7809 << " " << *ack << dendl;
7810
7811 mds_rank_t from = mds_rank_t(ack->get_source().num());
7812
7813 mdr->more()->peers.insert(from);
7814 mdr->more()->witnessed.insert(from);
7815 if (!ack->is_not_journaled())
7816 mdr->more()->has_journaled_peers = true;
7817
7818 // remove from waiting list
7819 ceph_assert(mdr->more()->waiting_on_peer.count(from));
7820 mdr->more()->waiting_on_peer.erase(from);
7821
7822 if (mdr->more()->waiting_on_peer.empty())
7823 dispatch_client_request(mdr); // go again!
7824 else
7825 dout(10) << "still waiting on peers " << mdr->more()->waiting_on_peer << dendl;
7826 }
7827
7828 void Server::_commit_peer_rmdir(MDRequestRef& mdr, int r, CDentry *straydn)
7829 {
7830 dout(10) << "_commit_peer_rmdir " << *mdr << " r=" << r << dendl;
7831
7832 if (r == 0) {
7833 if (mdr->more()->peer_update_journaled) {
7834 CInode *strayin = straydn->get_projected_linkage()->get_inode();
7835 if (strayin && !strayin->snaprealm)
7836 mdcache->clear_dirty_bits_for_stray(strayin);
7837 }
7838
7839 mdr->cleanup();
7840
7841 if (mdr->more()->peer_update_journaled) {
7842 // write a commit to the journal
7843 EPeerUpdate *le = new EPeerUpdate(mdlog, "peer_rmdir_commit", mdr->reqid,
7844 mdr->peer_to_mds, EPeerUpdate::OP_COMMIT,
7845 EPeerUpdate::RMDIR);
7846 mdlog->start_entry(le);
7847 submit_mdlog_entry(le, new C_MDS_CommittedPeer(this, mdr), mdr, __func__);
7848 mdlog->flush();
7849 } else {
7850 _committed_peer(mdr);
7851 }
7852 } else {
7853 // abort
7854 do_rmdir_rollback(mdr->more()->rollback_bl, mdr->peer_to_mds, mdr);
7855 }
7856 }
7857
7858 struct C_MDS_LoggedRmdirRollback : public ServerLogContext {
7859 metareqid_t reqid;
7860 CDentry *dn;
7861 CDentry *straydn;
7862 C_MDS_LoggedRmdirRollback(Server *s, MDRequestRef& m, metareqid_t mr, CDentry *d, CDentry *st)
7863 : ServerLogContext(s, m), reqid(mr), dn(d), straydn(st) {}
7864 void finish(int r) override {
7865 server->_rmdir_rollback_finish(mdr, reqid, dn, straydn);
7866 }
7867 };
7868
7869 void Server::do_rmdir_rollback(bufferlist &rbl, mds_rank_t leader, MDRequestRef& mdr)
7870 {
7871 // unlink the other rollback methods, the rmdir rollback is only
7872 // needed to record the subtree changes in the journal for inode
7873 // replicas who are auth for empty dirfrags. no actual changes to
7874 // the file system are taking place here, so there is no Mutation.
7875
7876 rmdir_rollback rollback;
7877 auto p = rbl.cbegin();
7878 decode(rollback, p);
7879
7880 dout(10) << "do_rmdir_rollback on " << rollback.reqid << dendl;
7881 mdcache->add_rollback(rollback.reqid, leader); // need to finish this update before resolve finishes
7882 ceph_assert(mdr || mds->is_resolve());
7883
7884 CDir *dir = mdcache->get_dirfrag(rollback.src_dir);
7885 if (!dir)
7886 dir = mdcache->get_dirfrag(rollback.src_dir.ino, rollback.src_dname);
7887 ceph_assert(dir);
7888 CDentry *dn = dir->lookup(rollback.src_dname);
7889 ceph_assert(dn);
7890 dout(10) << " dn " << *dn << dendl;
7891 CDir *straydir = mdcache->get_dirfrag(rollback.dest_dir);
7892 ceph_assert(straydir);
7893 CDentry *straydn = straydir->lookup(rollback.dest_dname);
7894 ceph_assert(straydn);
7895 dout(10) << " straydn " << *straydn << dendl;
7896 CInode *in = straydn->get_linkage()->get_inode();
7897
7898 dn->push_projected_linkage(in);
7899 straydn->push_projected_linkage();
7900
7901 if (rollback.snapbl.length() && in->snaprealm) {
7902 bool hadrealm;
7903 auto p = rollback.snapbl.cbegin();
7904 decode(hadrealm, p);
7905 if (hadrealm) {
7906 decode(in->snaprealm->srnode, p);
7907 } else {
7908 in->snaprealm->merge_to(dir->get_inode()->find_snaprealm());
7909 }
7910 }
7911
7912 if (mdr && !mdr->more()->peer_update_journaled) {
7913 ceph_assert(!in->has_subtree_root_dirfrag(mds->get_nodeid()));
7914
7915 _rmdir_rollback_finish(mdr, rollback.reqid, dn, straydn);
7916 return;
7917 }
7918
7919
7920 EPeerUpdate *le = new EPeerUpdate(mdlog, "peer_rmdir_rollback", rollback.reqid, leader,
7921 EPeerUpdate::OP_ROLLBACK, EPeerUpdate::RMDIR);
7922 mdlog->start_entry(le);
7923
7924 le->commit.add_dir_context(dn->get_dir());
7925 le->commit.add_primary_dentry(dn, in, true);
7926 // peer: no need to journal straydn
7927
7928 dout(10) << " noting renamed (unlinked) dir ino " << in->ino() << " in metablob" << dendl;
7929 le->commit.renamed_dirino = in->ino();
7930
7931 mdcache->project_subtree_rename(in, straydn->get_dir(), dn->get_dir());
7932
7933 submit_mdlog_entry(le,
7934 new C_MDS_LoggedRmdirRollback(this, mdr,rollback.reqid,
7935 dn, straydn),
7936 mdr, __func__);
7937 mdlog->flush();
7938 }
7939
7940 void Server::_rmdir_rollback_finish(MDRequestRef& mdr, metareqid_t reqid, CDentry *dn, CDentry *straydn)
7941 {
7942 dout(10) << "_rmdir_rollback_finish " << reqid << dendl;
7943
7944 straydn->get_dir()->unlink_inode(straydn);
7945 dn->pop_projected_linkage();
7946 straydn->pop_projected_linkage();
7947
7948 CInode *in = dn->get_linkage()->get_inode();
7949 mdcache->adjust_subtree_after_rename(in, straydn->get_dir(),
7950 !mdr || mdr->more()->peer_update_journaled);
7951
7952 if (mds->is_resolve()) {
7953 CDir *root = mdcache->get_subtree_root(straydn->get_dir());
7954 mdcache->try_trim_non_auth_subtree(root);
7955 }
7956
7957 if (mdr)
7958 mdcache->request_finish(mdr);
7959
7960 mdcache->finish_rollback(reqid, mdr);
7961 }
7962
7963
7964 /** _dir_is_nonempty[_unlocked]
7965 *
7966 * check if a directory is non-empty (i.e. we can rmdir it).
7967 *
7968 * the unlocked varient this is a fastpath check. we can't really be
7969 * sure until we rdlock the filelock.
7970 */
7971 bool Server::_dir_is_nonempty_unlocked(MDRequestRef& mdr, CInode *in)
7972 {
7973 dout(10) << "dir_is_nonempty_unlocked " << *in << dendl;
7974 ceph_assert(in->is_auth());
7975
7976 if (in->filelock.is_cached())
7977 return false; // there can be pending async create/unlink. don't know.
7978 if (in->snaprealm && in->snaprealm->srnode.snaps.size())
7979 return true; // in a snapshot!
7980
7981 auto&& ls = in->get_dirfrags();
7982 for (const auto& dir : ls) {
7983 // is the frag obviously non-empty?
7984 if (dir->is_auth()) {
7985 if (dir->get_projected_fnode()->fragstat.size()) {
7986 dout(10) << "dir_is_nonempty_unlocked dirstat has "
7987 << dir->get_projected_fnode()->fragstat.size() << " items " << *dir << dendl;
7988 return true;
7989 }
7990 }
7991 }
7992
7993 return false;
7994 }
7995
7996 bool Server::_dir_is_nonempty(MDRequestRef& mdr, CInode *in)
7997 {
7998 dout(10) << "dir_is_nonempty " << *in << dendl;
7999 ceph_assert(in->is_auth());
8000 ceph_assert(in->filelock.can_read(mdr->get_client()));
8001
8002 frag_info_t dirstat;
8003 version_t dirstat_version = in->get_projected_inode()->dirstat.version;
8004
8005 auto&& ls = in->get_dirfrags();
8006 for (const auto& dir : ls) {
8007 const auto& pf = dir->get_projected_fnode();
8008 if (pf->fragstat.size()) {
8009 dout(10) << "dir_is_nonempty dirstat has "
8010 << pf->fragstat.size() << " items " << *dir << dendl;
8011 return true;
8012 }
8013
8014 if (pf->accounted_fragstat.version == dirstat_version)
8015 dirstat.add(pf->accounted_fragstat);
8016 else
8017 dirstat.add(pf->fragstat);
8018 }
8019
8020 return dirstat.size() != in->get_projected_inode()->dirstat.size();
8021 }
8022
8023
8024 // ======================================================
8025
8026
8027 class C_MDS_rename_finish : public ServerLogContext {
8028 CDentry *srcdn;
8029 CDentry *destdn;
8030 CDentry *straydn;
8031 public:
8032 C_MDS_rename_finish(Server *s, MDRequestRef& r,
8033 CDentry *sdn, CDentry *ddn, CDentry *stdn) :
8034 ServerLogContext(s, r),
8035 srcdn(sdn), destdn(ddn), straydn(stdn) { }
8036 void finish(int r) override {
8037 ceph_assert(r == 0);
8038 server->_rename_finish(mdr, srcdn, destdn, straydn);
8039 }
8040 };
8041
8042
8043 /** handle_client_rename
8044 *
8045 * rename leader is the destdn auth. this is because cached inodes
8046 * must remain connected. thus, any replica of srci, must also
8047 * replicate destdn, and possibly straydn, so that srci (and
8048 * destdn->inode) remain connected during the rename.
8049 *
8050 * to do this, we freeze srci, then leader (destdn auth) verifies that
8051 * all other nodes have also replciated destdn and straydn. note that
8052 * destdn replicas need not also replicate srci. this only works when
8053 * destdn is leader.
8054 *
8055 * This function takes responsibility for the passed mdr.
8056 */
8057 void Server::handle_client_rename(MDRequestRef& mdr)
8058 {
8059 const auto& req = mdr->client_request;
8060 dout(7) << "handle_client_rename " << *req << dendl;
8061
8062 filepath destpath = req->get_filepath();
8063 filepath srcpath = req->get_filepath2();
8064 if (srcpath.is_last_dot_or_dotdot() || destpath.is_last_dot_or_dotdot()) {
8065 respond_to_request(mdr, -CEPHFS_EBUSY);
8066 return;
8067 }
8068
8069 if (req->get_alternate_name().size() > alternate_name_max) {
8070 dout(10) << " alternate_name longer than " << alternate_name_max << dendl;
8071 respond_to_request(mdr, -CEPHFS_ENAMETOOLONG);
8072 return;
8073 }
8074
8075 auto [destdn, srcdn] = rdlock_two_paths_xlock_destdn(mdr, true);
8076 if (!destdn)
8077 return;
8078
8079 dout(10) << " destdn " << *destdn << dendl;
8080 CDir *destdir = destdn->get_dir();
8081 ceph_assert(destdir->is_auth());
8082 CDentry::linkage_t *destdnl = destdn->get_projected_linkage();
8083
8084 dout(10) << " srcdn " << *srcdn << dendl;
8085 CDir *srcdir = srcdn->get_dir();
8086 CDentry::linkage_t *srcdnl = srcdn->get_projected_linkage();
8087 CInode *srci = srcdnl->get_inode();
8088 dout(10) << " srci " << *srci << dendl;
8089
8090 // -- some sanity checks --
8091 if (destdn == srcdn) {
8092 dout(7) << "rename src=dest, noop" << dendl;
8093 respond_to_request(mdr, 0);
8094 return;
8095 }
8096
8097 // dest a child of src?
8098 // e.g. mv /usr /usr/foo
8099 if (srci->is_dir() && srci->is_projected_ancestor_of(destdir->get_inode())) {
8100 dout(7) << "cannot rename item to be a child of itself" << dendl;
8101 respond_to_request(mdr, -CEPHFS_EINVAL);
8102 return;
8103 }
8104
8105 // is this a stray migration, reintegration or merge? (sanity checks!)
8106 if (mdr->reqid.name.is_mds() &&
8107 !(MDS_INO_IS_STRAY(srcpath.get_ino()) &&
8108 MDS_INO_IS_STRAY(destpath.get_ino())) &&
8109 !(destdnl->is_remote() &&
8110 destdnl->get_remote_ino() == srci->ino())) {
8111 respond_to_request(mdr, -CEPHFS_EINVAL); // actually, this won't reply, but whatev.
8112 return;
8113 }
8114
8115 CInode *oldin = 0;
8116 if (!destdnl->is_null()) {
8117 //dout(10) << "dest dn exists " << *destdn << dendl;
8118 oldin = mdcache->get_dentry_inode(destdn, mdr, true);
8119 if (!oldin) return;
8120 dout(10) << " oldin " << *oldin << dendl;
8121
8122 // non-empty dir? do trivial fast unlocked check, do another check later with read locks
8123 if (oldin->is_dir() && _dir_is_nonempty_unlocked(mdr, oldin)) {
8124 respond_to_request(mdr, -CEPHFS_ENOTEMPTY);
8125 return;
8126 }
8127
8128 // mv /some/thing /to/some/existing_other_thing
8129 if (oldin->is_dir() && !srci->is_dir()) {
8130 respond_to_request(mdr, -CEPHFS_EISDIR);
8131 return;
8132 }
8133 if (!oldin->is_dir() && srci->is_dir()) {
8134 respond_to_request(mdr, -CEPHFS_ENOTDIR);
8135 return;
8136 }
8137 if (srci == oldin && !srcdir->inode->is_stray()) {
8138 respond_to_request(mdr, 0); // no-op. POSIX makes no sense.
8139 return;
8140 }
8141 if (destdn->get_alternate_name() != req->get_alternate_name()) {
8142 /* the dentry exists but the alternate_names do not match, fail... */
8143 respond_to_request(mdr, -CEPHFS_EINVAL);
8144 return;
8145 }
8146 }
8147
8148 vector<CDentry*>& srctrace = mdr->dn[1];
8149 vector<CDentry*>& desttrace = mdr->dn[0];
8150
8151 // src+dest traces _must_ share a common ancestor for locking to prevent orphans
8152 if (destpath.get_ino() != srcpath.get_ino() &&
8153 !(req->get_source().is_mds() &&
8154 MDS_INO_IS_STRAY(srcpath.get_ino()))) { // <-- mds 'rename' out of stray dir is ok!
8155 CInode *srcbase = srctrace[0]->get_dir()->get_inode();
8156 CInode *destbase = desttrace[0]->get_dir()->get_inode();
8157 // ok, extend srctrace toward root until it is an ancestor of desttrace.
8158 while (srcbase != destbase &&
8159 !srcbase->is_projected_ancestor_of(destbase)) {
8160 CDentry *pdn = srcbase->get_projected_parent_dn();
8161 srctrace.insert(srctrace.begin(), pdn);
8162 dout(10) << "rename prepending srctrace with " << *pdn << dendl;
8163 srcbase = pdn->get_dir()->get_inode();
8164 }
8165
8166 // then, extend destpath until it shares the same parent inode as srcpath.
8167 while (destbase != srcbase) {
8168 CDentry *pdn = destbase->get_projected_parent_dn();
8169 desttrace.insert(desttrace.begin(), pdn);
8170 dout(10) << "rename prepending desttrace with " << *pdn << dendl;
8171 destbase = pdn->get_dir()->get_inode();
8172 }
8173 dout(10) << "rename src and dest traces now share common ancestor " << *destbase << dendl;
8174 }
8175
8176
8177 bool linkmerge = srcdnl->get_inode() == destdnl->get_inode();
8178 if (linkmerge)
8179 dout(10) << " this is a link merge" << dendl;
8180
8181 // -- create stray dentry? --
8182 CDentry *straydn = NULL;
8183 if (destdnl->is_primary() && !linkmerge) {
8184 straydn = prepare_stray_dentry(mdr, destdnl->get_inode());
8185 if (!straydn)
8186 return;
8187 dout(10) << " straydn is " << *straydn << dendl;
8188 } else if (mdr->straydn) {
8189 mdr->unpin(mdr->straydn);
8190 mdr->straydn = NULL;
8191 }
8192
8193
8194 // -- locks --
8195 if (!(mdr->locking_state & MutationImpl::ALL_LOCKED)) {
8196 MutationImpl::LockOpVec lov;
8197
8198 // we need to update srci's ctime. xlock its least contended lock to do that...
8199 lov.add_xlock(&srci->linklock);
8200 lov.add_xlock(&srci->snaplock);
8201
8202 if (oldin) {
8203 // xlock oldin (for nlink--)
8204 lov.add_xlock(&oldin->linklock);
8205 lov.add_xlock(&oldin->snaplock);
8206 if (oldin->is_dir()) {
8207 ceph_assert(srci->is_dir());
8208 lov.add_rdlock(&oldin->filelock); // to verify it's empty
8209
8210 // adjust locking order?
8211 int cmp = mdr->compare_paths();
8212 if (cmp < 0 || (cmp == 0 && oldin->ino() < srci->ino()))
8213 std::reverse(lov.begin(), lov.end());
8214 } else {
8215 ceph_assert(!srci->is_dir());
8216 // adjust locking order;
8217 if (srci->ino() > oldin->ino())
8218 std::reverse(lov.begin(), lov.end());
8219 }
8220 }
8221
8222 // straydn?
8223 if (straydn) {
8224 lov.add_wrlock(&straydn->get_dir()->inode->filelock);
8225 lov.add_wrlock(&straydn->get_dir()->inode->nestlock);
8226 lov.add_xlock(&straydn->lock);
8227 }
8228
8229 CInode *auth_pin_freeze = !srcdn->is_auth() && srcdnl->is_primary() ? srci : nullptr;
8230 if (!mds->locker->acquire_locks(mdr, lov, auth_pin_freeze))
8231 return;
8232
8233 mdr->locking_state |= MutationImpl::ALL_LOCKED;
8234 }
8235
8236 if (linkmerge)
8237 ceph_assert(srcdir->inode->is_stray() && srcdnl->is_primary() && destdnl->is_remote());
8238
8239 if ((!mdr->has_more() || mdr->more()->witnessed.empty())) {
8240 if (!check_access(mdr, srcdir->get_inode(), MAY_WRITE))
8241 return;
8242
8243 if (!check_access(mdr, destdn->get_dir()->get_inode(), MAY_WRITE))
8244 return;
8245
8246 if (!linkmerge && !check_fragment_space(mdr, destdn->get_dir()))
8247 return;
8248
8249 if (!linkmerge && !check_dir_max_entries(mdr, destdn->get_dir()))
8250 return;
8251
8252 if (!check_access(mdr, srci, MAY_WRITE))
8253 return;
8254 }
8255
8256 // with read lock, really verify oldin is empty
8257 if (oldin &&
8258 oldin->is_dir() &&
8259 _dir_is_nonempty(mdr, oldin)) {
8260 respond_to_request(mdr, -CEPHFS_ENOTEMPTY);
8261 return;
8262 }
8263
8264 /* project_snaprealm_past_parent() will do this job
8265 *
8266 // moving between snaprealms?
8267 if (srcdnl->is_primary() && srci->is_multiversion() && !srci->snaprealm) {
8268 SnapRealm *srcrealm = srci->find_snaprealm();
8269 SnapRealm *destrealm = destdn->get_dir()->inode->find_snaprealm();
8270 if (srcrealm != destrealm &&
8271 (srcrealm->get_newest_seq() + 1 > srcdn->first ||
8272 destrealm->get_newest_seq() + 1 > srcdn->first)) {
8273 dout(10) << " renaming between snaprealms, creating snaprealm for " << *srci << dendl;
8274 mdcache->snaprealm_create(mdr, srci);
8275 return;
8276 }
8277 }
8278 */
8279
8280 SnapRealm *dest_realm = nullptr;
8281 SnapRealm *src_realm = nullptr;
8282 if (!linkmerge) {
8283 dest_realm = destdir->inode->find_snaprealm();
8284 if (srcdir->inode == destdir->inode)
8285 src_realm = dest_realm;
8286 else
8287 src_realm = srcdir->inode->find_snaprealm();
8288 if (src_realm != dest_realm &&
8289 src_realm->get_subvolume_ino() != dest_realm->get_subvolume_ino()) {
8290 respond_to_request(mdr, -CEPHFS_EXDEV);
8291 return;
8292 }
8293 }
8294
8295 ceph_assert(g_conf()->mds_kill_rename_at != 1);
8296
8297 // -- open all srcdn inode frags, if any --
8298 // we need these open so that auth can properly delegate from inode to dirfrags
8299 // after the inode is _ours_.
8300 if (srcdnl->is_primary() &&
8301 !srcdn->is_auth() &&
8302 srci->is_dir()) {
8303 dout(10) << "srci is remote dir, setting stickydirs and opening all frags" << dendl;
8304 mdr->set_stickydirs(srci);
8305
8306 frag_vec_t leaves;
8307 srci->dirfragtree.get_leaves(leaves);
8308 for (const auto& leaf : leaves) {
8309 CDir *dir = srci->get_dirfrag(leaf);
8310 if (!dir) {
8311 dout(10) << " opening " << leaf << " under " << *srci << dendl;
8312 mdcache->open_remote_dirfrag(srci, leaf, new C_MDS_RetryRequest(mdcache, mdr));
8313 return;
8314 }
8315 }
8316 }
8317
8318 // -- prepare snaprealm ---
8319
8320 if (linkmerge) {
8321 if (!mdr->more()->srci_srnode &&
8322 srci->get_projected_inode()->nlink == 1 &&
8323 srci->is_projected_snaprealm_global()) {
8324 sr_t *new_srnode = srci->prepare_new_srnode(0);
8325 srci->record_snaprealm_parent_dentry(new_srnode, nullptr, destdn, false);
8326
8327 srci->clear_snaprealm_global(new_srnode);
8328 mdr->more()->srci_srnode = new_srnode;
8329 }
8330 } else {
8331 if (oldin && !mdr->more()->desti_srnode) {
8332 if (oldin->is_projected_snaprealm_global()) {
8333 sr_t *new_srnode = oldin->prepare_new_srnode(0);
8334 oldin->record_snaprealm_parent_dentry(new_srnode, dest_realm, destdn, destdnl->is_primary());
8335 // dropping the last linkage or dropping the last remote linkage,
8336 // detch the inode from global snaprealm
8337 auto nlink = oldin->get_projected_inode()->nlink;
8338 if (nlink == 1 ||
8339 (nlink == 2 && !destdnl->is_primary() &&
8340 !oldin->get_projected_parent_dir()->inode->is_stray()))
8341 oldin->clear_snaprealm_global(new_srnode);
8342 mdr->more()->desti_srnode = new_srnode;
8343 } else if (destdnl->is_primary()) {
8344 snapid_t follows = dest_realm->get_newest_seq();
8345 if (oldin->snaprealm || follows + 1 > oldin->get_oldest_snap()) {
8346 sr_t *new_srnode = oldin->prepare_new_srnode(follows);
8347 oldin->record_snaprealm_past_parent(new_srnode, straydn->get_dir()->inode->find_snaprealm());
8348 mdr->more()->desti_srnode = new_srnode;
8349 }
8350 }
8351 }
8352 if (!mdr->more()->srci_srnode) {
8353 if (srci->is_projected_snaprealm_global()) {
8354 sr_t *new_srnode = srci->prepare_new_srnode(0);
8355 srci->record_snaprealm_parent_dentry(new_srnode, src_realm, srcdn, srcdnl->is_primary());
8356 mdr->more()->srci_srnode = new_srnode;
8357 } else if (srcdnl->is_primary()) {
8358 snapid_t follows = src_realm->get_newest_seq();
8359 if (src_realm != dest_realm &&
8360 (srci->snaprealm || follows + 1 > srci->get_oldest_snap())) {
8361 sr_t *new_srnode = srci->prepare_new_srnode(follows);
8362 srci->record_snaprealm_past_parent(new_srnode, dest_realm);
8363 mdr->more()->srci_srnode = new_srnode;
8364 }
8365 }
8366 }
8367 }
8368
8369 // -- prepare witnesses --
8370
8371 /*
8372 * NOTE: we use _all_ replicas as witnesses.
8373 * this probably isn't totally necessary (esp for file renames),
8374 * but if/when we change that, we have to make sure rejoin is
8375 * sufficiently robust to handle strong rejoins from survivors
8376 * with totally wrong dentry->inode linkage.
8377 * (currently, it can ignore rename effects, because the resolve
8378 * stage will sort them out.)
8379 */
8380 set<mds_rank_t> witnesses = mdr->more()->extra_witnesses;
8381 if (srcdn->is_auth())
8382 srcdn->list_replicas(witnesses);
8383 else
8384 witnesses.insert(srcdn->authority().first);
8385 if (srcdnl->is_remote() && !srci->is_auth())
8386 witnesses.insert(srci->authority().first);
8387 destdn->list_replicas(witnesses);
8388 if (destdnl->is_remote() && !oldin->is_auth())
8389 witnesses.insert(oldin->authority().first);
8390 dout(10) << " witnesses " << witnesses << ", have " << mdr->more()->witnessed << dendl;
8391
8392 if (!witnesses.empty()) {
8393 // Replicas can't see projected dentry linkages and will get confused.
8394 // We have taken snaplocks on ancestor inodes. Later rename/rmdir requests
8395 // can't project these inodes' linkages.
8396 bool need_flush = false;
8397 for (auto& dn : srctrace) {
8398 if (dn->is_projected()) {
8399 need_flush = true;
8400 break;
8401 }
8402 }
8403 if (!need_flush) {
8404 CDentry *dn = destdn;
8405 do {
8406 if (dn->is_projected()) {
8407 need_flush = true;
8408 break;
8409 }
8410 CInode *diri = dn->get_dir()->get_inode();
8411 dn = diri->get_projected_parent_dn();
8412 } while (dn);
8413 }
8414 if (need_flush) {
8415 mdlog->wait_for_safe(
8416 new MDSInternalContextWrapper(mds,
8417 new C_MDS_RetryRequest(mdcache, mdr)));
8418 mdlog->flush();
8419 return;
8420 }
8421 }
8422
8423 // do srcdn auth last
8424 mds_rank_t last = MDS_RANK_NONE;
8425 if (!srcdn->is_auth()) {
8426 last = srcdn->authority().first;
8427 mdr->more()->srcdn_auth_mds = last;
8428 // ask auth of srci to mark srci as ambiguous auth if more than two MDS
8429 // are involved in the rename operation.
8430 if (srcdnl->is_primary() && !mdr->more()->is_ambiguous_auth) {
8431 dout(10) << " preparing ambiguous auth for srci" << dendl;
8432 ceph_assert(mdr->more()->is_remote_frozen_authpin);
8433 ceph_assert(mdr->more()->rename_inode == srci);
8434 _rename_prepare_witness(mdr, last, witnesses, srctrace, desttrace, straydn);
8435 return;
8436 }
8437 }
8438
8439 for (set<mds_rank_t>::iterator p = witnesses.begin();
8440 p != witnesses.end();
8441 ++p) {
8442 if (*p == last) continue; // do it last!
8443 if (mdr->more()->witnessed.count(*p)) {
8444 dout(10) << " already witnessed by mds." << *p << dendl;
8445 } else if (mdr->more()->waiting_on_peer.count(*p)) {
8446 dout(10) << " already waiting on witness mds." << *p << dendl;
8447 } else {
8448 if (!_rename_prepare_witness(mdr, *p, witnesses, srctrace, desttrace, straydn))
8449 return;
8450 }
8451 }
8452 if (!mdr->more()->waiting_on_peer.empty())
8453 return; // we're waiting for a witness.
8454
8455 if (last != MDS_RANK_NONE && mdr->more()->witnessed.count(last) == 0) {
8456 dout(10) << " preparing last witness (srcdn auth)" << dendl;
8457 ceph_assert(mdr->more()->waiting_on_peer.count(last) == 0);
8458 _rename_prepare_witness(mdr, last, witnesses, srctrace, desttrace, straydn);
8459 return;
8460 }
8461
8462 // test hack: bail after peer does prepare, so we can verify it's _live_ rollback.
8463 if (!mdr->more()->peers.empty() && !srci->is_dir())
8464 ceph_assert(g_conf()->mds_kill_rename_at != 3);
8465 if (!mdr->more()->peers.empty() && srci->is_dir())
8466 ceph_assert(g_conf()->mds_kill_rename_at != 4);
8467
8468 // -- declare now --
8469 mdr->set_mds_stamp(ceph_clock_now());
8470
8471 // -- prepare journal entry --
8472 mdr->ls = mdlog->get_current_segment();
8473 EUpdate *le = new EUpdate(mdlog, "rename");
8474 mdlog->start_entry(le);
8475 le->metablob.add_client_req(mdr->reqid, req->get_oldest_client_tid());
8476 if (!mdr->more()->witnessed.empty()) {
8477 dout(20) << " noting uncommitted_peers " << mdr->more()->witnessed << dendl;
8478
8479 le->reqid = mdr->reqid;
8480 le->had_peers = true;
8481
8482 mdcache->add_uncommitted_leader(mdr->reqid, mdr->ls, mdr->more()->witnessed);
8483 // no need to send frozen auth pin to recovring auth MDS of srci
8484 mdr->more()->is_remote_frozen_authpin = false;
8485 }
8486
8487 _rename_prepare(mdr, &le->metablob, &le->client_map, srcdn, destdn, req->get_alternate_name(), straydn);
8488 if (le->client_map.length())
8489 le->cmapv = mds->sessionmap.get_projected();
8490
8491 // -- commit locally --
8492 C_MDS_rename_finish *fin = new C_MDS_rename_finish(this, mdr, srcdn, destdn, straydn);
8493
8494 journal_and_reply(mdr, srci, destdn, le, fin);
8495 mds->balancer->maybe_fragment(destdn->get_dir(), false);
8496 }
8497
8498
8499 void Server::_rename_finish(MDRequestRef& mdr, CDentry *srcdn, CDentry *destdn, CDentry *straydn)
8500 {
8501 dout(10) << "_rename_finish " << *mdr << dendl;
8502
8503 if (!mdr->more()->witnessed.empty())
8504 mdcache->logged_leader_update(mdr->reqid);
8505
8506 // apply
8507 _rename_apply(mdr, srcdn, destdn, straydn);
8508
8509 mdcache->send_dentry_link(destdn, mdr);
8510
8511 CDentry::linkage_t *destdnl = destdn->get_linkage();
8512 CInode *in = destdnl->get_inode();
8513 bool need_eval = mdr->more()->cap_imports.count(in);
8514
8515 // test hack: test peer commit
8516 if (!mdr->more()->peers.empty() && !in->is_dir())
8517 ceph_assert(g_conf()->mds_kill_rename_at != 5);
8518 if (!mdr->more()->peers.empty() && in->is_dir())
8519 ceph_assert(g_conf()->mds_kill_rename_at != 6);
8520
8521 // bump popularity
8522 mds->balancer->hit_dir(srcdn->get_dir(), META_POP_IWR);
8523 if (destdnl->is_remote() && in->is_auth())
8524 mds->balancer->hit_inode(in, META_POP_IWR);
8525
8526 // did we import srci? if so, explicitly ack that import that, before we unlock and reply.
8527
8528 ceph_assert(g_conf()->mds_kill_rename_at != 7);
8529
8530 // reply
8531 respond_to_request(mdr, 0);
8532
8533 if (need_eval)
8534 mds->locker->eval(in, CEPH_CAP_LOCKS, true);
8535
8536 // clean up?
8537 // respond_to_request() drops locks. So stray reintegration can race with us.
8538 if (straydn && !straydn->get_projected_linkage()->is_null()) {
8539 mdcache->notify_stray(straydn);
8540 }
8541 }
8542
8543
8544
8545 // helpers
8546
8547 bool Server::_rename_prepare_witness(MDRequestRef& mdr, mds_rank_t who, set<mds_rank_t> &witnesse,
8548 vector<CDentry*>& srctrace, vector<CDentry*>& dsttrace, CDentry *straydn)
8549 {
8550 const auto& client_req = mdr->client_request;
8551 ceph_assert(client_req);
8552
8553 if (mds->is_cluster_degraded() &&
8554 !mds->mdsmap->is_clientreplay_or_active_or_stopping(who)) {
8555 dout(10) << "_rename_prepare_witness mds." << who << " is not active" << dendl;
8556 if (mdr->more()->waiting_on_peer.empty())
8557 mds->wait_for_active_peer(who, new C_MDS_RetryRequest(mdcache, mdr));
8558 return false;
8559 }
8560
8561 dout(10) << "_rename_prepare_witness mds." << who << dendl;
8562 auto req = make_message<MMDSPeerRequest>(mdr->reqid, mdr->attempt, MMDSPeerRequest::OP_RENAMEPREP);
8563
8564 req->srcdnpath = filepath(srctrace.front()->get_dir()->ino());
8565 for (auto dn : srctrace)
8566 req->srcdnpath.push_dentry(dn->get_name());
8567 req->destdnpath = filepath(dsttrace.front()->get_dir()->ino());
8568 for (auto dn : dsttrace)
8569 req->destdnpath.push_dentry(dn->get_name());
8570 req->alternate_name = client_req->alternate_name;
8571 if (straydn)
8572 mdcache->encode_replica_stray(straydn, who, req->straybl);
8573
8574 if (mdr->more()->srci_srnode)
8575 encode(*mdr->more()->srci_srnode, req->srci_snapbl);
8576 if (mdr->more()->desti_srnode)
8577 encode(*mdr->more()->desti_srnode, req->desti_snapbl);
8578
8579 req->srcdn_auth = mdr->more()->srcdn_auth_mds;
8580
8581 // srcdn auth will verify our current witness list is sufficient
8582 req->witnesses = witnesse;
8583
8584 req->op_stamp = mdr->get_op_stamp();
8585 mds->send_message_mds(req, who);
8586
8587 ceph_assert(mdr->more()->waiting_on_peer.count(who) == 0);
8588 mdr->more()->waiting_on_peer.insert(who);
8589 return true;
8590 }
8591
8592 version_t Server::_rename_prepare_import(MDRequestRef& mdr, CDentry *srcdn, bufferlist *client_map_bl)
8593 {
8594 version_t oldpv = mdr->more()->inode_import_v;
8595
8596 CDentry::linkage_t *srcdnl = srcdn->get_linkage();
8597
8598 /* import node */
8599 auto blp = mdr->more()->inode_import.cbegin();
8600
8601 // imported caps
8602 map<client_t,entity_inst_t> client_map;
8603 map<client_t, client_metadata_t> client_metadata_map;
8604 decode(client_map, blp);
8605 decode(client_metadata_map, blp);
8606 prepare_force_open_sessions(client_map, client_metadata_map,
8607 mdr->more()->imported_session_map);
8608 encode(client_map, *client_map_bl, mds->mdsmap->get_up_features());
8609 encode(client_metadata_map, *client_map_bl);
8610
8611 list<ScatterLock*> updated_scatterlocks;
8612 mdcache->migrator->decode_import_inode(srcdn, blp, srcdn->authority().first, mdr->ls,
8613 mdr->more()->cap_imports, updated_scatterlocks);
8614
8615 // hack: force back to !auth and clean, temporarily
8616 srcdnl->get_inode()->state_clear(CInode::STATE_AUTH);
8617 srcdnl->get_inode()->mark_clean();
8618
8619 return oldpv;
8620 }
8621
8622 bool Server::_need_force_journal(CInode *diri, bool empty)
8623 {
8624 auto&& dirs = diri->get_dirfrags();
8625
8626 bool force_journal = false;
8627 if (empty) {
8628 for (const auto& dir : dirs) {
8629 if (dir->is_subtree_root() && dir->get_dir_auth().first == mds->get_nodeid()) {
8630 dout(10) << " frag " << dir->get_frag() << " is auth subtree dirfrag, will force journal" << dendl;
8631 force_journal = true;
8632 break;
8633 } else
8634 dout(20) << " frag " << dir->get_frag() << " is not auth subtree dirfrag" << dendl;
8635 }
8636 } else {
8637 // see if any children of our frags are auth subtrees.
8638 std::vector<CDir*> subtrees;
8639 mdcache->get_subtrees(subtrees);
8640 dout(10) << " subtrees " << subtrees << " frags " << dirs << dendl;
8641 for (const auto& dir : dirs) {
8642 for (const auto& subtree : subtrees) {
8643 if (dir->contains(subtree)) {
8644 if (subtree->get_dir_auth().first == mds->get_nodeid()) {
8645 dout(10) << " frag " << dir->get_frag() << " contains (maybe) auth subtree, will force journal "
8646 << *subtree << dendl;
8647 force_journal = true;
8648 break;
8649 } else
8650 dout(20) << " frag " << dir->get_frag() << " contains but isn't auth for " << *subtree << dendl;
8651 } else
8652 dout(20) << " frag " << dir->get_frag() << " does not contain " << *subtree << dendl;
8653 }
8654 if (force_journal)
8655 break;
8656 }
8657 }
8658 return force_journal;
8659 }
8660
8661 void Server::_rename_prepare(MDRequestRef& mdr,
8662 EMetaBlob *metablob, bufferlist *client_map_bl,
8663 CDentry *srcdn, CDentry *destdn, std::string_view alternate_name,
8664 CDentry *straydn)
8665 {
8666 dout(10) << "_rename_prepare " << *mdr << " " << *srcdn << " " << *destdn << dendl;
8667 if (straydn)
8668 dout(10) << " straydn " << *straydn << dendl;
8669
8670 CDentry::linkage_t *srcdnl = srcdn->get_projected_linkage();
8671 CDentry::linkage_t *destdnl = destdn->get_projected_linkage();
8672 CInode *srci = srcdnl->get_inode();
8673 CInode *oldin = destdnl->get_inode();
8674
8675 // primary+remote link merge?
8676 bool linkmerge = (srci == oldin);
8677 if (linkmerge)
8678 ceph_assert(srcdnl->is_primary() && destdnl->is_remote());
8679 bool silent = srcdn->get_dir()->inode->is_stray();
8680
8681 bool force_journal_dest = false;
8682 if (srci->is_dir() && !destdn->is_auth()) {
8683 if (srci->is_auth()) {
8684 // if we are auth for srci and exporting it, force journal because journal replay needs
8685 // the source inode to create auth subtrees.
8686 dout(10) << " we are exporting srci, will force journal destdn" << dendl;
8687 force_journal_dest = true;
8688 } else
8689 force_journal_dest = _need_force_journal(srci, false);
8690 }
8691
8692 bool force_journal_stray = false;
8693 if (oldin && oldin->is_dir() && straydn && !straydn->is_auth())
8694 force_journal_stray = _need_force_journal(oldin, true);
8695
8696 if (linkmerge)
8697 dout(10) << " merging remote and primary links to the same inode" << dendl;
8698 if (silent)
8699 dout(10) << " reintegrating stray; will avoid changing nlink or dir mtime" << dendl;
8700 if (force_journal_dest)
8701 dout(10) << " forcing journal destdn because we (will) have auth subtrees nested beneath it" << dendl;
8702 if (force_journal_stray)
8703 dout(10) << " forcing journal straydn because we (will) have auth subtrees nested beneath it" << dendl;
8704
8705 if (srci->is_dir() && (destdn->is_auth() || force_journal_dest)) {
8706 dout(10) << " noting renamed dir ino " << srci->ino() << " in metablob" << dendl;
8707 metablob->renamed_dirino = srci->ino();
8708 } else if (oldin && oldin->is_dir() && force_journal_stray) {
8709 dout(10) << " noting rename target dir " << oldin->ino() << " in metablob" << dendl;
8710 metablob->renamed_dirino = oldin->ino();
8711 }
8712
8713 // prepare
8714 CInode::mempool_inode *spi = 0; // renamed inode
8715 CInode::mempool_inode *tpi = 0; // target/overwritten inode
8716
8717 // target inode
8718 if (!linkmerge) {
8719 if (destdnl->is_primary()) {
8720 ceph_assert(straydn); // moving to straydn.
8721 // link--, and move.
8722 if (destdn->is_auth()) {
8723 auto pi= oldin->project_inode(mdr); //project_snaprealm
8724 pi.inode->version = straydn->pre_dirty(pi.inode->version);
8725 pi.inode->update_backtrace();
8726 tpi = pi.inode.get();
8727 }
8728 straydn->push_projected_linkage(oldin);
8729 } else if (destdnl->is_remote()) {
8730 // nlink-- targeti
8731 if (oldin->is_auth()) {
8732 auto pi = oldin->project_inode(mdr);
8733 pi.inode->version = oldin->pre_dirty();
8734 tpi = pi.inode.get();
8735 }
8736 }
8737 }
8738
8739 // dest
8740 if (destdnl->is_null()) {
8741 /* handle_client_rename checks that alternate_name matches for existing destdn */
8742 destdn->set_alternate_name(alternate_name);
8743 }
8744 if (srcdnl->is_remote()) {
8745 if (!linkmerge) {
8746 // destdn
8747 if (destdn->is_auth())
8748 mdr->more()->pvmap[destdn] = destdn->pre_dirty();
8749 destdn->push_projected_linkage(srcdnl->get_remote_ino(), srcdnl->get_remote_d_type());
8750 // srci
8751 if (srci->is_auth()) {
8752 auto pi = srci->project_inode(mdr);
8753 pi.inode->version = srci->pre_dirty();
8754 spi = pi.inode.get();
8755 }
8756 } else {
8757 dout(10) << " will merge remote onto primary link" << dendl;
8758 if (destdn->is_auth()) {
8759 auto pi = oldin->project_inode(mdr);
8760 pi.inode->version = mdr->more()->pvmap[destdn] = destdn->pre_dirty(oldin->get_version());
8761 spi = pi.inode.get();
8762 }
8763 }
8764 } else { // primary
8765 if (destdn->is_auth()) {
8766 version_t oldpv;
8767 if (srcdn->is_auth())
8768 oldpv = srci->get_projected_version();
8769 else {
8770 oldpv = _rename_prepare_import(mdr, srcdn, client_map_bl);
8771
8772 // note which dirfrags have child subtrees in the journal
8773 // event, so that we can open those (as bounds) during replay.
8774 if (srci->is_dir()) {
8775 auto&& ls = srci->get_dirfrags();
8776 for (const auto& dir : ls) {
8777 if (!dir->is_auth())
8778 metablob->renamed_dir_frags.push_back(dir->get_frag());
8779 }
8780 dout(10) << " noting renamed dir open frags " << metablob->renamed_dir_frags << dendl;
8781 }
8782 }
8783 auto pi = srci->project_inode(mdr); // project snaprealm if srcdnl->is_primary
8784 // & srcdnl->snaprealm
8785 pi.inode->version = mdr->more()->pvmap[destdn] = destdn->pre_dirty(oldpv);
8786 pi.inode->update_backtrace();
8787 spi = pi.inode.get();
8788 }
8789 destdn->push_projected_linkage(srci);
8790 }
8791
8792 // src
8793 if (srcdn->is_auth())
8794 mdr->more()->pvmap[srcdn] = srcdn->pre_dirty();
8795 srcdn->push_projected_linkage(); // push null linkage
8796
8797 if (!silent) {
8798 if (spi) {
8799 spi->ctime = mdr->get_op_stamp();
8800 if (mdr->get_op_stamp() > spi->rstat.rctime)
8801 spi->rstat.rctime = mdr->get_op_stamp();
8802 spi->change_attr++;
8803 if (linkmerge)
8804 spi->nlink--;
8805 }
8806 if (tpi) {
8807 tpi->ctime = mdr->get_op_stamp();
8808 if (mdr->get_op_stamp() > tpi->rstat.rctime)
8809 tpi->rstat.rctime = mdr->get_op_stamp();
8810 tpi->change_attr++;
8811 {
8812 std::string t;
8813 destdn->make_path_string(t, true);
8814 tpi->stray_prior_path = std::move(t);
8815 }
8816 tpi->nlink--;
8817 if (tpi->nlink == 0)
8818 oldin->state_set(CInode::STATE_ORPHAN);
8819 }
8820 }
8821
8822 // prepare nesting, mtime updates
8823 int predirty_dir = silent ? 0:PREDIRTY_DIR;
8824
8825 // guarantee stray dir is processed first during journal replay. unlink the old inode,
8826 // then link the source inode to destdn
8827 if (destdnl->is_primary()) {
8828 ceph_assert(straydn);
8829 if (straydn->is_auth()) {
8830 metablob->add_dir_context(straydn->get_dir());
8831 metablob->add_dir(straydn->get_dir(), true);
8832 }
8833 }
8834
8835 if (!linkmerge && destdnl->is_remote() && oldin->is_auth()) {
8836 CDir *oldin_dir = oldin->get_projected_parent_dir();
8837 if (oldin_dir != srcdn->get_dir() && oldin_dir != destdn->get_dir())
8838 mdcache->predirty_journal_parents(mdr, metablob, oldin, oldin_dir, PREDIRTY_PRIMARY);
8839 }
8840
8841 // sub off target
8842 if (destdn->is_auth() && !destdnl->is_null()) {
8843 mdcache->predirty_journal_parents(mdr, metablob, oldin, destdn->get_dir(),
8844 (destdnl->is_primary() ? PREDIRTY_PRIMARY:0)|predirty_dir, -1);
8845 if (destdnl->is_primary()) {
8846 ceph_assert(straydn);
8847 mdcache->predirty_journal_parents(mdr, metablob, oldin, straydn->get_dir(),
8848 PREDIRTY_PRIMARY|PREDIRTY_DIR, 1);
8849 }
8850 }
8851
8852 if (srcdnl->is_remote() && srci->is_auth()) {
8853 CDir *srci_dir = srci->get_projected_parent_dir();
8854 if (srci_dir != srcdn->get_dir() && srci_dir != destdn->get_dir())
8855 mdcache->predirty_journal_parents(mdr, metablob, srci, srci_dir, PREDIRTY_PRIMARY);
8856 }
8857
8858 // move srcdn
8859 int predirty_primary = (srcdnl->is_primary() && srcdn->get_dir() != destdn->get_dir()) ? PREDIRTY_PRIMARY:0;
8860 int flags = predirty_dir | predirty_primary;
8861 if (srcdn->is_auth())
8862 mdcache->predirty_journal_parents(mdr, metablob, srci, srcdn->get_dir(), PREDIRTY_SHALLOW|flags, -1);
8863 if (destdn->is_auth())
8864 mdcache->predirty_journal_parents(mdr, metablob, srci, destdn->get_dir(), flags, 1);
8865
8866 // add it all to the metablob
8867 // target inode
8868 if (!linkmerge) {
8869 if (destdnl->is_primary()) {
8870 ceph_assert(straydn);
8871 if (destdn->is_auth()) {
8872 // project snaprealm, too
8873 if (auto& desti_srnode = mdr->more()->desti_srnode) {
8874 oldin->project_snaprealm(desti_srnode);
8875 if (tpi->nlink == 0)
8876 ceph_assert(!desti_srnode->is_parent_global());
8877 desti_srnode = NULL;
8878 }
8879 straydn->first = mdcache->get_global_snaprealm()->get_newest_seq() + 1;
8880 metablob->add_primary_dentry(straydn, oldin, true, true);
8881 } else if (force_journal_stray) {
8882 dout(10) << " forced journaling straydn " << *straydn << dendl;
8883 metablob->add_dir_context(straydn->get_dir());
8884 metablob->add_primary_dentry(straydn, oldin, true);
8885 }
8886 } else if (destdnl->is_remote()) {
8887 if (oldin->is_auth()) {
8888 sr_t *new_srnode = NULL;
8889 if (mdr->peer_request) {
8890 if (mdr->peer_request->desti_snapbl.length() > 0) {
8891 new_srnode = new sr_t();
8892 auto p = mdr->peer_request->desti_snapbl.cbegin();
8893 decode(*new_srnode, p);
8894 }
8895 } else if (auto& desti_srnode = mdr->more()->desti_srnode) {
8896 new_srnode = desti_srnode;
8897 desti_srnode = NULL;
8898 }
8899 if (new_srnode) {
8900 oldin->project_snaprealm(new_srnode);
8901 if (tpi->nlink == 0)
8902 ceph_assert(!new_srnode->is_parent_global());
8903 }
8904 // auth for targeti
8905 CDentry *oldin_pdn = oldin->get_projected_parent_dn();
8906 mdcache->journal_cow_dentry(mdr.get(), metablob, oldin_pdn);
8907 metablob->add_primary_dentry(oldin_pdn, oldin, true);
8908 }
8909 }
8910 }
8911
8912 // dest
8913 if (srcdnl->is_remote()) {
8914 ceph_assert(!linkmerge);
8915 if (destdn->is_auth() && !destdnl->is_null())
8916 mdcache->journal_cow_dentry(mdr.get(), metablob, destdn, CEPH_NOSNAP, 0, destdnl);
8917 else
8918 destdn->first = mdcache->get_global_snaprealm()->get_newest_seq() + 1;
8919
8920 if (destdn->is_auth())
8921 metablob->add_remote_dentry(destdn, true, srcdnl->get_remote_ino(), srcdnl->get_remote_d_type());
8922
8923 if (srci->is_auth() ) { // it's remote
8924 if (mdr->peer_request) {
8925 if (mdr->peer_request->srci_snapbl.length() > 0) {
8926 sr_t *new_srnode = new sr_t();
8927 auto p = mdr->peer_request->srci_snapbl.cbegin();
8928 decode(*new_srnode, p);
8929 srci->project_snaprealm(new_srnode);
8930 }
8931 } else if (auto& srci_srnode = mdr->more()->srci_srnode) {
8932 srci->project_snaprealm(srci_srnode);
8933 srci_srnode = NULL;
8934 }
8935
8936 CDentry *srci_pdn = srci->get_projected_parent_dn();
8937 mdcache->journal_cow_dentry(mdr.get(), metablob, srci_pdn);
8938 metablob->add_primary_dentry(srci_pdn, srci, true);
8939 }
8940 } else if (srcdnl->is_primary()) {
8941 // project snap parent update?
8942 if (destdn->is_auth()) {
8943 if (auto& srci_srnode = mdr->more()->srci_srnode) {
8944 srci->project_snaprealm(srci_srnode);
8945 srci_srnode = NULL;
8946 }
8947 }
8948
8949 if (destdn->is_auth() && !destdnl->is_null())
8950 mdcache->journal_cow_dentry(mdr.get(), metablob, destdn, CEPH_NOSNAP, 0, destdnl);
8951
8952 destdn->first = mdcache->get_global_snaprealm()->get_newest_seq() + 1;
8953
8954 if (destdn->is_auth())
8955 metablob->add_primary_dentry(destdn, srci, true, true);
8956 else if (force_journal_dest) {
8957 dout(10) << " forced journaling destdn " << *destdn << dendl;
8958 metablob->add_dir_context(destdn->get_dir());
8959 metablob->add_primary_dentry(destdn, srci, true);
8960 if (srcdn->is_auth() && srci->is_dir()) {
8961 // journal new subtrees root dirfrags
8962 auto&& ls = srci->get_dirfrags();
8963 for (const auto& dir : ls) {
8964 if (dir->is_auth())
8965 metablob->add_dir(dir, true);
8966 }
8967 }
8968 }
8969 }
8970
8971 // src
8972 if (srcdn->is_auth()) {
8973 dout(10) << " journaling srcdn " << *srcdn << dendl;
8974 mdcache->journal_cow_dentry(mdr.get(), metablob, srcdn, CEPH_NOSNAP, 0, srcdnl);
8975 // also journal the inode in case we need do peer rename rollback. It is Ok to add
8976 // both primary and NULL dentries. Because during journal replay, null dentry is
8977 // processed after primary dentry.
8978 if (srcdnl->is_primary() && !srci->is_dir() && !destdn->is_auth())
8979 metablob->add_primary_dentry(srcdn, srci, true);
8980 metablob->add_null_dentry(srcdn, true);
8981 } else
8982 dout(10) << " NOT journaling srcdn " << *srcdn << dendl;
8983
8984 // make renamed inode first track the dn
8985 if (srcdnl->is_primary() && destdn->is_auth()) {
8986 ceph_assert(srci->first <= destdn->first);
8987 srci->first = destdn->first;
8988 }
8989 // make stray inode first track the straydn
8990 if (straydn && straydn->is_auth()) {
8991 ceph_assert(oldin->first <= straydn->first);
8992 oldin->first = straydn->first;
8993 }
8994
8995 if (oldin && oldin->is_dir()) {
8996 ceph_assert(straydn);
8997 mdcache->project_subtree_rename(oldin, destdn->get_dir(), straydn->get_dir());
8998 }
8999 if (srci->is_dir())
9000 mdcache->project_subtree_rename(srci, srcdn->get_dir(), destdn->get_dir());
9001
9002 }
9003
9004
9005 void Server::_rename_apply(MDRequestRef& mdr, CDentry *srcdn, CDentry *destdn, CDentry *straydn)
9006 {
9007 dout(10) << "_rename_apply " << *mdr << " " << *srcdn << " " << *destdn << dendl;
9008 dout(10) << " pvs " << mdr->more()->pvmap << dendl;
9009
9010 CDentry::linkage_t *srcdnl = srcdn->get_linkage();
9011 CDentry::linkage_t *destdnl = destdn->get_linkage();
9012
9013 CInode *oldin = destdnl->get_inode();
9014
9015 // primary+remote link merge?
9016 bool linkmerge = (srcdnl->get_inode() == oldin);
9017 if (linkmerge)
9018 ceph_assert(srcdnl->is_primary() || destdnl->is_remote());
9019
9020 bool new_in_snaprealm = false;
9021 bool new_oldin_snaprealm = false;
9022
9023 // target inode
9024 if (!linkmerge) {
9025 if (destdnl->is_primary()) {
9026 ceph_assert(straydn);
9027 dout(10) << "straydn is " << *straydn << dendl;
9028
9029 // if there is newly created snaprealm, need to split old snaprealm's
9030 // inodes_with_caps. So pop snaprealm before linkage changes.
9031 if (destdn->is_auth()) {
9032 bool hadrealm = (oldin->snaprealm ? true : false);
9033 oldin->early_pop_projected_snaprealm();
9034 new_oldin_snaprealm = (oldin->snaprealm && !hadrealm);
9035 } else {
9036 ceph_assert(mdr->peer_request);
9037 if (mdr->peer_request->desti_snapbl.length()) {
9038 new_oldin_snaprealm = !oldin->snaprealm;
9039 oldin->decode_snap_blob(mdr->peer_request->desti_snapbl);
9040 ceph_assert(oldin->snaprealm);
9041 }
9042 }
9043
9044 destdn->get_dir()->unlink_inode(destdn, false);
9045
9046 straydn->pop_projected_linkage();
9047 if (mdr->is_peer() && !mdr->more()->peer_update_journaled)
9048 ceph_assert(!straydn->is_projected()); // no other projected
9049
9050 // nlink-- targeti
9051 if (destdn->is_auth())
9052 oldin->pop_and_dirty_projected_inode(mdr->ls, mdr);
9053
9054 mdcache->touch_dentry_bottom(straydn); // drop dn as quickly as possible.
9055 } else if (destdnl->is_remote()) {
9056 destdn->get_dir()->unlink_inode(destdn, false);
9057 if (oldin->is_auth()) {
9058 oldin->pop_and_dirty_projected_inode(mdr->ls, mdr);
9059 } else if (mdr->peer_request) {
9060 if (mdr->peer_request->desti_snapbl.length() > 0) {
9061 ceph_assert(oldin->snaprealm);
9062 oldin->decode_snap_blob(mdr->peer_request->desti_snapbl);
9063 }
9064 } else if (auto& desti_srnode = mdr->more()->desti_srnode) {
9065 delete desti_srnode;
9066 desti_srnode = NULL;
9067 }
9068 }
9069 }
9070
9071 // unlink src before we relink it at dest
9072 CInode *in = srcdnl->get_inode();
9073 ceph_assert(in);
9074
9075 bool srcdn_was_remote = srcdnl->is_remote();
9076 if (!srcdn_was_remote) {
9077 // if there is newly created snaprealm, need to split old snaprealm's
9078 // inodes_with_caps. So pop snaprealm before linkage changes.
9079 if (destdn->is_auth()) {
9080 bool hadrealm = (in->snaprealm ? true : false);
9081 in->early_pop_projected_snaprealm();
9082 new_in_snaprealm = (in->snaprealm && !hadrealm);
9083 } else {
9084 ceph_assert(mdr->peer_request);
9085 if (mdr->peer_request->srci_snapbl.length()) {
9086 new_in_snaprealm = !in->snaprealm;
9087 in->decode_snap_blob(mdr->peer_request->srci_snapbl);
9088 ceph_assert(in->snaprealm);
9089 }
9090 }
9091 }
9092
9093 srcdn->get_dir()->unlink_inode(srcdn);
9094
9095 // dest
9096 if (srcdn_was_remote) {
9097 if (!linkmerge) {
9098 // destdn
9099 destdnl = destdn->pop_projected_linkage();
9100 if (mdr->is_peer() && !mdr->more()->peer_update_journaled)
9101 ceph_assert(!destdn->is_projected()); // no other projected
9102
9103 destdn->link_remote(destdnl, in);
9104 if (destdn->is_auth())
9105 destdn->mark_dirty(mdr->more()->pvmap[destdn], mdr->ls);
9106 // in
9107 if (in->is_auth()) {
9108 in->pop_and_dirty_projected_inode(mdr->ls, mdr);
9109 } else if (mdr->peer_request) {
9110 if (mdr->peer_request->srci_snapbl.length() > 0) {
9111 ceph_assert(in->snaprealm);
9112 in->decode_snap_blob(mdr->peer_request->srci_snapbl);
9113 }
9114 } else if (auto& srci_srnode = mdr->more()->srci_srnode) {
9115 delete srci_srnode;
9116 srci_srnode = NULL;
9117 }
9118 } else {
9119 dout(10) << "merging remote onto primary link" << dendl;
9120 oldin->pop_and_dirty_projected_inode(mdr->ls, mdr);
9121 }
9122 } else { // primary
9123 if (linkmerge) {
9124 dout(10) << "merging primary onto remote link" << dendl;
9125 destdn->get_dir()->unlink_inode(destdn, false);
9126 }
9127 destdnl = destdn->pop_projected_linkage();
9128 if (mdr->is_peer() && !mdr->more()->peer_update_journaled)
9129 ceph_assert(!destdn->is_projected()); // no other projected
9130
9131 // srcdn inode import?
9132 if (!srcdn->is_auth() && destdn->is_auth()) {
9133 ceph_assert(mdr->more()->inode_import.length() > 0);
9134
9135 map<client_t,Capability::Import> imported_caps;
9136
9137 // finish cap imports
9138 finish_force_open_sessions(mdr->more()->imported_session_map);
9139 if (mdr->more()->cap_imports.count(destdnl->get_inode())) {
9140 mdcache->migrator->finish_import_inode_caps(destdnl->get_inode(),
9141 mdr->more()->srcdn_auth_mds, true,
9142 mdr->more()->imported_session_map,
9143 mdr->more()->cap_imports[destdnl->get_inode()],
9144 imported_caps);
9145 }
9146
9147 mdr->more()->inode_import.clear();
9148 encode(imported_caps, mdr->more()->inode_import);
9149
9150 /* hack: add an auth pin for each xlock we hold. These were
9151 * remote xlocks previously but now they're local and
9152 * we're going to try and unpin when we xlock_finish. */
9153
9154 for (auto i = mdr->locks.lower_bound(&destdnl->get_inode()->versionlock);
9155 i != mdr->locks.end();
9156 ++i) {
9157 SimpleLock *lock = i->lock;
9158 if (lock->get_parent() != destdnl->get_inode())
9159 break;
9160 if (i->is_xlock() && !lock->is_locallock())
9161 mds->locker->xlock_import(lock);
9162 }
9163
9164 // hack: fix auth bit
9165 in->state_set(CInode::STATE_AUTH);
9166
9167 mdr->clear_ambiguous_auth();
9168 }
9169
9170 if (destdn->is_auth())
9171 in->pop_and_dirty_projected_inode(mdr->ls, mdr);
9172 }
9173
9174 // src
9175 if (srcdn->is_auth())
9176 srcdn->mark_dirty(mdr->more()->pvmap[srcdn], mdr->ls);
9177 srcdn->pop_projected_linkage();
9178 if (mdr->is_peer() && !mdr->more()->peer_update_journaled)
9179 ceph_assert(!srcdn->is_projected()); // no other projected
9180
9181 // apply remaining projected inodes (nested)
9182 mdr->apply();
9183
9184 // update subtree map?
9185 if (destdnl->is_primary() && in->is_dir())
9186 mdcache->adjust_subtree_after_rename(in, srcdn->get_dir(), true);
9187
9188 if (straydn && oldin->is_dir())
9189 mdcache->adjust_subtree_after_rename(oldin, destdn->get_dir(), true);
9190
9191 if (new_oldin_snaprealm)
9192 mdcache->do_realm_invalidate_and_update_notify(oldin, CEPH_SNAP_OP_SPLIT, false);
9193 if (new_in_snaprealm)
9194 mdcache->do_realm_invalidate_and_update_notify(in, CEPH_SNAP_OP_SPLIT, true);
9195
9196 // removing a new dn?
9197 if (srcdn->is_auth())
9198 srcdn->get_dir()->try_remove_unlinked_dn(srcdn);
9199 }
9200
9201
9202
9203 // ------------
9204 // PEER
9205
9206 class C_MDS_PeerRenamePrep : public ServerLogContext {
9207 CDentry *srcdn, *destdn, *straydn;
9208 public:
9209 C_MDS_PeerRenamePrep(Server *s, MDRequestRef& m, CDentry *sr, CDentry *de, CDentry *st) :
9210 ServerLogContext(s, m), srcdn(sr), destdn(de), straydn(st) {}
9211 void finish(int r) override {
9212 server->_logged_peer_rename(mdr, srcdn, destdn, straydn);
9213 }
9214 };
9215
9216 class C_MDS_PeerRenameCommit : public ServerContext {
9217 MDRequestRef mdr;
9218 CDentry *srcdn, *destdn, *straydn;
9219 public:
9220 C_MDS_PeerRenameCommit(Server *s, MDRequestRef& m, CDentry *sr, CDentry *de, CDentry *st) :
9221 ServerContext(s), mdr(m), srcdn(sr), destdn(de), straydn(st) {}
9222 void finish(int r) override {
9223 server->_commit_peer_rename(mdr, r, srcdn, destdn, straydn);
9224 }
9225 };
9226
9227 class C_MDS_PeerRenameSessionsFlushed : public ServerContext {
9228 MDRequestRef mdr;
9229 public:
9230 C_MDS_PeerRenameSessionsFlushed(Server *s, MDRequestRef& r) :
9231 ServerContext(s), mdr(r) {}
9232 void finish(int r) override {
9233 server->_peer_rename_sessions_flushed(mdr);
9234 }
9235 };
9236
9237 void Server::handle_peer_rename_prep(MDRequestRef& mdr)
9238 {
9239 dout(10) << "handle_peer_rename_prep " << *mdr
9240 << " " << mdr->peer_request->srcdnpath
9241 << " to " << mdr->peer_request->destdnpath
9242 << dendl;
9243
9244 if (mdr->peer_request->is_interrupted()) {
9245 dout(10) << " peer request interrupted, sending noop reply" << dendl;
9246 auto reply = make_message<MMDSPeerRequest>(mdr->reqid, mdr->attempt, MMDSPeerRequest::OP_RENAMEPREPACK);
9247 reply->mark_interrupted();
9248 mds->send_message_mds(reply, mdr->peer_to_mds);
9249 mdr->reset_peer_request();
9250 return;
9251 }
9252
9253 // discover destdn
9254 filepath destpath(mdr->peer_request->destdnpath);
9255 dout(10) << " dest " << destpath << dendl;
9256 vector<CDentry*> trace;
9257 CF_MDS_RetryRequestFactory cf(mdcache, mdr, false);
9258 int r = mdcache->path_traverse(mdr, cf, destpath,
9259 MDS_TRAVERSE_DISCOVER | MDS_TRAVERSE_PATH_LOCKED | MDS_TRAVERSE_WANT_DENTRY,
9260 &trace);
9261 if (r > 0) return;
9262 if (r == -CEPHFS_ESTALE) {
9263 mdcache->find_ino_peers(destpath.get_ino(), new C_MDS_RetryRequest(mdcache, mdr),
9264 mdr->peer_to_mds, true);
9265 return;
9266 }
9267 ceph_assert(r == 0); // we shouldn't get an error here!
9268
9269 CDentry *destdn = trace.back();
9270 CDentry::linkage_t *destdnl = destdn->get_projected_linkage();
9271 dout(10) << " destdn " << *destdn << dendl;
9272 mdr->pin(destdn);
9273
9274 // discover srcdn
9275 filepath srcpath(mdr->peer_request->srcdnpath);
9276 dout(10) << " src " << srcpath << dendl;
9277 CInode *srci = nullptr;
9278 r = mdcache->path_traverse(mdr, cf, srcpath,
9279 MDS_TRAVERSE_DISCOVER | MDS_TRAVERSE_PATH_LOCKED,
9280 &trace, &srci);
9281 if (r > 0) return;
9282 ceph_assert(r == 0);
9283
9284 CDentry *srcdn = trace.back();
9285 CDentry::linkage_t *srcdnl = srcdn->get_projected_linkage();
9286 dout(10) << " srcdn " << *srcdn << dendl;
9287 mdr->pin(srcdn);
9288 mdr->pin(srci);
9289
9290 // stray?
9291 bool linkmerge = srcdnl->get_inode() == destdnl->get_inode();
9292 if (linkmerge)
9293 ceph_assert(srcdnl->is_primary() && destdnl->is_remote());
9294 CDentry *straydn = mdr->straydn;
9295 if (destdnl->is_primary() && !linkmerge)
9296 ceph_assert(straydn);
9297
9298 mdr->set_op_stamp(mdr->peer_request->op_stamp);
9299 mdr->more()->srcdn_auth_mds = srcdn->authority().first;
9300
9301 // set up commit waiter (early, to clean up any freezing etc we do)
9302 if (!mdr->more()->peer_commit)
9303 mdr->more()->peer_commit = new C_MDS_PeerRenameCommit(this, mdr, srcdn, destdn, straydn);
9304
9305 // am i srcdn auth?
9306 if (srcdn->is_auth()) {
9307 set<mds_rank_t> srcdnrep;
9308 srcdn->list_replicas(srcdnrep);
9309
9310 bool reply_witness = false;
9311 if (srcdnl->is_primary() && !srcdnl->get_inode()->state_test(CInode::STATE_AMBIGUOUSAUTH)) {
9312 // freeze?
9313 // we need this to
9314 // - avoid conflicting lock state changes
9315 // - avoid concurrent updates to the inode
9316 // (this could also be accomplished with the versionlock)
9317 int allowance = 3; // 1 for the mdr auth_pin, 1 for the link lock, 1 for the snap lock
9318 dout(10) << " freezing srci " << *srcdnl->get_inode() << " with allowance " << allowance << dendl;
9319 bool frozen_inode = srcdnl->get_inode()->freeze_inode(allowance);
9320
9321 // unfreeze auth pin after freezing the inode to avoid queueing waiters
9322 if (srcdnl->get_inode()->is_frozen_auth_pin())
9323 mdr->unfreeze_auth_pin();
9324
9325 if (!frozen_inode) {
9326 srcdnl->get_inode()->add_waiter(CInode::WAIT_FROZEN, new C_MDS_RetryRequest(mdcache, mdr));
9327 return;
9328 }
9329
9330 /*
9331 * set ambiguous auth for srci
9332 * NOTE: we don't worry about ambiguous cache expire as we do
9333 * with subtree migrations because all peers will pin
9334 * srcdn->get_inode() for duration of this rename.
9335 */
9336 mdr->set_ambiguous_auth(srcdnl->get_inode());
9337
9338 // just mark the source inode as ambiguous auth if more than two MDS are involved.
9339 // the leader will send another OP_RENAMEPREP peer request later.
9340 if (mdr->peer_request->witnesses.size() > 1) {
9341 dout(10) << " set srci ambiguous auth; providing srcdn replica list" << dendl;
9342 reply_witness = true;
9343 }
9344
9345 // make sure bystanders have received all lock related messages
9346 for (set<mds_rank_t>::iterator p = srcdnrep.begin(); p != srcdnrep.end(); ++p) {
9347 if (*p == mdr->peer_to_mds ||
9348 (mds->is_cluster_degraded() &&
9349 !mds->mdsmap->is_clientreplay_or_active_or_stopping(*p)))
9350 continue;
9351 auto notify = make_message<MMDSPeerRequest>(mdr->reqid, mdr->attempt, MMDSPeerRequest::OP_RENAMENOTIFY);
9352 mds->send_message_mds(notify, *p);
9353 mdr->more()->waiting_on_peer.insert(*p);
9354 }
9355
9356 // make sure clients have received all cap related messages
9357 set<client_t> export_client_set;
9358 mdcache->migrator->get_export_client_set(srcdnl->get_inode(), export_client_set);
9359
9360 MDSGatherBuilder gather(g_ceph_context);
9361 flush_client_sessions(export_client_set, gather);
9362 if (gather.has_subs()) {
9363 mdr->more()->waiting_on_peer.insert(MDS_RANK_NONE);
9364 gather.set_finisher(new C_MDS_PeerRenameSessionsFlushed(this, mdr));
9365 gather.activate();
9366 }
9367 }
9368
9369 // is witness list sufficient?
9370 for (set<mds_rank_t>::iterator p = srcdnrep.begin(); p != srcdnrep.end(); ++p) {
9371 if (*p == mdr->peer_to_mds ||
9372 mdr->peer_request->witnesses.count(*p)) continue;
9373 dout(10) << " witness list insufficient; providing srcdn replica list" << dendl;
9374 reply_witness = true;
9375 break;
9376 }
9377
9378 if (reply_witness) {
9379 ceph_assert(!srcdnrep.empty());
9380 auto reply = make_message<MMDSPeerRequest>(mdr->reqid, mdr->attempt, MMDSPeerRequest::OP_RENAMEPREPACK);
9381 reply->witnesses.swap(srcdnrep);
9382 mds->send_message_mds(reply, mdr->peer_to_mds);
9383 mdr->reset_peer_request();
9384 return;
9385 }
9386 dout(10) << " witness list sufficient: includes all srcdn replicas" << dendl;
9387 if (!mdr->more()->waiting_on_peer.empty()) {
9388 dout(10) << " still waiting for rename notify acks from "
9389 << mdr->more()->waiting_on_peer << dendl;
9390 return;
9391 }
9392 } else if (srcdnl->is_primary() && srcdn->authority() != destdn->authority()) {
9393 // set ambiguous auth for srci on witnesses
9394 mdr->set_ambiguous_auth(srcdnl->get_inode());
9395 }
9396
9397 // encode everything we'd need to roll this back... basically, just the original state.
9398 rename_rollback rollback;
9399
9400 rollback.reqid = mdr->reqid;
9401
9402 rollback.orig_src.dirfrag = srcdn->get_dir()->dirfrag();
9403 rollback.orig_src.dirfrag_old_mtime = srcdn->get_dir()->get_projected_fnode()->fragstat.mtime;
9404 rollback.orig_src.dirfrag_old_rctime = srcdn->get_dir()->get_projected_fnode()->rstat.rctime;
9405 rollback.orig_src.dname = srcdn->get_name();
9406 if (srcdnl->is_primary())
9407 rollback.orig_src.ino = srcdnl->get_inode()->ino();
9408 else {
9409 ceph_assert(srcdnl->is_remote());
9410 rollback.orig_src.remote_ino = srcdnl->get_remote_ino();
9411 rollback.orig_src.remote_d_type = srcdnl->get_remote_d_type();
9412 }
9413
9414 rollback.orig_dest.dirfrag = destdn->get_dir()->dirfrag();
9415 rollback.orig_dest.dirfrag_old_mtime = destdn->get_dir()->get_projected_fnode()->fragstat.mtime;
9416 rollback.orig_dest.dirfrag_old_rctime = destdn->get_dir()->get_projected_fnode()->rstat.rctime;
9417 rollback.orig_dest.dname = destdn->get_name();
9418 if (destdnl->is_primary())
9419 rollback.orig_dest.ino = destdnl->get_inode()->ino();
9420 else if (destdnl->is_remote()) {
9421 rollback.orig_dest.remote_ino = destdnl->get_remote_ino();
9422 rollback.orig_dest.remote_d_type = destdnl->get_remote_d_type();
9423 }
9424
9425 if (straydn) {
9426 rollback.stray.dirfrag = straydn->get_dir()->dirfrag();
9427 rollback.stray.dirfrag_old_mtime = straydn->get_dir()->get_projected_fnode()->fragstat.mtime;
9428 rollback.stray.dirfrag_old_rctime = straydn->get_dir()->get_projected_fnode()->rstat.rctime;
9429 rollback.stray.dname = straydn->get_name();
9430 }
9431 if (mdr->peer_request->desti_snapbl.length()) {
9432 CInode *oldin = destdnl->get_inode();
9433 if (oldin->snaprealm) {
9434 encode(true, rollback.desti_snapbl);
9435 oldin->encode_snap_blob(rollback.desti_snapbl);
9436 } else {
9437 encode(false, rollback.desti_snapbl);
9438 }
9439 }
9440 if (mdr->peer_request->srci_snapbl.length()) {
9441 if (srci->snaprealm) {
9442 encode(true, rollback.srci_snapbl);
9443 srci->encode_snap_blob(rollback.srci_snapbl);
9444 } else {
9445 encode(false, rollback.srci_snapbl);
9446 }
9447 }
9448 encode(rollback, mdr->more()->rollback_bl);
9449 // FIXME: rollback snaprealm
9450 dout(20) << " rollback is " << mdr->more()->rollback_bl.length() << " bytes" << dendl;
9451
9452 // journal.
9453 mdr->ls = mdlog->get_current_segment();
9454 EPeerUpdate *le = new EPeerUpdate(mdlog, "peer_rename_prep", mdr->reqid, mdr->peer_to_mds,
9455 EPeerUpdate::OP_PREPARE, EPeerUpdate::RENAME);
9456 mdlog->start_entry(le);
9457 le->rollback = mdr->more()->rollback_bl;
9458
9459 bufferlist blah; // inode import data... obviously not used if we're the peer
9460 _rename_prepare(mdr, &le->commit, &blah, srcdn, destdn, mdr->peer_request->alternate_name, straydn);
9461
9462 if (le->commit.empty()) {
9463 dout(10) << " empty metablob, skipping journal" << dendl;
9464 mdlog->cancel_entry(le);
9465 mdr->ls = NULL;
9466 _logged_peer_rename(mdr, srcdn, destdn, straydn);
9467 } else {
9468 mdcache->add_uncommitted_peer(mdr->reqid, mdr->ls, mdr->peer_to_mds);
9469 mdr->more()->peer_update_journaled = true;
9470 submit_mdlog_entry(le, new C_MDS_PeerRenamePrep(this, mdr, srcdn, destdn, straydn),
9471 mdr, __func__);
9472 mdlog->flush();
9473 }
9474 }
9475
9476 void Server::_logged_peer_rename(MDRequestRef& mdr,
9477 CDentry *srcdn, CDentry *destdn, CDentry *straydn)
9478 {
9479 dout(10) << "_logged_peer_rename " << *mdr << dendl;
9480
9481 // prepare ack
9482 ref_t<MMDSPeerRequest> reply;
9483 if (!mdr->aborted) {
9484 reply = make_message<MMDSPeerRequest>(mdr->reqid, mdr->attempt, MMDSPeerRequest::OP_RENAMEPREPACK);
9485 if (!mdr->more()->peer_update_journaled)
9486 reply->mark_not_journaled();
9487 }
9488
9489 CDentry::linkage_t *srcdnl = srcdn->get_linkage();
9490 //CDentry::linkage_t *straydnl = straydn ? straydn->get_linkage() : 0;
9491
9492 // export srci?
9493 if (srcdn->is_auth() && srcdnl->is_primary()) {
9494 // set export bounds for CInode::encode_export()
9495 if (reply) {
9496 std::vector<CDir*> bounds;
9497 if (srcdnl->get_inode()->is_dir()) {
9498 srcdnl->get_inode()->get_dirfrags(bounds);
9499 for (const auto& bound : bounds) {
9500 bound->state_set(CDir::STATE_EXPORTBOUND);
9501 }
9502 }
9503
9504 map<client_t,entity_inst_t> exported_client_map;
9505 map<client_t, client_metadata_t> exported_client_metadata_map;
9506 bufferlist inodebl;
9507 mdcache->migrator->encode_export_inode(srcdnl->get_inode(), inodebl,
9508 exported_client_map,
9509 exported_client_metadata_map);
9510
9511 for (const auto& bound : bounds) {
9512 bound->state_clear(CDir::STATE_EXPORTBOUND);
9513 }
9514
9515 encode(exported_client_map, reply->inode_export, mds->mdsmap->get_up_features());
9516 encode(exported_client_metadata_map, reply->inode_export);
9517 reply->inode_export.claim_append(inodebl);
9518 reply->inode_export_v = srcdnl->get_inode()->get_version();
9519 }
9520
9521 // remove mdr auth pin
9522 mdr->auth_unpin(srcdnl->get_inode());
9523 mdr->more()->is_inode_exporter = true;
9524
9525 if (srcdnl->get_inode()->is_dirty())
9526 srcdnl->get_inode()->mark_clean();
9527
9528 dout(10) << " exported srci " << *srcdnl->get_inode() << dendl;
9529 }
9530
9531 // apply
9532 _rename_apply(mdr, srcdn, destdn, straydn);
9533
9534 CDentry::linkage_t *destdnl = destdn->get_linkage();
9535
9536 // bump popularity
9537 mds->balancer->hit_dir(srcdn->get_dir(), META_POP_IWR);
9538 if (destdnl->get_inode() && destdnl->get_inode()->is_auth())
9539 mds->balancer->hit_inode(destdnl->get_inode(), META_POP_IWR);
9540
9541 // done.
9542 mdr->reset_peer_request();
9543 mdr->straydn = 0;
9544
9545 if (reply) {
9546 mds->send_message_mds(reply, mdr->peer_to_mds);
9547 } else {
9548 ceph_assert(mdr->aborted);
9549 dout(10) << " abort flag set, finishing" << dendl;
9550 mdcache->request_finish(mdr);
9551 }
9552 }
9553
9554 void Server::_commit_peer_rename(MDRequestRef& mdr, int r,
9555 CDentry *srcdn, CDentry *destdn, CDentry *straydn)
9556 {
9557 dout(10) << "_commit_peer_rename " << *mdr << " r=" << r << dendl;
9558
9559 CInode *in = destdn->get_linkage()->get_inode();
9560
9561 inodeno_t migrated_stray;
9562 if (srcdn->is_auth() && srcdn->get_dir()->inode->is_stray())
9563 migrated_stray = in->ino();
9564
9565 MDSContext::vec finished;
9566 if (r == 0) {
9567 // unfreeze+singleauth inode
9568 // hmm, do i really need to delay this?
9569 if (mdr->more()->is_inode_exporter) {
9570 // drop our pins
9571 // we exported, clear out any xlocks that we moved to another MDS
9572
9573 for (auto i = mdr->locks.lower_bound(&in->versionlock);
9574 i != mdr->locks.end(); ) {
9575 SimpleLock *lock = i->lock;
9576 if (lock->get_parent() != in)
9577 break;
9578 // we only care about xlocks on the exported inode
9579 if (i->is_xlock() && !lock->is_locallock())
9580 mds->locker->xlock_export(i++, mdr.get());
9581 else
9582 ++i;
9583 }
9584
9585 map<client_t,Capability::Import> peer_imported;
9586 auto bp = mdr->more()->inode_import.cbegin();
9587 decode(peer_imported, bp);
9588
9589 dout(10) << " finishing inode export on " << *in << dendl;
9590 mdcache->migrator->finish_export_inode(in, mdr->peer_to_mds, peer_imported, finished);
9591 mds->queue_waiters(finished); // this includes SINGLEAUTH waiters.
9592
9593 // unfreeze
9594 ceph_assert(in->is_frozen_inode());
9595 in->unfreeze_inode(finished);
9596 }
9597
9598 // singleauth
9599 if (mdr->more()->is_ambiguous_auth) {
9600 mdr->more()->rename_inode->clear_ambiguous_auth(finished);
9601 mdr->more()->is_ambiguous_auth = false;
9602 }
9603
9604 if (straydn && mdr->more()->peer_update_journaled) {
9605 CInode *strayin = straydn->get_projected_linkage()->get_inode();
9606 if (strayin && !strayin->snaprealm)
9607 mdcache->clear_dirty_bits_for_stray(strayin);
9608 }
9609
9610 mds->queue_waiters(finished);
9611 mdr->cleanup();
9612
9613 if (mdr->more()->peer_update_journaled) {
9614 // write a commit to the journal
9615 EPeerUpdate *le = new EPeerUpdate(mdlog, "peer_rename_commit", mdr->reqid,
9616 mdr->peer_to_mds, EPeerUpdate::OP_COMMIT,
9617 EPeerUpdate::RENAME);
9618 mdlog->start_entry(le);
9619 submit_mdlog_entry(le, new C_MDS_CommittedPeer(this, mdr), mdr, __func__);
9620 mdlog->flush();
9621 } else {
9622 _committed_peer(mdr);
9623 }
9624 } else {
9625
9626 // abort
9627 // rollback_bl may be empty if we froze the inode but had to provide an expanded
9628 // witness list from the leader, and they failed before we tried prep again.
9629 if (mdr->more()->rollback_bl.length()) {
9630 if (mdr->more()->is_inode_exporter) {
9631 dout(10) << " reversing inode export of " << *in << dendl;
9632 in->abort_export();
9633 }
9634 if (mdcache->is_ambiguous_peer_update(mdr->reqid, mdr->peer_to_mds)) {
9635 mdcache->remove_ambiguous_peer_update(mdr->reqid, mdr->peer_to_mds);
9636 // rollback but preserve the peer request
9637 do_rename_rollback(mdr->more()->rollback_bl, mdr->peer_to_mds, mdr, false);
9638 mdr->more()->rollback_bl.clear();
9639 } else
9640 do_rename_rollback(mdr->more()->rollback_bl, mdr->peer_to_mds, mdr, true);
9641 } else {
9642 dout(10) << " rollback_bl empty, not rollback back rename (leader failed after getting extra witnesses?)" << dendl;
9643 // singleauth
9644 if (mdr->more()->is_ambiguous_auth) {
9645 if (srcdn->is_auth())
9646 mdr->more()->rename_inode->unfreeze_inode(finished);
9647
9648 mdr->more()->rename_inode->clear_ambiguous_auth(finished);
9649 mdr->more()->is_ambiguous_auth = false;
9650 }
9651 mds->queue_waiters(finished);
9652 mdcache->request_finish(mdr);
9653 }
9654 }
9655
9656 if (migrated_stray && mds->is_stopping())
9657 mdcache->shutdown_export_stray_finish(migrated_stray);
9658 }
9659
9660 static void _rollback_repair_dir(MutationRef& mut, CDir *dir,
9661 rename_rollback::drec &r, utime_t ctime,
9662 bool isdir, const nest_info_t &rstat)
9663 {
9664 auto pf = dir->project_fnode(mut);
9665 pf->version = dir->pre_dirty();
9666
9667 if (isdir) {
9668 pf->fragstat.nsubdirs += 1;
9669 } else {
9670 pf->fragstat.nfiles += 1;
9671 }
9672 if (r.ino) {
9673 pf->rstat.rbytes += rstat.rbytes;
9674 pf->rstat.rfiles += rstat.rfiles;
9675 pf->rstat.rsubdirs += rstat.rsubdirs;
9676 pf->rstat.rsnaps += rstat.rsnaps;
9677 }
9678 if (pf->fragstat.mtime == ctime) {
9679 pf->fragstat.mtime = r.dirfrag_old_mtime;
9680 if (pf->rstat.rctime == ctime)
9681 pf->rstat.rctime = r.dirfrag_old_rctime;
9682 }
9683 mut->add_updated_lock(&dir->get_inode()->filelock);
9684 mut->add_updated_lock(&dir->get_inode()->nestlock);
9685 }
9686
9687 struct C_MDS_LoggedRenameRollback : public ServerLogContext {
9688 MutationRef mut;
9689 CDentry *srcdn;
9690 version_t srcdnpv;
9691 CDentry *destdn;
9692 CDentry *straydn;
9693 map<client_t,ref_t<MClientSnap>> splits[2];
9694 bool finish_mdr;
9695 C_MDS_LoggedRenameRollback(Server *s, MutationRef& m, MDRequestRef& r,
9696 CDentry *sd, version_t pv, CDentry *dd, CDentry *st,
9697 map<client_t,ref_t<MClientSnap>> _splits[2], bool f) :
9698 ServerLogContext(s, r), mut(m), srcdn(sd), srcdnpv(pv), destdn(dd),
9699 straydn(st), finish_mdr(f) {
9700 splits[0].swap(_splits[0]);
9701 splits[1].swap(_splits[1]);
9702 }
9703 void finish(int r) override {
9704 server->_rename_rollback_finish(mut, mdr, srcdn, srcdnpv,
9705 destdn, straydn, splits, finish_mdr);
9706 }
9707 };
9708
9709 void Server::do_rename_rollback(bufferlist &rbl, mds_rank_t leader, MDRequestRef& mdr,
9710 bool finish_mdr)
9711 {
9712 rename_rollback rollback;
9713 auto p = rbl.cbegin();
9714 decode(rollback, p);
9715
9716 dout(10) << "do_rename_rollback on " << rollback.reqid << dendl;
9717 // need to finish this update before sending resolve to claim the subtree
9718 mdcache->add_rollback(rollback.reqid, leader);
9719
9720 MutationRef mut(new MutationImpl(nullptr, utime_t(), rollback.reqid));
9721 mut->ls = mds->mdlog->get_current_segment();
9722
9723 CDentry *srcdn = NULL;
9724 CDir *srcdir = mdcache->get_dirfrag(rollback.orig_src.dirfrag);
9725 if (!srcdir)
9726 srcdir = mdcache->get_dirfrag(rollback.orig_src.dirfrag.ino, rollback.orig_src.dname);
9727 if (srcdir) {
9728 dout(10) << " srcdir " << *srcdir << dendl;
9729 srcdn = srcdir->lookup(rollback.orig_src.dname);
9730 if (srcdn) {
9731 dout(10) << " srcdn " << *srcdn << dendl;
9732 ceph_assert(srcdn->get_linkage()->is_null());
9733 } else
9734 dout(10) << " srcdn not found" << dendl;
9735 } else
9736 dout(10) << " srcdir not found" << dendl;
9737
9738 CDentry *destdn = NULL;
9739 CDir *destdir = mdcache->get_dirfrag(rollback.orig_dest.dirfrag);
9740 if (!destdir)
9741 destdir = mdcache->get_dirfrag(rollback.orig_dest.dirfrag.ino, rollback.orig_dest.dname);
9742 if (destdir) {
9743 dout(10) << " destdir " << *destdir << dendl;
9744 destdn = destdir->lookup(rollback.orig_dest.dname);
9745 if (destdn)
9746 dout(10) << " destdn " << *destdn << dendl;
9747 else
9748 dout(10) << " destdn not found" << dendl;
9749 } else
9750 dout(10) << " destdir not found" << dendl;
9751
9752 CInode *in = NULL;
9753 if (rollback.orig_src.ino) {
9754 in = mdcache->get_inode(rollback.orig_src.ino);
9755 if (in && in->is_dir())
9756 ceph_assert(srcdn && destdn);
9757 } else
9758 in = mdcache->get_inode(rollback.orig_src.remote_ino);
9759
9760 CDir *straydir = NULL;
9761 CDentry *straydn = NULL;
9762 if (rollback.stray.dirfrag.ino) {
9763 straydir = mdcache->get_dirfrag(rollback.stray.dirfrag);
9764 if (straydir) {
9765 dout(10) << "straydir " << *straydir << dendl;
9766 straydn = straydir->lookup(rollback.stray.dname);
9767 if (straydn) {
9768 dout(10) << " straydn " << *straydn << dendl;
9769 ceph_assert(straydn->get_linkage()->is_primary());
9770 } else
9771 dout(10) << " straydn not found" << dendl;
9772 } else
9773 dout(10) << "straydir not found" << dendl;
9774 }
9775
9776 CInode *target = NULL;
9777 if (rollback.orig_dest.ino) {
9778 target = mdcache->get_inode(rollback.orig_dest.ino);
9779 if (target)
9780 ceph_assert(destdn && straydn);
9781 } else if (rollback.orig_dest.remote_ino)
9782 target = mdcache->get_inode(rollback.orig_dest.remote_ino);
9783
9784 // can't use is_auth() in the resolve stage
9785 mds_rank_t whoami = mds->get_nodeid();
9786 // peer
9787 ceph_assert(!destdn || destdn->authority().first != whoami);
9788 ceph_assert(!straydn || straydn->authority().first != whoami);
9789
9790 bool force_journal_src = false;
9791 bool force_journal_dest = false;
9792 if (in && in->is_dir() && srcdn->authority().first != whoami)
9793 force_journal_src = _need_force_journal(in, false);
9794 if (in && target && target->is_dir())
9795 force_journal_dest = _need_force_journal(in, true);
9796
9797 version_t srcdnpv = 0;
9798 // repair src
9799 if (srcdn) {
9800 if (srcdn->authority().first == whoami)
9801 srcdnpv = srcdn->pre_dirty();
9802 if (rollback.orig_src.ino) {
9803 ceph_assert(in);
9804 srcdn->push_projected_linkage(in);
9805 } else
9806 srcdn->push_projected_linkage(rollback.orig_src.remote_ino,
9807 rollback.orig_src.remote_d_type);
9808 }
9809
9810 map<client_t,ref_t<MClientSnap>> splits[2];
9811
9812 const CInode::mempool_inode *pip = nullptr;
9813 if (in) {
9814 bool projected;
9815 CDir *pdir = in->get_projected_parent_dir();
9816 if (pdir->authority().first == whoami) {
9817 auto pi = in->project_inode(mut);
9818 pi.inode->version = in->pre_dirty();
9819 if (pdir != srcdir) {
9820 auto pf = pdir->project_fnode(mut);
9821 pf->version = pdir->pre_dirty();
9822 }
9823 if (pi.inode->ctime == rollback.ctime)
9824 pi.inode->ctime = rollback.orig_src.old_ctime;
9825 projected = true;
9826 } else {
9827 if (in->get_inode()->ctime == rollback.ctime) {
9828 auto _inode = CInode::allocate_inode(*in->get_inode());
9829 _inode->ctime = rollback.orig_src.old_ctime;
9830 in->reset_inode(_inode);
9831 }
9832 projected = false;
9833 }
9834 pip = in->get_projected_inode().get();
9835
9836 if (rollback.srci_snapbl.length() && in->snaprealm) {
9837 bool hadrealm;
9838 auto p = rollback.srci_snapbl.cbegin();
9839 decode(hadrealm, p);
9840 if (hadrealm) {
9841 if (projected && !mds->is_resolve()) {
9842 sr_t *new_srnode = new sr_t();
9843 decode(*new_srnode, p);
9844 in->project_snaprealm(new_srnode);
9845 } else
9846 decode(in->snaprealm->srnode, p);
9847 } else {
9848 SnapRealm *realm;
9849 if (rollback.orig_src.ino) {
9850 ceph_assert(srcdir);
9851 realm = srcdir->get_inode()->find_snaprealm();
9852 } else {
9853 realm = in->snaprealm->parent;
9854 }
9855 if (!mds->is_resolve())
9856 mdcache->prepare_realm_merge(in->snaprealm, realm, splits[0]);
9857 if (projected)
9858 in->project_snaprealm(NULL);
9859 else
9860 in->snaprealm->merge_to(realm);
9861 }
9862 }
9863 }
9864
9865 // repair dest
9866 if (destdn) {
9867 if (rollback.orig_dest.ino && target) {
9868 destdn->push_projected_linkage(target);
9869 } else if (rollback.orig_dest.remote_ino) {
9870 destdn->push_projected_linkage(rollback.orig_dest.remote_ino,
9871 rollback.orig_dest.remote_d_type);
9872 } else {
9873 // the dentry will be trimmed soon, it's ok to have wrong linkage
9874 if (rollback.orig_dest.ino)
9875 ceph_assert(mds->is_resolve());
9876 destdn->push_projected_linkage();
9877 }
9878 }
9879
9880 if (straydn)
9881 straydn->push_projected_linkage();
9882
9883 if (target) {
9884 bool projected;
9885 CInode::inode_ptr ti;
9886 CDir *pdir = target->get_projected_parent_dir();
9887 if (pdir->authority().first == whoami) {
9888 auto pi = target->project_inode(mut);
9889 pi.inode->version = target->pre_dirty();
9890 if (pdir != srcdir) {
9891 auto pf = pdir->project_fnode(mut);
9892 pf->version = pdir->pre_dirty();
9893 }
9894 ti = pi.inode;
9895 projected = true;
9896 } else {
9897 ti = CInode::allocate_inode(*target->get_inode());
9898 projected = false;
9899 }
9900
9901 if (ti->ctime == rollback.ctime)
9902 ti->ctime = rollback.orig_dest.old_ctime;
9903 if (MDS_INO_IS_STRAY(rollback.orig_src.dirfrag.ino)) {
9904 if (MDS_INO_IS_STRAY(rollback.orig_dest.dirfrag.ino))
9905 ceph_assert(!rollback.orig_dest.ino && !rollback.orig_dest.remote_ino);
9906 else
9907 ceph_assert(rollback.orig_dest.remote_ino &&
9908 rollback.orig_dest.remote_ino == rollback.orig_src.ino);
9909 } else
9910 ti->nlink++;
9911
9912 if (!projected)
9913 target->reset_inode(ti);
9914
9915 if (rollback.desti_snapbl.length() && target->snaprealm) {
9916 bool hadrealm;
9917 auto p = rollback.desti_snapbl.cbegin();
9918 decode(hadrealm, p);
9919 if (hadrealm) {
9920 if (projected && !mds->is_resolve()) {
9921 sr_t *new_srnode = new sr_t();
9922 decode(*new_srnode, p);
9923 target->project_snaprealm(new_srnode);
9924 } else
9925 decode(target->snaprealm->srnode, p);
9926 } else {
9927 SnapRealm *realm;
9928 if (rollback.orig_dest.ino) {
9929 ceph_assert(destdir);
9930 realm = destdir->get_inode()->find_snaprealm();
9931 } else {
9932 realm = target->snaprealm->parent;
9933 }
9934 if (!mds->is_resolve())
9935 mdcache->prepare_realm_merge(target->snaprealm, realm, splits[1]);
9936 if (projected)
9937 target->project_snaprealm(NULL);
9938 else
9939 target->snaprealm->merge_to(realm);
9940 }
9941 }
9942 }
9943
9944 if (srcdn && srcdn->authority().first == whoami) {
9945 nest_info_t blah;
9946 _rollback_repair_dir(mut, srcdir, rollback.orig_src, rollback.ctime,
9947 in && in->is_dir(), pip ? pip->accounted_rstat : blah);
9948 }
9949
9950 if (srcdn)
9951 dout(0) << " srcdn back to " << *srcdn << dendl;
9952 if (in)
9953 dout(0) << " srci back to " << *in << dendl;
9954 if (destdn)
9955 dout(0) << " destdn back to " << *destdn << dendl;
9956 if (target)
9957 dout(0) << " desti back to " << *target << dendl;
9958
9959 // journal it
9960 EPeerUpdate *le = new EPeerUpdate(mdlog, "peer_rename_rollback", rollback.reqid, leader,
9961 EPeerUpdate::OP_ROLLBACK, EPeerUpdate::RENAME);
9962 mdlog->start_entry(le);
9963
9964 if (srcdn && (srcdn->authority().first == whoami || force_journal_src)) {
9965 le->commit.add_dir_context(srcdir);
9966 if (rollback.orig_src.ino)
9967 le->commit.add_primary_dentry(srcdn, 0, true);
9968 else
9969 le->commit.add_remote_dentry(srcdn, true);
9970 }
9971
9972 if (!rollback.orig_src.ino && // remote linkage
9973 in && in->authority().first == whoami) {
9974 le->commit.add_dir_context(in->get_projected_parent_dir());
9975 le->commit.add_primary_dentry(in->get_projected_parent_dn(), in, true);
9976 }
9977
9978 if (force_journal_dest) {
9979 ceph_assert(rollback.orig_dest.ino);
9980 le->commit.add_dir_context(destdir);
9981 le->commit.add_primary_dentry(destdn, 0, true);
9982 }
9983
9984 // peer: no need to journal straydn
9985
9986 if (target && target != in && target->authority().first == whoami) {
9987 ceph_assert(rollback.orig_dest.remote_ino);
9988 le->commit.add_dir_context(target->get_projected_parent_dir());
9989 le->commit.add_primary_dentry(target->get_projected_parent_dn(), target, true);
9990 }
9991
9992 if (in && in->is_dir() && (srcdn->authority().first == whoami || force_journal_src)) {
9993 dout(10) << " noting renamed dir ino " << in->ino() << " in metablob" << dendl;
9994 le->commit.renamed_dirino = in->ino();
9995 if (srcdn->authority().first == whoami) {
9996 auto&& ls = in->get_dirfrags();
9997 for (const auto& dir : ls) {
9998 if (!dir->is_auth())
9999 le->commit.renamed_dir_frags.push_back(dir->get_frag());
10000 }
10001 dout(10) << " noting renamed dir open frags " << le->commit.renamed_dir_frags << dendl;
10002 }
10003 } else if (force_journal_dest) {
10004 dout(10) << " noting rename target ino " << target->ino() << " in metablob" << dendl;
10005 le->commit.renamed_dirino = target->ino();
10006 }
10007
10008 if (target && target->is_dir()) {
10009 ceph_assert(destdn);
10010 mdcache->project_subtree_rename(target, straydir, destdir);
10011 }
10012
10013 if (in && in->is_dir()) {
10014 ceph_assert(srcdn);
10015 mdcache->project_subtree_rename(in, destdir, srcdir);
10016 }
10017
10018 if (mdr && !mdr->more()->peer_update_journaled) {
10019 ceph_assert(le->commit.empty());
10020 mdlog->cancel_entry(le);
10021 mut->ls = NULL;
10022 _rename_rollback_finish(mut, mdr, srcdn, srcdnpv, destdn, straydn, splits, finish_mdr);
10023 } else {
10024 ceph_assert(!le->commit.empty());
10025 if (mdr)
10026 mdr->more()->peer_update_journaled = false;
10027 MDSLogContextBase *fin = new C_MDS_LoggedRenameRollback(this, mut, mdr,
10028 srcdn, srcdnpv, destdn, straydn,
10029 splits, finish_mdr);
10030 submit_mdlog_entry(le, fin, mdr, __func__);
10031 mdlog->flush();
10032 }
10033 }
10034
10035 void Server::_rename_rollback_finish(MutationRef& mut, MDRequestRef& mdr, CDentry *srcdn,
10036 version_t srcdnpv, CDentry *destdn, CDentry *straydn,
10037 map<client_t,ref_t<MClientSnap>> splits[2], bool finish_mdr)
10038 {
10039 dout(10) << "_rename_rollback_finish " << mut->reqid << dendl;
10040
10041 if (straydn) {
10042 straydn->get_dir()->unlink_inode(straydn);
10043 straydn->pop_projected_linkage();
10044 }
10045 if (destdn) {
10046 destdn->get_dir()->unlink_inode(destdn);
10047 destdn->pop_projected_linkage();
10048 }
10049 if (srcdn) {
10050 srcdn->pop_projected_linkage();
10051 if (srcdn->authority().first == mds->get_nodeid()) {
10052 srcdn->mark_dirty(srcdnpv, mut->ls);
10053 if (srcdn->get_linkage()->is_primary())
10054 srcdn->get_linkage()->get_inode()->state_set(CInode::STATE_AUTH);
10055 }
10056 }
10057
10058 mut->apply();
10059
10060 if (srcdn && srcdn->get_linkage()->is_primary()) {
10061 CInode *in = srcdn->get_linkage()->get_inode();
10062 if (in && in->is_dir()) {
10063 ceph_assert(destdn);
10064 mdcache->adjust_subtree_after_rename(in, destdn->get_dir(), true);
10065 }
10066 }
10067
10068 if (destdn) {
10069 CInode *oldin = destdn->get_linkage()->get_inode();
10070 // update subtree map?
10071 if (oldin && oldin->is_dir()) {
10072 ceph_assert(straydn);
10073 mdcache->adjust_subtree_after_rename(oldin, straydn->get_dir(), true);
10074 }
10075 }
10076
10077 if (mds->is_resolve()) {
10078 CDir *root = NULL;
10079 if (straydn)
10080 root = mdcache->get_subtree_root(straydn->get_dir());
10081 else if (destdn)
10082 root = mdcache->get_subtree_root(destdn->get_dir());
10083 if (root)
10084 mdcache->try_trim_non_auth_subtree(root);
10085 } else {
10086 mdcache->send_snaps(splits[1]);
10087 mdcache->send_snaps(splits[0]);
10088 }
10089
10090 if (mdr) {
10091 MDSContext::vec finished;
10092 if (mdr->more()->is_ambiguous_auth) {
10093 if (srcdn->is_auth())
10094 mdr->more()->rename_inode->unfreeze_inode(finished);
10095
10096 mdr->more()->rename_inode->clear_ambiguous_auth(finished);
10097 mdr->more()->is_ambiguous_auth = false;
10098 }
10099 mds->queue_waiters(finished);
10100 if (finish_mdr || mdr->aborted)
10101 mdcache->request_finish(mdr);
10102 else
10103 mdr->more()->peer_rolling_back = false;
10104 }
10105
10106 mdcache->finish_rollback(mut->reqid, mdr);
10107
10108 mut->cleanup();
10109 }
10110
10111 void Server::handle_peer_rename_prep_ack(MDRequestRef& mdr, const cref_t<MMDSPeerRequest> &ack)
10112 {
10113 dout(10) << "handle_peer_rename_prep_ack " << *mdr
10114 << " witnessed by " << ack->get_source()
10115 << " " << *ack << dendl;
10116 mds_rank_t from = mds_rank_t(ack->get_source().num());
10117
10118 // note peer
10119 mdr->more()->peers.insert(from);
10120 if (mdr->more()->srcdn_auth_mds == from &&
10121 mdr->more()->is_remote_frozen_authpin &&
10122 !mdr->more()->is_ambiguous_auth) {
10123 mdr->set_ambiguous_auth(mdr->more()->rename_inode);
10124 }
10125
10126 // witnessed? or add extra witnesses?
10127 ceph_assert(mdr->more()->witnessed.count(from) == 0);
10128 if (ack->is_interrupted()) {
10129 dout(10) << " peer request interrupted, noop" << dendl;
10130 } else if (ack->witnesses.empty()) {
10131 mdr->more()->witnessed.insert(from);
10132 if (!ack->is_not_journaled())
10133 mdr->more()->has_journaled_peers = true;
10134 } else {
10135 dout(10) << " extra witnesses (srcdn replicas) are " << ack->witnesses << dendl;
10136 mdr->more()->extra_witnesses = ack->witnesses;
10137 mdr->more()->extra_witnesses.erase(mds->get_nodeid()); // not me!
10138 }
10139
10140 // srci import?
10141 if (ack->inode_export.length()) {
10142 dout(10) << " got srci import" << dendl;
10143 mdr->more()->inode_import.share(ack->inode_export);
10144 mdr->more()->inode_import_v = ack->inode_export_v;
10145 }
10146
10147 // remove from waiting list
10148 ceph_assert(mdr->more()->waiting_on_peer.count(from));
10149 mdr->more()->waiting_on_peer.erase(from);
10150
10151 if (mdr->more()->waiting_on_peer.empty())
10152 dispatch_client_request(mdr); // go again!
10153 else
10154 dout(10) << "still waiting on peers " << mdr->more()->waiting_on_peer << dendl;
10155 }
10156
10157 void Server::handle_peer_rename_notify_ack(MDRequestRef& mdr, const cref_t<MMDSPeerRequest> &ack)
10158 {
10159 dout(10) << "handle_peer_rename_notify_ack " << *mdr << " from mds."
10160 << ack->get_source() << dendl;
10161 ceph_assert(mdr->is_peer());
10162 mds_rank_t from = mds_rank_t(ack->get_source().num());
10163
10164 if (mdr->more()->waiting_on_peer.count(from)) {
10165 mdr->more()->waiting_on_peer.erase(from);
10166
10167 if (mdr->more()->waiting_on_peer.empty()) {
10168 if (mdr->peer_request)
10169 dispatch_peer_request(mdr);
10170 } else
10171 dout(10) << " still waiting for rename notify acks from "
10172 << mdr->more()->waiting_on_peer << dendl;
10173 }
10174 }
10175
10176 void Server::_peer_rename_sessions_flushed(MDRequestRef& mdr)
10177 {
10178 dout(10) << "_peer_rename_sessions_flushed " << *mdr << dendl;
10179
10180 if (mdr->more()->waiting_on_peer.count(MDS_RANK_NONE)) {
10181 mdr->more()->waiting_on_peer.erase(MDS_RANK_NONE);
10182
10183 if (mdr->more()->waiting_on_peer.empty()) {
10184 if (mdr->peer_request)
10185 dispatch_peer_request(mdr);
10186 } else
10187 dout(10) << " still waiting for rename notify acks from "
10188 << mdr->more()->waiting_on_peer << dendl;
10189 }
10190 }
10191
10192 // snaps
10193 /* This function takes responsibility for the passed mdr*/
10194 void Server::handle_client_lssnap(MDRequestRef& mdr)
10195 {
10196 const cref_t<MClientRequest> &req = mdr->client_request;
10197
10198 // traverse to path
10199 CInode *diri = try_get_auth_inode(mdr, req->get_filepath().get_ino());
10200 if (!diri)
10201 return;
10202
10203 if (!diri->is_dir()) {
10204 respond_to_request(mdr, -CEPHFS_ENOTDIR);
10205 return;
10206 }
10207 dout(10) << "lssnap on " << *diri << dendl;
10208
10209 // lock snap
10210 if (!mds->locker->try_rdlock_snap_layout(diri, mdr))
10211 return;
10212
10213 if (!check_access(mdr, diri, MAY_READ))
10214 return;
10215
10216 SnapRealm *realm = diri->find_snaprealm();
10217 map<snapid_t,const SnapInfo*> infomap;
10218 realm->get_snap_info(infomap, diri->get_oldest_snap());
10219
10220 unsigned max_entries = req->head.args.readdir.max_entries;
10221 if (!max_entries)
10222 max_entries = infomap.size();
10223 int max_bytes = req->head.args.readdir.max_bytes;
10224 if (!max_bytes)
10225 // make sure at least one item can be encoded
10226 max_bytes = (512 << 10) + g_conf()->mds_max_xattr_pairs_size;
10227
10228 __u64 last_snapid = 0;
10229 string offset_str = req->get_path2();
10230 if (!offset_str.empty())
10231 last_snapid = realm->resolve_snapname(offset_str, diri->ino());
10232
10233 //Empty DirStat
10234 bufferlist dirbl;
10235 static DirStat empty;
10236 CDir::encode_dirstat(dirbl, mdr->session->info, empty);
10237
10238 max_bytes -= dirbl.length() - sizeof(__u32) + sizeof(__u8) * 2;
10239
10240 __u32 num = 0;
10241 bufferlist dnbl;
10242 auto p = infomap.upper_bound(last_snapid);
10243 for (; p != infomap.end() && num < max_entries; ++p) {
10244 dout(10) << p->first << " -> " << *p->second << dendl;
10245
10246 // actual
10247 string snap_name;
10248 if (p->second->ino == diri->ino())
10249 snap_name = p->second->name;
10250 else
10251 snap_name = p->second->get_long_name();
10252
10253 unsigned start_len = dnbl.length();
10254 if (int(start_len + snap_name.length() + sizeof(__u32) + sizeof(LeaseStat)) > max_bytes)
10255 break;
10256
10257 encode(snap_name, dnbl);
10258 //infinite lease
10259 LeaseStat e(CEPH_LEASE_VALID, -1, 0);
10260 mds->locker->encode_lease(dnbl, mdr->session->info, e);
10261 dout(20) << "encode_infinite_lease" << dendl;
10262
10263 int r = diri->encode_inodestat(dnbl, mdr->session, realm, p->first, max_bytes - (int)dnbl.length());
10264 if (r < 0) {
10265 bufferlist keep;
10266 keep.substr_of(dnbl, 0, start_len);
10267 dnbl.swap(keep);
10268 break;
10269 }
10270 ++num;
10271 }
10272
10273 encode(num, dirbl);
10274 __u16 flags = 0;
10275 if (p == infomap.end()) {
10276 flags = CEPH_READDIR_FRAG_END;
10277 if (last_snapid == 0)
10278 flags |= CEPH_READDIR_FRAG_COMPLETE;
10279 }
10280 encode(flags, dirbl);
10281 dirbl.claim_append(dnbl);
10282
10283 mdr->reply_extra_bl = dirbl;
10284 mdr->tracei = diri;
10285 respond_to_request(mdr, 0);
10286 }
10287
10288
10289 // MKSNAP
10290
10291 struct C_MDS_mksnap_finish : public ServerLogContext {
10292 CInode *diri;
10293 SnapInfo info;
10294 C_MDS_mksnap_finish(Server *s, MDRequestRef& r, CInode *di, SnapInfo &i) :
10295 ServerLogContext(s, r), diri(di), info(i) {}
10296 void finish(int r) override {
10297 server->_mksnap_finish(mdr, diri, info);
10298 }
10299 };
10300
10301 /* This function takes responsibility for the passed mdr*/
10302 void Server::handle_client_mksnap(MDRequestRef& mdr)
10303 {
10304 const cref_t<MClientRequest> &req = mdr->client_request;
10305 // make sure we have as new a map as the client
10306 if (req->get_mdsmap_epoch() > mds->mdsmap->get_epoch()) {
10307 mds->wait_for_mdsmap(req->get_mdsmap_epoch(), new C_MDS_RetryRequest(mdcache, mdr));
10308 return;
10309 }
10310 if (!mds->mdsmap->allows_snaps()) {
10311 // you can't make snapshots until you set an option right now
10312 dout(5) << "new snapshots are disabled for this fs" << dendl;
10313 respond_to_request(mdr, -CEPHFS_EPERM);
10314 return;
10315 }
10316
10317 CInode *diri = try_get_auth_inode(mdr, req->get_filepath().get_ino());
10318 if (!diri)
10319 return;
10320
10321 // dir only
10322 if (!diri->is_dir()) {
10323 respond_to_request(mdr, -CEPHFS_ENOTDIR);
10324 return;
10325 }
10326 if (diri->is_system() && !diri->is_root()) {
10327 // no snaps in system dirs (root is ok)
10328 dout(5) << "is an internal system dir" << dendl;
10329 respond_to_request(mdr, -CEPHFS_EPERM);
10330 return;
10331 }
10332
10333 std::string_view snapname = req->get_filepath().last_dentry();
10334
10335 if (mdr->client_request->get_caller_uid() < g_conf()->mds_snap_min_uid || mdr->client_request->get_caller_uid() > g_conf()->mds_snap_max_uid) {
10336 dout(20) << "mksnap " << snapname << " on " << *diri << " denied to uid " << mdr->client_request->get_caller_uid() << dendl;
10337 respond_to_request(mdr, -CEPHFS_EPERM);
10338 return;
10339 }
10340
10341 dout(10) << "mksnap " << snapname << " on " << *diri << dendl;
10342
10343 // lock snap
10344 if (!(mdr->locking_state & MutationImpl::ALL_LOCKED)) {
10345 MutationImpl::LockOpVec lov;
10346 lov.add_xlock(&diri->snaplock);
10347 if (!mds->locker->acquire_locks(mdr, lov))
10348 return;
10349
10350 if (CDentry *pdn = diri->get_projected_parent_dn(); pdn) {
10351 if (!mds->locker->try_rdlock_snap_layout(pdn->get_dir()->get_inode(), mdr))
10352 return;
10353 }
10354 mdr->locking_state |= MutationImpl::ALL_LOCKED;
10355 }
10356
10357 if (!check_access(mdr, diri, MAY_WRITE|MAY_SNAPSHOT))
10358 return;
10359
10360 if (inodeno_t subvol_ino = diri->find_snaprealm()->get_subvolume_ino();
10361 (subvol_ino && subvol_ino != diri->ino())) {
10362 dout(5) << "is a descendent of a subvolume dir" << dendl;
10363 respond_to_request(mdr, -CEPHFS_EPERM);
10364 return;
10365 }
10366
10367 // check if we can create any more snapshots
10368 // we don't allow any more if we are already at or beyond the limit
10369 if (diri->snaprealm &&
10370 diri->snaprealm->get_snaps().size() >= max_snaps_per_dir) {
10371 respond_to_request(mdr, -CEPHFS_EMLINK);
10372 return;
10373 }
10374
10375 // make sure name is unique
10376 if (diri->snaprealm &&
10377 diri->snaprealm->exists(snapname)) {
10378 respond_to_request(mdr, -CEPHFS_EEXIST);
10379 return;
10380 }
10381 if (snapname.length() == 0 ||
10382 snapname[0] == '_') {
10383 respond_to_request(mdr, -CEPHFS_EINVAL);
10384 return;
10385 }
10386
10387 // allocate a snapid
10388 if (!mdr->more()->stid) {
10389 // prepare an stid
10390 mds->snapclient->prepare_create(diri->ino(), snapname,
10391 mdr->get_mds_stamp(),
10392 &mdr->more()->stid, &mdr->more()->snapidbl,
10393 new C_MDS_RetryRequest(mdcache, mdr));
10394 return;
10395 }
10396
10397 version_t stid = mdr->more()->stid;
10398 snapid_t snapid;
10399 auto p = mdr->more()->snapidbl.cbegin();
10400 decode(snapid, p);
10401 dout(10) << " stid " << stid << " snapid " << snapid << dendl;
10402
10403 ceph_assert(mds->snapclient->get_cached_version() >= stid);
10404
10405 SnapPayload payload;
10406 if (req->get_data().length()) {
10407 try {
10408 auto iter = req->get_data().cbegin();
10409 decode(payload, iter);
10410 } catch (const ceph::buffer::error &e) {
10411 // backward compat -- client sends xattr bufferlist. however,
10412 // that is not used anywhere -- so (log and) ignore.
10413 dout(20) << ": no metadata in payload (old client?)" << dendl;
10414 }
10415 }
10416
10417 // journal
10418 SnapInfo info;
10419 info.ino = diri->ino();
10420 info.snapid = snapid;
10421 info.name = snapname;
10422 info.stamp = mdr->get_op_stamp();
10423 info.metadata = payload.metadata;
10424
10425 auto pi = diri->project_inode(mdr, false, true);
10426 pi.inode->ctime = info.stamp;
10427 if (info.stamp > pi.inode->rstat.rctime)
10428 pi.inode->rstat.rctime = info.stamp;
10429 pi.inode->rstat.rsnaps++;
10430 pi.inode->version = diri->pre_dirty();
10431
10432 // project the snaprealm
10433 auto &newsnap = *pi.snapnode;
10434 newsnap.created = snapid;
10435 auto em = newsnap.snaps.emplace(std::piecewise_construct, std::forward_as_tuple(snapid), std::forward_as_tuple(info));
10436 if (!em.second)
10437 em.first->second = info;
10438 newsnap.seq = snapid;
10439 newsnap.last_created = snapid;
10440
10441 // journal the inode changes
10442 mdr->ls = mdlog->get_current_segment();
10443 EUpdate *le = new EUpdate(mdlog, "mksnap");
10444 mdlog->start_entry(le);
10445
10446 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
10447 le->metablob.add_table_transaction(TABLE_SNAP, stid);
10448 mdcache->predirty_journal_parents(mdr, &le->metablob, diri, 0, PREDIRTY_PRIMARY, false);
10449 mdcache->journal_dirty_inode(mdr.get(), &le->metablob, diri);
10450
10451 // journal the snaprealm changes
10452 submit_mdlog_entry(le, new C_MDS_mksnap_finish(this, mdr, diri, info),
10453 mdr, __func__);
10454 mdlog->flush();
10455 }
10456
10457 void Server::_mksnap_finish(MDRequestRef& mdr, CInode *diri, SnapInfo &info)
10458 {
10459 dout(10) << "_mksnap_finish " << *mdr << " " << info << dendl;
10460
10461 int op = (diri->snaprealm? CEPH_SNAP_OP_CREATE : CEPH_SNAP_OP_SPLIT);
10462
10463 mdr->apply();
10464
10465 mds->snapclient->commit(mdr->more()->stid, mdr->ls);
10466
10467 // create snap
10468 dout(10) << "snaprealm now " << *diri->snaprealm << dendl;
10469
10470 // notify other mds
10471 mdcache->send_snap_update(diri, mdr->more()->stid, op);
10472
10473 mdcache->do_realm_invalidate_and_update_notify(diri, op);
10474
10475 // yay
10476 mdr->in[0] = diri;
10477 mdr->snapid = info.snapid;
10478 mdr->tracei = diri;
10479 respond_to_request(mdr, 0);
10480 }
10481
10482
10483 // RMSNAP
10484
10485 struct C_MDS_rmsnap_finish : public ServerLogContext {
10486 CInode *diri;
10487 snapid_t snapid;
10488 C_MDS_rmsnap_finish(Server *s, MDRequestRef& r, CInode *di, snapid_t sn) :
10489 ServerLogContext(s, r), diri(di), snapid(sn) {}
10490 void finish(int r) override {
10491 server->_rmsnap_finish(mdr, diri, snapid);
10492 }
10493 };
10494
10495 /* This function takes responsibility for the passed mdr*/
10496 void Server::handle_client_rmsnap(MDRequestRef& mdr)
10497 {
10498 const cref_t<MClientRequest> &req = mdr->client_request;
10499
10500 CInode *diri = try_get_auth_inode(mdr, req->get_filepath().get_ino());
10501 if (!diri)
10502 return;
10503
10504 if (!diri->is_dir()) {
10505 respond_to_request(mdr, -CEPHFS_ENOTDIR);
10506 return;
10507 }
10508
10509 std::string_view snapname = req->get_filepath().last_dentry();
10510
10511 if (mdr->client_request->get_caller_uid() < g_conf()->mds_snap_min_uid || mdr->client_request->get_caller_uid() > g_conf()->mds_snap_max_uid) {
10512 dout(20) << "rmsnap " << snapname << " on " << *diri << " denied to uid " << mdr->client_request->get_caller_uid() << dendl;
10513 respond_to_request(mdr, -CEPHFS_EPERM);
10514 return;
10515 }
10516
10517 dout(10) << "rmsnap " << snapname << " on " << *diri << dendl;
10518
10519 // does snap exist?
10520 if (snapname.length() == 0 || snapname[0] == '_') {
10521 respond_to_request(mdr, -CEPHFS_EINVAL); // can't prune a parent snap, currently.
10522 return;
10523 }
10524 if (!diri->snaprealm || !diri->snaprealm->exists(snapname)) {
10525 respond_to_request(mdr, -CEPHFS_ENOENT);
10526 return;
10527 }
10528 snapid_t snapid = diri->snaprealm->resolve_snapname(snapname, diri->ino());
10529 dout(10) << " snapname " << snapname << " is " << snapid << dendl;
10530
10531 if (!(mdr->locking_state & MutationImpl::ALL_LOCKED)) {
10532 MutationImpl::LockOpVec lov;
10533 lov.add_xlock(&diri->snaplock);
10534 if (!mds->locker->acquire_locks(mdr, lov))
10535 return;
10536 if (CDentry *pdn = diri->get_projected_parent_dn(); pdn) {
10537 if (!mds->locker->try_rdlock_snap_layout(pdn->get_dir()->get_inode(), mdr))
10538 return;
10539 }
10540 mdr->locking_state |= MutationImpl::ALL_LOCKED;
10541 }
10542
10543 if (!check_access(mdr, diri, MAY_WRITE|MAY_SNAPSHOT))
10544 return;
10545
10546 // prepare
10547 if (!mdr->more()->stid) {
10548 mds->snapclient->prepare_destroy(diri->ino(), snapid,
10549 &mdr->more()->stid, &mdr->more()->snapidbl,
10550 new C_MDS_RetryRequest(mdcache, mdr));
10551 return;
10552 }
10553 version_t stid = mdr->more()->stid;
10554 auto p = mdr->more()->snapidbl.cbegin();
10555 snapid_t seq;
10556 decode(seq, p);
10557 dout(10) << " stid is " << stid << ", seq is " << seq << dendl;
10558
10559 ceph_assert(mds->snapclient->get_cached_version() >= stid);
10560
10561 // journal
10562 auto pi = diri->project_inode(mdr, false, true);
10563 pi.inode->version = diri->pre_dirty();
10564 pi.inode->ctime = mdr->get_op_stamp();
10565 if (mdr->get_op_stamp() > pi.inode->rstat.rctime)
10566 pi.inode->rstat.rctime = mdr->get_op_stamp();
10567 pi.inode->rstat.rsnaps--;
10568
10569 mdr->ls = mdlog->get_current_segment();
10570 EUpdate *le = new EUpdate(mdlog, "rmsnap");
10571 mdlog->start_entry(le);
10572
10573 // project the snaprealm
10574 auto &newnode = *pi.snapnode;
10575 newnode.snaps.erase(snapid);
10576 newnode.seq = seq;
10577 newnode.last_destroyed = seq;
10578
10579 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
10580 le->metablob.add_table_transaction(TABLE_SNAP, stid);
10581 mdcache->predirty_journal_parents(mdr, &le->metablob, diri, 0, PREDIRTY_PRIMARY, false);
10582 mdcache->journal_dirty_inode(mdr.get(), &le->metablob, diri);
10583
10584 submit_mdlog_entry(le, new C_MDS_rmsnap_finish(this, mdr, diri, snapid),
10585 mdr, __func__);
10586 mdlog->flush();
10587 }
10588
10589 void Server::_rmsnap_finish(MDRequestRef& mdr, CInode *diri, snapid_t snapid)
10590 {
10591 dout(10) << "_rmsnap_finish " << *mdr << " " << snapid << dendl;
10592 snapid_t stid = mdr->more()->stid;
10593 auto p = mdr->more()->snapidbl.cbegin();
10594 snapid_t seq;
10595 decode(seq, p);
10596
10597 mdr->apply();
10598
10599 mds->snapclient->commit(stid, mdr->ls);
10600
10601 dout(10) << "snaprealm now " << *diri->snaprealm << dendl;
10602
10603 // notify other mds
10604 mdcache->send_snap_update(diri, mdr->more()->stid, CEPH_SNAP_OP_DESTROY);
10605
10606 mdcache->do_realm_invalidate_and_update_notify(diri, CEPH_SNAP_OP_DESTROY);
10607
10608 // yay
10609 mdr->in[0] = diri;
10610 respond_to_request(mdr, 0);
10611
10612 // purge snapshot data
10613 diri->purge_stale_snap_data(diri->snaprealm->get_snaps());
10614 }
10615
10616 struct C_MDS_renamesnap_finish : public ServerLogContext {
10617 CInode *diri;
10618 snapid_t snapid;
10619 C_MDS_renamesnap_finish(Server *s, MDRequestRef& r, CInode *di, snapid_t sn) :
10620 ServerLogContext(s, r), diri(di), snapid(sn) {}
10621 void finish(int r) override {
10622 server->_renamesnap_finish(mdr, diri, snapid);
10623 }
10624 };
10625
10626 /* This function takes responsibility for the passed mdr*/
10627 void Server::handle_client_renamesnap(MDRequestRef& mdr)
10628 {
10629 const cref_t<MClientRequest> &req = mdr->client_request;
10630 if (req->get_filepath().get_ino() != req->get_filepath2().get_ino()) {
10631 respond_to_request(mdr, -CEPHFS_EINVAL);
10632 return;
10633 }
10634
10635 CInode *diri = try_get_auth_inode(mdr, req->get_filepath().get_ino());
10636 if (!diri)
10637 return;
10638
10639 if (!diri->is_dir()) { // dir only
10640 respond_to_request(mdr, -CEPHFS_ENOTDIR);
10641 return;
10642 }
10643
10644 if (mdr->client_request->get_caller_uid() < g_conf()->mds_snap_min_uid ||
10645 mdr->client_request->get_caller_uid() > g_conf()->mds_snap_max_uid) {
10646 respond_to_request(mdr, -CEPHFS_EPERM);
10647 return;
10648 }
10649
10650 std::string_view dstname = req->get_filepath().last_dentry();
10651 std::string_view srcname = req->get_filepath2().last_dentry();
10652 dout(10) << "renamesnap " << srcname << "->" << dstname << " on " << *diri << dendl;
10653
10654 if (srcname.length() == 0 || srcname[0] == '_') {
10655 respond_to_request(mdr, -CEPHFS_EINVAL); // can't rename a parent snap.
10656 return;
10657 }
10658 if (!diri->snaprealm || !diri->snaprealm->exists(srcname)) {
10659 respond_to_request(mdr, -CEPHFS_ENOENT);
10660 return;
10661 }
10662 if (dstname.length() == 0 || dstname[0] == '_') {
10663 respond_to_request(mdr, -CEPHFS_EINVAL);
10664 return;
10665 }
10666 if (diri->snaprealm->exists(dstname)) {
10667 respond_to_request(mdr, -CEPHFS_EEXIST);
10668 return;
10669 }
10670
10671 snapid_t snapid = diri->snaprealm->resolve_snapname(srcname, diri->ino());
10672 dout(10) << " snapname " << srcname << " is " << snapid << dendl;
10673
10674 // lock snap
10675 if (!(mdr->locking_state & MutationImpl::ALL_LOCKED)) {
10676 MutationImpl::LockOpVec lov;
10677 lov.add_xlock(&diri->snaplock);
10678 if (!mds->locker->acquire_locks(mdr, lov))
10679 return;
10680 if (CDentry *pdn = diri->get_projected_parent_dn(); pdn) {
10681 if (!mds->locker->try_rdlock_snap_layout(pdn->get_dir()->get_inode(), mdr))
10682 return;
10683 }
10684 mdr->locking_state |= MutationImpl::ALL_LOCKED;
10685 }
10686
10687 if (!check_access(mdr, diri, MAY_WRITE|MAY_SNAPSHOT))
10688 return;
10689
10690 // prepare
10691 if (!mdr->more()->stid) {
10692 mds->snapclient->prepare_update(diri->ino(), snapid, dstname, utime_t(),
10693 &mdr->more()->stid,
10694 new C_MDS_RetryRequest(mdcache, mdr));
10695 return;
10696 }
10697
10698 version_t stid = mdr->more()->stid;
10699 dout(10) << " stid is " << stid << dendl;
10700
10701 ceph_assert(mds->snapclient->get_cached_version() >= stid);
10702
10703 // journal
10704 auto pi = diri->project_inode(mdr, false, true);
10705 pi.inode->ctime = mdr->get_op_stamp();
10706 if (mdr->get_op_stamp() > pi.inode->rstat.rctime)
10707 pi.inode->rstat.rctime = mdr->get_op_stamp();
10708 pi.inode->version = diri->pre_dirty();
10709
10710 // project the snaprealm
10711 auto &newsnap = *pi.snapnode;
10712 auto it = newsnap.snaps.find(snapid);
10713 ceph_assert(it != newsnap.snaps.end());
10714 it->second.name = dstname;
10715
10716 // journal the inode changes
10717 mdr->ls = mdlog->get_current_segment();
10718 EUpdate *le = new EUpdate(mdlog, "renamesnap");
10719 mdlog->start_entry(le);
10720
10721 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
10722 le->metablob.add_table_transaction(TABLE_SNAP, stid);
10723 mdcache->predirty_journal_parents(mdr, &le->metablob, diri, 0, PREDIRTY_PRIMARY, false);
10724 mdcache->journal_dirty_inode(mdr.get(), &le->metablob, diri);
10725
10726 // journal the snaprealm changes
10727 submit_mdlog_entry(le, new C_MDS_renamesnap_finish(this, mdr, diri, snapid),
10728 mdr, __func__);
10729 mdlog->flush();
10730 }
10731
10732 void Server::_renamesnap_finish(MDRequestRef& mdr, CInode *diri, snapid_t snapid)
10733 {
10734 dout(10) << "_renamesnap_finish " << *mdr << " " << snapid << dendl;
10735
10736 mdr->apply();
10737
10738 mds->snapclient->commit(mdr->more()->stid, mdr->ls);
10739
10740 dout(10) << "snaprealm now " << *diri->snaprealm << dendl;
10741
10742 // notify other mds
10743 mdcache->send_snap_update(diri, mdr->more()->stid, CEPH_SNAP_OP_UPDATE);
10744
10745 mdcache->do_realm_invalidate_and_update_notify(diri, CEPH_SNAP_OP_UPDATE);
10746
10747 // yay
10748 mdr->in[0] = diri;
10749 mdr->tracei = diri;
10750 mdr->snapid = snapid;
10751 respond_to_request(mdr, 0);
10752 }
10753
10754 /**
10755 * Return true if server is in state RECONNECT and this
10756 * client has not yet reconnected.
10757 */
10758 bool Server::waiting_for_reconnect(client_t c) const
10759 {
10760 return client_reconnect_gather.count(c) > 0;
10761 }
10762
10763 void Server::dump_reconnect_status(Formatter *f) const
10764 {
10765 f->open_object_section("reconnect_status");
10766 f->dump_stream("client_reconnect_gather") << client_reconnect_gather;
10767 f->close_section();
10768 }