]> git.proxmox.com Git - ceph.git/blob - ceph/src/mds/Server.cc
b01eb92b368495267b73ed4717244e92858205a3
[ceph.git] / ceph / src / mds / Server.cc
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3 /*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 *
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
12 *
13 */
14
15 #include <boost/lexical_cast.hpp>
16 #include "include/ceph_assert.h" // lexical_cast includes system assert.h
17
18 #include <boost/config/warning_disable.hpp>
19 #include <boost/fusion/include/std_pair.hpp>
20 #include <boost/range/adaptor/reversed.hpp>
21
22 #include "MDSRank.h"
23 #include "Server.h"
24 #include "Locker.h"
25 #include "MDCache.h"
26 #include "MDLog.h"
27 #include "Migrator.h"
28 #include "MDBalancer.h"
29 #include "InoTable.h"
30 #include "SnapClient.h"
31 #include "Mutation.h"
32 #include "MetricsHandler.h"
33 #include "cephfs_features.h"
34
35 #include "msg/Messenger.h"
36
37 #include "osdc/Objecter.h"
38
39 #include "events/EUpdate.h"
40 #include "events/EPeerUpdate.h"
41 #include "events/ESession.h"
42 #include "events/EOpen.h"
43 #include "events/ECommitted.h"
44 #include "events/EPurged.h"
45
46 #include "include/stringify.h"
47 #include "include/filepath.h"
48 #include "common/errno.h"
49 #include "common/Timer.h"
50 #include "common/perf_counters.h"
51 #include "include/compat.h"
52 #include "osd/OSDMap.h"
53
54 #include <errno.h>
55
56 #include <list>
57 #include <regex>
58 #include <string_view>
59 #include <functional>
60
61 #include "common/config.h"
62
63 #define dout_context g_ceph_context
64 #define dout_subsys ceph_subsys_mds
65 #undef dout_prefix
66 #define dout_prefix *_dout << "mds." << mds->get_nodeid() << ".server "
67
68 using namespace std;
69
70 class ServerContext : public MDSContext {
71 protected:
72 Server *server;
73 MDSRank *get_mds() override
74 {
75 return server->mds;
76 }
77
78 public:
79 explicit ServerContext(Server *s) : server(s) {
80 ceph_assert(server != NULL);
81 }
82 };
83
84 class Batch_Getattr_Lookup : public BatchOp {
85 protected:
86 Server* server;
87 ceph::ref_t<MDRequestImpl> mdr;
88 std::vector<ceph::ref_t<MDRequestImpl>> batch_reqs;
89 int res = 0;
90 public:
91 Batch_Getattr_Lookup(Server* s, const ceph::ref_t<MDRequestImpl>& r)
92 : server(s), mdr(r) {
93 if (mdr->client_request->get_op() == CEPH_MDS_OP_LOOKUP)
94 mdr->batch_op_map = &mdr->dn[0].back()->batch_ops;
95 else
96 mdr->batch_op_map = &mdr->in[0]->batch_ops;
97 }
98 void add_request(const ceph::ref_t<MDRequestImpl>& r) override {
99 batch_reqs.push_back(r);
100 }
101 ceph::ref_t<MDRequestImpl> find_new_head() override {
102 while (!batch_reqs.empty()) {
103 auto r = std::move(batch_reqs.back());
104 batch_reqs.pop_back();
105 if (r->killed)
106 continue;
107
108 r->batch_op_map = mdr->batch_op_map;
109 mdr->batch_op_map = nullptr;
110 mdr = r;
111 return mdr;
112 }
113 return nullptr;
114 }
115 void _forward(mds_rank_t t) override {
116 MDCache* mdcache = server->mdcache;
117 mdcache->mds->forward_message_mds(mdr->release_client_request(), t);
118 mdr->set_mds_stamp(ceph_clock_now());
119 for (auto& m : batch_reqs) {
120 if (!m->killed)
121 mdcache->request_forward(m, t);
122 }
123 batch_reqs.clear();
124 }
125 void _respond(int r) override {
126 mdr->set_mds_stamp(ceph_clock_now());
127 for (auto& m : batch_reqs) {
128 if (!m->killed) {
129 m->tracei = mdr->tracei;
130 m->tracedn = mdr->tracedn;
131 server->respond_to_request(m, r);
132 }
133 }
134 batch_reqs.clear();
135 server->reply_client_request(mdr, make_message<MClientReply>(*mdr->client_request, r));
136 }
137 void print(std::ostream& o) {
138 o << "[batch front=" << *mdr << "]";
139 }
140 };
141
142 class ServerLogContext : public MDSLogContextBase {
143 protected:
144 Server *server;
145 MDSRank *get_mds() override
146 {
147 return server->mds;
148 }
149
150 MDRequestRef mdr;
151 void pre_finish(int r) override {
152 if (mdr)
153 mdr->mark_event("journal_committed: ");
154 }
155 public:
156 explicit ServerLogContext(Server *s) : server(s) {
157 ceph_assert(server != NULL);
158 }
159 explicit ServerLogContext(Server *s, MDRequestRef& r) : server(s), mdr(r) {
160 ceph_assert(server != NULL);
161 }
162 };
163
164 void Server::create_logger()
165 {
166 PerfCountersBuilder plb(g_ceph_context, "mds_server", l_mdss_first, l_mdss_last);
167
168 plb.add_u64_counter(l_mdss_handle_client_request, "handle_client_request",
169 "Client requests", "hcr", PerfCountersBuilder::PRIO_INTERESTING);
170 plb.add_u64_counter(l_mdss_handle_peer_request, "handle_peer_request",
171 "Peer requests", "hsr", PerfCountersBuilder::PRIO_INTERESTING);
172 plb.add_u64_counter(l_mdss_handle_client_session,
173 "handle_client_session", "Client session messages", "hcs",
174 PerfCountersBuilder::PRIO_INTERESTING);
175 plb.add_u64_counter(l_mdss_cap_revoke_eviction, "cap_revoke_eviction",
176 "Cap Revoke Client Eviction", "cre", PerfCountersBuilder::PRIO_INTERESTING);
177 plb.add_u64_counter(l_mdss_cap_acquisition_throttle,
178 "cap_acquisition_throttle", "Cap acquisition throttle counter", "cat",
179 PerfCountersBuilder::PRIO_INTERESTING);
180
181 // fop latencies are useful
182 plb.set_prio_default(PerfCountersBuilder::PRIO_USEFUL);
183 plb.add_time_avg(l_mdss_req_lookuphash_latency, "req_lookuphash_latency",
184 "Request type lookup hash of inode latency");
185 plb.add_time_avg(l_mdss_req_lookupino_latency, "req_lookupino_latency",
186 "Request type lookup inode latency");
187 plb.add_time_avg(l_mdss_req_lookupparent_latency, "req_lookupparent_latency",
188 "Request type lookup parent latency");
189 plb.add_time_avg(l_mdss_req_lookupname_latency, "req_lookupname_latency",
190 "Request type lookup name latency");
191 plb.add_time_avg(l_mdss_req_lookup_latency, "req_lookup_latency",
192 "Request type lookup latency");
193 plb.add_time_avg(l_mdss_req_lookupsnap_latency, "req_lookupsnap_latency",
194 "Request type lookup snapshot latency");
195 plb.add_time_avg(l_mdss_req_getattr_latency, "req_getattr_latency",
196 "Request type get attribute latency");
197 plb.add_time_avg(l_mdss_req_setattr_latency, "req_setattr_latency",
198 "Request type set attribute latency");
199 plb.add_time_avg(l_mdss_req_setlayout_latency, "req_setlayout_latency",
200 "Request type set file layout latency");
201 plb.add_time_avg(l_mdss_req_setdirlayout_latency, "req_setdirlayout_latency",
202 "Request type set directory layout latency");
203 plb.add_time_avg(l_mdss_req_getvxattr_latency, "req_getvxattr_latency",
204 "Request type get virtual extended attribute latency");
205 plb.add_time_avg(l_mdss_req_setxattr_latency, "req_setxattr_latency",
206 "Request type set extended attribute latency");
207 plb.add_time_avg(l_mdss_req_rmxattr_latency, "req_rmxattr_latency",
208 "Request type remove extended attribute latency");
209 plb.add_time_avg(l_mdss_req_readdir_latency, "req_readdir_latency",
210 "Request type read directory latency");
211 plb.add_time_avg(l_mdss_req_setfilelock_latency, "req_setfilelock_latency",
212 "Request type set file lock latency");
213 plb.add_time_avg(l_mdss_req_getfilelock_latency, "req_getfilelock_latency",
214 "Request type get file lock latency");
215 plb.add_time_avg(l_mdss_req_create_latency, "req_create_latency",
216 "Request type create latency");
217 plb.add_time_avg(l_mdss_req_open_latency, "req_open_latency",
218 "Request type open latency");
219 plb.add_time_avg(l_mdss_req_mknod_latency, "req_mknod_latency",
220 "Request type make node latency");
221 plb.add_time_avg(l_mdss_req_link_latency, "req_link_latency",
222 "Request type link latency");
223 plb.add_time_avg(l_mdss_req_unlink_latency, "req_unlink_latency",
224 "Request type unlink latency");
225 plb.add_time_avg(l_mdss_req_rmdir_latency, "req_rmdir_latency",
226 "Request type remove directory latency");
227 plb.add_time_avg(l_mdss_req_rename_latency, "req_rename_latency",
228 "Request type rename latency");
229 plb.add_time_avg(l_mdss_req_mkdir_latency, "req_mkdir_latency",
230 "Request type make directory latency");
231 plb.add_time_avg(l_mdss_req_symlink_latency, "req_symlink_latency",
232 "Request type symbolic link latency");
233 plb.add_time_avg(l_mdss_req_lssnap_latency, "req_lssnap_latency",
234 "Request type list snapshot latency");
235 plb.add_time_avg(l_mdss_req_mksnap_latency, "req_mksnap_latency",
236 "Request type make snapshot latency");
237 plb.add_time_avg(l_mdss_req_rmsnap_latency, "req_rmsnap_latency",
238 "Request type remove snapshot latency");
239 plb.add_time_avg(l_mdss_req_renamesnap_latency, "req_renamesnap_latency",
240 "Request type rename snapshot latency");
241
242 plb.set_prio_default(PerfCountersBuilder::PRIO_DEBUGONLY);
243 plb.add_u64_counter(l_mdss_dispatch_client_request, "dispatch_client_request",
244 "Client requests dispatched");
245 plb.add_u64_counter(l_mdss_dispatch_peer_request, "dispatch_server_request",
246 "Server requests dispatched");
247
248 logger = plb.create_perf_counters();
249 g_ceph_context->get_perfcounters_collection()->add(logger);
250 }
251
252 Server::Server(MDSRank *m, MetricsHandler *metrics_handler) :
253 mds(m),
254 mdcache(mds->mdcache), mdlog(mds->mdlog),
255 recall_throttle(g_conf().get_val<double>("mds_recall_max_decay_rate")),
256 metrics_handler(metrics_handler)
257 {
258 forward_all_requests_to_auth = g_conf().get_val<bool>("mds_forward_all_requests_to_auth");
259 replay_unsafe_with_closed_session = g_conf().get_val<bool>("mds_replay_unsafe_with_closed_session");
260 cap_revoke_eviction_timeout = g_conf().get_val<double>("mds_cap_revoke_eviction_timeout");
261 max_snaps_per_dir = g_conf().get_val<uint64_t>("mds_max_snaps_per_dir");
262 delegate_inos_pct = g_conf().get_val<uint64_t>("mds_client_delegate_inos_pct");
263 max_caps_per_client = g_conf().get_val<uint64_t>("mds_max_caps_per_client");
264 cap_acquisition_throttle = g_conf().get_val<uint64_t>("mds_session_cap_acquisition_throttle");
265 max_caps_throttle_ratio = g_conf().get_val<double>("mds_session_max_caps_throttle_ratio");
266 caps_throttle_retry_request_timeout = g_conf().get_val<double>("mds_cap_acquisition_throttle_retry_request_timeout");
267 dir_max_entries = g_conf().get_val<uint64_t>("mds_dir_max_entries");
268 bal_fragment_size_max = g_conf().get_val<int64_t>("mds_bal_fragment_size_max");
269 supported_features = feature_bitset_t(CEPHFS_FEATURES_MDS_SUPPORTED);
270 supported_metric_spec = feature_bitset_t(CEPHFS_METRIC_FEATURES_ALL);
271 }
272
273 void Server::dispatch(const cref_t<Message> &m)
274 {
275 switch (m->get_type()) {
276 case CEPH_MSG_CLIENT_RECONNECT:
277 handle_client_reconnect(ref_cast<MClientReconnect>(m));
278 return;
279 }
280
281 /*
282 *In reconnect phase, client sent unsafe requests to mds before reconnect msg. Seting sessionclosed_isok will handle scenario like this:
283
284 1. In reconnect phase, client sent unsafe requests to mds.
285 2. It reached reconnect timeout. All sessions without sending reconnect msg in time, some of which may had sent unsafe requests, are marked as closed.
286 (Another situation is #31668, which will deny all client reconnect msg to speed up reboot).
287 3.So these unsafe request from session without sending reconnect msg in time or being denied could be handled in clientreplay phase.
288
289 */
290 bool sessionclosed_isok = replay_unsafe_with_closed_session;
291 // active?
292 // handle_peer_request()/handle_client_session() will wait if necessary
293 if (m->get_type() == CEPH_MSG_CLIENT_REQUEST && !mds->is_active()) {
294 const auto &req = ref_cast<MClientRequest>(m);
295 if (mds->is_reconnect() || mds->get_want_state() == CEPH_MDS_STATE_RECONNECT) {
296 Session *session = mds->get_session(req);
297 if (!session || (!session->is_open() && !sessionclosed_isok)) {
298 dout(5) << "session is closed, dropping " << req->get_reqid() << dendl;
299 return;
300 }
301 bool queue_replay = false;
302 if (req->is_replay() || req->is_async()) {
303 dout(3) << "queuing replayed op" << dendl;
304 queue_replay = true;
305 if (req->head.ino &&
306 !session->have_completed_request(req->get_reqid().tid, nullptr)) {
307 inodeno_t ino(req->head.ino);
308 mdcache->add_replay_ino_alloc(ino);
309 if (replay_unsafe_with_closed_session &&
310 session->free_prealloc_inos.contains(ino)) {
311 // don't purge inodes that will be created by later replay
312 session->free_prealloc_inos.erase(ino);
313 session->delegated_inos.insert(ino);
314 }
315 }
316 } else if (req->get_retry_attempt()) {
317 // process completed request in clientreplay stage. The completed request
318 // might have created new file/directorie. This guarantees MDS sends a reply
319 // to client before other request modifies the new file/directorie.
320 if (session->have_completed_request(req->get_reqid().tid, NULL)) {
321 dout(3) << "queuing completed op" << dendl;
322 queue_replay = true;
323 }
324 // this request was created before the cap reconnect message, drop any embedded
325 // cap releases.
326 req->releases.clear();
327 }
328 if (queue_replay) {
329 req->mark_queued_for_replay();
330 mds->enqueue_replay(new C_MDS_RetryMessage(mds, m));
331 return;
332 }
333 }
334
335 bool wait_for_active = true;
336 if (mds->is_stopping()) {
337 wait_for_active = false;
338 } else if (mds->is_clientreplay()) {
339 if (req->is_queued_for_replay()) {
340 wait_for_active = false;
341 }
342 }
343 if (wait_for_active) {
344 dout(3) << "not active yet, waiting" << dendl;
345 mds->wait_for_active(new C_MDS_RetryMessage(mds, m));
346 return;
347 }
348 }
349
350 switch (m->get_type()) {
351 case CEPH_MSG_CLIENT_SESSION:
352 handle_client_session(ref_cast<MClientSession>(m));
353 return;
354 case CEPH_MSG_CLIENT_REQUEST:
355 handle_client_request(ref_cast<MClientRequest>(m));
356 return;
357 case CEPH_MSG_CLIENT_RECLAIM:
358 handle_client_reclaim(ref_cast<MClientReclaim>(m));
359 return;
360 case MSG_MDS_PEER_REQUEST:
361 handle_peer_request(ref_cast<MMDSPeerRequest>(m));
362 return;
363 default:
364 derr << "server unknown message " << m->get_type() << dendl;
365 ceph_abort_msg("server unknown message");
366 }
367 }
368
369
370
371 // ----------------------------------------------------------
372 // SESSION management
373
374 class C_MDS_session_finish : public ServerLogContext {
375 Session *session;
376 uint64_t state_seq;
377 bool open;
378 version_t cmapv;
379 interval_set<inodeno_t> inos_to_free;
380 version_t inotablev;
381 interval_set<inodeno_t> inos_to_purge;
382 LogSegment *ls = nullptr;
383 Context *fin;
384 public:
385 C_MDS_session_finish(Server *srv, Session *se, uint64_t sseq, bool s, version_t mv, Context *fin_ = nullptr) :
386 ServerLogContext(srv), session(se), state_seq(sseq), open(s), cmapv(mv), inotablev(0), fin(fin_) { }
387 C_MDS_session_finish(Server *srv, Session *se, uint64_t sseq, bool s, version_t mv,
388 const interval_set<inodeno_t>& to_free, version_t iv,
389 const interval_set<inodeno_t>& to_purge, LogSegment *_ls, Context *fin_ = nullptr) :
390 ServerLogContext(srv), session(se), state_seq(sseq), open(s), cmapv(mv),
391 inos_to_free(to_free), inotablev(iv), inos_to_purge(to_purge), ls(_ls), fin(fin_) {}
392 void finish(int r) override {
393 ceph_assert(r == 0);
394 server->_session_logged(session, state_seq, open, cmapv, inos_to_free, inotablev, inos_to_purge, ls);
395 if (fin) {
396 fin->complete(r);
397 }
398 }
399 };
400
401 Session* Server::find_session_by_uuid(std::string_view uuid)
402 {
403 Session* session = nullptr;
404 for (auto& it : mds->sessionmap.get_sessions()) {
405 auto& metadata = it.second->info.client_metadata;
406
407 auto p = metadata.find("uuid");
408 if (p == metadata.end() || p->second != uuid)
409 continue;
410
411 if (!session) {
412 session = it.second;
413 } else if (!session->reclaiming_from) {
414 ceph_assert(it.second->reclaiming_from == session);
415 session = it.second;
416 } else {
417 ceph_assert(session->reclaiming_from == it.second);
418 }
419 }
420 return session;
421 }
422
423 void Server::reclaim_session(Session *session, const cref_t<MClientReclaim> &m)
424 {
425 if (!session->is_open() && !session->is_stale()) {
426 dout(10) << "session not open, dropping this req" << dendl;
427 return;
428 }
429
430 auto reply = make_message<MClientReclaimReply>(0);
431 if (m->get_uuid().empty()) {
432 dout(10) << __func__ << " invalid message (no uuid)" << dendl;
433 reply->set_result(-CEPHFS_EINVAL);
434 mds->send_message_client(reply, session);
435 return;
436 }
437
438 unsigned flags = m->get_flags();
439 if (flags != CEPH_RECLAIM_RESET) { // currently only support reset
440 dout(10) << __func__ << " unsupported flags" << dendl;
441 reply->set_result(-CEPHFS_EOPNOTSUPP);
442 mds->send_message_client(reply, session);
443 return;
444 }
445
446 Session* target = find_session_by_uuid(m->get_uuid());
447 if (target) {
448 if (session->info.auth_name != target->info.auth_name) {
449 dout(10) << __func__ << " session auth_name " << session->info.auth_name
450 << " != target auth_name " << target->info.auth_name << dendl;
451 reply->set_result(-CEPHFS_EPERM);
452 mds->send_message_client(reply, session);
453 }
454
455 ceph_assert(!target->reclaiming_from);
456 ceph_assert(!session->reclaiming_from);
457 session->reclaiming_from = target;
458 reply->set_addrs(entity_addrvec_t(target->info.inst.addr));
459 }
460
461 if (flags & CEPH_RECLAIM_RESET) {
462 finish_reclaim_session(session, reply);
463 return;
464 }
465
466 ceph_abort();
467 }
468
469 void Server::finish_reclaim_session(Session *session, const ref_t<MClientReclaimReply> &reply)
470 {
471 Session *target = session->reclaiming_from;
472 if (target) {
473 session->reclaiming_from = nullptr;
474
475 Context *send_reply;
476 if (reply) {
477 int64_t session_id = session->get_client().v;
478 send_reply = new LambdaContext([this, session_id, reply](int r) {
479 ceph_assert(ceph_mutex_is_locked_by_me(mds->mds_lock));
480 Session *session = mds->sessionmap.get_session(entity_name_t::CLIENT(session_id));
481 if (!session) {
482 return;
483 }
484 auto epoch = mds->objecter->with_osdmap([](const OSDMap &map){ return map.get_epoch(); });
485 reply->set_epoch(epoch);
486 mds->send_message_client(reply, session);
487 });
488 } else {
489 send_reply = nullptr;
490 }
491
492 bool blocklisted = mds->objecter->with_osdmap([target](const OSDMap &map) {
493 return map.is_blocklisted(target->info.inst.addr);
494 });
495
496 if (blocklisted || !g_conf()->mds_session_blocklist_on_evict) {
497 kill_session(target, send_reply);
498 } else {
499 CachedStackStringStream css;
500 mds->evict_client(target->get_client().v, false, true, *css, send_reply);
501 }
502 } else if (reply) {
503 mds->send_message_client(reply, session);
504 }
505 }
506
507 void Server::handle_client_reclaim(const cref_t<MClientReclaim> &m)
508 {
509 Session *session = mds->get_session(m);
510 dout(3) << __func__ << " " << *m << " from " << m->get_source() << dendl;
511 ceph_assert(m->get_source().is_client()); // should _not_ come from an mds!
512
513 if (!session) {
514 dout(0) << " ignoring sessionless msg " << *m << dendl;
515 return;
516 }
517
518 std::string_view fs_name = mds->mdsmap->get_fs_name();
519 if (!fs_name.empty() && !session->fs_name_capable(fs_name, MAY_READ)) {
520 dout(0) << " dropping message not allowed for this fs_name: " << *m << dendl;
521 return;
522 }
523
524 if (mds->get_state() < MDSMap::STATE_CLIENTREPLAY) {
525 mds->wait_for_replay(new C_MDS_RetryMessage(mds, m));
526 return;
527 }
528
529 if (m->get_flags() & MClientReclaim::FLAG_FINISH) {
530 finish_reclaim_session(session);
531 } else {
532 reclaim_session(session, m);
533 }
534 }
535
536 void Server::handle_client_session(const cref_t<MClientSession> &m)
537 {
538 version_t pv;
539 Session *session = mds->get_session(m);
540
541 dout(3) << "handle_client_session " << *m << " from " << m->get_source() << dendl;
542 ceph_assert(m->get_source().is_client()); // should _not_ come from an mds!
543
544 if (!session) {
545 dout(0) << " ignoring sessionless msg " << *m << dendl;
546 auto reply = make_message<MClientSession>(CEPH_SESSION_REJECT);
547 reply->metadata["error_string"] = "sessionless";
548 mds->send_message(reply, m->get_connection());
549 return;
550 }
551
552 std::string_view fs_name = mds->mdsmap->get_fs_name();
553 if (!fs_name.empty() && !session->fs_name_capable(fs_name, MAY_READ)) {
554 dout(0) << " dropping message not allowed for this fs_name: " << *m << dendl;
555 auto reply = make_message<MClientSession>(CEPH_SESSION_REJECT);
556 reply->metadata["error_string"] = "client doesn't have caps for FS \"" +
557 std::string(fs_name) + "\"";
558 mds->send_message(std::move(reply), m->get_connection());
559 return;
560 }
561
562 if (m->get_op() == CEPH_SESSION_REQUEST_RENEWCAPS) {
563 // always handle renewcaps (state >= MDSMap::STATE_RECONNECT)
564 } else if (m->get_op() == CEPH_SESSION_REQUEST_CLOSE) {
565 // close requests need to be handled when mds is active
566 if (mds->get_state() < MDSMap::STATE_ACTIVE) {
567 mds->wait_for_active(new C_MDS_RetryMessage(mds, m));
568 return;
569 }
570 } else {
571 if (mds->get_state() < MDSMap::STATE_CLIENTREPLAY) {
572 mds->wait_for_replay(new C_MDS_RetryMessage(mds, m));
573 return;
574 }
575 }
576
577 if (logger)
578 logger->inc(l_mdss_handle_client_session);
579
580 uint64_t sseq = 0;
581 switch (m->get_op()) {
582 case CEPH_SESSION_REQUEST_OPEN:
583 if (session->is_opening() ||
584 session->is_open() ||
585 session->is_stale() ||
586 session->is_killing() ||
587 terminating_sessions) {
588 dout(10) << "currently open|opening|stale|killing, dropping this req" << dendl;
589 return;
590 }
591 ceph_assert(session->is_closed() || session->is_closing());
592
593 if (mds->is_stopping()) {
594 dout(10) << "mds is stopping, dropping open req" << dendl;
595 return;
596 }
597
598 {
599 auto& addr = session->info.inst.addr;
600 session->set_client_metadata(client_metadata_t(m->metadata, m->supported_features, m->metric_spec));
601 auto& client_metadata = session->info.client_metadata;
602
603 auto log_session_status = [this, m, session](std::string_view status, std::string_view err) {
604 auto now = ceph_clock_now();
605 auto throttle_elapsed = m->get_recv_complete_stamp() - m->get_throttle_stamp();
606 auto elapsed = now - m->get_recv_stamp();
607 CachedStackStringStream css;
608 *css << "New client session:"
609 << " addr=\"" << session->info.inst.addr << "\""
610 << ",elapsed=" << elapsed
611 << ",throttled=" << throttle_elapsed
612 << ",status=\"" << status << "\"";
613 if (!err.empty()) {
614 *css << ",error=\"" << err << "\"";
615 }
616 const auto& metadata = session->info.client_metadata;
617 if (auto it = metadata.find("root"); it != metadata.end()) {
618 *css << ",root=\"" << it->second << "\"";
619 }
620 dout(2) << css->strv() << dendl;
621 };
622
623 auto send_reject_message = [this, &session, &log_session_status](std::string_view err_str, unsigned flags=0) {
624 auto m = make_message<MClientSession>(CEPH_SESSION_REJECT, 0, flags);
625 if (session->info.has_feature(CEPHFS_FEATURE_MIMIC))
626 m->metadata["error_string"] = err_str;
627 mds->send_message_client(m, session);
628 log_session_status("REJECTED", err_str);
629 };
630
631 bool blocklisted = mds->objecter->with_osdmap(
632 [&addr](const OSDMap &osd_map) -> bool {
633 return osd_map.is_blocklisted(addr);
634 });
635
636 if (blocklisted) {
637 dout(10) << "rejecting blocklisted client " << addr << dendl;
638 // This goes on the wire and the "blacklisted" substring is
639 // depended upon by the kernel client for detecting whether it
640 // has been blocklisted. If mounted with recover_session=clean
641 // (since 5.4), it tries to automatically recover itself from
642 // blocklisting.
643 unsigned flags = 0;
644 flags |= MClientSession::SESSION_BLOCKLISTED;
645 send_reject_message("blocklisted (blacklisted)", flags);
646 session->clear();
647 break;
648 }
649
650 if (client_metadata.features.empty())
651 infer_supported_features(session, client_metadata);
652
653 dout(20) << __func__ << " CEPH_SESSION_REQUEST_OPEN metadata entries:" << dendl;
654 dout(20) << " features: '" << client_metadata.features << "'" << dendl;
655 dout(20) << " metric specification: [" << client_metadata.metric_spec << "]" << dendl;
656 for (const auto& p : client_metadata) {
657 dout(20) << " " << p.first << ": " << p.second << dendl;
658 }
659
660 feature_bitset_t missing_features = required_client_features;
661 missing_features -= client_metadata.features;
662 if (!missing_features.empty()) {
663 CachedStackStringStream css;
664 *css << "missing required features '" << missing_features << "'";
665 send_reject_message(css->strv());
666 mds->clog->warn() << "client session (" << session->info.inst
667 << ") lacks required features " << missing_features
668 << "; client supports " << client_metadata.features;
669 session->clear();
670 break;
671 }
672
673 // Special case for the 'root' metadata path; validate that the claimed
674 // root is actually within the caps of the session
675 if (auto it = client_metadata.find("root"); it != client_metadata.end()) {
676 auto claimed_root = it->second;
677 CachedStackStringStream css;
678 bool denied = false;
679 // claimed_root has a leading "/" which we strip before passing
680 // into caps check
681 if (claimed_root.empty() || claimed_root[0] != '/') {
682 denied = true;
683 *css << "invalue root '" << claimed_root << "'";
684 } else if (!session->auth_caps.path_capable(claimed_root.substr(1))) {
685 denied = true;
686 *css << "non-allowable root '" << claimed_root << "'";
687 }
688
689 if (denied) {
690 // Tell the client we're rejecting their open
691 send_reject_message(css->strv());
692 mds->clog->warn() << "client session with " << css->strv()
693 << " denied (" << session->info.inst << ")";
694 session->clear();
695 break;
696 }
697 }
698
699 if (auto it = client_metadata.find("uuid"); it != client_metadata.end()) {
700 if (find_session_by_uuid(it->second)) {
701 send_reject_message("duplicated session uuid");
702 mds->clog->warn() << "client session with duplicated session uuid '"
703 << it->second << "' denied (" << session->info.inst << ")";
704 session->clear();
705 break;
706 }
707 }
708
709 if (session->is_closed()) {
710 mds->sessionmap.add_session(session);
711 }
712
713 pv = mds->sessionmap.mark_projected(session);
714 sseq = mds->sessionmap.set_state(session, Session::STATE_OPENING);
715 mds->sessionmap.touch_session(session);
716 auto fin = new LambdaContext([log_session_status = std::move(log_session_status)](int r){
717 ceph_assert(r == 0);
718 log_session_status("ACCEPTED", "");
719 });
720 mdlog->start_submit_entry(new ESession(m->get_source_inst(), true, pv, client_metadata),
721 new C_MDS_session_finish(this, session, sseq, true, pv, fin));
722 mdlog->flush();
723 }
724 break;
725
726 case CEPH_SESSION_REQUEST_RENEWCAPS:
727 if (session->is_open() || session->is_stale()) {
728 mds->sessionmap.touch_session(session);
729 if (session->is_stale()) {
730 mds->sessionmap.set_state(session, Session::STATE_OPEN);
731 mds->locker->resume_stale_caps(session);
732 mds->sessionmap.touch_session(session);
733 }
734 auto reply = make_message<MClientSession>(CEPH_SESSION_RENEWCAPS, m->get_seq());
735 mds->send_message_client(reply, session);
736 } else {
737 dout(10) << "ignoring renewcaps on non open|stale session (" << session->get_state_name() << ")" << dendl;
738 }
739 break;
740
741 case CEPH_SESSION_REQUEST_CLOSE:
742 {
743 if (session->is_closed() ||
744 session->is_closing() ||
745 session->is_killing()) {
746 dout(10) << "already closed|closing|killing, dropping this req" << dendl;
747 return;
748 }
749 if (session->is_importing()) {
750 dout(10) << "ignoring close req on importing session" << dendl;
751 return;
752 }
753 ceph_assert(session->is_open() ||
754 session->is_stale() ||
755 session->is_opening());
756 if (m->get_seq() < session->get_push_seq()) {
757 dout(10) << "old push seq " << m->get_seq() << " < " << session->get_push_seq()
758 << ", dropping" << dendl;
759 return;
760 }
761 // We are getting a seq that is higher than expected.
762 // Handle the same as any other seqn error.
763 //
764 if (m->get_seq() != session->get_push_seq()) {
765 dout(0) << "old push seq " << m->get_seq() << " != " << session->get_push_seq()
766 << ", BUGGY!" << dendl;
767 mds->clog->warn() << "incorrect push seq " << m->get_seq() << " != "
768 << session->get_push_seq() << ", dropping" << " from client : " << session->get_human_name();
769 return;
770 }
771 journal_close_session(session, Session::STATE_CLOSING, NULL);
772 }
773 break;
774
775 case CEPH_SESSION_FLUSHMSG_ACK:
776 finish_flush_session(session, m->get_seq());
777 break;
778
779 case CEPH_SESSION_REQUEST_FLUSH_MDLOG:
780 if (mds->is_active())
781 mdlog->flush();
782 break;
783
784 default:
785 ceph_abort();
786 }
787 }
788
789 void Server::flush_session(Session *session, MDSGatherBuilder& gather) {
790 if (!session->is_open() ||
791 !session->get_connection() ||
792 !session->get_connection()->has_feature(CEPH_FEATURE_EXPORT_PEER)) {
793 return;
794 }
795
796 version_t seq = session->wait_for_flush(gather.new_sub());
797 mds->send_message_client(
798 make_message<MClientSession>(CEPH_SESSION_FLUSHMSG, seq), session);
799 }
800
801 void Server::flush_client_sessions(set<client_t>& client_set, MDSGatherBuilder& gather)
802 {
803 for (const auto& client : client_set) {
804 Session *session = mds->sessionmap.get_session(entity_name_t::CLIENT(client.v));
805 ceph_assert(session);
806 flush_session(session, gather);
807 }
808 }
809
810 void Server::finish_flush_session(Session *session, version_t seq)
811 {
812 MDSContext::vec finished;
813 session->finish_flush(seq, finished);
814 mds->queue_waiters(finished);
815 }
816
817 void Server::_session_logged(Session *session, uint64_t state_seq, bool open, version_t pv,
818 const interval_set<inodeno_t>& inos_to_free, version_t piv,
819 const interval_set<inodeno_t>& inos_to_purge, LogSegment *ls)
820 {
821 dout(10) << "_session_logged " << session->info.inst
822 << " state_seq " << state_seq
823 << " " << (open ? "open":"close") << " " << pv
824 << " inos_to_free " << inos_to_free << " inotablev " << piv
825 << " inos_to_purge " << inos_to_purge << dendl;
826
827 if (!open) {
828 if (inos_to_purge.size()){
829 ceph_assert(ls);
830 session->info.prealloc_inos.subtract(inos_to_purge);
831 ls->purging_inodes.insert(inos_to_purge);
832 if (mds->is_clientreplay() || mds->is_active() || mds->is_stopping())
833 mdcache->purge_inodes(inos_to_purge, ls);
834 }
835
836 if (inos_to_free.size()) {
837 ceph_assert(piv);
838 ceph_assert(session->is_closing() || session->is_killing() ||
839 session->is_opening()); // re-open closing session
840 session->info.prealloc_inos.subtract(inos_to_free);
841 mds->inotable->apply_release_ids(inos_to_free);
842 ceph_assert(mds->inotable->get_version() == piv);
843 }
844 session->free_prealloc_inos = session->info.prealloc_inos;
845 session->delegated_inos.clear();
846 }
847
848 mds->sessionmap.mark_dirty(session);
849
850 // apply
851 if (session->get_state_seq() != state_seq) {
852 dout(10) << " journaled state_seq " << state_seq << " != current " << session->get_state_seq()
853 << ", noop" << dendl;
854 // close must have been canceled (by an import?), or any number of other things..
855 } else if (open) {
856 ceph_assert(session->is_opening());
857 mds->sessionmap.set_state(session, Session::STATE_OPEN);
858 mds->sessionmap.touch_session(session);
859 metrics_handler->add_session(session);
860 ceph_assert(session->get_connection());
861 auto reply = make_message<MClientSession>(CEPH_SESSION_OPEN);
862 if (session->info.has_feature(CEPHFS_FEATURE_MIMIC)) {
863 reply->supported_features = supported_features;
864 reply->metric_spec = supported_metric_spec;
865 }
866 mds->send_message_client(reply, session);
867 if (mdcache->is_readonly()) {
868 auto m = make_message<MClientSession>(CEPH_SESSION_FORCE_RO);
869 mds->send_message_client(m, session);
870 }
871 } else if (session->is_closing() ||
872 session->is_killing()) {
873 // kill any lingering capabilities, leases, requests
874 bool killing = session->is_killing();
875 while (!session->caps.empty()) {
876 Capability *cap = session->caps.front();
877 CInode *in = cap->get_inode();
878 dout(20) << " killing capability " << ccap_string(cap->issued()) << " on " << *in << dendl;
879 mds->locker->remove_client_cap(in, cap, killing);
880 }
881 while (!session->leases.empty()) {
882 ClientLease *r = session->leases.front();
883 CDentry *dn = static_cast<CDentry*>(r->parent);
884 dout(20) << " killing client lease of " << *dn << dendl;
885 dn->remove_client_lease(r, mds->locker);
886 }
887 if (client_reconnect_gather.erase(session->info.get_client())) {
888 dout(20) << " removing client from reconnect set" << dendl;
889 if (client_reconnect_gather.empty()) {
890 dout(7) << " client " << session->info.inst << " was last reconnect, finishing" << dendl;
891 reconnect_gather_finish();
892 }
893 }
894 if (client_reclaim_gather.erase(session->info.get_client())) {
895 dout(20) << " removing client from reclaim set" << dendl;
896 if (client_reclaim_gather.empty()) {
897 dout(7) << " client " << session->info.inst << " was last reclaimed, finishing" << dendl;
898 mds->maybe_clientreplay_done();
899 }
900 }
901
902 if (session->is_closing()) {
903 // mark con disposable. if there is a fault, we will get a
904 // reset and clean it up. if the client hasn't received the
905 // CLOSE message yet, they will reconnect and get an
906 // ms_handle_remote_reset() and realize they had in fact closed.
907 // do this *before* sending the message to avoid a possible
908 // race.
909 if (session->get_connection()) {
910 // Conditional because terminate_sessions will indiscrimately
911 // put sessions in CLOSING whether they ever had a conn or not.
912 session->get_connection()->mark_disposable();
913 }
914
915 // reset session
916 mds->send_message_client(make_message<MClientSession>(CEPH_SESSION_CLOSE), session);
917 mds->sessionmap.set_state(session, Session::STATE_CLOSED);
918 session->clear();
919 metrics_handler->remove_session(session);
920 mds->sessionmap.remove_session(session);
921 } else if (session->is_killing()) {
922 // destroy session, close connection
923 if (session->get_connection()) {
924 session->get_connection()->mark_down();
925 mds->sessionmap.set_state(session, Session::STATE_CLOSED);
926 session->set_connection(nullptr);
927 }
928 metrics_handler->remove_session(session);
929 mds->sessionmap.remove_session(session);
930 } else {
931 ceph_abort();
932 }
933 } else {
934 ceph_abort();
935 }
936 }
937
938 /**
939 * Inject sessions from some source other than actual connections.
940 *
941 * For example:
942 * - sessions inferred from journal replay
943 * - sessions learned from other MDSs during rejoin
944 * - sessions learned from other MDSs during dir/caps migration
945 * - sessions learned from other MDSs during a cross-MDS rename
946 */
947 version_t Server::prepare_force_open_sessions(map<client_t,entity_inst_t>& cm,
948 map<client_t,client_metadata_t>& cmm,
949 map<client_t, pair<Session*,uint64_t> >& smap)
950 {
951 version_t pv = mds->sessionmap.get_projected();
952
953 dout(10) << "prepare_force_open_sessions " << pv
954 << " on " << cm.size() << " clients"
955 << dendl;
956
957 mds->objecter->with_osdmap(
958 [this, &cm, &cmm](const OSDMap &osd_map) {
959 for (auto p = cm.begin(); p != cm.end(); ) {
960 if (osd_map.is_blocklisted(p->second.addr)) {
961 dout(10) << " ignoring blocklisted client." << p->first
962 << " (" << p->second.addr << ")" << dendl;
963 cmm.erase(p->first);
964 cm.erase(p++);
965 } else {
966 ++p;
967 }
968 }
969 });
970
971 for (map<client_t,entity_inst_t>::iterator p = cm.begin(); p != cm.end(); ++p) {
972 Session *session = mds->sessionmap.get_or_add_session(p->second);
973 pv = mds->sessionmap.mark_projected(session);
974 uint64_t sseq;
975 if (session->is_closed() ||
976 session->is_closing() ||
977 session->is_killing()) {
978 sseq = mds->sessionmap.set_state(session, Session::STATE_OPENING);
979 auto q = cmm.find(p->first);
980 if (q != cmm.end())
981 session->info.client_metadata.merge(q->second);
982 } else {
983 ceph_assert(session->is_open() ||
984 session->is_opening() ||
985 session->is_stale());
986 sseq = 0;
987 }
988 smap[p->first] = make_pair(session, sseq);
989 session->inc_importing();
990 }
991 return pv;
992 }
993
994 void Server::finish_force_open_sessions(const map<client_t,pair<Session*,uint64_t> >& smap,
995 bool dec_import)
996 {
997 /*
998 * FIXME: need to carefully consider the race conditions between a
999 * client trying to close a session and an MDS doing an import
1000 * trying to force open a session...
1001 */
1002 dout(10) << "finish_force_open_sessions on " << smap.size() << " clients,"
1003 << " initial v " << mds->sessionmap.get_version() << dendl;
1004
1005 for (auto &it : smap) {
1006 Session *session = it.second.first;
1007 uint64_t sseq = it.second.second;
1008 if (sseq > 0) {
1009 if (session->get_state_seq() != sseq) {
1010 dout(10) << "force_open_sessions skipping changed " << session->info.inst << dendl;
1011 } else {
1012 dout(10) << "force_open_sessions opened " << session->info.inst << dendl;
1013 mds->sessionmap.set_state(session, Session::STATE_OPEN);
1014 mds->sessionmap.touch_session(session);
1015 metrics_handler->add_session(session);
1016
1017 auto reply = make_message<MClientSession>(CEPH_SESSION_OPEN);
1018 if (session->info.has_feature(CEPHFS_FEATURE_MIMIC)) {
1019 reply->supported_features = supported_features;
1020 reply->metric_spec = supported_metric_spec;
1021 }
1022 mds->send_message_client(reply, session);
1023
1024 if (mdcache->is_readonly())
1025 mds->send_message_client(make_message<MClientSession>(CEPH_SESSION_FORCE_RO), session);
1026 }
1027 } else {
1028 dout(10) << "force_open_sessions skipping already-open " << session->info.inst << dendl;
1029 ceph_assert(session->is_open() || session->is_stale());
1030 }
1031
1032 if (dec_import) {
1033 session->dec_importing();
1034 }
1035
1036 mds->sessionmap.mark_dirty(session);
1037 }
1038
1039 dout(10) << __func__ << ": final v " << mds->sessionmap.get_version() << dendl;
1040 }
1041
1042 class C_MDS_TerminatedSessions : public ServerContext {
1043 void finish(int r) override {
1044 server->terminating_sessions = false;
1045 }
1046 public:
1047 explicit C_MDS_TerminatedSessions(Server *s) : ServerContext(s) {}
1048 };
1049
1050 void Server::terminate_sessions()
1051 {
1052 dout(5) << "terminating all sessions..." << dendl;
1053
1054 terminating_sessions = true;
1055
1056 // kill them off. clients will retry etc.
1057 set<Session*> sessions;
1058 mds->sessionmap.get_client_session_set(sessions);
1059 for (set<Session*>::const_iterator p = sessions.begin();
1060 p != sessions.end();
1061 ++p) {
1062 Session *session = *p;
1063 if (session->is_closing() ||
1064 session->is_killing() ||
1065 session->is_closed())
1066 continue;
1067 journal_close_session(session, Session::STATE_CLOSING, NULL);
1068 }
1069
1070 mdlog->wait_for_safe(new C_MDS_TerminatedSessions(this));
1071 }
1072
1073
1074 void Server::find_idle_sessions()
1075 {
1076 auto now = clock::now();
1077 auto last_cleared_laggy = mds->last_cleared_laggy();
1078
1079 dout(10) << "find_idle_sessions. last cleared laggy state " << last_cleared_laggy << "s ago" << dendl;
1080
1081 // timeout/stale
1082 // (caps go stale, lease die)
1083 double queue_max_age = mds->get_dispatch_queue_max_age(ceph_clock_now());
1084 double cutoff = queue_max_age + mds->mdsmap->get_session_timeout();
1085
1086 // don't kick clients if we've been laggy
1087 if (last_cleared_laggy < cutoff) {
1088 dout(10) << " last cleared laggy " << last_cleared_laggy << "s ago (< cutoff " << cutoff
1089 << "), not marking any client stale" << dendl;
1090 return;
1091 }
1092
1093 std::vector<Session*> to_evict;
1094
1095 bool defer_session_stale = g_conf().get_val<bool>("mds_defer_session_stale");
1096 const auto sessions_p1 = mds->sessionmap.by_state.find(Session::STATE_OPEN);
1097 if (sessions_p1 != mds->sessionmap.by_state.end() && !sessions_p1->second->empty()) {
1098 std::vector<Session*> new_stale;
1099
1100 for (auto session : *(sessions_p1->second)) {
1101 auto last_cap_renew_span = std::chrono::duration<double>(now - session->last_cap_renew).count();
1102 if (last_cap_renew_span < cutoff) {
1103 dout(20) << "laggiest active session is " << session->info.inst
1104 << " and renewed caps recently (" << last_cap_renew_span << "s ago)" << dendl;
1105 break;
1106 }
1107
1108 if (session->last_seen > session->last_cap_renew) {
1109 last_cap_renew_span = std::chrono::duration<double>(now - session->last_seen).count();
1110 if (last_cap_renew_span < cutoff) {
1111 dout(20) << "laggiest active session is " << session->info.inst
1112 << " and renewed caps recently (" << last_cap_renew_span << "s ago)" << dendl;
1113 continue;
1114 }
1115 }
1116
1117 if (last_cap_renew_span >= mds->mdsmap->get_session_autoclose()) {
1118 dout(20) << "evicting session " << session->info.inst << " since autoclose "
1119 "has arrived" << dendl;
1120 // evict session without marking it stale
1121 to_evict.push_back(session);
1122 continue;
1123 }
1124
1125 if (defer_session_stale &&
1126 !session->is_any_flush_waiter() &&
1127 !mds->locker->is_revoking_any_caps_from(session->get_client())) {
1128 dout(20) << "deferring marking session " << session->info.inst << " stale "
1129 "since it holds no caps" << dendl;
1130 continue;
1131 }
1132
1133 auto it = session->info.client_metadata.find("timeout");
1134 if (it != session->info.client_metadata.end()) {
1135 unsigned timeout = strtoul(it->second.c_str(), nullptr, 0);
1136 if (timeout == 0) {
1137 dout(10) << "skipping session " << session->info.inst
1138 << ", infinite timeout specified" << dendl;
1139 continue;
1140 }
1141 double cutoff = queue_max_age + timeout;
1142 if (last_cap_renew_span < cutoff) {
1143 dout(10) << "skipping session " << session->info.inst
1144 << ", timeout (" << timeout << ") specified"
1145 << " and renewed caps recently (" << last_cap_renew_span << "s ago)" << dendl;
1146 continue;
1147 }
1148
1149 // do not go through stale, evict it directly.
1150 to_evict.push_back(session);
1151 } else {
1152 dout(10) << "new stale session " << session->info.inst
1153 << " last renewed caps " << last_cap_renew_span << "s ago" << dendl;
1154 new_stale.push_back(session);
1155 }
1156 }
1157
1158 for (auto session : new_stale) {
1159 mds->sessionmap.set_state(session, Session::STATE_STALE);
1160 if (mds->locker->revoke_stale_caps(session)) {
1161 mds->locker->remove_stale_leases(session);
1162 finish_flush_session(session, session->get_push_seq());
1163 auto m = make_message<MClientSession>(CEPH_SESSION_STALE, session->get_push_seq());
1164 mds->send_message_client(m, session);
1165 } else {
1166 to_evict.push_back(session);
1167 }
1168 }
1169 }
1170
1171 // autoclose
1172 cutoff = queue_max_age + mds->mdsmap->get_session_autoclose();
1173
1174 // Collect a list of sessions exceeding the autoclose threshold
1175 const auto sessions_p2 = mds->sessionmap.by_state.find(Session::STATE_STALE);
1176 if (sessions_p2 != mds->sessionmap.by_state.end() && !sessions_p2->second->empty()) {
1177 for (auto session : *(sessions_p2->second)) {
1178 ceph_assert(session->is_stale());
1179 auto last_cap_renew_span = std::chrono::duration<double>(now - session->last_cap_renew).count();
1180 if (last_cap_renew_span < cutoff) {
1181 dout(20) << "oldest stale session is " << session->info.inst
1182 << " and recently renewed caps " << last_cap_renew_span << "s ago" << dendl;
1183 break;
1184 }
1185 to_evict.push_back(session);
1186 }
1187 }
1188
1189 for (auto session: to_evict) {
1190 if (session->is_importing()) {
1191 dout(10) << "skipping session " << session->info.inst << ", it's being imported" << dendl;
1192 continue;
1193 }
1194
1195 auto last_cap_renew_span = std::chrono::duration<double>(now - session->last_cap_renew).count();
1196 mds->clog->warn() << "evicting unresponsive client " << *session
1197 << ", after " << last_cap_renew_span << " seconds";
1198 dout(10) << "autoclosing stale session " << session->info.inst
1199 << " last renewed caps " << last_cap_renew_span << "s ago" << dendl;
1200
1201 if (g_conf()->mds_session_blocklist_on_timeout) {
1202 CachedStackStringStream css;
1203 mds->evict_client(session->get_client().v, false, true, *css, nullptr);
1204 } else {
1205 kill_session(session, NULL);
1206 }
1207 }
1208 }
1209
1210 void Server::evict_cap_revoke_non_responders() {
1211 if (!cap_revoke_eviction_timeout) {
1212 return;
1213 }
1214
1215 auto&& to_evict = mds->locker->get_late_revoking_clients(cap_revoke_eviction_timeout);
1216
1217 for (auto const &client: to_evict) {
1218 mds->clog->warn() << "client id " << client << " has not responded to"
1219 << " cap revoke by MDS for over " << cap_revoke_eviction_timeout
1220 << " seconds, evicting";
1221 dout(1) << __func__ << ": evicting cap revoke non-responder client id "
1222 << client << dendl;
1223
1224 CachedStackStringStream css;
1225 bool evicted = mds->evict_client(client.v, false,
1226 g_conf()->mds_session_blocklist_on_evict,
1227 *css, nullptr);
1228 if (evicted && logger) {
1229 logger->inc(l_mdss_cap_revoke_eviction);
1230 }
1231 }
1232 }
1233
1234 void Server::handle_conf_change(const std::set<std::string>& changed) {
1235 if (changed.count("mds_forward_all_requests_to_auth")){
1236 forward_all_requests_to_auth = g_conf().get_val<bool>("mds_forward_all_requests_to_auth");
1237 }
1238 if (changed.count("mds_cap_revoke_eviction_timeout")) {
1239 cap_revoke_eviction_timeout = g_conf().get_val<double>("mds_cap_revoke_eviction_timeout");
1240 dout(20) << __func__ << " cap revoke eviction timeout changed to "
1241 << cap_revoke_eviction_timeout << dendl;
1242 }
1243 if (changed.count("mds_recall_max_decay_rate")) {
1244 recall_throttle = DecayCounter(g_conf().get_val<double>("mds_recall_max_decay_rate"));
1245 }
1246 if (changed.count("mds_max_snaps_per_dir")) {
1247 max_snaps_per_dir = g_conf().get_val<uint64_t>("mds_max_snaps_per_dir");
1248 dout(20) << __func__ << " max snapshots per directory changed to "
1249 << max_snaps_per_dir << dendl;
1250 }
1251 if (changed.count("mds_client_delegate_inos_pct")) {
1252 delegate_inos_pct = g_conf().get_val<uint64_t>("mds_client_delegate_inos_pct");
1253 }
1254 if (changed.count("mds_max_caps_per_client")) {
1255 max_caps_per_client = g_conf().get_val<uint64_t>("mds_max_caps_per_client");
1256 }
1257 if (changed.count("mds_session_cap_acquisition_throttle")) {
1258 cap_acquisition_throttle = g_conf().get_val<uint64_t>("mds_session_cap_acquisition_throttle");
1259 }
1260 if (changed.count("mds_session_max_caps_throttle_ratio")) {
1261 max_caps_throttle_ratio = g_conf().get_val<double>("mds_session_max_caps_throttle_ratio");
1262 }
1263 if (changed.count("mds_cap_acquisition_throttle_retry_request_timeout")) {
1264 caps_throttle_retry_request_timeout = g_conf().get_val<double>("mds_cap_acquisition_throttle_retry_request_timeout");
1265 }
1266 if (changed.count("mds_alternate_name_max")) {
1267 alternate_name_max = g_conf().get_val<Option::size_t>("mds_alternate_name_max");
1268 }
1269 if (changed.count("mds_dir_max_entries")) {
1270 dir_max_entries = g_conf().get_val<uint64_t>("mds_dir_max_entries");
1271 dout(20) << __func__ << " max entries per directory changed to "
1272 << dir_max_entries << dendl;
1273 }
1274 if (changed.count("mds_bal_fragment_size_max")) {
1275 bal_fragment_size_max = g_conf().get_val<int64_t>("mds_bal_fragment_size_max");
1276 dout(20) << __func__ << " max fragment size changed to "
1277 << bal_fragment_size_max << dendl;
1278 }
1279 }
1280
1281 /*
1282 * XXX bump in the interface here, not using an MDSContext here
1283 * because all the callers right now happen to use a SaferCond
1284 */
1285 void Server::kill_session(Session *session, Context *on_safe)
1286 {
1287 ceph_assert(ceph_mutex_is_locked_by_me(mds->mds_lock));
1288
1289 if ((session->is_opening() ||
1290 session->is_open() ||
1291 session->is_stale()) &&
1292 !session->is_importing()) {
1293 dout(10) << "kill_session " << session << dendl;
1294 journal_close_session(session, Session::STATE_KILLING, on_safe);
1295 } else {
1296 dout(10) << "kill_session importing or already closing/killing " << session << dendl;
1297 if (session->is_closing() ||
1298 session->is_killing()) {
1299 if (on_safe)
1300 mdlog->wait_for_safe(new MDSInternalContextWrapper(mds, on_safe));
1301 } else {
1302 ceph_assert(session->is_closed() ||
1303 session->is_importing());
1304 if (on_safe)
1305 on_safe->complete(0);
1306 }
1307 }
1308 }
1309
1310 size_t Server::apply_blocklist()
1311 {
1312 std::vector<Session*> victims;
1313 const auto& sessions = mds->sessionmap.get_sessions();
1314 mds->objecter->with_osdmap(
1315 [&](const OSDMap& o) {
1316 for (const auto& p : sessions) {
1317 if (!p.first.is_client()) {
1318 // Do not apply OSDMap blocklist to MDS daemons, we find out
1319 // about their death via MDSMap.
1320 continue;
1321 }
1322 if (o.is_blocklisted(p.second->info.inst.addr)) {
1323 victims.push_back(p.second);
1324 }
1325 }
1326 });
1327
1328 for (const auto& s : victims) {
1329 kill_session(s, nullptr);
1330 }
1331
1332 dout(10) << "apply_blocklist: killed " << victims.size() << dendl;
1333
1334 return victims.size();
1335 }
1336
1337 void Server::journal_close_session(Session *session, int state, Context *on_safe)
1338 {
1339 dout(10) << __func__ << " : "
1340 << session->info.inst
1341 << " pending_prealloc_inos " << session->pending_prealloc_inos
1342 << " free_prealloc_inos " << session->free_prealloc_inos
1343 << " delegated_inos " << session->delegated_inos << dendl;
1344
1345 uint64_t sseq = mds->sessionmap.set_state(session, state);
1346 version_t pv = mds->sessionmap.mark_projected(session);
1347 version_t piv = 0;
1348
1349 // release alloc and pending-alloc inos for this session
1350 // and wipe out session state, in case the session close aborts for some reason
1351 interval_set<inodeno_t> inos_to_free;
1352 inos_to_free.insert(session->pending_prealloc_inos);
1353 inos_to_free.insert(session->free_prealloc_inos);
1354 if (inos_to_free.size()) {
1355 mds->inotable->project_release_ids(inos_to_free);
1356 piv = mds->inotable->get_projected_version();
1357 } else
1358 piv = 0;
1359
1360 auto le = new ESession(session->info.inst, false, pv, inos_to_free, piv, session->delegated_inos);
1361 auto fin = new C_MDS_session_finish(this, session, sseq, false, pv, inos_to_free, piv,
1362 session->delegated_inos, mdlog->get_current_segment(), on_safe);
1363 mdlog->start_submit_entry(le, fin);
1364 mdlog->flush();
1365
1366 // clean up requests, too
1367 while(!session->requests.empty()) {
1368 auto mdr = MDRequestRef(*session->requests.begin());
1369 mdcache->request_kill(mdr);
1370 }
1371
1372 finish_flush_session(session, session->get_push_seq());
1373 }
1374
1375 void Server::reconnect_clients(MDSContext *reconnect_done_)
1376 {
1377 reconnect_done = reconnect_done_;
1378
1379 auto now = clock::now();
1380 set<Session*> sessions;
1381 mds->sessionmap.get_client_session_set(sessions);
1382 for (auto session : sessions) {
1383 if (session->is_open()) {
1384 client_reconnect_gather.insert(session->get_client());
1385 session->set_reconnecting(true);
1386 session->last_cap_renew = now;
1387 }
1388 }
1389
1390 if (client_reconnect_gather.empty()) {
1391 dout(7) << "reconnect_clients -- no sessions, doing nothing." << dendl;
1392 reconnect_gather_finish();
1393 return;
1394 }
1395
1396 // clients will get the mdsmap and discover we're reconnecting via the monitor.
1397
1398 reconnect_start = now;
1399 dout(1) << "reconnect_clients -- " << client_reconnect_gather.size() << " sessions" << dendl;
1400 mds->sessionmap.dump();
1401 }
1402
1403 void Server::handle_client_reconnect(const cref_t<MClientReconnect> &m)
1404 {
1405 dout(7) << "handle_client_reconnect " << m->get_source()
1406 << (m->has_more() ? " (more)" : "") << dendl;
1407 client_t from = m->get_source().num();
1408 Session *session = mds->get_session(m);
1409 if (!session) {
1410 dout(0) << " ignoring sessionless msg " << *m << dendl;
1411 auto reply = make_message<MClientSession>(CEPH_SESSION_REJECT);
1412 reply->metadata["error_string"] = "sessionless";
1413 mds->send_message(reply, m->get_connection());
1414 return;
1415 }
1416
1417 if (!session->is_open()) {
1418 dout(0) << " ignoring msg from not-open session" << *m << dendl;
1419 auto reply = make_message<MClientSession>(CEPH_SESSION_CLOSE);
1420 mds->send_message(reply, m->get_connection());
1421 return;
1422 }
1423
1424 bool reconnect_all_deny = g_conf().get_val<bool>("mds_deny_all_reconnect");
1425
1426 if (!mds->is_reconnect() && mds->get_want_state() == CEPH_MDS_STATE_RECONNECT) {
1427 dout(10) << " we're almost in reconnect state (mdsmap delivery race?); waiting" << dendl;
1428 mds->wait_for_reconnect(new C_MDS_RetryMessage(mds, m));
1429 return;
1430 }
1431
1432 auto delay = std::chrono::duration<double>(clock::now() - reconnect_start).count();
1433 dout(10) << " reconnect_start " << reconnect_start << " delay " << delay << dendl;
1434
1435 bool deny = false;
1436 if (reconnect_all_deny || !mds->is_reconnect() || mds->get_want_state() != CEPH_MDS_STATE_RECONNECT || reconnect_evicting) {
1437 // XXX maybe in the future we can do better than this?
1438 if (reconnect_all_deny) {
1439 dout(1) << "mds_deny_all_reconnect was set to speed up reboot phase, ignoring reconnect, sending close" << dendl;
1440 } else {
1441 dout(1) << "no longer in reconnect state, ignoring reconnect, sending close" << dendl;
1442 }
1443 mds->clog->info() << "denied reconnect attempt (mds is "
1444 << ceph_mds_state_name(mds->get_state())
1445 << ") from " << m->get_source_inst()
1446 << " after " << delay << " (allowed interval " << g_conf()->mds_reconnect_timeout << ")";
1447 deny = true;
1448 } else {
1449 std::string error_str;
1450 if (!session->is_open()) {
1451 error_str = "session is closed";
1452 } else if (mdcache->is_readonly()) {
1453 error_str = "mds is readonly";
1454 } else {
1455 if (session->info.client_metadata.features.empty())
1456 infer_supported_features(session, session->info.client_metadata);
1457
1458 feature_bitset_t missing_features = required_client_features;
1459 missing_features -= session->info.client_metadata.features;
1460 if (!missing_features.empty()) {
1461 CachedStackStringStream css;
1462 *css << "missing required features '" << missing_features << "'";
1463 error_str = css->strv();
1464 }
1465 }
1466
1467 if (!error_str.empty()) {
1468 deny = true;
1469 dout(1) << " " << error_str << ", ignoring reconnect, sending close" << dendl;
1470 mds->clog->info() << "denied reconnect attempt from "
1471 << m->get_source_inst() << " (" << error_str << ")";
1472 }
1473 }
1474
1475 if (deny) {
1476 auto r = make_message<MClientSession>(CEPH_SESSION_CLOSE);
1477 mds->send_message_client(r, session);
1478 if (session->is_open()) {
1479 client_reconnect_denied.insert(session->get_client());
1480 }
1481 return;
1482 }
1483
1484 if (!m->has_more()) {
1485 metrics_handler->add_session(session);
1486 // notify client of success with an OPEN
1487 auto reply = make_message<MClientSession>(CEPH_SESSION_OPEN);
1488 if (session->info.has_feature(CEPHFS_FEATURE_MIMIC)) {
1489 reply->supported_features = supported_features;
1490 reply->metric_spec = supported_metric_spec;
1491 }
1492 mds->send_message_client(reply, session);
1493 mds->clog->debug() << "reconnect by " << session->info.inst << " after " << delay;
1494 }
1495
1496 session->last_cap_renew = clock::now();
1497
1498 // snaprealms
1499 for (const auto &r : m->realms) {
1500 CInode *in = mdcache->get_inode(inodeno_t(r.realm.ino));
1501 if (in && in->state_test(CInode::STATE_PURGING))
1502 continue;
1503 if (in) {
1504 if (in->snaprealm) {
1505 dout(15) << "open snaprealm (w inode) on " << *in << dendl;
1506 } else {
1507 // this can happen if we are non-auth or we rollback snaprealm
1508 dout(15) << "open snaprealm (null snaprealm) on " << *in << dendl;
1509 }
1510 mdcache->add_reconnected_snaprealm(from, inodeno_t(r.realm.ino), snapid_t(r.realm.seq));
1511 } else {
1512 dout(15) << "open snaprealm (w/o inode) on " << inodeno_t(r.realm.ino)
1513 << " seq " << r.realm.seq << dendl;
1514 mdcache->add_reconnected_snaprealm(from, inodeno_t(r.realm.ino), snapid_t(r.realm.seq));
1515 }
1516 }
1517
1518 // caps
1519 for (const auto &p : m->caps) {
1520 // make sure our last_cap_id is MAX over all issued caps
1521 if (p.second.capinfo.cap_id > mdcache->last_cap_id)
1522 mdcache->last_cap_id = p.second.capinfo.cap_id;
1523
1524 CInode *in = mdcache->get_inode(p.first);
1525 if (in && in->state_test(CInode::STATE_PURGING))
1526 continue;
1527 if (in && in->is_auth()) {
1528 // we recovered it, and it's ours. take note.
1529 dout(15) << "open cap realm " << inodeno_t(p.second.capinfo.snaprealm)
1530 << " on " << *in << dendl;
1531 in->reconnect_cap(from, p.second, session);
1532 mdcache->add_reconnected_cap(from, p.first, p.second);
1533 recover_filelocks(in, p.second.flockbl, m->get_orig_source().num());
1534 continue;
1535 }
1536
1537 if (in && !in->is_auth()) {
1538 // not mine.
1539 dout(10) << "non-auth " << *in << ", will pass off to authority" << dendl;
1540 // add to cap export list.
1541 mdcache->rejoin_export_caps(p.first, from, p.second,
1542 in->authority().first, true);
1543 } else {
1544 // don't know if the inode is mine
1545 dout(10) << "missing ino " << p.first << ", will load later" << dendl;
1546 mdcache->rejoin_recovered_caps(p.first, from, p.second, MDS_RANK_NONE);
1547 }
1548 }
1549
1550 reconnect_last_seen = clock::now();
1551
1552 if (!m->has_more()) {
1553 mdcache->rejoin_recovered_client(session->get_client(), session->info.inst);
1554
1555 // remove from gather set
1556 client_reconnect_gather.erase(from);
1557 session->set_reconnecting(false);
1558 if (client_reconnect_gather.empty())
1559 reconnect_gather_finish();
1560 }
1561 }
1562
1563 void Server::infer_supported_features(Session *session, client_metadata_t& client_metadata)
1564 {
1565 int supported = -1;
1566 auto it = client_metadata.find("ceph_version");
1567 if (it != client_metadata.end()) {
1568 // user space client
1569 if (it->second.compare(0, 16, "ceph version 12.") == 0)
1570 supported = CEPHFS_FEATURE_LUMINOUS;
1571 else if (session->get_connection()->has_feature(CEPH_FEATURE_FS_CHANGE_ATTR))
1572 supported = CEPHFS_FEATURE_KRAKEN;
1573 } else {
1574 it = client_metadata.find("kernel_version");
1575 if (it != client_metadata.end()) {
1576 // kernel client
1577 if (session->get_connection()->has_feature(CEPH_FEATURE_NEW_OSDOP_ENCODING))
1578 supported = CEPHFS_FEATURE_LUMINOUS;
1579 }
1580 }
1581 if (supported == -1 &&
1582 session->get_connection()->has_feature(CEPH_FEATURE_FS_FILE_LAYOUT_V2))
1583 supported = CEPHFS_FEATURE_JEWEL;
1584
1585 if (supported >= 0) {
1586 unsigned long value = (1UL << (supported + 1)) - 1;
1587 client_metadata.features = feature_bitset_t(value);
1588 dout(10) << __func__ << " got '" << client_metadata.features << "'" << dendl;
1589 }
1590 }
1591
1592 void Server::update_required_client_features()
1593 {
1594 required_client_features = mds->mdsmap->get_required_client_features();
1595 dout(7) << "required_client_features: " << required_client_features << dendl;
1596
1597 if (mds->get_state() >= MDSMap::STATE_RECONNECT) {
1598 set<Session*> sessions;
1599 mds->sessionmap.get_client_session_set(sessions);
1600 for (auto session : sessions) {
1601 feature_bitset_t missing_features = required_client_features;
1602 missing_features -= session->info.client_metadata.features;
1603 if (!missing_features.empty()) {
1604 bool blocklisted = mds->objecter->with_osdmap(
1605 [session](const OSDMap &osd_map) -> bool {
1606 return osd_map.is_blocklisted(session->info.inst.addr);
1607 });
1608 if (blocklisted)
1609 continue;
1610
1611 mds->clog->warn() << "evicting session " << *session << ", missing required features '"
1612 << missing_features << "'";
1613 CachedStackStringStream css;
1614 mds->evict_client(session->get_client().v, false,
1615 g_conf()->mds_session_blocklist_on_evict, *css);
1616 }
1617 }
1618 }
1619 }
1620
1621 void Server::reconnect_gather_finish()
1622 {
1623 dout(7) << "reconnect_gather_finish. failed on " << failed_reconnects << " clients" << dendl;
1624 ceph_assert(reconnect_done);
1625
1626 if (!mds->snapclient->is_synced()) {
1627 // make sure snaptable cache is populated. snaprealms will be
1628 // extensively used in rejoin stage.
1629 dout(7) << " snaptable cache isn't synced, delaying state transition" << dendl;
1630 mds->snapclient->wait_for_sync(reconnect_done);
1631 } else {
1632 reconnect_done->complete(0);
1633 }
1634 reconnect_done = NULL;
1635 }
1636
1637 void Server::reconnect_tick()
1638 {
1639 bool reject_all_reconnect = false;
1640 if (reconnect_evicting) {
1641 dout(7) << "reconnect_tick: waiting for evictions" << dendl;
1642 return;
1643 }
1644
1645 /*
1646 * Set mds_deny_all_reconnect to reject all the reconnect req ,
1647 * then load less meta information in rejoin phase. This will shorten reboot time.
1648 * Moreover, loading less meta increases the chance standby with less memory can failover.
1649
1650 * Why not shorten reconnect period?
1651 * Clients may send unsafe or retry requests, which haven't been
1652 * completed before old mds stop, to new mds. These requests may
1653 * need to be processed during new mds's clientreplay phase,
1654 * see: #https://github.com/ceph/ceph/pull/29059.
1655 */
1656 bool reconnect_all_deny = g_conf().get_val<bool>("mds_deny_all_reconnect");
1657 if (client_reconnect_gather.empty())
1658 return;
1659
1660 if (reconnect_all_deny && (client_reconnect_gather == client_reconnect_denied))
1661 reject_all_reconnect = true;
1662
1663 auto now = clock::now();
1664 auto elapse1 = std::chrono::duration<double>(now - reconnect_start).count();
1665 if (elapse1 < g_conf()->mds_reconnect_timeout && !reject_all_reconnect)
1666 return;
1667
1668 vector<Session*> remaining_sessions;
1669 remaining_sessions.reserve(client_reconnect_gather.size());
1670 for (auto c : client_reconnect_gather) {
1671 Session *session = mds->sessionmap.get_session(entity_name_t::CLIENT(c.v));
1672 ceph_assert(session);
1673 remaining_sessions.push_back(session);
1674 // client re-sends cap flush messages before the reconnect message
1675 if (session->last_seen > reconnect_last_seen)
1676 reconnect_last_seen = session->last_seen;
1677 }
1678
1679 auto elapse2 = std::chrono::duration<double>(now - reconnect_last_seen).count();
1680 if (elapse2 < g_conf()->mds_reconnect_timeout / 2 && !reject_all_reconnect) {
1681 dout(7) << "reconnect_tick: last seen " << elapse2
1682 << " seconds ago, extending reconnect interval" << dendl;
1683 return;
1684 }
1685
1686 dout(7) << "reconnect timed out, " << remaining_sessions.size()
1687 << " clients have not reconnected in time" << dendl;
1688
1689 // If we're doing blocklist evictions, use this to wait for them before
1690 // proceeding to reconnect_gather_finish
1691 MDSGatherBuilder gather(g_ceph_context);
1692
1693 for (auto session : remaining_sessions) {
1694 // Keep sessions that have specified timeout. These sessions will prevent
1695 // mds from going to active. MDS goes to active after they all have been
1696 // killed or reclaimed.
1697 if (session->info.client_metadata.find("timeout") !=
1698 session->info.client_metadata.end()) {
1699 dout(1) << "reconnect keeps " << session->info.inst
1700 << ", need to be reclaimed" << dendl;
1701 client_reclaim_gather.insert(session->get_client());
1702 continue;
1703 }
1704
1705 dout(1) << "reconnect gives up on " << session->info.inst << dendl;
1706
1707 mds->clog->warn() << "evicting unresponsive client " << *session
1708 << ", after waiting " << elapse1
1709 << " seconds during MDS startup";
1710
1711 // make _session_logged() purge orphan objects of lost async/unsafe requests
1712 session->delegated_inos.swap(session->free_prealloc_inos);
1713
1714 if (g_conf()->mds_session_blocklist_on_timeout) {
1715 CachedStackStringStream css;
1716 mds->evict_client(session->get_client().v, false, true, *css,
1717 gather.new_sub());
1718 } else {
1719 kill_session(session, NULL);
1720 }
1721
1722 failed_reconnects++;
1723 }
1724 client_reconnect_gather.clear();
1725 client_reconnect_denied.clear();
1726
1727 if (gather.has_subs()) {
1728 dout(1) << "reconnect will complete once clients are evicted" << dendl;
1729 gather.set_finisher(new MDSInternalContextWrapper(mds, new LambdaContext(
1730 [this](int r){reconnect_gather_finish();})));
1731 gather.activate();
1732 reconnect_evicting = true;
1733 } else {
1734 reconnect_gather_finish();
1735 }
1736 }
1737
1738 void Server::recover_filelocks(CInode *in, bufferlist locks, int64_t client)
1739 {
1740 if (!locks.length()) return;
1741 int numlocks;
1742 ceph_filelock lock;
1743 auto p = locks.cbegin();
1744 decode(numlocks, p);
1745 for (int i = 0; i < numlocks; ++i) {
1746 decode(lock, p);
1747 lock.client = client;
1748 in->get_fcntl_lock_state()->held_locks.insert(pair<uint64_t, ceph_filelock>(lock.start, lock));
1749 ++in->get_fcntl_lock_state()->client_held_lock_counts[client];
1750 }
1751 decode(numlocks, p);
1752 for (int i = 0; i < numlocks; ++i) {
1753 decode(lock, p);
1754 lock.client = client;
1755 in->get_flock_lock_state()->held_locks.insert(pair<uint64_t, ceph_filelock> (lock.start, lock));
1756 ++in->get_flock_lock_state()->client_held_lock_counts[client];
1757 }
1758 }
1759
1760 /**
1761 * Call this when the MDCache is oversized, to send requests to the clients
1762 * to trim some caps, and consequently unpin some inodes in the MDCache so
1763 * that it can trim too.
1764 */
1765 std::pair<bool, uint64_t> Server::recall_client_state(MDSGatherBuilder* gather, RecallFlags flags)
1766 {
1767 const auto now = clock::now();
1768 const bool steady = !!(flags&RecallFlags::STEADY);
1769 const bool enforce_max = !!(flags&RecallFlags::ENFORCE_MAX);
1770 const bool enforce_liveness = !!(flags&RecallFlags::ENFORCE_LIVENESS);
1771 const bool trim = !!(flags&RecallFlags::TRIM);
1772
1773 const auto max_caps_per_client = g_conf().get_val<uint64_t>("mds_max_caps_per_client");
1774 const auto min_caps_per_client = g_conf().get_val<uint64_t>("mds_min_caps_per_client");
1775 const auto recall_global_max_decay_threshold = g_conf().get_val<Option::size_t>("mds_recall_global_max_decay_threshold");
1776 const auto recall_max_caps = g_conf().get_val<Option::size_t>("mds_recall_max_caps");
1777 const auto recall_max_decay_threshold = g_conf().get_val<Option::size_t>("mds_recall_max_decay_threshold");
1778 const auto cache_liveness_magnitude = g_conf().get_val<Option::size_t>("mds_session_cache_liveness_magnitude");
1779
1780 dout(7) << __func__ << ":"
1781 << " min=" << min_caps_per_client
1782 << " max=" << max_caps_per_client
1783 << " total=" << Capability::count()
1784 << " flags=" << flags
1785 << dendl;
1786
1787 /* trim caps of sessions with the most caps first */
1788 std::multimap<uint64_t, Session*> caps_session;
1789 auto f = [&caps_session, enforce_max, enforce_liveness, trim, max_caps_per_client, cache_liveness_magnitude](auto& s) {
1790 auto num_caps = s->caps.size();
1791 auto cache_liveness = s->get_session_cache_liveness();
1792 if (trim || (enforce_max && num_caps > max_caps_per_client) || (enforce_liveness && cache_liveness < (num_caps>>cache_liveness_magnitude))) {
1793 caps_session.emplace(std::piecewise_construct, std::forward_as_tuple(num_caps), std::forward_as_tuple(s));
1794 }
1795 };
1796 mds->sessionmap.get_client_sessions(std::move(f));
1797
1798 std::pair<bool, uint64_t> result = {false, 0};
1799 auto& [throttled, caps_recalled] = result;
1800 last_recall_state = now;
1801 for (const auto& [num_caps, session] : boost::adaptors::reverse(caps_session)) {
1802 if (!session->is_open() ||
1803 !session->get_connection() ||
1804 !session->info.inst.name.is_client())
1805 continue;
1806
1807 dout(10) << __func__ << ":"
1808 << " session " << session->info.inst
1809 << " caps " << num_caps
1810 << ", leases " << session->leases.size()
1811 << dendl;
1812
1813 uint64_t newlim;
1814 if (num_caps < recall_max_caps || (num_caps-recall_max_caps) < min_caps_per_client) {
1815 newlim = min_caps_per_client;
1816 } else {
1817 newlim = num_caps-recall_max_caps;
1818 }
1819 if (num_caps > newlim) {
1820 /* now limit the number of caps we recall at a time to prevent overloading ourselves */
1821 uint64_t recall = std::min<uint64_t>(recall_max_caps, num_caps-newlim);
1822 newlim = num_caps-recall;
1823 const uint64_t session_recall_throttle = session->get_recall_caps_throttle();
1824 const uint64_t session_recall_throttle2o = session->get_recall_caps_throttle2o();
1825 const uint64_t global_recall_throttle = recall_throttle.get();
1826 if (session_recall_throttle+recall > recall_max_decay_threshold) {
1827 dout(15) << " session recall threshold (" << recall_max_decay_threshold << ") hit at " << session_recall_throttle << "; skipping!" << dendl;
1828 throttled = true;
1829 continue;
1830 } else if (session_recall_throttle2o+recall > recall_max_caps*2) {
1831 dout(15) << " session recall 2nd-order threshold (" << 2*recall_max_caps << ") hit at " << session_recall_throttle2o << "; skipping!" << dendl;
1832 throttled = true;
1833 continue;
1834 } else if (global_recall_throttle+recall > recall_global_max_decay_threshold) {
1835 dout(15) << " global recall threshold (" << recall_global_max_decay_threshold << ") hit at " << global_recall_throttle << "; skipping!" << dendl;
1836 throttled = true;
1837 break;
1838 }
1839
1840 // now check if we've recalled caps recently and the client is unlikely to satisfy a new recall
1841 if (steady) {
1842 const auto session_recall = session->get_recall_caps();
1843 const auto session_release = session->get_release_caps();
1844 if (2*session_release < session_recall && 2*session_recall > recall_max_decay_threshold) {
1845 /* The session has been unable to keep up with the number of caps
1846 * recalled (by half); additionally, to prevent marking sessions
1847 * we've just begun to recall from, the session_recall counter
1848 * (decayed count of caps recently recalled) is **greater** than the
1849 * session threshold for the session's cap recall throttle.
1850 */
1851 dout(15) << " 2*session_release < session_recall"
1852 " (2*" << session_release << " < " << session_recall << ") &&"
1853 " 2*session_recall < recall_max_decay_threshold"
1854 " (2*" << session_recall << " > " << recall_max_decay_threshold << ")"
1855 " Skipping because we are unlikely to get more released." << dendl;
1856 continue;
1857 } else if (recall < recall_max_caps && 2*recall < session_recall) {
1858 /* The number of caps recalled is less than the number we *could*
1859 * recall (so there isn't much left to recall?) and the number of
1860 * caps is less than the current recall_caps counter (decayed count
1861 * of caps recently recalled).
1862 */
1863 dout(15) << " 2*recall < session_recall "
1864 " (2*" << recall << " < " << session_recall << ") &&"
1865 " recall < recall_max_caps (" << recall << " < " << recall_max_caps << ");"
1866 " Skipping because we are unlikely to get more released." << dendl;
1867 continue;
1868 }
1869 }
1870
1871 dout(7) << " recalling " << recall << " caps; session_recall_throttle = " << session_recall_throttle << "; global_recall_throttle = " << global_recall_throttle << dendl;
1872
1873 auto m = make_message<MClientSession>(CEPH_SESSION_RECALL_STATE);
1874 m->head.max_caps = newlim;
1875 mds->send_message_client(m, session);
1876 if (gather) {
1877 flush_session(session, *gather);
1878 }
1879 caps_recalled += session->notify_recall_sent(newlim);
1880 recall_throttle.hit(recall);
1881 }
1882 }
1883
1884 dout(7) << "recalled" << (throttled ? " (throttled)" : "") << " " << caps_recalled << " client caps." << dendl;
1885
1886 return result;
1887 }
1888
1889 void Server::force_clients_readonly()
1890 {
1891 dout(10) << "force_clients_readonly" << dendl;
1892 set<Session*> sessions;
1893 mds->sessionmap.get_client_session_set(sessions);
1894 for (set<Session*>::const_iterator p = sessions.begin();
1895 p != sessions.end();
1896 ++p) {
1897 Session *session = *p;
1898 if (!session->info.inst.name.is_client() ||
1899 !(session->is_open() || session->is_stale()))
1900 continue;
1901 mds->send_message_client(make_message<MClientSession>(CEPH_SESSION_FORCE_RO), session);
1902 }
1903 }
1904
1905 /*******
1906 * some generic stuff for finishing off requests
1907 */
1908 void Server::journal_and_reply(MDRequestRef& mdr, CInode *in, CDentry *dn, LogEvent *le, MDSLogContextBase *fin)
1909 {
1910 dout(10) << "journal_and_reply tracei " << in << " tracedn " << dn << dendl;
1911 ceph_assert(!mdr->has_completed);
1912
1913 // note trace items for eventual reply.
1914 mdr->tracei = in;
1915 if (in)
1916 mdr->pin(in);
1917
1918 mdr->tracedn = dn;
1919 if (dn)
1920 mdr->pin(dn);
1921
1922 early_reply(mdr, in, dn);
1923
1924 mdr->committing = true;
1925 submit_mdlog_entry(le, fin, mdr, __func__);
1926
1927 if (mdr->client_request && mdr->client_request->is_queued_for_replay()) {
1928 if (mds->queue_one_replay()) {
1929 dout(10) << " queued next replay op" << dendl;
1930 } else {
1931 dout(10) << " journaled last replay op" << dendl;
1932 }
1933 } else if (mdr->did_early_reply)
1934 mds->locker->drop_rdlocks_for_early_reply(mdr.get());
1935 else
1936 mdlog->flush();
1937 }
1938
1939 void Server::submit_mdlog_entry(LogEvent *le, MDSLogContextBase *fin, MDRequestRef& mdr,
1940 std::string_view event)
1941 {
1942 if (mdr) {
1943 string event_str("submit entry: ");
1944 event_str += event;
1945 mdr->mark_event(event_str);
1946 }
1947 mdlog->submit_entry(le, fin);
1948 }
1949
1950 /*
1951 * send response built from mdr contents and error code; clean up mdr
1952 */
1953 void Server::respond_to_request(MDRequestRef& mdr, int r)
1954 {
1955 if (mdr->client_request) {
1956 if (mdr->is_batch_head()) {
1957 dout(20) << __func__ << " batch head " << *mdr << dendl;
1958 mdr->release_batch_op()->respond(r);
1959 } else {
1960 reply_client_request(mdr, make_message<MClientReply>(*mdr->client_request, r));
1961 }
1962 } else if (mdr->internal_op > -1) {
1963 dout(10) << "respond_to_request on internal request " << mdr << dendl;
1964 if (!mdr->internal_op_finish)
1965 ceph_abort_msg("trying to respond to internal op without finisher");
1966 mdr->internal_op_finish->complete(r);
1967 mdcache->request_finish(mdr);
1968 }
1969 }
1970
1971 // statistics mds req op number and latency
1972 void Server::perf_gather_op_latency(const cref_t<MClientRequest> &req, utime_t lat)
1973 {
1974 int code = l_mdss_first;
1975 switch(req->get_op()) {
1976 case CEPH_MDS_OP_LOOKUPHASH:
1977 code = l_mdss_req_lookuphash_latency;
1978 break;
1979 case CEPH_MDS_OP_LOOKUPINO:
1980 code = l_mdss_req_lookupino_latency;
1981 break;
1982 case CEPH_MDS_OP_LOOKUPPARENT:
1983 code = l_mdss_req_lookupparent_latency;
1984 break;
1985 case CEPH_MDS_OP_LOOKUPNAME:
1986 code = l_mdss_req_lookupname_latency;
1987 break;
1988 case CEPH_MDS_OP_LOOKUP:
1989 code = l_mdss_req_lookup_latency;
1990 break;
1991 case CEPH_MDS_OP_LOOKUPSNAP:
1992 code = l_mdss_req_lookupsnap_latency;
1993 break;
1994 case CEPH_MDS_OP_GETATTR:
1995 code = l_mdss_req_getattr_latency;
1996 break;
1997 case CEPH_MDS_OP_SETATTR:
1998 code = l_mdss_req_setattr_latency;
1999 break;
2000 case CEPH_MDS_OP_SETLAYOUT:
2001 code = l_mdss_req_setlayout_latency;
2002 break;
2003 case CEPH_MDS_OP_SETDIRLAYOUT:
2004 code = l_mdss_req_setdirlayout_latency;
2005 break;
2006 case CEPH_MDS_OP_GETVXATTR:
2007 code = l_mdss_req_getvxattr_latency;
2008 break;
2009 case CEPH_MDS_OP_SETXATTR:
2010 code = l_mdss_req_setxattr_latency;
2011 break;
2012 case CEPH_MDS_OP_RMXATTR:
2013 code = l_mdss_req_rmxattr_latency;
2014 break;
2015 case CEPH_MDS_OP_READDIR:
2016 code = l_mdss_req_readdir_latency;
2017 break;
2018 case CEPH_MDS_OP_SETFILELOCK:
2019 code = l_mdss_req_setfilelock_latency;
2020 break;
2021 case CEPH_MDS_OP_GETFILELOCK:
2022 code = l_mdss_req_getfilelock_latency;
2023 break;
2024 case CEPH_MDS_OP_CREATE:
2025 code = l_mdss_req_create_latency;
2026 break;
2027 case CEPH_MDS_OP_OPEN:
2028 code = l_mdss_req_open_latency;
2029 break;
2030 case CEPH_MDS_OP_MKNOD:
2031 code = l_mdss_req_mknod_latency;
2032 break;
2033 case CEPH_MDS_OP_LINK:
2034 code = l_mdss_req_link_latency;
2035 break;
2036 case CEPH_MDS_OP_UNLINK:
2037 code = l_mdss_req_unlink_latency;
2038 break;
2039 case CEPH_MDS_OP_RMDIR:
2040 code = l_mdss_req_rmdir_latency;
2041 break;
2042 case CEPH_MDS_OP_RENAME:
2043 code = l_mdss_req_rename_latency;
2044 break;
2045 case CEPH_MDS_OP_MKDIR:
2046 code = l_mdss_req_mkdir_latency;
2047 break;
2048 case CEPH_MDS_OP_SYMLINK:
2049 code = l_mdss_req_symlink_latency;
2050 break;
2051 case CEPH_MDS_OP_LSSNAP:
2052 code = l_mdss_req_lssnap_latency;
2053 break;
2054 case CEPH_MDS_OP_MKSNAP:
2055 code = l_mdss_req_mksnap_latency;
2056 break;
2057 case CEPH_MDS_OP_RMSNAP:
2058 code = l_mdss_req_rmsnap_latency;
2059 break;
2060 case CEPH_MDS_OP_RENAMESNAP:
2061 code = l_mdss_req_renamesnap_latency;
2062 break;
2063 default:
2064 dout(1) << ": unknown client op" << dendl;
2065 return;
2066 }
2067 logger->tinc(code, lat);
2068 }
2069
2070 void Server::early_reply(MDRequestRef& mdr, CInode *tracei, CDentry *tracedn)
2071 {
2072 if (!g_conf()->mds_early_reply)
2073 return;
2074
2075 if (mdr->no_early_reply) {
2076 dout(10) << "early_reply - flag no_early_reply is set, not allowed." << dendl;
2077 return;
2078 }
2079
2080 if (mdr->has_more() && mdr->more()->has_journaled_peers) {
2081 dout(10) << "early_reply - there are journaled peers, not allowed." << dendl;
2082 return;
2083 }
2084
2085 if (mdr->alloc_ino) {
2086 dout(10) << "early_reply - allocated ino, not allowed" << dendl;
2087 return;
2088 }
2089
2090 const cref_t<MClientRequest> &req = mdr->client_request;
2091 entity_inst_t client_inst = req->get_source_inst();
2092 if (client_inst.name.is_mds())
2093 return;
2094
2095 if (req->is_replay()) {
2096 dout(10) << " no early reply on replay op" << dendl;
2097 return;
2098 }
2099
2100
2101 auto reply = make_message<MClientReply>(*req, 0);
2102 reply->set_unsafe();
2103
2104 // mark xlocks "done", indicating that we are exposing uncommitted changes.
2105 //
2106 //_rename_finish() does not send dentry link/unlink message to replicas.
2107 // so do not set xlocks on dentries "done", the xlocks prevent dentries
2108 // that have projected linkages from getting new replica.
2109 mds->locker->set_xlocks_done(mdr.get(), req->get_op() == CEPH_MDS_OP_RENAME);
2110
2111 dout(10) << "early_reply " << reply->get_result()
2112 << " (" << cpp_strerror(reply->get_result())
2113 << ") " << *req << dendl;
2114
2115 if (tracei || tracedn) {
2116 if (tracei)
2117 mdr->cap_releases.erase(tracei->vino());
2118 if (tracedn)
2119 mdr->cap_releases.erase(tracedn->get_dir()->get_inode()->vino());
2120
2121 set_trace_dist(reply, tracei, tracedn, mdr);
2122 }
2123
2124 reply->set_extra_bl(mdr->reply_extra_bl);
2125 mds->send_message_client(reply, mdr->session);
2126
2127 mdr->did_early_reply = true;
2128
2129 mds->logger->inc(l_mds_reply);
2130 utime_t lat = ceph_clock_now() - req->get_recv_stamp();
2131 mds->logger->tinc(l_mds_reply_latency, lat);
2132 if (lat >= g_conf()->mds_op_complaint_time) {
2133 mds->logger->inc(l_mds_slow_reply);
2134 }
2135 if (client_inst.name.is_client()) {
2136 mds->sessionmap.hit_session(mdr->session);
2137 }
2138 perf_gather_op_latency(req, lat);
2139 dout(20) << "lat " << lat << dendl;
2140
2141 mdr->mark_event("early_replied");
2142 }
2143
2144 /*
2145 * send given reply
2146 * include a trace to tracei
2147 * Clean up mdr
2148 */
2149 void Server::reply_client_request(MDRequestRef& mdr, const ref_t<MClientReply> &reply)
2150 {
2151 ceph_assert(mdr.get());
2152 const cref_t<MClientRequest> &req = mdr->client_request;
2153
2154 dout(7) << "reply_client_request " << reply->get_result()
2155 << " (" << cpp_strerror(reply->get_result())
2156 << ") " << *req << dendl;
2157
2158 mdr->mark_event("replying");
2159
2160 Session *session = mdr->session;
2161
2162 // note successful request in session map?
2163 //
2164 // setfilelock requests are special, they only modify states in MDS memory.
2165 // The states get lost when MDS fails. If Client re-send a completed
2166 // setfilelock request, it means that client did not receive corresponding
2167 // setfilelock reply. So MDS should re-execute the setfilelock request.
2168 if (req->may_write() && req->get_op() != CEPH_MDS_OP_SETFILELOCK &&
2169 reply->get_result() == 0 && session) {
2170 inodeno_t created = mdr->alloc_ino ? mdr->alloc_ino : mdr->used_prealloc_ino;
2171 session->add_completed_request(mdr->reqid.tid, created);
2172 if (mdr->ls) {
2173 mdr->ls->touched_sessions.insert(session->info.inst.name);
2174 }
2175 }
2176
2177 // give any preallocated inos to the session
2178 apply_allocated_inos(mdr, session);
2179
2180 // get tracei/tracedn from mdr?
2181 CInode *tracei = mdr->tracei;
2182 CDentry *tracedn = mdr->tracedn;
2183
2184 bool is_replay = mdr->client_request->is_replay();
2185 bool did_early_reply = mdr->did_early_reply;
2186 entity_inst_t client_inst = req->get_source_inst();
2187
2188 if (!did_early_reply && !is_replay) {
2189
2190 mds->logger->inc(l_mds_reply);
2191 utime_t lat = ceph_clock_now() - mdr->client_request->get_recv_stamp();
2192 mds->logger->tinc(l_mds_reply_latency, lat);
2193 if (lat >= g_conf()->mds_op_complaint_time) {
2194 mds->logger->inc(l_mds_slow_reply);
2195 }
2196 if (session && client_inst.name.is_client()) {
2197 mds->sessionmap.hit_session(session);
2198 }
2199 perf_gather_op_latency(req, lat);
2200 dout(20) << "lat " << lat << dendl;
2201
2202 if (tracei)
2203 mdr->cap_releases.erase(tracei->vino());
2204 if (tracedn)
2205 mdr->cap_releases.erase(tracedn->get_dir()->get_inode()->vino());
2206 }
2207
2208 // drop non-rdlocks before replying, so that we can issue leases
2209 mdcache->request_drop_non_rdlocks(mdr);
2210
2211 // reply at all?
2212 if (session && !client_inst.name.is_mds()) {
2213 // send reply.
2214 if (!did_early_reply && // don't issue leases if we sent an earlier reply already
2215 (tracei || tracedn)) {
2216 if (is_replay) {
2217 if (tracei)
2218 mdcache->try_reconnect_cap(tracei, session);
2219 } else {
2220 // include metadata in reply
2221 set_trace_dist(reply, tracei, tracedn, mdr);
2222 }
2223 }
2224
2225 // We can set the extra bl unconditionally: if it's already been sent in the
2226 // early_reply, set_extra_bl will have claimed it and reply_extra_bl is empty
2227 reply->set_extra_bl(mdr->reply_extra_bl);
2228
2229 reply->set_mdsmap_epoch(mds->mdsmap->get_epoch());
2230 mds->send_message_client(reply, session);
2231 }
2232
2233 if (req->is_queued_for_replay() &&
2234 (mdr->has_completed || reply->get_result() < 0)) {
2235 if (reply->get_result() < 0) {
2236 int r = reply->get_result();
2237 derr << "reply_client_request: failed to replay " << *req
2238 << " error " << r << " (" << cpp_strerror(r) << ")" << dendl;
2239 mds->clog->warn() << "failed to replay " << req->get_reqid() << " error " << r;
2240 }
2241 mds->queue_one_replay();
2242 }
2243
2244 // clean up request
2245 mdcache->request_finish(mdr);
2246
2247 // take a closer look at tracei, if it happens to be a remote link
2248 if (tracei &&
2249 tracedn &&
2250 tracedn->get_projected_linkage()->is_remote()) {
2251 mdcache->eval_remote(tracedn);
2252 }
2253 }
2254
2255 /*
2256 * pass inode OR dentry (not both, or we may get confused)
2257 *
2258 * trace is in reverse order (i.e. root inode comes last)
2259 */
2260 void Server::set_trace_dist(const ref_t<MClientReply> &reply,
2261 CInode *in, CDentry *dn,
2262 MDRequestRef& mdr)
2263 {
2264 // skip doing this for debugging purposes?
2265 if (g_conf()->mds_inject_traceless_reply_probability &&
2266 mdr->ls && !mdr->o_trunc &&
2267 (rand() % 10000 < g_conf()->mds_inject_traceless_reply_probability * 10000.0)) {
2268 dout(5) << "deliberately skipping trace for " << *reply << dendl;
2269 return;
2270 }
2271
2272 // inode, dentry, dir, ..., inode
2273 bufferlist bl;
2274 mds_rank_t whoami = mds->get_nodeid();
2275 Session *session = mdr->session;
2276 snapid_t snapid = mdr->snapid;
2277 utime_t now = ceph_clock_now();
2278
2279 dout(20) << "set_trace_dist snapid " << snapid << dendl;
2280
2281 // realm
2282 if (snapid == CEPH_NOSNAP) {
2283 SnapRealm *realm;
2284 if (in)
2285 realm = in->find_snaprealm();
2286 else
2287 realm = dn->get_dir()->get_inode()->find_snaprealm();
2288 reply->snapbl = realm->get_snap_trace();
2289 dout(10) << "set_trace_dist snaprealm " << *realm << " len=" << reply->snapbl.length() << dendl;
2290 }
2291
2292 // dir + dentry?
2293 if (dn) {
2294 reply->head.is_dentry = 1;
2295 CDir *dir = dn->get_dir();
2296 CInode *diri = dir->get_inode();
2297
2298 diri->encode_inodestat(bl, session, NULL, snapid);
2299 dout(20) << "set_trace_dist added diri " << *diri << dendl;
2300
2301 #ifdef MDS_VERIFY_FRAGSTAT
2302 if (dir->is_complete())
2303 dir->verify_fragstat();
2304 #endif
2305 DirStat ds;
2306 ds.frag = dir->get_frag();
2307 ds.auth = dir->get_dir_auth().first;
2308 if (dir->is_auth() && !forward_all_requests_to_auth)
2309 dir->get_dist_spec(ds.dist, whoami);
2310
2311 dir->encode_dirstat(bl, session->info, ds);
2312 dout(20) << "set_trace_dist added dir " << *dir << dendl;
2313
2314 encode(dn->get_name(), bl);
2315 mds->locker->issue_client_lease(dn, in, mdr, now, bl);
2316 } else
2317 reply->head.is_dentry = 0;
2318
2319 // inode
2320 if (in) {
2321 in->encode_inodestat(bl, session, NULL, snapid, 0, mdr->getattr_caps);
2322 dout(20) << "set_trace_dist added in " << *in << dendl;
2323 reply->head.is_target = 1;
2324 } else
2325 reply->head.is_target = 0;
2326
2327 reply->set_trace(bl);
2328 }
2329
2330 void Server::handle_client_request(const cref_t<MClientRequest> &req)
2331 {
2332 dout(4) << "handle_client_request " << *req << dendl;
2333
2334 if (mds->logger)
2335 mds->logger->inc(l_mds_request);
2336 if (logger)
2337 logger->inc(l_mdss_handle_client_request);
2338
2339 if (!mdcache->is_open()) {
2340 dout(5) << "waiting for root" << dendl;
2341 mdcache->wait_for_open(new C_MDS_RetryMessage(mds, req));
2342 return;
2343 }
2344
2345 bool sessionclosed_isok = replay_unsafe_with_closed_session;
2346 // active session?
2347 Session *session = 0;
2348 if (req->get_source().is_client()) {
2349 session = mds->get_session(req);
2350 if (!session) {
2351 dout(5) << "no session for " << req->get_source() << ", dropping" << dendl;
2352 } else if ((session->is_closed() && (!mds->is_clientreplay() || !sessionclosed_isok)) ||
2353 session->is_closing() ||
2354 session->is_killing()) {
2355 dout(5) << "session closed|closing|killing, dropping" << dendl;
2356 session = NULL;
2357 }
2358 if (!session) {
2359 if (req->is_queued_for_replay())
2360 mds->queue_one_replay();
2361 return;
2362 }
2363 }
2364
2365 // old mdsmap?
2366 if (req->get_mdsmap_epoch() < mds->mdsmap->get_epoch()) {
2367 // send it? hrm, this isn't ideal; they may get a lot of copies if
2368 // they have a high request rate.
2369 }
2370
2371 // completed request?
2372 bool has_completed = false;
2373 if (req->is_replay() || req->get_retry_attempt()) {
2374 ceph_assert(session);
2375 inodeno_t created;
2376 if (session->have_completed_request(req->get_reqid().tid, &created)) {
2377 has_completed = true;
2378 if (!session->is_open())
2379 return;
2380 // Don't send traceless reply if the completed request has created
2381 // new inode. Treat the request as lookup request instead.
2382 if (req->is_replay() ||
2383 ((created == inodeno_t() || !mds->is_clientreplay()) &&
2384 req->get_op() != CEPH_MDS_OP_OPEN &&
2385 req->get_op() != CEPH_MDS_OP_CREATE)) {
2386 dout(5) << "already completed " << req->get_reqid() << dendl;
2387 auto reply = make_message<MClientReply>(*req, 0);
2388 if (created != inodeno_t()) {
2389 bufferlist extra;
2390 encode(created, extra);
2391 reply->set_extra_bl(extra);
2392 }
2393 mds->send_message_client(reply, session);
2394
2395 if (req->is_queued_for_replay())
2396 mds->queue_one_replay();
2397
2398 return;
2399 }
2400 if (req->get_op() != CEPH_MDS_OP_OPEN &&
2401 req->get_op() != CEPH_MDS_OP_CREATE) {
2402 dout(10) << " completed request which created new inode " << created
2403 << ", convert it to lookup request" << dendl;
2404 req->head.op = req->get_dentry_wanted() ? CEPH_MDS_OP_LOOKUP : CEPH_MDS_OP_GETATTR;
2405 req->head.args.getattr.mask = CEPH_STAT_CAP_INODE_ALL;
2406 }
2407 }
2408 }
2409
2410 // trim completed_request list
2411 if (req->get_oldest_client_tid() > 0) {
2412 dout(15) << " oldest_client_tid=" << req->get_oldest_client_tid() << dendl;
2413 ceph_assert(session);
2414 if (session->trim_completed_requests(req->get_oldest_client_tid())) {
2415 // Sessions 'completed_requests' was dirtied, mark it to be
2416 // potentially flushed at segment expiry.
2417 mdlog->get_current_segment()->touched_sessions.insert(session->info.inst.name);
2418
2419 if (session->get_num_trim_requests_warnings() > 0 &&
2420 session->get_num_completed_requests() * 2 < g_conf()->mds_max_completed_requests)
2421 session->reset_num_trim_requests_warnings();
2422 } else {
2423 if (session->get_num_completed_requests() >=
2424 (g_conf()->mds_max_completed_requests << session->get_num_trim_requests_warnings())) {
2425 session->inc_num_trim_requests_warnings();
2426 CachedStackStringStream css;
2427 *css << "client." << session->get_client() << " does not advance its oldest_client_tid ("
2428 << req->get_oldest_client_tid() << "), "
2429 << session->get_num_completed_requests()
2430 << " completed requests recorded in session\n";
2431 mds->clog->warn() << css->strv();
2432 dout(20) << __func__ << " " << css->strv() << dendl;
2433 }
2434 }
2435 }
2436
2437 // register + dispatch
2438 MDRequestRef mdr = mdcache->request_start(req);
2439 if (!mdr.get())
2440 return;
2441
2442 if (session) {
2443 mdr->session = session;
2444 session->requests.push_back(&mdr->item_session_request);
2445 }
2446
2447 if (has_completed)
2448 mdr->has_completed = true;
2449
2450 // process embedded cap releases?
2451 // (only if NOT replay!)
2452 if (!req->releases.empty() && req->get_source().is_client() && !req->is_replay()) {
2453 client_t client = req->get_source().num();
2454 for (const auto &r : req->releases) {
2455 mds->locker->process_request_cap_release(mdr, client, r.item, r.dname);
2456 }
2457 req->releases.clear();
2458 }
2459
2460 dispatch_client_request(mdr);
2461 return;
2462 }
2463
2464 void Server::handle_osd_map()
2465 {
2466 /* Note that we check the OSDMAP_FULL flag directly rather than
2467 * using osdmap_full_flag(), because we want to know "is the flag set"
2468 * rather than "does the flag apply to us?" */
2469 mds->objecter->with_osdmap([this](const OSDMap& o) {
2470 auto pi = o.get_pg_pool(mds->get_metadata_pool());
2471 is_full = pi && pi->has_flag(pg_pool_t::FLAG_FULL);
2472 dout(7) << __func__ << ": full = " << is_full << " epoch = "
2473 << o.get_epoch() << dendl;
2474 });
2475 }
2476
2477 void Server::dispatch_client_request(MDRequestRef& mdr)
2478 {
2479 // we shouldn't be waiting on anyone.
2480 ceph_assert(!mdr->has_more() || mdr->more()->waiting_on_peer.empty());
2481
2482 if (mdr->killed) {
2483 dout(10) << "request " << *mdr << " was killed" << dendl;
2484 //if the mdr is a "batch_op" and it has followers, pick a follower as
2485 //the new "head of the batch ops" and go on processing the new one.
2486 if (mdr->is_batch_head()) {
2487 int mask = mdr->client_request->head.args.getattr.mask;
2488 auto it = mdr->batch_op_map->find(mask);
2489 auto new_batch_head = it->second->find_new_head();
2490 if (!new_batch_head) {
2491 mdr->batch_op_map->erase(it);
2492 return;
2493 }
2494 mdr = std::move(new_batch_head);
2495 } else {
2496 return;
2497 }
2498 } else if (mdr->aborted) {
2499 mdr->aborted = false;
2500 mdcache->request_kill(mdr);
2501 return;
2502 }
2503
2504 const cref_t<MClientRequest> &req = mdr->client_request;
2505
2506 if (logger) logger->inc(l_mdss_dispatch_client_request);
2507
2508 dout(7) << "dispatch_client_request " << *req << dendl;
2509
2510 if (req->may_write() && mdcache->is_readonly()) {
2511 dout(10) << " read-only FS" << dendl;
2512 respond_to_request(mdr, -CEPHFS_EROFS);
2513 return;
2514 }
2515 if (mdr->has_more() && mdr->more()->peer_error) {
2516 dout(10) << " got error from peers" << dendl;
2517 respond_to_request(mdr, mdr->more()->peer_error);
2518 return;
2519 }
2520
2521 if (is_full) {
2522 CInode *cur = try_get_auth_inode(mdr, req->get_filepath().get_ino());
2523 if (!cur) {
2524 // the request is already responded to
2525 return;
2526 }
2527 if (req->get_op() == CEPH_MDS_OP_SETLAYOUT ||
2528 req->get_op() == CEPH_MDS_OP_SETDIRLAYOUT ||
2529 req->get_op() == CEPH_MDS_OP_SETLAYOUT ||
2530 req->get_op() == CEPH_MDS_OP_RMXATTR ||
2531 req->get_op() == CEPH_MDS_OP_SETXATTR ||
2532 req->get_op() == CEPH_MDS_OP_CREATE ||
2533 req->get_op() == CEPH_MDS_OP_SYMLINK ||
2534 req->get_op() == CEPH_MDS_OP_MKSNAP ||
2535 ((req->get_op() == CEPH_MDS_OP_LINK ||
2536 req->get_op() == CEPH_MDS_OP_RENAME) &&
2537 (!mdr->has_more() || mdr->more()->witnessed.empty())) // haven't started peer request
2538 ) {
2539
2540 if (check_access(mdr, cur, MAY_FULL)) {
2541 dout(20) << __func__ << ": full, has FULL caps, permitting op " << ceph_mds_op_name(req->get_op()) << dendl;
2542 } else {
2543 dout(20) << __func__ << ": full, responding CEPHFS_ENOSPC to op " << ceph_mds_op_name(req->get_op()) << dendl;
2544 respond_to_request(mdr, -CEPHFS_ENOSPC);
2545 return;
2546 }
2547 } else {
2548 dout(20) << __func__ << ": full, permitting op " << ceph_mds_op_name(req->get_op()) << dendl;
2549 }
2550 }
2551
2552 switch (req->get_op()) {
2553 case CEPH_MDS_OP_LOOKUPHASH:
2554 case CEPH_MDS_OP_LOOKUPINO:
2555 handle_client_lookup_ino(mdr, false, false);
2556 break;
2557 case CEPH_MDS_OP_LOOKUPPARENT:
2558 handle_client_lookup_ino(mdr, true, false);
2559 break;
2560 case CEPH_MDS_OP_LOOKUPNAME:
2561 handle_client_lookup_ino(mdr, false, true);
2562 break;
2563
2564 // inodes ops.
2565 case CEPH_MDS_OP_LOOKUP:
2566 handle_client_getattr(mdr, true);
2567 break;
2568
2569 case CEPH_MDS_OP_LOOKUPSNAP:
2570 // lookupsnap does not reference a CDentry; treat it as a getattr
2571 case CEPH_MDS_OP_GETATTR:
2572 handle_client_getattr(mdr, false);
2573 break;
2574 case CEPH_MDS_OP_GETVXATTR:
2575 handle_client_getvxattr(mdr);
2576 break;
2577
2578 case CEPH_MDS_OP_SETATTR:
2579 handle_client_setattr(mdr);
2580 break;
2581 case CEPH_MDS_OP_SETLAYOUT:
2582 handle_client_setlayout(mdr);
2583 break;
2584 case CEPH_MDS_OP_SETDIRLAYOUT:
2585 handle_client_setdirlayout(mdr);
2586 break;
2587 case CEPH_MDS_OP_SETXATTR:
2588 handle_client_setxattr(mdr);
2589 break;
2590 case CEPH_MDS_OP_RMXATTR:
2591 handle_client_removexattr(mdr);
2592 break;
2593
2594 case CEPH_MDS_OP_READDIR:
2595 handle_client_readdir(mdr);
2596 break;
2597
2598 case CEPH_MDS_OP_SETFILELOCK:
2599 handle_client_file_setlock(mdr);
2600 break;
2601
2602 case CEPH_MDS_OP_GETFILELOCK:
2603 handle_client_file_readlock(mdr);
2604 break;
2605
2606 // funky.
2607 case CEPH_MDS_OP_CREATE:
2608 if (mdr->has_completed)
2609 handle_client_open(mdr); // already created.. just open
2610 else
2611 handle_client_openc(mdr);
2612 break;
2613
2614 case CEPH_MDS_OP_OPEN:
2615 handle_client_open(mdr);
2616 break;
2617
2618 // namespace.
2619 // no prior locks.
2620 case CEPH_MDS_OP_MKNOD:
2621 handle_client_mknod(mdr);
2622 break;
2623 case CEPH_MDS_OP_LINK:
2624 handle_client_link(mdr);
2625 break;
2626 case CEPH_MDS_OP_UNLINK:
2627 case CEPH_MDS_OP_RMDIR:
2628 handle_client_unlink(mdr);
2629 break;
2630 case CEPH_MDS_OP_RENAME:
2631 handle_client_rename(mdr);
2632 break;
2633 case CEPH_MDS_OP_MKDIR:
2634 handle_client_mkdir(mdr);
2635 break;
2636 case CEPH_MDS_OP_SYMLINK:
2637 handle_client_symlink(mdr);
2638 break;
2639
2640
2641 // snaps
2642 case CEPH_MDS_OP_LSSNAP:
2643 handle_client_lssnap(mdr);
2644 break;
2645 case CEPH_MDS_OP_MKSNAP:
2646 handle_client_mksnap(mdr);
2647 break;
2648 case CEPH_MDS_OP_RMSNAP:
2649 handle_client_rmsnap(mdr);
2650 break;
2651 case CEPH_MDS_OP_RENAMESNAP:
2652 handle_client_renamesnap(mdr);
2653 break;
2654
2655 default:
2656 dout(1) << " unknown client op " << req->get_op() << dendl;
2657 respond_to_request(mdr, -CEPHFS_EOPNOTSUPP);
2658 }
2659 }
2660
2661
2662 // ---------------------------------------
2663 // PEER REQUESTS
2664
2665 void Server::handle_peer_request(const cref_t<MMDSPeerRequest> &m)
2666 {
2667 dout(4) << "handle_peer_request " << m->get_reqid() << " from " << m->get_source() << dendl;
2668 mds_rank_t from = mds_rank_t(m->get_source().num());
2669
2670 if (logger) logger->inc(l_mdss_handle_peer_request);
2671
2672 // reply?
2673 if (m->is_reply())
2674 return handle_peer_request_reply(m);
2675
2676 // the purpose of rename notify is enforcing causal message ordering. making sure
2677 // bystanders have received all messages from rename srcdn's auth MDS.
2678 if (m->get_op() == MMDSPeerRequest::OP_RENAMENOTIFY) {
2679 auto reply = make_message<MMDSPeerRequest>(m->get_reqid(), m->get_attempt(), MMDSPeerRequest::OP_RENAMENOTIFYACK);
2680 mds->send_message(reply, m->get_connection());
2681 return;
2682 }
2683
2684 CDentry *straydn = NULL;
2685 if (m->straybl.length() > 0) {
2686 mdcache->decode_replica_stray(straydn, nullptr, m->straybl, from);
2687 ceph_assert(straydn);
2688 m->straybl.clear();
2689 }
2690
2691 if (!mds->is_clientreplay() && !mds->is_active() && !mds->is_stopping()) {
2692 dout(3) << "not clientreplay|active yet, waiting" << dendl;
2693 mds->wait_for_replay(new C_MDS_RetryMessage(mds, m));
2694 return;
2695 }
2696
2697 // am i a new peer?
2698 MDRequestRef mdr;
2699 if (mdcache->have_request(m->get_reqid())) {
2700 // existing?
2701 mdr = mdcache->request_get(m->get_reqid());
2702
2703 // is my request newer?
2704 if (mdr->attempt > m->get_attempt()) {
2705 dout(10) << "local request " << *mdr << " attempt " << mdr->attempt << " > " << m->get_attempt()
2706 << ", dropping " << *m << dendl;
2707 return;
2708 }
2709
2710 if (mdr->attempt < m->get_attempt()) {
2711 // mine is old, close it out
2712 dout(10) << "local request " << *mdr << " attempt " << mdr->attempt << " < " << m->get_attempt()
2713 << ", closing out" << dendl;
2714 mdcache->request_finish(mdr);
2715 mdr.reset();
2716 } else if (mdr->peer_to_mds != from) {
2717 dout(10) << "local request " << *mdr << " not peer to mds." << from << dendl;
2718 return;
2719 }
2720
2721 // may get these while mdr->peer_request is non-null
2722 if (m->get_op() == MMDSPeerRequest::OP_DROPLOCKS) {
2723 mds->locker->drop_locks(mdr.get());
2724 return;
2725 }
2726 if (m->get_op() == MMDSPeerRequest::OP_FINISH) {
2727 if (m->is_abort()) {
2728 mdr->aborted = true;
2729 if (mdr->peer_request) {
2730 // only abort on-going xlock, wrlock and auth pin
2731 ceph_assert(!mdr->peer_did_prepare());
2732 } else {
2733 mdcache->request_finish(mdr);
2734 }
2735 } else {
2736 if (m->inode_export.length() > 0)
2737 mdr->more()->inode_import = m->inode_export;
2738 // finish off request.
2739 mdcache->request_finish(mdr);
2740 }
2741 return;
2742 }
2743 }
2744 if (!mdr.get()) {
2745 // new?
2746 if (m->get_op() == MMDSPeerRequest::OP_FINISH) {
2747 dout(10) << "missing peer request for " << m->get_reqid()
2748 << " OP_FINISH, must have lost race with a forward" << dendl;
2749 return;
2750 }
2751 mdr = mdcache->request_start_peer(m->get_reqid(), m->get_attempt(), m);
2752 mdr->set_op_stamp(m->op_stamp);
2753 }
2754 ceph_assert(mdr->peer_request == 0); // only one at a time, please!
2755
2756 if (straydn) {
2757 mdr->pin(straydn);
2758 mdr->straydn = straydn;
2759 }
2760
2761 if (mds->is_clientreplay() && !mds->mdsmap->is_clientreplay(from) &&
2762 mdr->locks.empty()) {
2763 dout(3) << "not active yet, waiting" << dendl;
2764 mds->wait_for_active(new C_MDS_RetryMessage(mds, m));
2765 return;
2766 }
2767
2768 mdr->reset_peer_request(m);
2769
2770 dispatch_peer_request(mdr);
2771 }
2772
2773 void Server::handle_peer_request_reply(const cref_t<MMDSPeerRequest> &m)
2774 {
2775 mds_rank_t from = mds_rank_t(m->get_source().num());
2776
2777 if (!mds->is_clientreplay() && !mds->is_active() && !mds->is_stopping()) {
2778 metareqid_t r = m->get_reqid();
2779 if (!mdcache->have_uncommitted_leader(r, from)) {
2780 dout(10) << "handle_peer_request_reply ignoring peer reply from mds."
2781 << from << " reqid " << r << dendl;
2782 return;
2783 }
2784 dout(3) << "not clientreplay|active yet, waiting" << dendl;
2785 mds->wait_for_replay(new C_MDS_RetryMessage(mds, m));
2786 return;
2787 }
2788
2789 if (m->get_op() == MMDSPeerRequest::OP_COMMITTED) {
2790 metareqid_t r = m->get_reqid();
2791 mdcache->committed_leader_peer(r, from);
2792 return;
2793 }
2794
2795 MDRequestRef mdr = mdcache->request_get(m->get_reqid());
2796 if (m->get_attempt() != mdr->attempt) {
2797 dout(10) << "handle_peer_request_reply " << *mdr << " ignoring reply from other attempt "
2798 << m->get_attempt() << dendl;
2799 return;
2800 }
2801
2802 switch (m->get_op()) {
2803 case MMDSPeerRequest::OP_XLOCKACK:
2804 {
2805 // identify lock, leader request
2806 SimpleLock *lock = mds->locker->get_lock(m->get_lock_type(),
2807 m->get_object_info());
2808 mdr->more()->peers.insert(from);
2809 lock->decode_locked_state(m->get_lock_data());
2810 dout(10) << "got remote xlock on " << *lock << " on " << *lock->get_parent() << dendl;
2811 mdr->emplace_lock(lock, MutationImpl::LockOp::XLOCK);
2812 mdr->finish_locking(lock);
2813 lock->get_xlock(mdr, mdr->get_client());
2814
2815 ceph_assert(mdr->more()->waiting_on_peer.count(from));
2816 mdr->more()->waiting_on_peer.erase(from);
2817 ceph_assert(mdr->more()->waiting_on_peer.empty());
2818 mdcache->dispatch_request(mdr);
2819 }
2820 break;
2821
2822 case MMDSPeerRequest::OP_WRLOCKACK:
2823 {
2824 // identify lock, leader request
2825 SimpleLock *lock = mds->locker->get_lock(m->get_lock_type(),
2826 m->get_object_info());
2827 mdr->more()->peers.insert(from);
2828 dout(10) << "got remote wrlock on " << *lock << " on " << *lock->get_parent() << dendl;
2829 auto it = mdr->emplace_lock(lock, MutationImpl::LockOp::REMOTE_WRLOCK, from);
2830 ceph_assert(it->is_remote_wrlock());
2831 ceph_assert(it->wrlock_target == from);
2832
2833 mdr->finish_locking(lock);
2834
2835 ceph_assert(mdr->more()->waiting_on_peer.count(from));
2836 mdr->more()->waiting_on_peer.erase(from);
2837 ceph_assert(mdr->more()->waiting_on_peer.empty());
2838 mdcache->dispatch_request(mdr);
2839 }
2840 break;
2841
2842 case MMDSPeerRequest::OP_AUTHPINACK:
2843 handle_peer_auth_pin_ack(mdr, m);
2844 break;
2845
2846 case MMDSPeerRequest::OP_LINKPREPACK:
2847 handle_peer_link_prep_ack(mdr, m);
2848 break;
2849
2850 case MMDSPeerRequest::OP_RMDIRPREPACK:
2851 handle_peer_rmdir_prep_ack(mdr, m);
2852 break;
2853
2854 case MMDSPeerRequest::OP_RENAMEPREPACK:
2855 handle_peer_rename_prep_ack(mdr, m);
2856 break;
2857
2858 case MMDSPeerRequest::OP_RENAMENOTIFYACK:
2859 handle_peer_rename_notify_ack(mdr, m);
2860 break;
2861
2862 default:
2863 ceph_abort();
2864 }
2865 }
2866
2867 void Server::dispatch_peer_request(MDRequestRef& mdr)
2868 {
2869 dout(7) << "dispatch_peer_request " << *mdr << " " << *mdr->peer_request << dendl;
2870
2871 if (mdr->aborted) {
2872 dout(7) << " abort flag set, finishing" << dendl;
2873 mdcache->request_finish(mdr);
2874 return;
2875 }
2876
2877 if (logger) logger->inc(l_mdss_dispatch_peer_request);
2878
2879 int op = mdr->peer_request->get_op();
2880 switch (op) {
2881 case MMDSPeerRequest::OP_XLOCK:
2882 case MMDSPeerRequest::OP_WRLOCK:
2883 {
2884 // identify object
2885 SimpleLock *lock = mds->locker->get_lock(mdr->peer_request->get_lock_type(),
2886 mdr->peer_request->get_object_info());
2887
2888 if (!lock) {
2889 dout(10) << "don't have object, dropping" << dendl;
2890 ceph_abort(); // can this happen, if we auth pinned properly.
2891 }
2892 if (op == MMDSPeerRequest::OP_XLOCK && !lock->get_parent()->is_auth()) {
2893 dout(10) << "not auth for remote xlock attempt, dropping on "
2894 << *lock << " on " << *lock->get_parent() << dendl;
2895 } else {
2896 // use acquire_locks so that we get auth_pinning.
2897 MutationImpl::LockOpVec lov;
2898 for (const auto& p : mdr->locks) {
2899 if (p.is_xlock())
2900 lov.add_xlock(p.lock);
2901 else if (p.is_wrlock())
2902 lov.add_wrlock(p.lock);
2903 }
2904
2905 int replycode = 0;
2906 switch (op) {
2907 case MMDSPeerRequest::OP_XLOCK:
2908 lov.add_xlock(lock);
2909 replycode = MMDSPeerRequest::OP_XLOCKACK;
2910 break;
2911 case MMDSPeerRequest::OP_WRLOCK:
2912 lov.add_wrlock(lock);
2913 replycode = MMDSPeerRequest::OP_WRLOCKACK;
2914 break;
2915 }
2916
2917 if (!mds->locker->acquire_locks(mdr, lov))
2918 return;
2919
2920 // ack
2921 auto r = make_message<MMDSPeerRequest>(mdr->reqid, mdr->attempt, replycode);
2922 r->set_lock_type(lock->get_type());
2923 lock->get_parent()->set_object_info(r->get_object_info());
2924 if (replycode == MMDSPeerRequest::OP_XLOCKACK)
2925 lock->encode_locked_state(r->get_lock_data());
2926 mds->send_message(r, mdr->peer_request->get_connection());
2927 }
2928
2929 // done.
2930 mdr->reset_peer_request();
2931 }
2932 break;
2933
2934 case MMDSPeerRequest::OP_UNXLOCK:
2935 case MMDSPeerRequest::OP_UNWRLOCK:
2936 {
2937 SimpleLock *lock = mds->locker->get_lock(mdr->peer_request->get_lock_type(),
2938 mdr->peer_request->get_object_info());
2939 ceph_assert(lock);
2940 auto it = mdr->locks.find(lock);
2941 ceph_assert(it != mdr->locks.end());
2942 bool need_issue = false;
2943 switch (op) {
2944 case MMDSPeerRequest::OP_UNXLOCK:
2945 mds->locker->xlock_finish(it, mdr.get(), &need_issue);
2946 break;
2947 case MMDSPeerRequest::OP_UNWRLOCK:
2948 mds->locker->wrlock_finish(it, mdr.get(), &need_issue);
2949 break;
2950 }
2951 if (need_issue)
2952 mds->locker->issue_caps(static_cast<CInode*>(lock->get_parent()));
2953
2954 // done. no ack necessary.
2955 mdr->reset_peer_request();
2956 }
2957 break;
2958
2959 case MMDSPeerRequest::OP_AUTHPIN:
2960 handle_peer_auth_pin(mdr);
2961 break;
2962
2963 case MMDSPeerRequest::OP_LINKPREP:
2964 case MMDSPeerRequest::OP_UNLINKPREP:
2965 handle_peer_link_prep(mdr);
2966 break;
2967
2968 case MMDSPeerRequest::OP_RMDIRPREP:
2969 handle_peer_rmdir_prep(mdr);
2970 break;
2971
2972 case MMDSPeerRequest::OP_RENAMEPREP:
2973 handle_peer_rename_prep(mdr);
2974 break;
2975
2976 default:
2977 ceph_abort();
2978 }
2979 }
2980
2981 void Server::handle_peer_auth_pin(MDRequestRef& mdr)
2982 {
2983 dout(10) << "handle_peer_auth_pin " << *mdr << dendl;
2984
2985 // build list of objects
2986 list<MDSCacheObject*> objects;
2987 CInode *auth_pin_freeze = NULL;
2988 bool nonblocking = mdr->peer_request->is_nonblocking();
2989 bool fail = false, wouldblock = false, readonly = false;
2990 ref_t<MMDSPeerRequest> reply;
2991
2992 if (mdcache->is_readonly()) {
2993 dout(10) << " read-only FS" << dendl;
2994 readonly = true;
2995 fail = true;
2996 }
2997
2998 if (!fail) {
2999 for (const auto &oi : mdr->peer_request->get_authpins()) {
3000 MDSCacheObject *object = mdcache->get_object(oi);
3001 if (!object) {
3002 dout(10) << " don't have " << oi << dendl;
3003 fail = true;
3004 break;
3005 }
3006
3007 objects.push_back(object);
3008 if (oi == mdr->peer_request->get_authpin_freeze())
3009 auth_pin_freeze = static_cast<CInode*>(object);
3010 }
3011 }
3012
3013 // can we auth pin them?
3014 if (!fail) {
3015 for (const auto& obj : objects) {
3016 if (!obj->is_auth()) {
3017 dout(10) << " not auth for " << *obj << dendl;
3018 fail = true;
3019 break;
3020 }
3021 if (mdr->is_auth_pinned(obj))
3022 continue;
3023 if (!mdr->can_auth_pin(obj)) {
3024 if (nonblocking) {
3025 dout(10) << " can't auth_pin (freezing?) " << *obj << " nonblocking" << dendl;
3026 fail = true;
3027 wouldblock = true;
3028 break;
3029 }
3030 // wait
3031 dout(10) << " waiting for authpinnable on " << *obj << dendl;
3032 obj->add_waiter(CDir::WAIT_UNFREEZE, new C_MDS_RetryRequest(mdcache, mdr));
3033 mdr->drop_local_auth_pins();
3034
3035 mds->locker->notify_freeze_waiter(obj);
3036 goto blocked;
3037 }
3038 }
3039 }
3040
3041 if (!fail) {
3042 /* freeze authpin wrong inode */
3043 if (mdr->has_more() && mdr->more()->is_freeze_authpin &&
3044 mdr->more()->rename_inode != auth_pin_freeze)
3045 mdr->unfreeze_auth_pin(true);
3046
3047 /* handle_peer_rename_prep() call freeze_inode() to wait for all other operations
3048 * on the source inode to complete. This happens after all locks for the rename
3049 * operation are acquired. But to acquire locks, we need auth pin locks' parent
3050 * objects first. So there is an ABBA deadlock if someone auth pins the source inode
3051 * after locks are acquired and before Server::handle_peer_rename_prep() is called.
3052 * The solution is freeze the inode and prevent other MDRequests from getting new
3053 * auth pins.
3054 */
3055 if (auth_pin_freeze) {
3056 dout(10) << " freezing auth pin on " << *auth_pin_freeze << dendl;
3057 if (!mdr->freeze_auth_pin(auth_pin_freeze)) {
3058 auth_pin_freeze->add_waiter(CInode::WAIT_FROZEN, new C_MDS_RetryRequest(mdcache, mdr));
3059 mds->mdlog->flush();
3060 goto blocked;
3061 }
3062 }
3063 }
3064
3065 reply = make_message<MMDSPeerRequest>(mdr->reqid, mdr->attempt, MMDSPeerRequest::OP_AUTHPINACK);
3066
3067 if (fail) {
3068 mdr->drop_local_auth_pins(); // just in case
3069 if (readonly)
3070 reply->mark_error_rofs();
3071 if (wouldblock)
3072 reply->mark_error_wouldblock();
3073 } else {
3074 // auth pin!
3075 for (const auto& obj : objects) {
3076 dout(10) << "auth_pinning " << *obj << dendl;
3077 mdr->auth_pin(obj);
3078 }
3079 // return list of my auth_pins (if any)
3080 for (const auto &p : mdr->object_states) {
3081 if (!p.second.auth_pinned)
3082 continue;
3083 MDSCacheObjectInfo info;
3084 p.first->set_object_info(info);
3085 reply->get_authpins().push_back(info);
3086 if (p.first == (MDSCacheObject*)auth_pin_freeze)
3087 auth_pin_freeze->set_object_info(reply->get_authpin_freeze());
3088 }
3089 }
3090
3091 mds->send_message_mds(reply, mdr->peer_to_mds);
3092
3093 // clean up this request
3094 mdr->reset_peer_request();
3095 return;
3096
3097 blocked:
3098 if (mdr->peer_request->should_notify_blocking()) {
3099 reply = make_message<MMDSPeerRequest>(mdr->reqid, mdr->attempt, MMDSPeerRequest::OP_AUTHPINACK);
3100 reply->mark_req_blocked();
3101 mds->send_message_mds(reply, mdr->peer_to_mds);
3102 mdr->peer_request->clear_notify_blocking();
3103 }
3104 return;
3105 }
3106
3107 void Server::handle_peer_auth_pin_ack(MDRequestRef& mdr, const cref_t<MMDSPeerRequest> &ack)
3108 {
3109 dout(10) << "handle_peer_auth_pin_ack on " << *mdr << " " << *ack << dendl;
3110 mds_rank_t from = mds_rank_t(ack->get_source().num());
3111
3112 if (ack->is_req_blocked()) {
3113 mdr->disable_lock_cache();
3114 // peer auth pin is blocked, drop locks to avoid deadlock
3115 mds->locker->drop_locks(mdr.get(), nullptr);
3116 return;
3117 }
3118
3119 // added auth pins?
3120 set<MDSCacheObject*> pinned;
3121 for (const auto &oi : ack->get_authpins()) {
3122 MDSCacheObject *object = mdcache->get_object(oi);
3123 ceph_assert(object); // we pinned it
3124 dout(10) << " remote has pinned " << *object << dendl;
3125 mdr->set_remote_auth_pinned(object, from);
3126 if (oi == ack->get_authpin_freeze())
3127 mdr->set_remote_frozen_auth_pin(static_cast<CInode *>(object));
3128 pinned.insert(object);
3129 }
3130
3131 // removed frozen auth pin ?
3132 if (mdr->more()->is_remote_frozen_authpin &&
3133 ack->get_authpin_freeze() == MDSCacheObjectInfo()) {
3134 auto stat_p = mdr->find_object_state(mdr->more()->rename_inode);
3135 ceph_assert(stat_p);
3136 if (stat_p->remote_auth_pinned == from) {
3137 mdr->more()->is_remote_frozen_authpin = false;
3138 }
3139 }
3140
3141 // removed auth pins?
3142 for (auto& p : mdr->object_states) {
3143 if (p.second.remote_auth_pinned == MDS_RANK_NONE)
3144 continue;
3145 MDSCacheObject* object = p.first;
3146 if (p.second.remote_auth_pinned == from && pinned.count(object) == 0) {
3147 dout(10) << " remote has unpinned " << *object << dendl;
3148 mdr->_clear_remote_auth_pinned(p.second);
3149 }
3150 }
3151
3152 // note peer
3153 mdr->more()->peers.insert(from);
3154
3155 // clear from waiting list
3156 auto ret = mdr->more()->waiting_on_peer.erase(from);
3157 ceph_assert(ret);
3158
3159 if (ack->is_error_rofs()) {
3160 mdr->more()->peer_error = -CEPHFS_EROFS;
3161 } else if (ack->is_error_wouldblock()) {
3162 mdr->more()->peer_error = -CEPHFS_EWOULDBLOCK;
3163 }
3164
3165 // go again?
3166 if (mdr->more()->waiting_on_peer.empty())
3167 mdcache->dispatch_request(mdr);
3168 else
3169 dout(10) << "still waiting on peers " << mdr->more()->waiting_on_peer << dendl;
3170 }
3171
3172
3173 // ---------------------------------------
3174 // HELPERS
3175
3176
3177 /**
3178 * check whether we are permitted to complete a request
3179 *
3180 * Check whether we have permission to perform the operation specified
3181 * by mask on the given inode, based on the capability in the mdr's
3182 * session.
3183 */
3184 bool Server::check_access(MDRequestRef& mdr, CInode *in, unsigned mask)
3185 {
3186 if (mdr->session) {
3187 int r = mdr->session->check_access(
3188 in, mask,
3189 mdr->client_request->get_caller_uid(),
3190 mdr->client_request->get_caller_gid(),
3191 &mdr->client_request->get_caller_gid_list(),
3192 mdr->client_request->head.args.setattr.uid,
3193 mdr->client_request->head.args.setattr.gid);
3194 if (r < 0) {
3195 respond_to_request(mdr, r);
3196 return false;
3197 }
3198 }
3199 return true;
3200 }
3201
3202 /**
3203 * check whether fragment has reached maximum size
3204 *
3205 */
3206 bool Server::check_fragment_space(MDRequestRef &mdr, CDir *dir)
3207 {
3208 const auto size = dir->get_frag_size();
3209 const auto max = bal_fragment_size_max;
3210 if (size >= max) {
3211 dout(10) << "fragment " << *dir << " size exceeds " << max << " (CEPHFS_ENOSPC)" << dendl;
3212 respond_to_request(mdr, -CEPHFS_ENOSPC);
3213 return false;
3214 } else {
3215 dout(20) << "fragment " << *dir << " size " << size << " < " << max << dendl;
3216 }
3217
3218 return true;
3219 }
3220
3221 /**
3222 * check whether entries in a dir reached maximum size
3223 *
3224 */
3225 bool Server::check_dir_max_entries(MDRequestRef &mdr, CDir *in)
3226 {
3227 const uint64_t size = in->inode->get_projected_inode()->dirstat.nfiles +
3228 in->inode->get_projected_inode()->dirstat.nsubdirs;
3229 if (dir_max_entries && size >= dir_max_entries) {
3230 dout(10) << "entries per dir " << *in << " size exceeds " << dir_max_entries << " (ENOSPC)" << dendl;
3231 respond_to_request(mdr, -ENOSPC);
3232 return false;
3233 }
3234 return true;
3235 }
3236
3237
3238 CDentry* Server::prepare_stray_dentry(MDRequestRef& mdr, CInode *in)
3239 {
3240 string straydname;
3241 in->name_stray_dentry(straydname);
3242
3243 CDentry *straydn = mdr->straydn;
3244 if (straydn) {
3245 ceph_assert(straydn->get_name() == straydname);
3246 return straydn;
3247 }
3248 CDir *straydir = mdcache->get_stray_dir(in);
3249
3250 if (!mdr->client_request->is_replay() &&
3251 !check_fragment_space(mdr, straydir))
3252 return nullptr;
3253
3254 straydn = straydir->lookup(straydname);
3255 if (!straydn) {
3256 if (straydir->is_frozen_dir()) {
3257 dout(10) << __func__ << ": " << *straydir << " is frozen, waiting" << dendl;
3258 mds->locker->drop_locks(mdr.get());
3259 mdr->drop_local_auth_pins();
3260 straydir->add_waiter(CInode::WAIT_UNFREEZE, new C_MDS_RetryRequest(mdcache, mdr));
3261 return nullptr;
3262 }
3263 straydn = straydir->add_null_dentry(straydname);
3264 straydn->mark_new();
3265 } else {
3266 ceph_assert(straydn->get_projected_linkage()->is_null());
3267 }
3268
3269 straydn->state_set(CDentry::STATE_STRAY);
3270 mdr->straydn = straydn;
3271 mdr->pin(straydn);
3272
3273 return straydn;
3274 }
3275
3276 /** prepare_new_inode
3277 *
3278 * create a new inode. set c/m/atime. hit dir pop.
3279 */
3280 CInode* Server::prepare_new_inode(MDRequestRef& mdr, CDir *dir, inodeno_t useino, unsigned mode,
3281 const file_layout_t *layout)
3282 {
3283 CInode *in = new CInode(mdcache);
3284 auto _inode = in->_get_inode();
3285
3286 // Server::prepare_force_open_sessions() can re-open session in closing
3287 // state. In that corner case, session's prealloc_inos are being freed.
3288 // To simplify the code, we disallow using/refilling session's prealloc_ino
3289 // while session is opening.
3290 bool allow_prealloc_inos = mdr->session->is_open();
3291
3292 // assign ino
3293 if (allow_prealloc_inos && (mdr->used_prealloc_ino = _inode->ino = mdr->session->take_ino(useino))) {
3294 mds->sessionmap.mark_projected(mdr->session);
3295 dout(10) << "prepare_new_inode used_prealloc " << mdr->used_prealloc_ino
3296 << " (" << mdr->session->info.prealloc_inos.size() << " left)"
3297 << dendl;
3298 } else {
3299 mdr->alloc_ino =
3300 _inode->ino = mds->inotable->project_alloc_id(useino);
3301 dout(10) << "prepare_new_inode alloc " << mdr->alloc_ino << dendl;
3302 }
3303
3304 if (useino && useino != _inode->ino) {
3305 dout(0) << "WARNING: client specified " << useino << " and i allocated " << _inode->ino << dendl;
3306 mds->clog->error() << mdr->client_request->get_source()
3307 << " specified ino " << useino
3308 << " but mds." << mds->get_nodeid() << " allocated " << _inode->ino;
3309 //ceph_abort(); // just for now.
3310 }
3311
3312 if (allow_prealloc_inos &&
3313 mdr->session->get_num_projected_prealloc_inos() < g_conf()->mds_client_prealloc_inos / 2) {
3314 int need = g_conf()->mds_client_prealloc_inos - mdr->session->get_num_projected_prealloc_inos();
3315 mds->inotable->project_alloc_ids(mdr->prealloc_inos, need);
3316 ceph_assert(mdr->prealloc_inos.size()); // or else fix projected increment semantics
3317 mdr->session->pending_prealloc_inos.insert(mdr->prealloc_inos);
3318 mds->sessionmap.mark_projected(mdr->session);
3319 dout(10) << "prepare_new_inode prealloc " << mdr->prealloc_inos << dendl;
3320 }
3321
3322 _inode->version = 1;
3323 _inode->xattr_version = 1;
3324 _inode->nlink = 1; // FIXME
3325
3326 _inode->mode = mode;
3327
3328 // FIPS zeroization audit 20191117: this memset is not security related.
3329 memset(&_inode->dir_layout, 0, sizeof(_inode->dir_layout));
3330 if (_inode->is_dir()) {
3331 _inode->dir_layout.dl_dir_hash = g_conf()->mds_default_dir_hash;
3332 } else if (layout) {
3333 _inode->layout = *layout;
3334 } else {
3335 _inode->layout = mdcache->default_file_layout;
3336 }
3337
3338 _inode->truncate_size = -1ull; // not truncated, yet!
3339 _inode->truncate_seq = 1; /* starting with 1, 0 is kept for no-truncation logic */
3340
3341 CInode *diri = dir->get_inode();
3342 auto pip = diri->get_projected_inode();
3343
3344 dout(10) << oct << " dir mode 0" << pip->mode << " new mode 0" << mode << dec << dendl;
3345
3346 if (pip->mode & S_ISGID) {
3347 dout(10) << " dir is sticky" << dendl;
3348 _inode->gid = pip->gid;
3349 if (S_ISDIR(mode)) {
3350 dout(10) << " new dir also sticky" << dendl;
3351 _inode->mode |= S_ISGID;
3352 }
3353 } else {
3354 _inode->gid = mdr->client_request->get_caller_gid();
3355 }
3356
3357 _inode->uid = mdr->client_request->get_caller_uid();
3358
3359 _inode->btime = _inode->ctime = _inode->mtime = _inode->atime =
3360 mdr->get_op_stamp();
3361
3362 _inode->change_attr = 0;
3363
3364 const cref_t<MClientRequest> &req = mdr->client_request;
3365 if (req->get_data().length()) {
3366 auto p = req->get_data().cbegin();
3367
3368 // xattrs on new inode?
3369 auto _xattrs = CInode::allocate_xattr_map();
3370 decode_noshare(*_xattrs, p);
3371 dout(10) << "prepare_new_inode setting xattrs " << *_xattrs << dendl;
3372 if (_xattrs->count("encryption.ctx")) {
3373 _inode->fscrypt = true;
3374 }
3375 in->reset_xattrs(std::move(_xattrs));
3376 }
3377
3378 if (!mds->mdsmap->get_inline_data_enabled() ||
3379 !mdr->session->get_connection()->has_feature(CEPH_FEATURE_MDS_INLINE_DATA))
3380 _inode->inline_data.version = CEPH_INLINE_NONE;
3381
3382 mdcache->add_inode(in); // add
3383 dout(10) << "prepare_new_inode " << *in << dendl;
3384 return in;
3385 }
3386
3387 void Server::journal_allocated_inos(MDRequestRef& mdr, EMetaBlob *blob)
3388 {
3389 dout(20) << "journal_allocated_inos sessionmapv " << mds->sessionmap.get_projected()
3390 << " inotablev " << mds->inotable->get_projected_version()
3391 << dendl;
3392 blob->set_ino_alloc(mdr->alloc_ino,
3393 mdr->used_prealloc_ino,
3394 mdr->prealloc_inos,
3395 mdr->client_request->get_source(),
3396 mds->sessionmap.get_projected(),
3397 mds->inotable->get_projected_version());
3398 }
3399
3400 void Server::apply_allocated_inos(MDRequestRef& mdr, Session *session)
3401 {
3402 dout(10) << "apply_allocated_inos " << mdr->alloc_ino
3403 << " / " << mdr->prealloc_inos
3404 << " / " << mdr->used_prealloc_ino << dendl;
3405
3406 if (mdr->alloc_ino) {
3407 mds->inotable->apply_alloc_id(mdr->alloc_ino);
3408 }
3409 if (mdr->prealloc_inos.size()) {
3410 ceph_assert(session);
3411 session->pending_prealloc_inos.subtract(mdr->prealloc_inos);
3412 session->free_prealloc_inos.insert(mdr->prealloc_inos);
3413 session->info.prealloc_inos.insert(mdr->prealloc_inos);
3414 mds->sessionmap.mark_dirty(session, !mdr->used_prealloc_ino);
3415 mds->inotable->apply_alloc_ids(mdr->prealloc_inos);
3416 }
3417 if (mdr->used_prealloc_ino) {
3418 ceph_assert(session);
3419 session->info.prealloc_inos.erase(mdr->used_prealloc_ino);
3420 mds->sessionmap.mark_dirty(session);
3421 }
3422 }
3423
3424 struct C_MDS_TryOpenInode : public ServerContext {
3425 MDRequestRef mdr;
3426 inodeno_t ino;
3427 C_MDS_TryOpenInode(Server *s, MDRequestRef& r, inodeno_t i) :
3428 ServerContext(s), mdr(r), ino(i) {}
3429 void finish(int r) override {
3430 server->_try_open_ino(mdr, r, ino);
3431 }
3432 };
3433
3434 void Server::_try_open_ino(MDRequestRef& mdr, int r, inodeno_t ino)
3435 {
3436 dout(10) << "_try_open_ino " << mdr.get() << " ino " << ino << " r=" << r << dendl;
3437
3438 // `r` is a rank if >=0, else an error code
3439 if (r >= 0) {
3440 mds_rank_t dest_rank(r);
3441 if (dest_rank == mds->get_nodeid())
3442 dispatch_client_request(mdr);
3443 else
3444 mdcache->request_forward(mdr, dest_rank);
3445 return;
3446 }
3447
3448 // give up
3449 if (r == -CEPHFS_ENOENT || r == -CEPHFS_ENODATA)
3450 r = -CEPHFS_ESTALE;
3451 respond_to_request(mdr, r);
3452 }
3453
3454 class C_MDS_TryFindInode : public ServerContext {
3455 MDRequestRef mdr;
3456 MDCache *mdcache;
3457 inodeno_t ino;
3458 public:
3459 C_MDS_TryFindInode(Server *s, MDRequestRef& r, MDCache *m, inodeno_t i) :
3460 ServerContext(s), mdr(r), mdcache(m), ino(i) {}
3461 void finish(int r) override {
3462 if (r == -CEPHFS_ESTALE) { // :( find_ino_peers failed
3463 /*
3464 * There has one case that when the MDS crashes and the
3465 * openfiletable journal couldn't be flushed and then
3466 * the replacing MDS is possibly won't load some already
3467 * opened CInodes into the MDCache. And if the clients
3468 * will retry some requests after reconnected, the MDS
3469 * will return -ESTALE after failing to find the ino in
3470 * all active peers.
3471 *
3472 * As a workaround users can run `ls -R ${mountpoint}`
3473 * to list all the sub-files or sub-direcotries from the
3474 * mountpoint.
3475 *
3476 * We need try to open the ino and try it again.
3477 */
3478 CInode *in = mdcache->get_inode(ino);
3479 if (in && in->state_test(CInode::STATE_PURGING))
3480 server->respond_to_request(mdr, r);
3481 else
3482 mdcache->open_ino(ino, (int64_t)-1, new C_MDS_TryOpenInode(server, mdr, ino));
3483 } else {
3484 server->dispatch_client_request(mdr);
3485 }
3486 }
3487 };
3488
3489 /* If this returns null, the request has been handled
3490 * as appropriate: forwarded on, or the client's been replied to */
3491 CInode* Server::rdlock_path_pin_ref(MDRequestRef& mdr,
3492 bool want_auth,
3493 bool no_want_auth)
3494 {
3495 const filepath& refpath = mdr->get_filepath();
3496 dout(10) << "rdlock_path_pin_ref " << *mdr << " " << refpath << dendl;
3497
3498 if (mdr->locking_state & MutationImpl::PATH_LOCKED)
3499 return mdr->in[0];
3500
3501 // traverse
3502 CF_MDS_RetryRequestFactory cf(mdcache, mdr, true);
3503 int flags = 0;
3504 if (refpath.is_last_snap()) {
3505 if (!no_want_auth)
3506 want_auth = true;
3507 } else {
3508 if (!no_want_auth && forward_all_requests_to_auth)
3509 want_auth = true;
3510 flags |= MDS_TRAVERSE_RDLOCK_PATH | MDS_TRAVERSE_RDLOCK_SNAP;
3511 }
3512 if (want_auth)
3513 flags |= MDS_TRAVERSE_WANT_AUTH;
3514 int r = mdcache->path_traverse(mdr, cf, refpath, flags, &mdr->dn[0], &mdr->in[0]);
3515 if (r > 0)
3516 return nullptr; // delayed
3517 if (r < 0) { // error
3518 if (r == -CEPHFS_ENOENT && !mdr->dn[0].empty()) {
3519 if (mdr->client_request &&
3520 mdr->client_request->get_dentry_wanted())
3521 mdr->tracedn = mdr->dn[0].back();
3522 respond_to_request(mdr, r);
3523 } else if (r == -CEPHFS_ESTALE) {
3524 dout(10) << "FAIL on CEPHFS_ESTALE but attempting recovery" << dendl;
3525 inodeno_t ino = refpath.get_ino();
3526 mdcache->find_ino_peers(ino, new C_MDS_TryFindInode(this, mdr, mdcache, ino));
3527 } else {
3528 dout(10) << "FAIL on error " << r << dendl;
3529 respond_to_request(mdr, r);
3530 }
3531 return nullptr;
3532 }
3533 CInode *ref = mdr->in[0];
3534 dout(10) << "ref is " << *ref << dendl;
3535
3536 if (want_auth) {
3537 // auth_pin?
3538 // do NOT proceed if freezing, as cap release may defer in that case, and
3539 // we could deadlock when we try to lock @ref.
3540 // if we're already auth_pinned, continue; the release has already been processed.
3541 if (ref->is_frozen() || ref->is_frozen_auth_pin() ||
3542 (ref->is_freezing() && !mdr->is_auth_pinned(ref))) {
3543 dout(7) << "waiting for !frozen/authpinnable on " << *ref << dendl;
3544 ref->add_waiter(CInode::WAIT_UNFREEZE, cf.build());
3545 if (mdr->is_any_remote_auth_pin())
3546 mds->locker->notify_freeze_waiter(ref);
3547 return 0;
3548 }
3549 mdr->auth_pin(ref);
3550 }
3551
3552 // set and pin ref
3553 mdr->pin(ref);
3554 return ref;
3555 }
3556
3557
3558 /** rdlock_path_xlock_dentry
3559 * traverse path to the directory that could/would contain dentry.
3560 * make sure i am auth for that dentry, forward as necessary.
3561 * create null dentry in place (or use existing if okexist).
3562 * get rdlocks on traversed dentries, xlock on new dentry.
3563 */
3564 CDentry* Server::rdlock_path_xlock_dentry(MDRequestRef& mdr,
3565 bool create, bool okexist, bool want_layout)
3566 {
3567 const filepath& refpath = mdr->get_filepath();
3568 dout(10) << "rdlock_path_xlock_dentry " << *mdr << " " << refpath << dendl;
3569
3570 if (mdr->locking_state & MutationImpl::PATH_LOCKED)
3571 return mdr->dn[0].back();
3572
3573 // figure parent dir vs dname
3574 if (refpath.depth() == 0) {
3575 dout(7) << "invalid path (zero length)" << dendl;
3576 respond_to_request(mdr, -CEPHFS_EINVAL);
3577 return nullptr;
3578 }
3579
3580 if (refpath.is_last_snap()) {
3581 respond_to_request(mdr, -CEPHFS_EROFS);
3582 return nullptr;
3583 }
3584
3585 if (refpath.is_last_dot_or_dotdot()) {
3586 dout(7) << "invalid path (last dot or dot_dot)" << dendl;
3587 if (create)
3588 respond_to_request(mdr, -CEPHFS_EEXIST);
3589 else
3590 respond_to_request(mdr, -CEPHFS_ENOTEMPTY);
3591 return nullptr;
3592 }
3593
3594 // traverse to parent dir
3595 CF_MDS_RetryRequestFactory cf(mdcache, mdr, true);
3596 int flags = MDS_TRAVERSE_RDLOCK_SNAP | MDS_TRAVERSE_RDLOCK_PATH |
3597 MDS_TRAVERSE_WANT_DENTRY | MDS_TRAVERSE_XLOCK_DENTRY |
3598 MDS_TRAVERSE_WANT_AUTH;
3599 if (refpath.depth() == 1 && !mdr->lock_cache_disabled)
3600 flags |= MDS_TRAVERSE_CHECK_LOCKCACHE;
3601 if (create)
3602 flags |= MDS_TRAVERSE_RDLOCK_AUTHLOCK;
3603 if (want_layout)
3604 flags |= MDS_TRAVERSE_WANT_DIRLAYOUT;
3605 int r = mdcache->path_traverse(mdr, cf, refpath, flags, &mdr->dn[0]);
3606 if (r > 0)
3607 return nullptr; // delayed
3608 if (r < 0) {
3609 if (r == -CEPHFS_ESTALE) {
3610 dout(10) << "FAIL on CEPHFS_ESTALE but attempting recovery" << dendl;
3611 inodeno_t ino = refpath.get_ino();
3612 mdcache->find_ino_peers(ino, new C_MDS_TryFindInode(this, mdr, mdcache, ino));
3613 return nullptr;
3614 }
3615 respond_to_request(mdr, r);
3616 return nullptr;
3617 }
3618
3619 CDentry *dn = mdr->dn[0].back();
3620 CDir *dir = dn->get_dir();
3621 CInode *diri = dir->get_inode();
3622
3623 if (!mdr->reqid.name.is_mds()) {
3624 if (diri->is_system() && !diri->is_root()) {
3625 respond_to_request(mdr, -CEPHFS_EROFS);
3626 return nullptr;
3627 }
3628 }
3629
3630 if (!diri->is_base() && diri->get_projected_parent_dir()->inode->is_stray()) {
3631 respond_to_request(mdr, -CEPHFS_ENOENT);
3632 return nullptr;
3633 }
3634
3635 CDentry::linkage_t *dnl = dn->get_projected_linkage();
3636 if (dnl->is_null()) {
3637 if (!create && okexist) {
3638 respond_to_request(mdr, -CEPHFS_ENOENT);
3639 return nullptr;
3640 }
3641
3642 snapid_t next_snap = mdcache->get_global_snaprealm()->get_newest_seq() + 1;
3643 dn->first = std::max(dn->first, next_snap);
3644 } else {
3645 if (!okexist) {
3646 respond_to_request(mdr, -CEPHFS_EEXIST);
3647 return nullptr;
3648 }
3649 mdr->in[0] = dnl->get_inode();
3650 }
3651
3652 return dn;
3653 }
3654
3655 /** rdlock_two_paths_xlock_destdn
3656 * traverse two paths and lock the two paths in proper order.
3657 * The order of taking locks is:
3658 * 1. Lock directory inodes or dentries according to which trees they
3659 * are under. Lock objects under fs root before objects under mdsdir.
3660 * 2. Lock directory inodes or dentries according to their depth, in
3661 * ascending order.
3662 * 3. Lock directory inodes or dentries according to inode numbers or
3663 * dentries' parent inode numbers, in ascending order.
3664 * 4. Lock dentries in the same directory in order of their keys.
3665 * 5. Lock non-directory inodes according to inode numbers, in ascending
3666 * order.
3667 */
3668 std::pair<CDentry*, CDentry*>
3669 Server::rdlock_two_paths_xlock_destdn(MDRequestRef& mdr, bool xlock_srcdn)
3670 {
3671
3672 const filepath& refpath = mdr->get_filepath();
3673 const filepath& refpath2 = mdr->get_filepath2();
3674
3675 dout(10) << "rdlock_two_paths_xlock_destdn " << *mdr << " " << refpath << " " << refpath2 << dendl;
3676
3677 if (mdr->locking_state & MutationImpl::PATH_LOCKED)
3678 return std::make_pair(mdr->dn[0].back(), mdr->dn[1].back());
3679
3680 if (refpath.depth() != 1 || refpath2.depth() != 1) {
3681 respond_to_request(mdr, -CEPHFS_EINVAL);
3682 return std::pair<CDentry*, CDentry*>(nullptr, nullptr);
3683 }
3684
3685 if (refpath.is_last_snap() || refpath2.is_last_snap()) {
3686 respond_to_request(mdr, -CEPHFS_EROFS);
3687 return std::make_pair(nullptr, nullptr);
3688 }
3689
3690 // traverse to parent dir
3691 CF_MDS_RetryRequestFactory cf(mdcache, mdr, true);
3692 int flags = MDS_TRAVERSE_RDLOCK_SNAP | MDS_TRAVERSE_WANT_DENTRY | MDS_TRAVERSE_WANT_AUTH;
3693 int r = mdcache->path_traverse(mdr, cf, refpath, flags, &mdr->dn[0]);
3694 if (r != 0) {
3695 if (r == -CEPHFS_ESTALE) {
3696 dout(10) << "CEPHFS_ESTALE on path, attempting recovery" << dendl;
3697 inodeno_t ino = refpath.get_ino();
3698 mdcache->find_ino_peers(ino, new C_MDS_TryFindInode(this, mdr, mdcache, ino));
3699 } else if (r < 0) {
3700 respond_to_request(mdr, r);
3701 }
3702 return std::make_pair(nullptr, nullptr);
3703 }
3704
3705 flags = MDS_TRAVERSE_RDLOCK_SNAP2 | MDS_TRAVERSE_WANT_DENTRY | MDS_TRAVERSE_DISCOVER;
3706 r = mdcache->path_traverse(mdr, cf, refpath2, flags, &mdr->dn[1]);
3707 if (r != 0) {
3708 if (r == -CEPHFS_ESTALE) {
3709 dout(10) << "CEPHFS_ESTALE on path2, attempting recovery" << dendl;
3710 inodeno_t ino = refpath2.get_ino();
3711 mdcache->find_ino_peers(ino, new C_MDS_TryFindInode(this, mdr, mdcache, ino));
3712 } else if (r < 0) {
3713 respond_to_request(mdr, r);
3714 }
3715 return std::make_pair(nullptr, nullptr);
3716 }
3717
3718 CDentry *srcdn = mdr->dn[1].back();
3719 CDir *srcdir = srcdn->get_dir();
3720 CDentry *destdn = mdr->dn[0].back();
3721 CDir *destdir = destdn->get_dir();
3722
3723 if (!mdr->reqid.name.is_mds()) {
3724 if ((srcdir->get_inode()->is_system() && !srcdir->get_inode()->is_root()) ||
3725 (destdir->get_inode()->is_system() && !destdir->get_inode()->is_root())) {
3726 respond_to_request(mdr, -CEPHFS_EROFS);
3727 return std::make_pair(nullptr, nullptr);
3728 }
3729 }
3730
3731 if (!destdir->get_inode()->is_base() &&
3732 destdir->get_inode()->get_projected_parent_dir()->inode->is_stray()) {
3733 respond_to_request(mdr, -CEPHFS_ENOENT);
3734 return std::make_pair(nullptr, nullptr);
3735 }
3736
3737 MutationImpl::LockOpVec lov;
3738 if (srcdir->get_inode() == destdir->get_inode()) {
3739 lov.add_wrlock(&destdir->inode->filelock);
3740 lov.add_wrlock(&destdir->inode->nestlock);
3741 if (xlock_srcdn && srcdir != destdir) {
3742 mds_rank_t srcdir_auth = srcdir->authority().first;
3743 if (srcdir_auth != mds->get_nodeid()) {
3744 lov.add_remote_wrlock(&srcdir->inode->filelock, srcdir_auth);
3745 lov.add_remote_wrlock(&srcdir->inode->nestlock, srcdir_auth);
3746 }
3747 }
3748
3749 if (srcdn->get_name() > destdn->get_name())
3750 lov.add_xlock(&destdn->lock);
3751
3752 if (xlock_srcdn)
3753 lov.add_xlock(&srcdn->lock);
3754 else
3755 lov.add_rdlock(&srcdn->lock);
3756
3757 if (srcdn->get_name() < destdn->get_name())
3758 lov.add_xlock(&destdn->lock);
3759 } else {
3760 int cmp = mdr->compare_paths();
3761 bool lock_destdir_first =
3762 (cmp < 0 || (cmp == 0 && destdir->ino() < srcdir->ino()));
3763
3764 if (lock_destdir_first) {
3765 lov.add_wrlock(&destdir->inode->filelock);
3766 lov.add_wrlock(&destdir->inode->nestlock);
3767 lov.add_xlock(&destdn->lock);
3768 }
3769
3770 if (xlock_srcdn) {
3771 mds_rank_t srcdir_auth = srcdir->authority().first;
3772 if (srcdir_auth == mds->get_nodeid()) {
3773 lov.add_wrlock(&srcdir->inode->filelock);
3774 lov.add_wrlock(&srcdir->inode->nestlock);
3775 } else {
3776 lov.add_remote_wrlock(&srcdir->inode->filelock, srcdir_auth);
3777 lov.add_remote_wrlock(&srcdir->inode->nestlock, srcdir_auth);
3778 }
3779 lov.add_xlock(&srcdn->lock);
3780 } else {
3781 lov.add_rdlock(&srcdn->lock);
3782 }
3783
3784 if (!lock_destdir_first) {
3785 lov.add_wrlock(&destdir->inode->filelock);
3786 lov.add_wrlock(&destdir->inode->nestlock);
3787 lov.add_xlock(&destdn->lock);
3788 }
3789 }
3790
3791 CInode *auth_pin_freeze = nullptr;
3792 // XXX any better way to do this?
3793 if (xlock_srcdn && !srcdn->is_auth()) {
3794 CDentry::linkage_t *srcdnl = srcdn->get_projected_linkage();
3795 auth_pin_freeze = srcdnl->is_primary() ? srcdnl->get_inode() : nullptr;
3796 }
3797 if (!mds->locker->acquire_locks(mdr, lov, auth_pin_freeze))
3798 return std::make_pair(nullptr, nullptr);
3799
3800 if (srcdn->get_projected_linkage()->is_null()) {
3801 respond_to_request(mdr, -CEPHFS_ENOENT);
3802 return std::make_pair(nullptr, nullptr);
3803 }
3804
3805 if (destdn->get_projected_linkage()->is_null()) {
3806 snapid_t next_snap = mdcache->get_global_snaprealm()->get_newest_seq() + 1;
3807 destdn->first = std::max(destdn->first, next_snap);
3808 }
3809
3810 mdr->locking_state |= MutationImpl::PATH_LOCKED;
3811
3812 return std::make_pair(destdn, srcdn);
3813 }
3814
3815 /**
3816 * try_open_auth_dirfrag -- open dirfrag, or forward to dirfrag auth
3817 *
3818 * @param diri base inode
3819 * @param fg the exact frag we want
3820 * @param mdr request
3821 * @returns the pointer, or NULL if it had to be delayed (but mdr is taken care of)
3822 */
3823 CDir* Server::try_open_auth_dirfrag(CInode *diri, frag_t fg, MDRequestRef& mdr)
3824 {
3825 CDir *dir = diri->get_dirfrag(fg);
3826
3827 if (dir) {
3828 // am i auth for the dirfrag?
3829 if (!dir->is_auth()) {
3830 mds_rank_t auth = dir->authority().first;
3831 dout(7) << "try_open_auth_dirfrag: not auth for " << *dir
3832 << ", fw to mds." << auth << dendl;
3833 mdcache->request_forward(mdr, auth);
3834 return nullptr;
3835 }
3836 } else {
3837 // not open and inode not mine?
3838 if (!diri->is_auth()) {
3839 mds_rank_t inauth = diri->authority().first;
3840 dout(7) << "try_open_auth_dirfrag: not open, not inode auth, fw to mds." << inauth << dendl;
3841 mdcache->request_forward(mdr, inauth);
3842 return nullptr;
3843 }
3844
3845 // not open and inode frozen?
3846 if (diri->is_frozen()) {
3847 dout(10) << "try_open_auth_dirfrag: dir inode is frozen, waiting " << *diri << dendl;
3848 ceph_assert(diri->get_parent_dir());
3849 diri->add_waiter(CInode::WAIT_UNFREEZE, new C_MDS_RetryRequest(mdcache, mdr));
3850 return nullptr;
3851 }
3852
3853 // invent?
3854 dir = diri->get_or_open_dirfrag(mdcache, fg);
3855 }
3856
3857 return dir;
3858 }
3859
3860
3861 // ===============================================================================
3862 // STAT
3863
3864 void Server::handle_client_getattr(MDRequestRef& mdr, bool is_lookup)
3865 {
3866 const cref_t<MClientRequest> &req = mdr->client_request;
3867
3868 if (req->get_filepath().depth() == 0 && is_lookup) {
3869 // refpath can't be empty for lookup but it can for
3870 // getattr (we do getattr with empty refpath for mount of '/')
3871 respond_to_request(mdr, -CEPHFS_EINVAL);
3872 return;
3873 }
3874
3875 bool want_auth = false;
3876 int mask = req->head.args.getattr.mask;
3877 if (mask & CEPH_STAT_RSTAT)
3878 want_auth = true; // set want_auth for CEPH_STAT_RSTAT mask
3879
3880 if (!mdr->is_batch_head() && mdr->can_batch()) {
3881 CF_MDS_RetryRequestFactory cf(mdcache, mdr, false);
3882 int r = mdcache->path_traverse(mdr, cf, mdr->get_filepath(),
3883 (want_auth ? MDS_TRAVERSE_WANT_AUTH : 0),
3884 &mdr->dn[0], &mdr->in[0]);
3885 if (r > 0)
3886 return; // delayed
3887
3888 if (r < 0) {
3889 // fall-thru. let rdlock_path_pin_ref() check again.
3890 } else if (is_lookup) {
3891 CDentry* dn = mdr->dn[0].back();
3892 mdr->pin(dn);
3893 auto em = dn->batch_ops.emplace(std::piecewise_construct, std::forward_as_tuple(mask), std::forward_as_tuple());
3894 if (em.second) {
3895 em.first->second = std::make_unique<Batch_Getattr_Lookup>(this, mdr);
3896 } else {
3897 dout(20) << __func__ << ": LOOKUP op, wait for previous same getattr ops to respond. " << *mdr << dendl;
3898 em.first->second->add_request(mdr);
3899 return;
3900 }
3901 } else {
3902 CInode *in = mdr->in[0];
3903 mdr->pin(in);
3904 auto em = in->batch_ops.emplace(std::piecewise_construct, std::forward_as_tuple(mask), std::forward_as_tuple());
3905 if (em.second) {
3906 em.first->second = std::make_unique<Batch_Getattr_Lookup>(this, mdr);
3907 } else {
3908 dout(20) << __func__ << ": GETATTR op, wait for previous same getattr ops to respond. " << *mdr << dendl;
3909 em.first->second->add_request(mdr);
3910 return;
3911 }
3912 }
3913 }
3914
3915 CInode *ref = rdlock_path_pin_ref(mdr, want_auth, false);
3916 if (!ref)
3917 return;
3918
3919 mdr->getattr_caps = mask;
3920
3921 /*
3922 * if client currently holds the EXCL cap on a field, do not rdlock
3923 * it; client's stat() will result in valid info if _either_ EXCL
3924 * cap is held or MDS rdlocks and reads the value here.
3925 *
3926 * handling this case here is easier than weakening rdlock
3927 * semantics... that would cause problems elsewhere.
3928 */
3929 client_t client = mdr->get_client();
3930 int issued = 0;
3931 Capability *cap = ref->get_client_cap(client);
3932 if (cap && (mdr->snapid == CEPH_NOSNAP ||
3933 mdr->snapid <= cap->client_follows))
3934 issued = cap->issued();
3935
3936 // FIXME
3937 MutationImpl::LockOpVec lov;
3938 if ((mask & CEPH_CAP_LINK_SHARED) && !(issued & CEPH_CAP_LINK_EXCL))
3939 lov.add_rdlock(&ref->linklock);
3940 if ((mask & CEPH_CAP_AUTH_SHARED) && !(issued & CEPH_CAP_AUTH_EXCL))
3941 lov.add_rdlock(&ref->authlock);
3942 if ((mask & CEPH_CAP_XATTR_SHARED) && !(issued & CEPH_CAP_XATTR_EXCL))
3943 lov.add_rdlock(&ref->xattrlock);
3944 if ((mask & CEPH_CAP_FILE_SHARED) && !(issued & CEPH_CAP_FILE_EXCL)) {
3945 // Don't wait on unstable filelock if client is allowed to read file size.
3946 // This can reduce the response time of getattr in the case that multiple
3947 // clients do stat(2) and there are writers.
3948 // The downside of this optimization is that mds may not issue Fs caps along
3949 // with getattr reply. Client may need to send more getattr requests.
3950 if (mdr->is_rdlocked(&ref->filelock)) {
3951 lov.add_rdlock(&ref->filelock);
3952 } else if (ref->filelock.is_stable() ||
3953 ref->filelock.get_num_wrlocks() > 0 ||
3954 !ref->filelock.can_read(mdr->get_client())) {
3955 lov.add_rdlock(&ref->filelock);
3956 mdr->locking_state &= ~MutationImpl::ALL_LOCKED;
3957 }
3958 }
3959
3960 if (!mds->locker->acquire_locks(mdr, lov))
3961 return;
3962
3963 if (!check_access(mdr, ref, MAY_READ))
3964 return;
3965
3966 utime_t now = ceph_clock_now();
3967 mdr->set_mds_stamp(now);
3968
3969 // note which caps are requested, so we return at least a snapshot
3970 // value for them. (currently this matters for xattrs and inline data)
3971 mdr->getattr_caps = mask;
3972
3973 mds->balancer->hit_inode(ref, META_POP_IRD, req->get_source().num());
3974
3975 // reply
3976 dout(10) << "reply to stat on " << *req << dendl;
3977 mdr->tracei = ref;
3978 if (is_lookup)
3979 mdr->tracedn = mdr->dn[0].back();
3980 respond_to_request(mdr, 0);
3981 }
3982
3983 struct C_MDS_LookupIno2 : public ServerContext {
3984 MDRequestRef mdr;
3985 C_MDS_LookupIno2(Server *s, MDRequestRef& r) : ServerContext(s), mdr(r) {}
3986 void finish(int r) override {
3987 server->_lookup_ino_2(mdr, r);
3988 }
3989 };
3990
3991 /*
3992 * filepath: ino
3993 */
3994 void Server::handle_client_lookup_ino(MDRequestRef& mdr,
3995 bool want_parent, bool want_dentry)
3996 {
3997 const cref_t<MClientRequest> &req = mdr->client_request;
3998
3999 if ((uint64_t)req->head.args.lookupino.snapid > 0)
4000 return _lookup_snap_ino(mdr);
4001
4002 inodeno_t ino = req->get_filepath().get_ino();
4003 auto _ino = ino.val;
4004
4005 /* It's been observed [1] that a client may lookup a private ~mdsdir inode.
4006 * I do not have an explanation for how that happened organically but this
4007 * check will ensure that the client can no longer do that.
4008 *
4009 * [1] https://tracker.ceph.com/issues/49922
4010 */
4011 if (MDS_IS_PRIVATE_INO(_ino)) {
4012 respond_to_request(mdr, -CEPHFS_ESTALE);
4013 return;
4014 }
4015
4016 CInode *in = mdcache->get_inode(ino);
4017 if (in && in->state_test(CInode::STATE_PURGING)) {
4018 respond_to_request(mdr, -CEPHFS_ESTALE);
4019 return;
4020 }
4021 if (!in) {
4022 mdcache->open_ino(ino, (int64_t)-1, new C_MDS_LookupIno2(this, mdr), false);
4023 return;
4024 }
4025
4026 // check for nothing (not read or write); this still applies the
4027 // path check.
4028 if (!check_access(mdr, in, 0))
4029 return;
4030
4031 CDentry *dn = in->get_projected_parent_dn();
4032 CInode *diri = dn ? dn->get_dir()->inode : NULL;
4033
4034 MutationImpl::LockOpVec lov;
4035 if (dn && (want_parent || want_dentry)) {
4036 mdr->pin(dn);
4037 lov.add_rdlock(&dn->lock);
4038 }
4039
4040 unsigned mask = req->head.args.lookupino.mask;
4041 if (mask) {
4042 Capability *cap = in->get_client_cap(mdr->get_client());
4043 int issued = 0;
4044 if (cap && (mdr->snapid == CEPH_NOSNAP || mdr->snapid <= cap->client_follows))
4045 issued = cap->issued();
4046 // FIXME
4047 // permission bits, ACL/security xattrs
4048 if ((mask & CEPH_CAP_AUTH_SHARED) && (issued & CEPH_CAP_AUTH_EXCL) == 0)
4049 lov.add_rdlock(&in->authlock);
4050 if ((mask & CEPH_CAP_XATTR_SHARED) && (issued & CEPH_CAP_XATTR_EXCL) == 0)
4051 lov.add_rdlock(&in->xattrlock);
4052
4053 mdr->getattr_caps = mask;
4054 }
4055
4056 if (!lov.empty()) {
4057 if (!mds->locker->acquire_locks(mdr, lov))
4058 return;
4059
4060 if (diri != NULL) {
4061 // need read access to directory inode
4062 if (!check_access(mdr, diri, MAY_READ))
4063 return;
4064 }
4065 }
4066
4067 if (want_parent) {
4068 if (in->is_base()) {
4069 respond_to_request(mdr, -CEPHFS_EINVAL);
4070 return;
4071 }
4072 if (!diri || diri->is_stray()) {
4073 respond_to_request(mdr, -CEPHFS_ESTALE);
4074 return;
4075 }
4076 dout(10) << "reply to lookup_parent " << *in << dendl;
4077 mdr->tracei = diri;
4078 respond_to_request(mdr, 0);
4079 } else {
4080 if (want_dentry) {
4081 inodeno_t dirino = req->get_filepath2().get_ino();
4082 if (!diri || (dirino != inodeno_t() && diri->ino() != dirino)) {
4083 respond_to_request(mdr, -CEPHFS_ENOENT);
4084 return;
4085 }
4086 dout(10) << "reply to lookup_name " << *in << dendl;
4087 } else
4088 dout(10) << "reply to lookup_ino " << *in << dendl;
4089
4090 mdr->tracei = in;
4091 if (want_dentry)
4092 mdr->tracedn = dn;
4093 respond_to_request(mdr, 0);
4094 }
4095 }
4096
4097 void Server::_lookup_snap_ino(MDRequestRef& mdr)
4098 {
4099 const cref_t<MClientRequest> &req = mdr->client_request;
4100
4101 vinodeno_t vino;
4102 vino.ino = req->get_filepath().get_ino();
4103 vino.snapid = (__u64)req->head.args.lookupino.snapid;
4104 inodeno_t parent_ino = (__u64)req->head.args.lookupino.parent;
4105 __u32 hash = req->head.args.lookupino.hash;
4106
4107 dout(7) << "lookup_snap_ino " << vino << " parent " << parent_ino << " hash " << hash << dendl;
4108
4109 CInode *in = mdcache->lookup_snap_inode(vino);
4110 if (!in) {
4111 in = mdcache->get_inode(vino.ino);
4112 if (in) {
4113 if (in->state_test(CInode::STATE_PURGING) ||
4114 !in->has_snap_data(vino.snapid)) {
4115 if (in->is_dir() || !parent_ino) {
4116 respond_to_request(mdr, -CEPHFS_ESTALE);
4117 return;
4118 }
4119 in = NULL;
4120 }
4121 }
4122 }
4123
4124 if (in) {
4125 dout(10) << "reply to lookup_snap_ino " << *in << dendl;
4126 mdr->snapid = vino.snapid;
4127 mdr->tracei = in;
4128 respond_to_request(mdr, 0);
4129 return;
4130 }
4131
4132 CInode *diri = NULL;
4133 if (parent_ino) {
4134 diri = mdcache->get_inode(parent_ino);
4135 if (!diri) {
4136 mdcache->open_ino(parent_ino, mds->get_metadata_pool(), new C_MDS_LookupIno2(this, mdr));
4137 return;
4138 }
4139
4140 if (!diri->is_dir()) {
4141 respond_to_request(mdr, -CEPHFS_EINVAL);
4142 return;
4143 }
4144
4145 MutationImpl::LockOpVec lov;
4146 lov.add_rdlock(&diri->dirfragtreelock);
4147 if (!mds->locker->acquire_locks(mdr, lov))
4148 return;
4149
4150 frag_t frag = diri->dirfragtree[hash];
4151 CDir *dir = try_open_auth_dirfrag(diri, frag, mdr);
4152 if (!dir)
4153 return;
4154
4155 if (!dir->is_complete()) {
4156 if (dir->is_frozen()) {
4157 mds->locker->drop_locks(mdr.get());
4158 mdr->drop_local_auth_pins();
4159 dir->add_waiter(CDir::WAIT_UNFREEZE, new C_MDS_RetryRequest(mdcache, mdr));
4160 return;
4161 }
4162 dir->fetch(new C_MDS_RetryRequest(mdcache, mdr), true);
4163 return;
4164 }
4165
4166 respond_to_request(mdr, -CEPHFS_ESTALE);
4167 } else {
4168 mdcache->open_ino(vino.ino, mds->get_metadata_pool(), new C_MDS_LookupIno2(this, mdr), false);
4169 }
4170 }
4171
4172 void Server::_lookup_ino_2(MDRequestRef& mdr, int r)
4173 {
4174 inodeno_t ino = mdr->client_request->get_filepath().get_ino();
4175 dout(10) << "_lookup_ino_2 " << mdr.get() << " ino " << ino << " r=" << r << dendl;
4176
4177 // `r` is a rank if >=0, else an error code
4178 if (r >= 0) {
4179 mds_rank_t dest_rank(r);
4180 if (dest_rank == mds->get_nodeid())
4181 dispatch_client_request(mdr);
4182 else
4183 mdcache->request_forward(mdr, dest_rank);
4184 return;
4185 }
4186
4187 // give up
4188 if (r == -CEPHFS_ENOENT || r == -CEPHFS_ENODATA)
4189 r = -CEPHFS_ESTALE;
4190 respond_to_request(mdr, r);
4191 }
4192
4193
4194 /* This function takes responsibility for the passed mdr*/
4195 void Server::handle_client_open(MDRequestRef& mdr)
4196 {
4197 const cref_t<MClientRequest> &req = mdr->client_request;
4198 dout(7) << "open on " << req->get_filepath() << dendl;
4199
4200 int flags = req->head.args.open.flags;
4201 int cmode = ceph_flags_to_mode(flags);
4202 if (cmode < 0) {
4203 respond_to_request(mdr, -CEPHFS_EINVAL);
4204 return;
4205 }
4206
4207 bool need_auth = !file_mode_is_readonly(cmode) ||
4208 (flags & (CEPH_O_TRUNC | CEPH_O_DIRECTORY));
4209
4210 if ((cmode & CEPH_FILE_MODE_WR) && mdcache->is_readonly()) {
4211 dout(7) << "read-only FS" << dendl;
4212 respond_to_request(mdr, -CEPHFS_EROFS);
4213 return;
4214 }
4215
4216 CInode *cur = rdlock_path_pin_ref(mdr, need_auth);
4217 if (!cur)
4218 return;
4219
4220 if (cur->is_frozen() || cur->state_test(CInode::STATE_EXPORTINGCAPS)) {
4221 ceph_assert(!need_auth);
4222 mdr->locking_state &= ~(MutationImpl::PATH_LOCKED | MutationImpl::ALL_LOCKED);
4223 CInode *cur = rdlock_path_pin_ref(mdr, true);
4224 if (!cur)
4225 return;
4226 }
4227
4228 if (!cur->is_file()) {
4229 // can only open non-regular inode with mode FILE_MODE_PIN, at least for now.
4230 cmode = CEPH_FILE_MODE_PIN;
4231 // the inode is symlink and client wants to follow it, ignore the O_TRUNC flag.
4232 if (cur->is_symlink() && !(flags & CEPH_O_NOFOLLOW))
4233 flags &= ~CEPH_O_TRUNC;
4234 }
4235
4236 dout(10) << "open flags = " << flags
4237 << ", filemode = " << cmode
4238 << ", need_auth = " << need_auth
4239 << dendl;
4240
4241 // regular file?
4242 /*if (!cur->inode.is_file() && !cur->inode.is_dir()) {
4243 dout(7) << "not a file or dir " << *cur << dendl;
4244 respond_to_request(mdr, -CEPHFS_ENXIO); // FIXME what error do we want?
4245 return;
4246 }*/
4247 if ((flags & CEPH_O_DIRECTORY) && !cur->is_dir() && !cur->is_symlink()) {
4248 dout(7) << "specified O_DIRECTORY on non-directory " << *cur << dendl;
4249 respond_to_request(mdr, -CEPHFS_EINVAL);
4250 return;
4251 }
4252
4253 if ((flags & CEPH_O_TRUNC) && !cur->is_file()) {
4254 dout(7) << "specified O_TRUNC on !(file|symlink) " << *cur << dendl;
4255 // we should return -CEPHFS_EISDIR for directory, return -CEPHFS_EINVAL for other non-regular
4256 respond_to_request(mdr, cur->is_dir() ? -CEPHFS_EISDIR : -CEPHFS_EINVAL);
4257 return;
4258 }
4259
4260 if (cur->get_inode()->inline_data.version != CEPH_INLINE_NONE &&
4261 !mdr->session->get_connection()->has_feature(CEPH_FEATURE_MDS_INLINE_DATA)) {
4262 dout(7) << "old client cannot open inline data file " << *cur << dendl;
4263 respond_to_request(mdr, -CEPHFS_EPERM);
4264 return;
4265 }
4266
4267 // snapped data is read only
4268 if (mdr->snapid != CEPH_NOSNAP &&
4269 ((cmode & CEPH_FILE_MODE_WR) || req->may_write())) {
4270 dout(7) << "snap " << mdr->snapid << " is read-only " << *cur << dendl;
4271 respond_to_request(mdr, -CEPHFS_EROFS);
4272 return;
4273 }
4274
4275 MutationImpl::LockOpVec lov;
4276
4277 unsigned mask = req->head.args.open.mask;
4278 if (mask) {
4279 Capability *cap = cur->get_client_cap(mdr->get_client());
4280 int issued = 0;
4281 if (cap && (mdr->snapid == CEPH_NOSNAP || mdr->snapid <= cap->client_follows))
4282 issued = cap->issued();
4283 // permission bits, ACL/security xattrs
4284 if ((mask & CEPH_CAP_AUTH_SHARED) && (issued & CEPH_CAP_AUTH_EXCL) == 0)
4285 lov.add_rdlock(&cur->authlock);
4286 if ((mask & CEPH_CAP_XATTR_SHARED) && (issued & CEPH_CAP_XATTR_EXCL) == 0)
4287 lov.add_rdlock(&cur->xattrlock);
4288
4289 mdr->getattr_caps = mask;
4290 }
4291
4292 // O_TRUNC
4293 if ((flags & CEPH_O_TRUNC) && !mdr->has_completed) {
4294 ceph_assert(cur->is_auth());
4295
4296 lov.add_xlock(&cur->filelock);
4297 if (!mds->locker->acquire_locks(mdr, lov))
4298 return;
4299
4300 if (!check_access(mdr, cur, MAY_WRITE))
4301 return;
4302
4303 // wait for pending truncate?
4304 const auto& pi = cur->get_projected_inode();
4305 if (pi->is_truncating()) {
4306 dout(10) << " waiting for pending truncate from " << pi->truncate_from
4307 << " to " << pi->truncate_size << " to complete on " << *cur << dendl;
4308 mds->locker->drop_locks(mdr.get());
4309 mdr->drop_local_auth_pins();
4310 cur->add_waiter(CInode::WAIT_TRUNC, new C_MDS_RetryRequest(mdcache, mdr));
4311 return;
4312 }
4313
4314 do_open_truncate(mdr, cmode);
4315 return;
4316 }
4317
4318 // sync filelock if snapped.
4319 // this makes us wait for writers to flushsnaps, ensuring we get accurate metadata,
4320 // and that data itself is flushed so that we can read the snapped data off disk.
4321 if (mdr->snapid != CEPH_NOSNAP && !cur->is_dir()) {
4322 lov.add_rdlock(&cur->filelock);
4323 }
4324
4325 if (!mds->locker->acquire_locks(mdr, lov))
4326 return;
4327
4328 mask = MAY_READ;
4329 if (cmode & CEPH_FILE_MODE_WR)
4330 mask |= MAY_WRITE;
4331 if (!check_access(mdr, cur, mask))
4332 return;
4333
4334 utime_t now = ceph_clock_now();
4335 mdr->set_mds_stamp(now);
4336
4337 if (cur->is_file() || cur->is_dir()) {
4338 if (mdr->snapid == CEPH_NOSNAP) {
4339 // register new cap
4340 Capability *cap = mds->locker->issue_new_caps(cur, cmode, mdr, nullptr);
4341 if (cap)
4342 dout(12) << "open issued caps " << ccap_string(cap->pending())
4343 << " for " << req->get_source()
4344 << " on " << *cur << dendl;
4345 } else {
4346 int caps = ceph_caps_for_mode(cmode);
4347 dout(12) << "open issued IMMUTABLE SNAP caps " << ccap_string(caps)
4348 << " for " << req->get_source()
4349 << " snapid " << mdr->snapid
4350 << " on " << *cur << dendl;
4351 mdr->snap_caps = caps;
4352 }
4353 }
4354
4355 // increase max_size?
4356 if (cmode & CEPH_FILE_MODE_WR)
4357 mds->locker->check_inode_max_size(cur);
4358
4359 // make sure this inode gets into the journal
4360 if (cur->is_auth() && cur->last == CEPH_NOSNAP &&
4361 mdcache->open_file_table.should_log_open(cur)) {
4362 EOpen *le = new EOpen(mds->mdlog);
4363 mdlog->start_entry(le);
4364 le->add_clean_inode(cur);
4365 mdlog->submit_entry(le);
4366 }
4367
4368 // hit pop
4369 if (cmode & CEPH_FILE_MODE_WR)
4370 mds->balancer->hit_inode(cur, META_POP_IWR);
4371 else
4372 mds->balancer->hit_inode(cur, META_POP_IRD,
4373 mdr->client_request->get_source().num());
4374
4375 CDentry *dn = 0;
4376 if (req->get_dentry_wanted()) {
4377 ceph_assert(mdr->dn[0].size());
4378 dn = mdr->dn[0].back();
4379 }
4380
4381 mdr->tracei = cur;
4382 mdr->tracedn = dn;
4383 respond_to_request(mdr, 0);
4384 }
4385
4386 class C_MDS_openc_finish : public ServerLogContext {
4387 CDentry *dn;
4388 CInode *newi;
4389 public:
4390 C_MDS_openc_finish(Server *s, MDRequestRef& r, CDentry *d, CInode *ni) :
4391 ServerLogContext(s, r), dn(d), newi(ni) {}
4392 void finish(int r) override {
4393 ceph_assert(r == 0);
4394
4395 dn->pop_projected_linkage();
4396
4397 // dirty inode, dn, dir
4398 newi->mark_dirty(mdr->ls);
4399 newi->mark_dirty_parent(mdr->ls, true);
4400
4401 mdr->apply();
4402
4403 get_mds()->locker->share_inode_max_size(newi);
4404
4405 MDRequestRef null_ref;
4406 get_mds()->mdcache->send_dentry_link(dn, null_ref);
4407
4408 get_mds()->balancer->hit_inode(newi, META_POP_IWR);
4409
4410 server->respond_to_request(mdr, 0);
4411
4412 ceph_assert(g_conf()->mds_kill_openc_at != 1);
4413 }
4414 };
4415
4416 /* This function takes responsibility for the passed mdr*/
4417 void Server::handle_client_openc(MDRequestRef& mdr)
4418 {
4419 const cref_t<MClientRequest> &req = mdr->client_request;
4420 client_t client = mdr->get_client();
4421
4422 dout(7) << "open w/ O_CREAT on " << req->get_filepath() << dendl;
4423
4424 int cmode = ceph_flags_to_mode(req->head.args.open.flags);
4425 if (cmode < 0) {
4426 respond_to_request(mdr, -CEPHFS_EINVAL);
4427 return;
4428 }
4429
4430 bool excl = req->head.args.open.flags & CEPH_O_EXCL;
4431 CDentry *dn = rdlock_path_xlock_dentry(mdr, true, !excl, true);
4432 if (!dn)
4433 return;
4434
4435 CDentry::linkage_t *dnl = dn->get_projected_linkage();
4436 if (!excl && !dnl->is_null()) {
4437 // it existed.
4438 mds->locker->xlock_downgrade(&dn->lock, mdr.get());
4439
4440 MutationImpl::LockOpVec lov;
4441 lov.add_rdlock(&dnl->get_inode()->snaplock);
4442 if (!mds->locker->acquire_locks(mdr, lov))
4443 return;
4444
4445 handle_client_open(mdr);
4446 return;
4447 }
4448
4449 ceph_assert(dnl->is_null());
4450
4451 if (req->get_alternate_name().size() > alternate_name_max) {
4452 dout(10) << " alternate_name longer than " << alternate_name_max << dendl;
4453 respond_to_request(mdr, -CEPHFS_ENAMETOOLONG);
4454 return;
4455 }
4456 dn->set_alternate_name(req->get_alternate_name());
4457
4458 // set layout
4459 file_layout_t layout;
4460 if (mdr->dir_layout != file_layout_t())
4461 layout = mdr->dir_layout;
4462 else
4463 layout = mdcache->default_file_layout;
4464
4465 // What kind of client caps are required to complete this operation
4466 uint64_t access = MAY_WRITE;
4467
4468 const auto default_layout = layout;
4469
4470 // fill in any special params from client
4471 if (req->head.args.open.stripe_unit)
4472 layout.stripe_unit = req->head.args.open.stripe_unit;
4473 if (req->head.args.open.stripe_count)
4474 layout.stripe_count = req->head.args.open.stripe_count;
4475 if (req->head.args.open.object_size)
4476 layout.object_size = req->head.args.open.object_size;
4477 if (req->get_connection()->has_feature(CEPH_FEATURE_CREATEPOOLID) &&
4478 (__s32)req->head.args.open.pool >= 0) {
4479 layout.pool_id = req->head.args.open.pool;
4480
4481 // make sure we have as new a map as the client
4482 if (req->get_mdsmap_epoch() > mds->mdsmap->get_epoch()) {
4483 mds->wait_for_mdsmap(req->get_mdsmap_epoch(), new C_MDS_RetryRequest(mdcache, mdr));
4484 return;
4485 }
4486 }
4487
4488 // If client doesn't have capability to modify layout pools, then
4489 // only permit this request if the requested pool matches what the
4490 // file would have inherited anyway from its parent.
4491 if (default_layout != layout) {
4492 access |= MAY_SET_VXATTR;
4493 }
4494
4495 if (!layout.is_valid()) {
4496 dout(10) << " invalid initial file layout" << dendl;
4497 respond_to_request(mdr, -CEPHFS_EINVAL);
4498 return;
4499 }
4500 if (!mds->mdsmap->is_data_pool(layout.pool_id)) {
4501 dout(10) << " invalid data pool " << layout.pool_id << dendl;
4502 respond_to_request(mdr, -CEPHFS_EINVAL);
4503 return;
4504 }
4505
4506 // created null dn.
4507 CDir *dir = dn->get_dir();
4508 CInode *diri = dir->get_inode();
4509 if (!check_access(mdr, diri, access))
4510 return;
4511 if (!check_fragment_space(mdr, dir))
4512 return;
4513 if (!check_dir_max_entries(mdr, dir))
4514 return;
4515
4516 if (mdr->dn[0].size() == 1)
4517 mds->locker->create_lock_cache(mdr, diri, &mdr->dir_layout);
4518
4519 // create inode.
4520 CInode *newi = prepare_new_inode(mdr, dn->get_dir(), inodeno_t(req->head.ino),
4521 req->head.args.open.mode | S_IFREG, &layout);
4522 ceph_assert(newi);
4523
4524 // it's a file.
4525 dn->push_projected_linkage(newi);
4526
4527 auto _inode = newi->_get_inode();
4528 _inode->version = dn->pre_dirty();
4529 if (layout.pool_id != mdcache->default_file_layout.pool_id)
4530 _inode->add_old_pool(mdcache->default_file_layout.pool_id);
4531 _inode->update_backtrace();
4532 _inode->rstat.rfiles = 1;
4533 _inode->accounted_rstat = _inode->rstat;
4534
4535 SnapRealm *realm = diri->find_snaprealm();
4536 snapid_t follows = mdcache->get_global_snaprealm()->get_newest_seq();
4537 ceph_assert(follows >= realm->get_newest_seq());
4538
4539 ceph_assert(dn->first == follows+1);
4540 newi->first = dn->first;
4541
4542 // do the open
4543 Capability *cap = mds->locker->issue_new_caps(newi, cmode, mdr, realm);
4544 newi->authlock.set_state(LOCK_EXCL);
4545 newi->xattrlock.set_state(LOCK_EXCL);
4546
4547 if (cap && (cmode & CEPH_FILE_MODE_WR)) {
4548 _inode->client_ranges[client].range.first = 0;
4549 _inode->client_ranges[client].range.last = _inode->layout.stripe_unit;
4550 _inode->client_ranges[client].follows = follows;
4551 newi->mark_clientwriteable();
4552 cap->mark_clientwriteable();
4553 }
4554
4555 // prepare finisher
4556 mdr->ls = mdlog->get_current_segment();
4557 EUpdate *le = new EUpdate(mdlog, "openc");
4558 mdlog->start_entry(le);
4559 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
4560 journal_allocated_inos(mdr, &le->metablob);
4561 mdcache->predirty_journal_parents(mdr, &le->metablob, newi, dn->get_dir(), PREDIRTY_PRIMARY|PREDIRTY_DIR, 1);
4562 le->metablob.add_primary_dentry(dn, newi, true, true, true);
4563
4564 // make sure this inode gets into the journal
4565 le->metablob.add_opened_ino(newi->ino());
4566
4567 C_MDS_openc_finish *fin = new C_MDS_openc_finish(this, mdr, dn, newi);
4568
4569 if (mdr->session->info.has_feature(CEPHFS_FEATURE_DELEG_INO)) {
4570 openc_response_t ocresp;
4571
4572 dout(10) << "adding created_ino and delegated_inos" << dendl;
4573 ocresp.created_ino = _inode->ino;
4574
4575 if (delegate_inos_pct && !req->is_queued_for_replay()) {
4576 // Try to delegate some prealloc_inos to the client, if it's down to half the max
4577 unsigned frac = 100 / delegate_inos_pct;
4578 if (mdr->session->delegated_inos.size() < (unsigned)g_conf()->mds_client_prealloc_inos / frac / 2)
4579 mdr->session->delegate_inos(g_conf()->mds_client_prealloc_inos / frac, ocresp.delegated_inos);
4580 }
4581
4582 encode(ocresp, mdr->reply_extra_bl);
4583 } else if (mdr->client_request->get_connection()->has_feature(CEPH_FEATURE_REPLY_CREATE_INODE)) {
4584 dout(10) << "adding ino to reply to indicate inode was created" << dendl;
4585 // add the file created flag onto the reply if create_flags features is supported
4586 encode(newi->ino(), mdr->reply_extra_bl);
4587 }
4588
4589 journal_and_reply(mdr, newi, dn, le, fin);
4590
4591 // We hit_dir (via hit_inode) in our finish callback, but by then we might
4592 // have overshot the split size (multiple opencs in flight), so here is
4593 // an early chance to split the dir if this openc makes it oversized.
4594 mds->balancer->maybe_fragment(dir, false);
4595 }
4596
4597
4598
4599 void Server::handle_client_readdir(MDRequestRef& mdr)
4600 {
4601 const cref_t<MClientRequest> &req = mdr->client_request;
4602 Session *session = mds->get_session(req);
4603 client_t client = req->get_source().num();
4604 MutationImpl::LockOpVec lov;
4605 CInode *diri = rdlock_path_pin_ref(mdr, false, true);
4606 if (!diri) return;
4607
4608 // it's a directory, right?
4609 if (!diri->is_dir()) {
4610 // not a dir
4611 dout(10) << "reply to " << *req << " readdir -CEPHFS_ENOTDIR" << dendl;
4612 respond_to_request(mdr, -CEPHFS_ENOTDIR);
4613 return;
4614 }
4615
4616 auto num_caps = session->get_num_caps();
4617 auto session_cap_acquisition = session->get_cap_acquisition();
4618
4619 if (num_caps > static_cast<uint64_t>(max_caps_per_client * max_caps_throttle_ratio) && session_cap_acquisition >= cap_acquisition_throttle) {
4620 dout(20) << "readdir throttled. max_caps_per_client: " << max_caps_per_client << " num_caps: " << num_caps
4621 << " session_cap_acquistion: " << session_cap_acquisition << " cap_acquisition_throttle: " << cap_acquisition_throttle << dendl;
4622 if (logger)
4623 logger->inc(l_mdss_cap_acquisition_throttle);
4624
4625 mds->timer.add_event_after(caps_throttle_retry_request_timeout, new C_MDS_RetryRequest(mdcache, mdr));
4626 return;
4627 }
4628
4629 lov.add_rdlock(&diri->filelock);
4630 lov.add_rdlock(&diri->dirfragtreelock);
4631
4632 if (!mds->locker->acquire_locks(mdr, lov))
4633 return;
4634
4635 if (!check_access(mdr, diri, MAY_READ))
4636 return;
4637
4638 // which frag?
4639 frag_t fg = (__u32)req->head.args.readdir.frag;
4640 unsigned req_flags = (__u32)req->head.args.readdir.flags;
4641 string offset_str = req->get_path2();
4642
4643 __u32 offset_hash = 0;
4644 if (!offset_str.empty())
4645 offset_hash = ceph_frag_value(diri->hash_dentry_name(offset_str));
4646 else
4647 offset_hash = (__u32)req->head.args.readdir.offset_hash;
4648
4649 dout(10) << " frag " << fg << " offset '" << offset_str << "'"
4650 << " offset_hash " << offset_hash << " flags " << req_flags << dendl;
4651
4652 // does the frag exist?
4653 if (diri->dirfragtree[fg.value()] != fg) {
4654 frag_t newfg;
4655 if (req_flags & CEPH_READDIR_REPLY_BITFLAGS) {
4656 if (fg.contains((unsigned)offset_hash)) {
4657 newfg = diri->dirfragtree[offset_hash];
4658 } else {
4659 // client actually wants next frag
4660 newfg = diri->dirfragtree[fg.value()];
4661 }
4662 } else {
4663 offset_str.clear();
4664 newfg = diri->dirfragtree[fg.value()];
4665 }
4666 dout(10) << " adjust frag " << fg << " -> " << newfg << " " << diri->dirfragtree << dendl;
4667 fg = newfg;
4668 }
4669
4670 CDir *dir = try_open_auth_dirfrag(diri, fg, mdr);
4671 if (!dir) return;
4672
4673 // ok!
4674 dout(10) << "handle_client_readdir on " << *dir << dendl;
4675 ceph_assert(dir->is_auth());
4676
4677 if (!dir->is_complete()) {
4678 if (dir->is_frozen()) {
4679 dout(7) << "dir is frozen " << *dir << dendl;
4680 mds->locker->drop_locks(mdr.get());
4681 mdr->drop_local_auth_pins();
4682 dir->add_waiter(CDir::WAIT_UNFREEZE, new C_MDS_RetryRequest(mdcache, mdr));
4683 return;
4684 }
4685 // fetch
4686 dout(10) << " incomplete dir contents for readdir on " << *dir << ", fetching" << dendl;
4687 dir->fetch(new C_MDS_RetryRequest(mdcache, mdr), true);
4688 return;
4689 }
4690
4691 #ifdef MDS_VERIFY_FRAGSTAT
4692 dir->verify_fragstat();
4693 #endif
4694
4695 utime_t now = ceph_clock_now();
4696 mdr->set_mds_stamp(now);
4697
4698 snapid_t snapid = mdr->snapid;
4699 dout(10) << "snapid " << snapid << dendl;
4700
4701 SnapRealm *realm = diri->find_snaprealm();
4702
4703 unsigned max = req->head.args.readdir.max_entries;
4704 if (!max)
4705 max = dir->get_num_any(); // whatever, something big.
4706 unsigned max_bytes = req->head.args.readdir.max_bytes;
4707 if (!max_bytes)
4708 // make sure at least one item can be encoded
4709 max_bytes = (512 << 10) + g_conf()->mds_max_xattr_pairs_size;
4710
4711 // start final blob
4712 bufferlist dirbl;
4713 DirStat ds;
4714 ds.frag = dir->get_frag();
4715 ds.auth = dir->get_dir_auth().first;
4716 if (dir->is_auth() && !forward_all_requests_to_auth)
4717 dir->get_dist_spec(ds.dist, mds->get_nodeid());
4718
4719 dir->encode_dirstat(dirbl, mdr->session->info, ds);
4720
4721 // count bytes available.
4722 // this isn't perfect, but we should capture the main variable/unbounded size items!
4723 int front_bytes = dirbl.length() + sizeof(__u32) + sizeof(__u8)*2;
4724 int bytes_left = max_bytes - front_bytes;
4725 bytes_left -= realm->get_snap_trace().length();
4726
4727 // build dir contents
4728 bufferlist dnbl;
4729 __u32 numfiles = 0;
4730 bool start = !offset_hash && offset_str.empty();
4731 // skip all dns < dentry_key_t(snapid, offset_str, offset_hash)
4732 dentry_key_t skip_key(snapid, offset_str.c_str(), offset_hash);
4733 auto it = start ? dir->begin() : dir->lower_bound(skip_key);
4734 bool end = (it == dir->end());
4735 for (; !end && numfiles < max; end = (it == dir->end())) {
4736 CDentry *dn = it->second;
4737 ++it;
4738
4739 if (dn->state_test(CDentry::STATE_PURGING))
4740 continue;
4741
4742 bool dnp = dn->use_projected(client, mdr);
4743 CDentry::linkage_t *dnl = dnp ? dn->get_projected_linkage() : dn->get_linkage();
4744
4745 if (dnl->is_null())
4746 continue;
4747
4748 if (dn->last < snapid || dn->first > snapid) {
4749 dout(20) << "skipping non-overlapping snap " << *dn << dendl;
4750 continue;
4751 }
4752
4753 if (!start) {
4754 dentry_key_t offset_key(dn->last, offset_str.c_str(), offset_hash);
4755 if (!(offset_key < dn->key()))
4756 continue;
4757 }
4758
4759 CInode *in = dnl->get_inode();
4760
4761 if (in && in->ino() == CEPH_INO_CEPH)
4762 continue;
4763
4764 // remote link?
4765 // better for the MDS to do the work, if we think the client will stat any of these files.
4766 if (dnl->is_remote() && !in) {
4767 in = mdcache->get_inode(dnl->get_remote_ino());
4768 if (in) {
4769 dn->link_remote(dnl, in);
4770 } else if (dn->state_test(CDentry::STATE_BADREMOTEINO)) {
4771 dout(10) << "skipping bad remote ino on " << *dn << dendl;
4772 continue;
4773 } else {
4774 // touch everything i _do_ have
4775 for (auto &p : *dir) {
4776 if (!p.second->get_linkage()->is_null())
4777 mdcache->lru.lru_touch(p.second);
4778 }
4779
4780 // already issued caps and leases, reply immediately.
4781 if (dnbl.length() > 0) {
4782 mdcache->open_remote_dentry(dn, dnp, new C_MDSInternalNoop);
4783 dout(10) << " open remote dentry after caps were issued, stopping at "
4784 << dnbl.length() << " < " << bytes_left << dendl;
4785 break;
4786 }
4787
4788 mds->locker->drop_locks(mdr.get());
4789 mdr->drop_local_auth_pins();
4790 mdcache->open_remote_dentry(dn, dnp, new C_MDS_RetryRequest(mdcache, mdr));
4791 return;
4792 }
4793 }
4794 ceph_assert(in);
4795
4796 if ((int)(dnbl.length() + dn->get_name().length() + sizeof(__u32) + sizeof(LeaseStat)) > bytes_left) {
4797 dout(10) << " ran out of room, stopping at " << dnbl.length() << " < " << bytes_left << dendl;
4798 break;
4799 }
4800
4801 unsigned start_len = dnbl.length();
4802
4803 // dentry
4804 dout(12) << "including dn " << *dn << dendl;
4805 encode(dn->get_name(), dnbl);
4806 mds->locker->issue_client_lease(dn, in, mdr, now, dnbl);
4807
4808 // inode
4809 dout(12) << "including inode " << *in << dendl;
4810 int r = in->encode_inodestat(dnbl, mdr->session, realm, snapid, bytes_left - (int)dnbl.length());
4811 if (r < 0) {
4812 // chop off dn->name, lease
4813 dout(10) << " ran out of room, stopping at " << start_len << " < " << bytes_left << dendl;
4814 bufferlist keep;
4815 keep.substr_of(dnbl, 0, start_len);
4816 dnbl.swap(keep);
4817 break;
4818 }
4819 ceph_assert(r >= 0);
4820 numfiles++;
4821
4822 // touch dn
4823 mdcache->lru.lru_touch(dn);
4824 }
4825
4826 session->touch_readdir_cap(numfiles);
4827
4828 __u16 flags = 0;
4829 if (end) {
4830 flags = CEPH_READDIR_FRAG_END;
4831 if (start)
4832 flags |= CEPH_READDIR_FRAG_COMPLETE; // FIXME: what purpose does this serve
4833 }
4834 // client only understand END and COMPLETE flags ?
4835 if (req_flags & CEPH_READDIR_REPLY_BITFLAGS) {
4836 flags |= CEPH_READDIR_HASH_ORDER | CEPH_READDIR_OFFSET_HASH;
4837 }
4838
4839 // finish final blob
4840 encode(numfiles, dirbl);
4841 encode(flags, dirbl);
4842 dirbl.claim_append(dnbl);
4843
4844 // yay, reply
4845 dout(10) << "reply to " << *req << " readdir num=" << numfiles
4846 << " bytes=" << dirbl.length()
4847 << " start=" << (int)start
4848 << " end=" << (int)end
4849 << dendl;
4850 mdr->reply_extra_bl = dirbl;
4851
4852 // bump popularity. NOTE: this doesn't quite capture it.
4853 mds->balancer->hit_dir(dir, META_POP_READDIR, -1, numfiles);
4854
4855 // reply
4856 mdr->tracei = diri;
4857 respond_to_request(mdr, 0);
4858 }
4859
4860
4861
4862 // ===============================================================================
4863 // INODE UPDATES
4864
4865
4866 /*
4867 * finisher for basic inode updates
4868 */
4869 class C_MDS_inode_update_finish : public ServerLogContext {
4870 CInode *in;
4871 bool truncating_smaller, changed_ranges, adjust_realm;
4872 public:
4873 C_MDS_inode_update_finish(Server *s, MDRequestRef& r, CInode *i,
4874 bool sm=false, bool cr=false, bool ar=false) :
4875 ServerLogContext(s, r), in(i),
4876 truncating_smaller(sm), changed_ranges(cr), adjust_realm(ar) { }
4877 void finish(int r) override {
4878 ceph_assert(r == 0);
4879
4880 int snap_op = (in->snaprealm ? CEPH_SNAP_OP_UPDATE : CEPH_SNAP_OP_SPLIT);
4881
4882 // apply
4883 mdr->apply();
4884
4885 MDSRank *mds = get_mds();
4886
4887 // notify any clients
4888 if (truncating_smaller && in->get_inode()->is_truncating()) {
4889 mds->locker->issue_truncate(in);
4890 mds->mdcache->truncate_inode(in, mdr->ls);
4891 }
4892
4893 if (adjust_realm) {
4894 mds->mdcache->send_snap_update(in, 0, snap_op);
4895 mds->mdcache->do_realm_invalidate_and_update_notify(in, snap_op);
4896 }
4897
4898 get_mds()->balancer->hit_inode(in, META_POP_IWR);
4899
4900 server->respond_to_request(mdr, 0);
4901
4902 if (changed_ranges)
4903 get_mds()->locker->share_inode_max_size(in);
4904 }
4905 };
4906
4907 void Server::handle_client_file_setlock(MDRequestRef& mdr)
4908 {
4909 const cref_t<MClientRequest> &req = mdr->client_request;
4910 MutationImpl::LockOpVec lov;
4911
4912 // get the inode to operate on, and set up any locks needed for that
4913 CInode *cur = rdlock_path_pin_ref(mdr, true);
4914 if (!cur)
4915 return;
4916
4917 lov.add_xlock(&cur->flocklock);
4918 /* acquire_locks will return true if it gets the locks. If it fails,
4919 it will redeliver this request at a later date, so drop the request.
4920 */
4921 if (!mds->locker->acquire_locks(mdr, lov)) {
4922 dout(10) << "handle_client_file_setlock could not get locks!" << dendl;
4923 return;
4924 }
4925
4926 // copy the lock change into a ceph_filelock so we can store/apply it
4927 ceph_filelock set_lock;
4928 set_lock.start = req->head.args.filelock_change.start;
4929 set_lock.length = req->head.args.filelock_change.length;
4930 set_lock.client = req->get_orig_source().num();
4931 set_lock.owner = req->head.args.filelock_change.owner;
4932 set_lock.pid = req->head.args.filelock_change.pid;
4933 set_lock.type = req->head.args.filelock_change.type;
4934 bool will_wait = req->head.args.filelock_change.wait;
4935
4936 dout(10) << "handle_client_file_setlock: " << set_lock << dendl;
4937
4938 ceph_lock_state_t *lock_state = NULL;
4939 bool interrupt = false;
4940
4941 // get the appropriate lock state
4942 switch (req->head.args.filelock_change.rule) {
4943 case CEPH_LOCK_FLOCK_INTR:
4944 interrupt = true;
4945 // fall-thru
4946 case CEPH_LOCK_FLOCK:
4947 lock_state = cur->get_flock_lock_state();
4948 break;
4949
4950 case CEPH_LOCK_FCNTL_INTR:
4951 interrupt = true;
4952 // fall-thru
4953 case CEPH_LOCK_FCNTL:
4954 lock_state = cur->get_fcntl_lock_state();
4955 break;
4956
4957 default:
4958 dout(10) << "got unknown lock type " << set_lock.type
4959 << ", dropping request!" << dendl;
4960 respond_to_request(mdr, -CEPHFS_EOPNOTSUPP);
4961 return;
4962 }
4963
4964 dout(10) << " state prior to lock change: " << *lock_state << dendl;
4965 if (CEPH_LOCK_UNLOCK == set_lock.type) {
4966 list<ceph_filelock> activated_locks;
4967 MDSContext::vec waiters;
4968 if (lock_state->is_waiting(set_lock)) {
4969 dout(10) << " unlock removing waiting lock " << set_lock << dendl;
4970 lock_state->remove_waiting(set_lock);
4971 cur->take_waiting(CInode::WAIT_FLOCK, waiters);
4972 } else if (!interrupt) {
4973 dout(10) << " unlock attempt on " << set_lock << dendl;
4974 lock_state->remove_lock(set_lock, activated_locks);
4975 cur->take_waiting(CInode::WAIT_FLOCK, waiters);
4976 }
4977 mds->queue_waiters(waiters);
4978
4979 respond_to_request(mdr, 0);
4980 } else {
4981 dout(10) << " lock attempt on " << set_lock << dendl;
4982 bool deadlock = false;
4983 if (mdr->more()->flock_was_waiting &&
4984 !lock_state->is_waiting(set_lock)) {
4985 dout(10) << " was waiting for lock but not anymore, must have been canceled " << set_lock << dendl;
4986 respond_to_request(mdr, -CEPHFS_EINTR);
4987 } else if (!lock_state->add_lock(set_lock, will_wait, mdr->more()->flock_was_waiting, &deadlock)) {
4988 dout(10) << " it failed on this attempt" << dendl;
4989 // couldn't set lock right now
4990 if (deadlock) {
4991 respond_to_request(mdr, -CEPHFS_EDEADLK);
4992 } else if (!will_wait) {
4993 respond_to_request(mdr, -CEPHFS_EWOULDBLOCK);
4994 } else {
4995 dout(10) << " added to waiting list" << dendl;
4996 ceph_assert(lock_state->is_waiting(set_lock));
4997 mdr->more()->flock_was_waiting = true;
4998 mds->locker->drop_locks(mdr.get());
4999 mdr->drop_local_auth_pins();
5000 mdr->mark_event("failed to add lock, waiting");
5001 mdr->mark_nowarn();
5002 cur->add_waiter(CInode::WAIT_FLOCK, new C_MDS_RetryRequest(mdcache, mdr));
5003 }
5004 } else
5005 respond_to_request(mdr, 0);
5006 }
5007 dout(10) << " state after lock change: " << *lock_state << dendl;
5008 }
5009
5010 void Server::handle_client_file_readlock(MDRequestRef& mdr)
5011 {
5012 const cref_t<MClientRequest> &req = mdr->client_request;
5013 MutationImpl::LockOpVec lov;
5014
5015 // get the inode to operate on, and set up any locks needed for that
5016 CInode *cur = rdlock_path_pin_ref(mdr, true);
5017 if (!cur)
5018 return;
5019
5020 /* acquire_locks will return true if it gets the locks. If it fails,
5021 it will redeliver this request at a later date, so drop the request.
5022 */
5023 lov.add_rdlock(&cur->flocklock);
5024 if (!mds->locker->acquire_locks(mdr, lov)) {
5025 dout(10) << "handle_client_file_readlock could not get locks!" << dendl;
5026 return;
5027 }
5028
5029 // copy the lock change into a ceph_filelock so we can store/apply it
5030 ceph_filelock checking_lock;
5031 checking_lock.start = req->head.args.filelock_change.start;
5032 checking_lock.length = req->head.args.filelock_change.length;
5033 checking_lock.client = req->get_orig_source().num();
5034 checking_lock.owner = req->head.args.filelock_change.owner;
5035 checking_lock.pid = req->head.args.filelock_change.pid;
5036 checking_lock.type = req->head.args.filelock_change.type;
5037
5038 // get the appropriate lock state
5039 ceph_lock_state_t *lock_state = NULL;
5040 switch (req->head.args.filelock_change.rule) {
5041 case CEPH_LOCK_FLOCK:
5042 lock_state = cur->get_flock_lock_state();
5043 break;
5044
5045 case CEPH_LOCK_FCNTL:
5046 lock_state = cur->get_fcntl_lock_state();
5047 break;
5048
5049 default:
5050 dout(10) << "got unknown lock type " << checking_lock.type << dendl;
5051 respond_to_request(mdr, -CEPHFS_EINVAL);
5052 return;
5053 }
5054 lock_state->look_for_lock(checking_lock);
5055
5056 bufferlist lock_bl;
5057 encode(checking_lock, lock_bl);
5058
5059 mdr->reply_extra_bl = lock_bl;
5060 respond_to_request(mdr, 0);
5061 }
5062
5063 void Server::handle_client_setattr(MDRequestRef& mdr)
5064 {
5065 const cref_t<MClientRequest> &req = mdr->client_request;
5066 MutationImpl::LockOpVec lov;
5067 CInode *cur = rdlock_path_pin_ref(mdr, true);
5068 if (!cur) return;
5069
5070 if (mdr->snapid != CEPH_NOSNAP) {
5071 respond_to_request(mdr, -CEPHFS_EROFS);
5072 return;
5073 }
5074 if (cur->ino() < MDS_INO_SYSTEM_BASE && !cur->is_base()) {
5075 respond_to_request(mdr, -CEPHFS_EPERM);
5076 return;
5077 }
5078
5079 __u32 mask = req->head.args.setattr.mask;
5080 __u32 access_mask = MAY_WRITE;
5081
5082 // xlock inode
5083 if (mask & (CEPH_SETATTR_MODE|CEPH_SETATTR_UID|CEPH_SETATTR_GID|CEPH_SETATTR_BTIME|CEPH_SETATTR_KILL_SGUID))
5084 lov.add_xlock(&cur->authlock);
5085 if (mask & (CEPH_SETATTR_MTIME|CEPH_SETATTR_ATIME|CEPH_SETATTR_SIZE))
5086 lov.add_xlock(&cur->filelock);
5087 if (mask & CEPH_SETATTR_CTIME)
5088 lov.add_wrlock(&cur->versionlock);
5089
5090 if (!mds->locker->acquire_locks(mdr, lov))
5091 return;
5092
5093 if ((mask & CEPH_SETATTR_UID) && (cur->get_inode()->uid != req->head.args.setattr.uid))
5094 access_mask |= MAY_CHOWN;
5095
5096 if ((mask & CEPH_SETATTR_GID) && (cur->get_inode()->gid != req->head.args.setattr.gid))
5097 access_mask |= MAY_CHGRP;
5098
5099 if (!check_access(mdr, cur, access_mask))
5100 return;
5101
5102 // trunc from bigger -> smaller?
5103 const auto& pip = cur->get_projected_inode();
5104
5105 uint64_t old_size = std::max<uint64_t>(pip->size, req->head.args.setattr.old_size);
5106
5107 // CEPHFS_ENOSPC on growing file while full, but allow shrinks
5108 if (is_full && req->head.args.setattr.size > old_size) {
5109 dout(20) << __func__ << ": full, responding CEPHFS_ENOSPC to setattr with larger size" << dendl;
5110 respond_to_request(mdr, -CEPHFS_ENOSPC);
5111 return;
5112 }
5113
5114 bool truncating_smaller = false;
5115 if (mask & CEPH_SETATTR_SIZE) {
5116 truncating_smaller = req->head.args.setattr.size < old_size;
5117 if (truncating_smaller && pip->is_truncating()) {
5118 dout(10) << " waiting for pending truncate from " << pip->truncate_from
5119 << " to " << pip->truncate_size << " to complete on " << *cur << dendl;
5120 mds->locker->drop_locks(mdr.get());
5121 mdr->drop_local_auth_pins();
5122 cur->add_waiter(CInode::WAIT_TRUNC, new C_MDS_RetryRequest(mdcache, mdr));
5123 return;
5124 }
5125 }
5126
5127 bool changed_ranges = false;
5128
5129 // project update
5130 mdr->ls = mdlog->get_current_segment();
5131 EUpdate *le = new EUpdate(mdlog, "setattr");
5132 mdlog->start_entry(le);
5133
5134 auto pi = cur->project_inode(mdr);
5135
5136 if (mask & CEPH_SETATTR_UID)
5137 pi.inode->uid = req->head.args.setattr.uid;
5138 if (mask & CEPH_SETATTR_GID)
5139 pi.inode->gid = req->head.args.setattr.gid;
5140
5141 if (mask & CEPH_SETATTR_MODE)
5142 pi.inode->mode = (pi.inode->mode & ~07777) | (req->head.args.setattr.mode & 07777);
5143 else if ((mask & (CEPH_SETATTR_UID|CEPH_SETATTR_GID|CEPH_SETATTR_KILL_SGUID)) &&
5144 S_ISREG(pi.inode->mode) &&
5145 (pi.inode->mode & (S_IXUSR|S_IXGRP|S_IXOTH))) {
5146 pi.inode->mode &= ~(S_ISUID|S_ISGID);
5147 }
5148
5149 if (mask & CEPH_SETATTR_MTIME)
5150 pi.inode->mtime = req->head.args.setattr.mtime;
5151 if (mask & CEPH_SETATTR_ATIME)
5152 pi.inode->atime = req->head.args.setattr.atime;
5153 if (mask & CEPH_SETATTR_BTIME)
5154 pi.inode->btime = req->head.args.setattr.btime;
5155 if (mask & (CEPH_SETATTR_ATIME | CEPH_SETATTR_MTIME | CEPH_SETATTR_BTIME))
5156 pi.inode->time_warp_seq++; // maybe not a timewarp, but still a serialization point.
5157 if (mask & CEPH_SETATTR_SIZE) {
5158 if (truncating_smaller) {
5159 pi.inode->truncate(old_size, req->head.args.setattr.size);
5160 le->metablob.add_truncate_start(cur->ino());
5161 } else {
5162 pi.inode->size = req->head.args.setattr.size;
5163 pi.inode->rstat.rbytes = pi.inode->size;
5164 }
5165 pi.inode->mtime = mdr->get_op_stamp();
5166
5167 // adjust client's max_size?
5168 if (mds->locker->calc_new_client_ranges(cur, pi.inode->size)) {
5169 dout(10) << " client_ranges " << cur->get_previous_projected_inode()->client_ranges
5170 << " -> " << pi.inode->client_ranges << dendl;
5171 changed_ranges = true;
5172 }
5173 }
5174
5175 pi.inode->version = cur->pre_dirty();
5176 pi.inode->ctime = mdr->get_op_stamp();
5177 if (mdr->get_op_stamp() > pi.inode->rstat.rctime)
5178 pi.inode->rstat.rctime = mdr->get_op_stamp();
5179 pi.inode->change_attr++;
5180
5181 // log + wait
5182 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
5183 mdcache->predirty_journal_parents(mdr, &le->metablob, cur, 0, PREDIRTY_PRIMARY);
5184 mdcache->journal_dirty_inode(mdr.get(), &le->metablob, cur);
5185
5186 journal_and_reply(mdr, cur, 0, le, new C_MDS_inode_update_finish(this, mdr, cur,
5187 truncating_smaller, changed_ranges));
5188
5189 // flush immediately if there are readers/writers waiting
5190 if (mdr->is_xlocked(&cur->filelock) &&
5191 (cur->get_caps_wanted() & (CEPH_CAP_FILE_RD|CEPH_CAP_FILE_WR)))
5192 mds->mdlog->flush();
5193 }
5194
5195 /* Takes responsibility for mdr */
5196 void Server::do_open_truncate(MDRequestRef& mdr, int cmode)
5197 {
5198 CInode *in = mdr->in[0];
5199 client_t client = mdr->get_client();
5200 ceph_assert(in);
5201
5202 dout(10) << "do_open_truncate " << *in << dendl;
5203
5204 SnapRealm *realm = in->find_snaprealm();
5205 Capability *cap = mds->locker->issue_new_caps(in, cmode, mdr, realm);
5206
5207 mdr->ls = mdlog->get_current_segment();
5208 EUpdate *le = new EUpdate(mdlog, "open_truncate");
5209 mdlog->start_entry(le);
5210
5211 // prepare
5212 auto pi = in->project_inode(mdr);
5213 pi.inode->version = in->pre_dirty();
5214 pi.inode->mtime = pi.inode->ctime = mdr->get_op_stamp();
5215 if (mdr->get_op_stamp() > pi.inode->rstat.rctime)
5216 pi.inode->rstat.rctime = mdr->get_op_stamp();
5217 pi.inode->change_attr++;
5218
5219 uint64_t old_size = std::max<uint64_t>(pi.inode->size, mdr->client_request->head.args.open.old_size);
5220 if (old_size > 0) {
5221 pi.inode->truncate(old_size, 0);
5222 le->metablob.add_truncate_start(in->ino());
5223 }
5224
5225 bool changed_ranges = false;
5226 if (cap && (cmode & CEPH_FILE_MODE_WR)) {
5227 pi.inode->client_ranges[client].range.first = 0;
5228 pi.inode->client_ranges[client].range.last = pi.inode->get_layout_size_increment();
5229 pi.inode->client_ranges[client].follows = realm->get_newest_seq();
5230 changed_ranges = true;
5231 in->mark_clientwriteable();
5232 cap->mark_clientwriteable();
5233 }
5234
5235 le->metablob.add_client_req(mdr->reqid, mdr->client_request->get_oldest_client_tid());
5236
5237 mdcache->predirty_journal_parents(mdr, &le->metablob, in, 0, PREDIRTY_PRIMARY);
5238 mdcache->journal_dirty_inode(mdr.get(), &le->metablob, in);
5239
5240 // make sure ino gets into the journal
5241 le->metablob.add_opened_ino(in->ino());
5242
5243 mdr->o_trunc = true;
5244
5245 CDentry *dn = 0;
5246 if (mdr->client_request->get_dentry_wanted()) {
5247 ceph_assert(mdr->dn[0].size());
5248 dn = mdr->dn[0].back();
5249 }
5250
5251 journal_and_reply(mdr, in, dn, le, new C_MDS_inode_update_finish(this, mdr, in, old_size > 0,
5252 changed_ranges));
5253 // Although the `open` part can give an early reply, the truncation won't
5254 // happen until our EUpdate is persistent, to give the client a prompt
5255 // response we must also flush that event.
5256 mdlog->flush();
5257 }
5258
5259
5260 /* This function cleans up the passed mdr */
5261 void Server::handle_client_setlayout(MDRequestRef& mdr)
5262 {
5263 const cref_t<MClientRequest> &req = mdr->client_request;
5264 CInode *cur = rdlock_path_pin_ref(mdr, true);
5265 if (!cur) return;
5266
5267 if (mdr->snapid != CEPH_NOSNAP) {
5268 respond_to_request(mdr, -CEPHFS_EROFS);
5269 return;
5270 }
5271 if (!cur->is_file()) {
5272 respond_to_request(mdr, -CEPHFS_EINVAL);
5273 return;
5274 }
5275 if (cur->get_projected_inode()->size ||
5276 cur->get_projected_inode()->truncate_seq > 1) {
5277 respond_to_request(mdr, -CEPHFS_ENOTEMPTY);
5278 return;
5279 }
5280
5281 // validate layout
5282 file_layout_t layout = cur->get_projected_inode()->layout;
5283 // save existing layout for later
5284 const auto old_layout = layout;
5285
5286 int access = MAY_WRITE;
5287
5288 if (req->head.args.setlayout.layout.fl_object_size > 0)
5289 layout.object_size = req->head.args.setlayout.layout.fl_object_size;
5290 if (req->head.args.setlayout.layout.fl_stripe_unit > 0)
5291 layout.stripe_unit = req->head.args.setlayout.layout.fl_stripe_unit;
5292 if (req->head.args.setlayout.layout.fl_stripe_count > 0)
5293 layout.stripe_count=req->head.args.setlayout.layout.fl_stripe_count;
5294 if (req->head.args.setlayout.layout.fl_pg_pool > 0) {
5295 layout.pool_id = req->head.args.setlayout.layout.fl_pg_pool;
5296
5297 // make sure we have as new a map as the client
5298 if (req->get_mdsmap_epoch() > mds->mdsmap->get_epoch()) {
5299 mds->wait_for_mdsmap(req->get_mdsmap_epoch(), new C_MDS_RetryRequest(mdcache, mdr));
5300 return;
5301 }
5302 }
5303
5304 // Don't permit layout modifications without 'p' caps
5305 if (layout != old_layout) {
5306 access |= MAY_SET_VXATTR;
5307 }
5308
5309 if (!layout.is_valid()) {
5310 dout(10) << "bad layout" << dendl;
5311 respond_to_request(mdr, -CEPHFS_EINVAL);
5312 return;
5313 }
5314 if (!mds->mdsmap->is_data_pool(layout.pool_id)) {
5315 dout(10) << " invalid data pool " << layout.pool_id << dendl;
5316 respond_to_request(mdr, -CEPHFS_EINVAL);
5317 return;
5318 }
5319
5320 MutationImpl::LockOpVec lov;
5321 lov.add_xlock(&cur->filelock);
5322 if (!mds->locker->acquire_locks(mdr, lov))
5323 return;
5324
5325 if (!check_access(mdr, cur, access))
5326 return;
5327
5328 // project update
5329 auto pi = cur->project_inode(mdr);
5330 pi.inode->layout = layout;
5331 // add the old pool to the inode
5332 pi.inode->add_old_pool(old_layout.pool_id);
5333 pi.inode->version = cur->pre_dirty();
5334 pi.inode->ctime = mdr->get_op_stamp();
5335 if (mdr->get_op_stamp() > pi.inode->rstat.rctime)
5336 pi.inode->rstat.rctime = mdr->get_op_stamp();
5337 pi.inode->change_attr++;
5338
5339 // log + wait
5340 mdr->ls = mdlog->get_current_segment();
5341 EUpdate *le = new EUpdate(mdlog, "setlayout");
5342 mdlog->start_entry(le);
5343 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
5344 mdcache->predirty_journal_parents(mdr, &le->metablob, cur, 0, PREDIRTY_PRIMARY);
5345 mdcache->journal_dirty_inode(mdr.get(), &le->metablob, cur);
5346
5347 journal_and_reply(mdr, cur, 0, le, new C_MDS_inode_update_finish(this, mdr, cur));
5348 }
5349
5350 bool Server::xlock_policylock(MDRequestRef& mdr, CInode *in, bool want_layout, bool xlock_snaplock)
5351 {
5352 if (mdr->locking_state & MutationImpl::ALL_LOCKED)
5353 return true;
5354
5355 MutationImpl::LockOpVec lov;
5356 lov.add_xlock(&in->policylock);
5357 if (xlock_snaplock)
5358 lov.add_xlock(&in->snaplock);
5359 else
5360 lov.add_rdlock(&in->snaplock);
5361 if (!mds->locker->acquire_locks(mdr, lov))
5362 return false;
5363
5364 if (want_layout && in->get_projected_inode()->has_layout()) {
5365 mdr->dir_layout = in->get_projected_inode()->layout;
5366 want_layout = false;
5367 }
5368 if (CDentry *pdn = in->get_projected_parent_dn(); pdn) {
5369 if (!mds->locker->try_rdlock_snap_layout(pdn->get_dir()->get_inode(), mdr, 0, want_layout))
5370 return false;
5371 }
5372
5373 mdr->locking_state |= MutationImpl::ALL_LOCKED;
5374 return true;
5375 }
5376
5377 CInode* Server::try_get_auth_inode(MDRequestRef& mdr, inodeno_t ino)
5378 {
5379 CInode *in = mdcache->get_inode(ino);
5380 if (!in || in->state_test(CInode::STATE_PURGING)) {
5381 respond_to_request(mdr, -CEPHFS_ESTALE);
5382 return nullptr;
5383 }
5384 if (!in->is_auth()) {
5385 mdcache->request_forward(mdr, in->authority().first);
5386 return nullptr;
5387 }
5388
5389 return in;
5390 }
5391
5392 void Server::handle_client_setdirlayout(MDRequestRef& mdr)
5393 {
5394 const cref_t<MClientRequest> &req = mdr->client_request;
5395
5396 // can't use rdlock_path_pin_ref because we need to xlock snaplock/policylock
5397 CInode *cur = try_get_auth_inode(mdr, req->get_filepath().get_ino());
5398 if (!cur)
5399 return;
5400
5401 if (!cur->is_dir()) {
5402 respond_to_request(mdr, -CEPHFS_ENOTDIR);
5403 return;
5404 }
5405
5406 if (!xlock_policylock(mdr, cur, true))
5407 return;
5408
5409 // validate layout
5410 const auto& old_pi = cur->get_projected_inode();
5411 file_layout_t layout;
5412 if (old_pi->has_layout())
5413 layout = old_pi->layout;
5414 else if (mdr->dir_layout != file_layout_t())
5415 layout = mdr->dir_layout;
5416 else
5417 layout = mdcache->default_file_layout;
5418
5419 // Level of access required to complete
5420 int access = MAY_WRITE;
5421
5422 const auto old_layout = layout;
5423
5424 if (req->head.args.setlayout.layout.fl_object_size > 0)
5425 layout.object_size = req->head.args.setlayout.layout.fl_object_size;
5426 if (req->head.args.setlayout.layout.fl_stripe_unit > 0)
5427 layout.stripe_unit = req->head.args.setlayout.layout.fl_stripe_unit;
5428 if (req->head.args.setlayout.layout.fl_stripe_count > 0)
5429 layout.stripe_count=req->head.args.setlayout.layout.fl_stripe_count;
5430 if (req->head.args.setlayout.layout.fl_pg_pool > 0) {
5431 layout.pool_id = req->head.args.setlayout.layout.fl_pg_pool;
5432 // make sure we have as new a map as the client
5433 if (req->get_mdsmap_epoch() > mds->mdsmap->get_epoch()) {
5434 mds->wait_for_mdsmap(req->get_mdsmap_epoch(), new C_MDS_RetryRequest(mdcache, mdr));
5435 return;
5436 }
5437 }
5438
5439 if (layout != old_layout) {
5440 access |= MAY_SET_VXATTR;
5441 }
5442
5443 if (!layout.is_valid()) {
5444 dout(10) << "bad layout" << dendl;
5445 respond_to_request(mdr, -CEPHFS_EINVAL);
5446 return;
5447 }
5448 if (!mds->mdsmap->is_data_pool(layout.pool_id)) {
5449 dout(10) << " invalid data pool " << layout.pool_id << dendl;
5450 respond_to_request(mdr, -CEPHFS_EINVAL);
5451 return;
5452 }
5453
5454 if (!check_access(mdr, cur, access))
5455 return;
5456
5457 auto pi = cur->project_inode(mdr);
5458 pi.inode->layout = layout;
5459 pi.inode->version = cur->pre_dirty();
5460
5461 // log + wait
5462 mdr->ls = mdlog->get_current_segment();
5463 EUpdate *le = new EUpdate(mdlog, "setlayout");
5464 mdlog->start_entry(le);
5465 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
5466 mdcache->predirty_journal_parents(mdr, &le->metablob, cur, 0, PREDIRTY_PRIMARY);
5467 mdcache->journal_dirty_inode(mdr.get(), &le->metablob, cur);
5468
5469 mdr->no_early_reply = true;
5470 journal_and_reply(mdr, cur, 0, le, new C_MDS_inode_update_finish(this, mdr, cur));
5471 }
5472
5473 // XATTRS
5474 int Server::parse_layout_vxattr_json(
5475 string name, string value, const OSDMap& osdmap, file_layout_t *layout)
5476 {
5477 auto parse_pool = [&](std::string pool_name, int64_t pool_id) -> int64_t {
5478 if (pool_name != "") {
5479 int64_t _pool_id = osdmap.lookup_pg_pool_name(pool_name);
5480 if (_pool_id < 0) {
5481 dout(10) << __func__ << ": unknown pool name:" << pool_name << dendl;
5482 return -CEPHFS_EINVAL;
5483 }
5484 return _pool_id;
5485 } else if (pool_id >= 0) {
5486 const auto pools = osdmap.get_pools();
5487 if (pools.find(pool_id) == pools.end()) {
5488 dout(10) << __func__ << ": unknown pool id:" << pool_id << dendl;
5489 return -CEPHFS_EINVAL;
5490 }
5491 return pool_id;
5492 } else {
5493 return -CEPHFS_EINVAL;
5494 }
5495 };
5496
5497 try {
5498 if (name == "layout.json") {
5499 JSONParser json_parser;
5500 if (json_parser.parse(value.c_str(), value.length()) and json_parser.is_object()) {
5501 std::string field;
5502 try {
5503 field = "object_size";
5504 JSONDecoder::decode_json("object_size", layout->object_size, &json_parser, true);
5505
5506 field = "stripe_unit";
5507 JSONDecoder::decode_json("stripe_unit", layout->stripe_unit, &json_parser, true);
5508
5509 field = "stripe_count";
5510 JSONDecoder::decode_json("stripe_count", layout->stripe_count, &json_parser, true);
5511
5512 field = "pool_namespace";
5513 JSONDecoder::decode_json("pool_namespace", layout->pool_ns, &json_parser, false);
5514
5515 field = "pool_id";
5516 int64_t pool_id = 0;
5517 JSONDecoder::decode_json("pool_id", pool_id, &json_parser, false);
5518
5519 field = "pool_name";
5520 std::string pool_name;
5521 JSONDecoder::decode_json("pool_name", pool_name, &json_parser, false);
5522
5523 pool_id = parse_pool(pool_name, pool_id);
5524 if (pool_id < 0) {
5525 return (int)pool_id;
5526 }
5527 layout->pool_id = pool_id;
5528 } catch (JSONDecoder::err&) {
5529 dout(10) << __func__ << ": json is missing a mandatory field named "
5530 << field << dendl;
5531 return -CEPHFS_EINVAL;
5532 }
5533 } else {
5534 dout(10) << __func__ << ": bad json" << dendl;
5535 return -CEPHFS_EINVAL;
5536 }
5537 } else {
5538 dout(10) << __func__ << ": unknown layout vxattr " << name << dendl;
5539 return -CEPHFS_ENODATA; // no such attribute
5540 }
5541 } catch (boost::bad_lexical_cast const&) {
5542 dout(10) << __func__ << ": bad vxattr value:" << value
5543 << ", unable to parse for xattr:" << name << dendl;
5544 return -CEPHFS_EINVAL;
5545 }
5546 return 0;
5547 }
5548
5549 // parse old style layout string
5550 int Server::parse_layout_vxattr_string(
5551 string name, string value, const OSDMap& osdmap, file_layout_t *layout)
5552 {
5553 try {
5554 if (name == "layout") {
5555 string::iterator begin = value.begin();
5556 string::iterator end = value.end();
5557 keys_and_values<string::iterator> p; // create instance of parser
5558 std::map<string, string> m; // map to receive results
5559 if (!qi::parse(begin, end, p, m)) { // returns true if successful
5560 return -CEPHFS_EINVAL;
5561 }
5562 string left(begin, end);
5563 dout(10) << __func__ << ": parsed " << m << " left '" << left << "'" << dendl;
5564 if (begin != end)
5565 return -CEPHFS_EINVAL;
5566 for (map<string,string>::iterator q = m.begin(); q != m.end(); ++q) {
5567 // Skip validation on each attr, we do it once at the end (avoid
5568 // rejecting intermediate states if the overall result is ok)
5569 int r = parse_layout_vxattr_string(string("layout.") + q->first, q->second,
5570 osdmap, layout);
5571 if (r < 0)
5572 return r;
5573 }
5574 } else if (name == "layout.object_size") {
5575 layout->object_size = boost::lexical_cast<unsigned>(value);
5576 } else if (name == "layout.stripe_unit") {
5577 layout->stripe_unit = boost::lexical_cast<unsigned>(value);
5578 } else if (name == "layout.stripe_count") {
5579 layout->stripe_count = boost::lexical_cast<unsigned>(value);
5580 } else if (name == "layout.pool") {
5581 try {
5582 layout->pool_id = boost::lexical_cast<unsigned>(value);
5583 } catch (boost::bad_lexical_cast const&) {
5584 int64_t pool = osdmap.lookup_pg_pool_name(value);
5585 if (pool < 0) {
5586 dout(10) << __func__ << ": unknown pool " << value << dendl;
5587 return -CEPHFS_ENOENT;
5588 }
5589 layout->pool_id = pool;
5590 }
5591 } else if (name == "layout.pool_id") {
5592 layout->pool_id = boost::lexical_cast<int64_t>(value);
5593 } else if (name == "layout.pool_name") {
5594 layout->pool_id = osdmap.lookup_pg_pool_name(value);
5595 if (layout->pool_id < 0) {
5596 dout(10) << __func__ << ": unknown pool " << value << dendl;
5597 return -CEPHFS_EINVAL;
5598 }
5599 } else if (name == "layout.pool_namespace") {
5600 layout->pool_ns = value;
5601 } else {
5602 dout(10) << __func__ << ": unknown layout vxattr " << name << dendl;
5603 return -CEPHFS_ENODATA; // no such attribute
5604 }
5605 } catch (boost::bad_lexical_cast const&) {
5606 dout(10) << __func__ << ": bad vxattr value, unable to parse int for "
5607 << name << dendl;
5608 return -CEPHFS_EINVAL;
5609 }
5610 return 0;
5611 }
5612
5613 int Server::parse_layout_vxattr(string name, string value, const OSDMap& osdmap,
5614 file_layout_t *layout, bool validate)
5615 {
5616 dout(20) << __func__ << ": name:" << name << " value:'" << value << "'" << dendl;
5617
5618 int r;
5619 if (name == "layout.json") {
5620 r = parse_layout_vxattr_json(name, value, osdmap, layout);
5621 } else {
5622 r = parse_layout_vxattr_string(name, value, osdmap, layout);
5623 }
5624 if (r < 0) {
5625 return r;
5626 }
5627
5628 if (validate && !layout->is_valid()) {
5629 dout(10) << __func__ << ": bad layout" << dendl;
5630 return -CEPHFS_EINVAL;
5631 }
5632 if (!mds->mdsmap->is_data_pool(layout->pool_id)) {
5633 dout(10) << __func__ << ": invalid data pool " << layout->pool_id << dendl;
5634 return -CEPHFS_EINVAL;
5635 }
5636 return 0;
5637 }
5638
5639 int Server::parse_quota_vxattr(string name, string value, quota_info_t *quota)
5640 {
5641 dout(20) << "parse_quota_vxattr name " << name << " value '" << value << "'" << dendl;
5642 try {
5643 if (name == "quota") {
5644 string::iterator begin = value.begin();
5645 string::iterator end = value.end();
5646 if (begin == end) {
5647 // keep quota unchanged. (for create_quota_realm())
5648 return 0;
5649 }
5650 keys_and_values<string::iterator> p; // create instance of parser
5651 std::map<string, string> m; // map to receive results
5652 if (!qi::parse(begin, end, p, m)) { // returns true if successful
5653 return -CEPHFS_EINVAL;
5654 }
5655 string left(begin, end);
5656 dout(10) << " parsed " << m << " left '" << left << "'" << dendl;
5657 if (begin != end)
5658 return -CEPHFS_EINVAL;
5659 for (map<string,string>::iterator q = m.begin(); q != m.end(); ++q) {
5660 int r = parse_quota_vxattr(string("quota.") + q->first, q->second, quota);
5661 if (r < 0)
5662 return r;
5663 }
5664 } else if (name == "quota.max_bytes") {
5665 int64_t q = boost::lexical_cast<int64_t>(value);
5666 if (q < 0)
5667 return -CEPHFS_EINVAL;
5668 quota->max_bytes = q;
5669 } else if (name == "quota.max_files") {
5670 int64_t q = boost::lexical_cast<int64_t>(value);
5671 if (q < 0)
5672 return -CEPHFS_EINVAL;
5673 quota->max_files = q;
5674 } else {
5675 dout(10) << " unknown quota vxattr " << name << dendl;
5676 return -CEPHFS_EINVAL;
5677 }
5678 } catch (boost::bad_lexical_cast const&) {
5679 dout(10) << "bad vxattr value, unable to parse int for " << name << dendl;
5680 return -CEPHFS_EINVAL;
5681 }
5682
5683 if (!quota->is_valid()) {
5684 dout(10) << "bad quota" << dendl;
5685 return -CEPHFS_EINVAL;
5686 }
5687 return 0;
5688 }
5689
5690 void Server::create_quota_realm(CInode *in)
5691 {
5692 dout(10) << __func__ << " " << *in << dendl;
5693
5694 auto req = make_message<MClientRequest>(CEPH_MDS_OP_SETXATTR);
5695 req->set_filepath(filepath(in->ino()));
5696 req->set_string2("ceph.quota");
5697 // empty vxattr value
5698 req->set_tid(mds->issue_tid());
5699
5700 mds->send_message_mds(req, in->authority().first);
5701 }
5702
5703 /*
5704 * Verify that the file layout attribute carried by client
5705 * is well-formatted.
5706 * Return 0 on success, otherwise this function takes
5707 * responsibility for the passed mdr.
5708 */
5709 int Server::check_layout_vxattr(MDRequestRef& mdr,
5710 string name,
5711 string value,
5712 file_layout_t *layout)
5713 {
5714 const cref_t<MClientRequest> &req = mdr->client_request;
5715 epoch_t epoch;
5716 int r;
5717
5718 mds->objecter->with_osdmap([&](const OSDMap& osdmap) {
5719 r = parse_layout_vxattr(name, value, osdmap, layout);
5720 epoch = osdmap.get_epoch();
5721 });
5722
5723 if (r == -CEPHFS_ENOENT) {
5724
5725 // we don't have the specified pool, make sure our map
5726 // is newer than or as new as the client.
5727 epoch_t req_epoch = req->get_osdmap_epoch();
5728
5729 if (req_epoch > epoch) {
5730
5731 // well, our map is older. consult mds.
5732 auto fin = new C_IO_Wrapper(mds, new C_MDS_RetryRequest(mdcache, mdr));
5733
5734 mds->objecter->wait_for_map(req_epoch, lambdafy(fin));
5735 return r;
5736 } else if (req_epoch == 0 && !mdr->waited_for_osdmap) {
5737
5738 // For compatibility with client w/ old code, we still need get the
5739 // latest map. One day if COMPACT_VERSION of MClientRequest >=3,
5740 // we can remove those code.
5741 mdr->waited_for_osdmap = true;
5742 mds->objecter->wait_for_latest_osdmap(std::ref(*new C_IO_Wrapper(
5743 mds, new C_MDS_RetryRequest(mdcache, mdr))));
5744 return r;
5745 }
5746 }
5747
5748 if (r < 0) {
5749
5750 if (r == -CEPHFS_ENOENT)
5751 r = -CEPHFS_EINVAL;
5752
5753 respond_to_request(mdr, r);
5754 return r;
5755 }
5756
5757 // all is well
5758 return 0;
5759 }
5760
5761 void Server::handle_set_vxattr(MDRequestRef& mdr, CInode *cur)
5762 {
5763 const cref_t<MClientRequest> &req = mdr->client_request;
5764 string name(req->get_path2());
5765 bufferlist bl = req->get_data();
5766 string value (bl.c_str(), bl.length());
5767 dout(10) << "handle_set_vxattr " << name
5768 << " val " << value.length()
5769 << " bytes on " << *cur
5770 << dendl;
5771
5772 CInode::mempool_inode *pip = nullptr;
5773 string rest;
5774
5775 if (!check_access(mdr, cur, MAY_SET_VXATTR)) {
5776 return;
5777 }
5778
5779 bool adjust_realm = false;
5780 if (name.compare(0, 15, "ceph.dir.layout") == 0) {
5781 if (!cur->is_dir()) {
5782 respond_to_request(mdr, -CEPHFS_EINVAL);
5783 return;
5784 }
5785
5786 if (!xlock_policylock(mdr, cur, true))
5787 return;
5788
5789 file_layout_t layout;
5790 if (cur->get_projected_inode()->has_layout())
5791 layout = cur->get_projected_inode()->layout;
5792 else if (mdr->dir_layout != file_layout_t())
5793 layout = mdr->dir_layout;
5794 else
5795 layout = mdcache->default_file_layout;
5796
5797 rest = name.substr(name.find("layout"));
5798 if (check_layout_vxattr(mdr, rest, value, &layout) < 0)
5799 return;
5800
5801 auto pi = cur->project_inode(mdr);
5802 pi.inode->layout = layout;
5803 mdr->no_early_reply = true;
5804 pip = pi.inode.get();
5805 } else if (name.compare(0, 16, "ceph.file.layout") == 0) {
5806 if (!cur->is_file()) {
5807 respond_to_request(mdr, -CEPHFS_EINVAL);
5808 return;
5809 }
5810 if (cur->get_projected_inode()->size ||
5811 cur->get_projected_inode()->truncate_seq > 1) {
5812 respond_to_request(mdr, -CEPHFS_ENOTEMPTY);
5813 return;
5814 }
5815 file_layout_t layout = cur->get_projected_inode()->layout;
5816 rest = name.substr(name.find("layout"));
5817 if (check_layout_vxattr(mdr, rest, value, &layout) < 0)
5818 return;
5819
5820 MutationImpl::LockOpVec lov;
5821 lov.add_xlock(&cur->filelock);
5822 if (!mds->locker->acquire_locks(mdr, lov))
5823 return;
5824
5825 auto pi = cur->project_inode(mdr);
5826 int64_t old_pool = pi.inode->layout.pool_id;
5827 pi.inode->add_old_pool(old_pool);
5828 pi.inode->layout = layout;
5829 pip = pi.inode.get();
5830 } else if (name.compare(0, 10, "ceph.quota") == 0) {
5831 if (!cur->is_dir()) {
5832 respond_to_request(mdr, -CEPHFS_EINVAL);
5833 return;
5834 }
5835
5836 quota_info_t quota = cur->get_projected_inode()->quota;
5837
5838 rest = name.substr(name.find("quota"));
5839 int r = parse_quota_vxattr(rest, value, &quota);
5840 if (r < 0) {
5841 respond_to_request(mdr, r);
5842 return;
5843 }
5844
5845 if (quota.is_enable() && !cur->get_projected_srnode())
5846 adjust_realm = true;
5847
5848 if (!xlock_policylock(mdr, cur, false, adjust_realm))
5849 return;
5850
5851 if (cur->get_projected_inode()->quota == quota) {
5852 respond_to_request(mdr, 0);
5853 return;
5854 }
5855
5856 auto pi = cur->project_inode(mdr, false, adjust_realm);
5857 pi.inode->quota = quota;
5858
5859 if (adjust_realm)
5860 pi.snapnode->created = pi.snapnode->seq = cur->find_snaprealm()->get_newest_seq();
5861
5862 mdr->no_early_reply = true;
5863 pip = pi.inode.get();
5864
5865 client_t exclude_ct = mdr->get_client();
5866 mdcache->broadcast_quota_to_client(cur, exclude_ct, true);
5867 } else if (name == "ceph.dir.subvolume"sv) {
5868 if (!cur->is_dir()) {
5869 respond_to_request(mdr, -CEPHFS_EINVAL);
5870 return;
5871 }
5872
5873 bool val;
5874 try {
5875 val = boost::lexical_cast<bool>(value);
5876 } catch (boost::bad_lexical_cast const&) {
5877 dout(10) << "bad vxattr value, unable to parse bool for " << name << dendl;
5878 respond_to_request(mdr, -CEPHFS_EINVAL);
5879 return;
5880 }
5881
5882 /* Verify it's not already a subvolume with lighter weight
5883 * rdlock.
5884 */
5885 if (!mdr->more()->rdonly_checks) {
5886 if (!(mdr->locking_state & MutationImpl::ALL_LOCKED)) {
5887 MutationImpl::LockOpVec lov;
5888 lov.add_rdlock(&cur->snaplock);
5889 if (!mds->locker->acquire_locks(mdr, lov))
5890 return;
5891 mdr->locking_state |= MutationImpl::ALL_LOCKED;
5892 }
5893 const auto srnode = cur->get_projected_srnode();
5894 if (val == (srnode && srnode->is_subvolume())) {
5895 dout(20) << "already marked subvolume" << dendl;
5896 respond_to_request(mdr, 0);
5897 return;
5898 }
5899 mdr->more()->rdonly_checks = true;
5900 }
5901
5902 if ((mdr->locking_state & MutationImpl::ALL_LOCKED) && !mdr->is_xlocked(&cur->snaplock)) {
5903 /* drop the rdlock and acquire xlocks */
5904 dout(20) << "dropping rdlocks" << dendl;
5905 mds->locker->drop_locks(mdr.get());
5906 if (!xlock_policylock(mdr, cur, false, true))
5907 return;
5908 }
5909
5910 /* repeat rdonly checks in case changed between rdlock -> xlock */
5911 SnapRealm *realm = cur->find_snaprealm();
5912 if (val) {
5913 inodeno_t subvol_ino = realm->get_subvolume_ino();
5914 // can't create subvolume inside another subvolume
5915 if (subvol_ino && subvol_ino != cur->ino()) {
5916 respond_to_request(mdr, -CEPHFS_EINVAL);
5917 return;
5918 }
5919 }
5920
5921 const auto srnode = cur->get_projected_srnode();
5922 if (val == (srnode && srnode->is_subvolume())) {
5923 respond_to_request(mdr, 0);
5924 return;
5925 }
5926
5927 auto pi = cur->project_inode(mdr, false, true);
5928 if (!srnode)
5929 pi.snapnode->created = pi.snapnode->seq = realm->get_newest_seq();
5930 if (val)
5931 pi.snapnode->mark_subvolume();
5932 else
5933 pi.snapnode->clear_subvolume();
5934
5935 mdr->no_early_reply = true;
5936 pip = pi.inode.get();
5937 adjust_realm = true;
5938 } else if (name == "ceph.dir.pin"sv) {
5939 if (!cur->is_dir() || cur->is_root()) {
5940 respond_to_request(mdr, -CEPHFS_EINVAL);
5941 return;
5942 }
5943
5944 mds_rank_t rank;
5945 try {
5946 rank = boost::lexical_cast<mds_rank_t>(value);
5947 if (rank < 0) rank = MDS_RANK_NONE;
5948 else if (rank >= MAX_MDS) {
5949 respond_to_request(mdr, -CEPHFS_EDOM);
5950 return;
5951 }
5952 } catch (boost::bad_lexical_cast const&) {
5953 dout(10) << "bad vxattr value, unable to parse int for " << name << dendl;
5954 respond_to_request(mdr, -CEPHFS_EINVAL);
5955 return;
5956 }
5957
5958 if (!xlock_policylock(mdr, cur))
5959 return;
5960
5961 auto pi = cur->project_inode(mdr);
5962 cur->set_export_pin(rank);
5963 pip = pi.inode.get();
5964 } else if (name == "ceph.dir.pin.random"sv) {
5965 if (!cur->is_dir() || cur->is_root()) {
5966 respond_to_request(mdr, -CEPHFS_EINVAL);
5967 return;
5968 }
5969
5970 double val;
5971 try {
5972 val = boost::lexical_cast<double>(value);
5973 } catch (boost::bad_lexical_cast const&) {
5974 dout(10) << "bad vxattr value, unable to parse float for " << name << dendl;
5975 respond_to_request(mdr, -CEPHFS_EINVAL);
5976 return;
5977 }
5978
5979 if (val < 0.0 || 1.0 < val) {
5980 respond_to_request(mdr, -CEPHFS_EDOM);
5981 return;
5982 } else if (mdcache->export_ephemeral_random_max < val) {
5983 respond_to_request(mdr, -CEPHFS_EINVAL);
5984 return;
5985 }
5986
5987 if (!xlock_policylock(mdr, cur))
5988 return;
5989
5990 auto pi = cur->project_inode(mdr);
5991 cur->setxattr_ephemeral_rand(val);
5992 pip = pi.inode.get();
5993 } else if (name == "ceph.dir.pin.distributed"sv) {
5994 if (!cur->is_dir() || cur->is_root()) {
5995 respond_to_request(mdr, -CEPHFS_EINVAL);
5996 return;
5997 }
5998
5999 bool val;
6000 try {
6001 val = boost::lexical_cast<bool>(value);
6002 } catch (boost::bad_lexical_cast const&) {
6003 dout(10) << "bad vxattr value, unable to parse bool for " << name << dendl;
6004 respond_to_request(mdr, -CEPHFS_EINVAL);
6005 return;
6006 }
6007
6008 if (!xlock_policylock(mdr, cur))
6009 return;
6010
6011 auto pi = cur->project_inode(mdr);
6012 cur->setxattr_ephemeral_dist(val);
6013 pip = pi.inode.get();
6014 } else {
6015 dout(10) << " unknown vxattr " << name << dendl;
6016 respond_to_request(mdr, -CEPHFS_EINVAL);
6017 return;
6018 }
6019
6020 pip->change_attr++;
6021 pip->ctime = mdr->get_op_stamp();
6022 if (mdr->get_op_stamp() > pip->rstat.rctime)
6023 pip->rstat.rctime = mdr->get_op_stamp();
6024 pip->version = cur->pre_dirty();
6025 if (cur->is_file())
6026 pip->update_backtrace();
6027
6028 // log + wait
6029 mdr->ls = mdlog->get_current_segment();
6030 EUpdate *le = new EUpdate(mdlog, "set vxattr layout");
6031 mdlog->start_entry(le);
6032 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
6033 mdcache->predirty_journal_parents(mdr, &le->metablob, cur, 0, PREDIRTY_PRIMARY);
6034 mdcache->journal_dirty_inode(mdr.get(), &le->metablob, cur);
6035
6036 journal_and_reply(mdr, cur, 0, le, new C_MDS_inode_update_finish(this, mdr, cur,
6037 false, false, adjust_realm));
6038 return;
6039 }
6040
6041 void Server::handle_remove_vxattr(MDRequestRef& mdr, CInode *cur)
6042 {
6043 const cref_t<MClientRequest> &req = mdr->client_request;
6044 string name(req->get_path2());
6045
6046 dout(10) << __func__ << " " << name << " on " << *cur << dendl;
6047
6048 if (name == "ceph.dir.layout") {
6049 if (!cur->is_dir()) {
6050 respond_to_request(mdr, -CEPHFS_ENODATA);
6051 return;
6052 }
6053 if (cur->is_root()) {
6054 dout(10) << "can't remove layout policy on the root directory" << dendl;
6055 respond_to_request(mdr, -CEPHFS_EINVAL);
6056 return;
6057 }
6058
6059 if (!cur->get_projected_inode()->has_layout()) {
6060 respond_to_request(mdr, -CEPHFS_ENODATA);
6061 return;
6062 }
6063
6064 MutationImpl::LockOpVec lov;
6065 lov.add_xlock(&cur->policylock);
6066 if (!mds->locker->acquire_locks(mdr, lov))
6067 return;
6068
6069 auto pi = cur->project_inode(mdr);
6070 pi.inode->clear_layout();
6071 pi.inode->version = cur->pre_dirty();
6072
6073 // log + wait
6074 mdr->ls = mdlog->get_current_segment();
6075 EUpdate *le = new EUpdate(mdlog, "remove dir layout vxattr");
6076 mdlog->start_entry(le);
6077 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
6078 mdcache->predirty_journal_parents(mdr, &le->metablob, cur, 0, PREDIRTY_PRIMARY);
6079 mdcache->journal_dirty_inode(mdr.get(), &le->metablob, cur);
6080
6081 mdr->no_early_reply = true;
6082 journal_and_reply(mdr, cur, 0, le, new C_MDS_inode_update_finish(this, mdr, cur));
6083 return;
6084 } else if (name == "ceph.dir.layout.pool_namespace"
6085 || name == "ceph.file.layout.pool_namespace") {
6086 // Namespace is the only layout field that has a meaningful
6087 // null/none value (empty string, means default layout). Is equivalent
6088 // to a setxattr with empty string: pass through the empty payload of
6089 // the rmxattr request to do this.
6090 handle_set_vxattr(mdr, cur);
6091 return;
6092 }
6093
6094 respond_to_request(mdr, -CEPHFS_ENODATA);
6095 }
6096
6097 const Server::XattrHandler Server::xattr_handlers[] = {
6098 {
6099 xattr_name: Server::DEFAULT_HANDLER,
6100 description: "default xattr handler",
6101 validate: &Server::default_xattr_validate,
6102 setxattr: &Server::default_setxattr_handler,
6103 removexattr: &Server::default_removexattr_handler,
6104 },
6105 {
6106 xattr_name: "ceph.mirror.info",
6107 description: "mirror info xattr handler",
6108 validate: &Server::mirror_info_xattr_validate,
6109 setxattr: &Server::mirror_info_setxattr_handler,
6110 removexattr: &Server::mirror_info_removexattr_handler
6111 },
6112 };
6113
6114 const Server::XattrHandler* Server::get_xattr_or_default_handler(std::string_view xattr_name) {
6115 const XattrHandler *default_xattr_handler = nullptr;
6116
6117 for (auto &handler : xattr_handlers) {
6118 if (handler.xattr_name == Server::DEFAULT_HANDLER) {
6119 ceph_assert(default_xattr_handler == nullptr);
6120 default_xattr_handler = &handler;
6121 }
6122 if (handler.xattr_name == xattr_name) {
6123 dout(20) << "handler=" << handler.description << dendl;
6124 return &handler;
6125 }
6126 }
6127
6128 ceph_assert(default_xattr_handler != nullptr);
6129 dout(20) << "handler=" << default_xattr_handler->description << dendl;
6130 return default_xattr_handler;
6131 }
6132
6133 int Server::xattr_validate(CInode *cur, const InodeStoreBase::xattr_map_const_ptr xattrs,
6134 const std::string &xattr_name, int op, int flags) {
6135 if (op == CEPH_MDS_OP_SETXATTR) {
6136 if (xattrs) {
6137 if ((flags & CEPH_XATTR_CREATE) && xattrs->count(mempool::mds_co::string(xattr_name))) {
6138 dout(10) << "setxattr '" << xattr_name << "' XATTR_CREATE and CEPHFS_EEXIST on " << *cur << dendl;
6139 return -CEPHFS_EEXIST;
6140 }
6141 }
6142 if ((flags & CEPH_XATTR_REPLACE) && !(xattrs && xattrs->count(mempool::mds_co::string(xattr_name)))) {
6143 dout(10) << "setxattr '" << xattr_name << "' XATTR_REPLACE and CEPHFS_ENODATA on " << *cur << dendl;
6144 return -CEPHFS_ENODATA;
6145 }
6146
6147 return 0;
6148 }
6149
6150 if (op == CEPH_MDS_OP_RMXATTR) {
6151 if (!xattrs || xattrs->count(mempool::mds_co::string(xattr_name)) == 0) {
6152 dout(10) << "removexattr '" << xattr_name << "' and CEPHFS_ENODATA on " << *cur << dendl;
6153 return -CEPHFS_ENODATA;
6154 }
6155
6156 return 0;
6157 }
6158
6159 derr << ": unhandled validation for: " << xattr_name << dendl;
6160 return -CEPHFS_EINVAL;
6161 }
6162
6163 void Server::xattr_set(InodeStoreBase::xattr_map_ptr xattrs, const std::string &xattr_name,
6164 const bufferlist &xattr_value) {
6165 size_t len = xattr_value.length();
6166 bufferptr b = buffer::create(len);
6167 if (len) {
6168 xattr_value.begin().copy(len, b.c_str());
6169 }
6170 auto em = xattrs->emplace(std::piecewise_construct,
6171 std::forward_as_tuple(mempool::mds_co::string(xattr_name)),
6172 std::forward_as_tuple(b));
6173 if (!em.second) {
6174 em.first->second = b;
6175 }
6176 }
6177
6178 void Server::xattr_rm(InodeStoreBase::xattr_map_ptr xattrs, const std::string &xattr_name) {
6179 xattrs->erase(mempool::mds_co::string(xattr_name));
6180 }
6181
6182 int Server::default_xattr_validate(CInode *cur, const InodeStoreBase::xattr_map_const_ptr xattrs,
6183 XattrOp *xattr_op) {
6184 return xattr_validate(cur, xattrs, xattr_op->xattr_name, xattr_op->op, xattr_op->flags);
6185 }
6186
6187 void Server::default_setxattr_handler(CInode *cur, InodeStoreBase::xattr_map_ptr xattrs,
6188 const XattrOp &xattr_op) {
6189 xattr_set(xattrs, xattr_op.xattr_name, xattr_op.xattr_value);
6190 }
6191
6192 void Server::default_removexattr_handler(CInode *cur, InodeStoreBase::xattr_map_ptr xattrs,
6193 const XattrOp &xattr_op) {
6194 xattr_rm(xattrs, xattr_op.xattr_name);
6195 }
6196
6197 // mirror info xattr handlers
6198 const std::string Server::MirrorXattrInfo::MIRROR_INFO_REGEX = "^cluster_id=([a-f0-9]{8}-" \
6199 "[a-f0-9]{4}-[a-f0-9]{4}-" \
6200 "[a-f0-9]{4}-[a-f0-9]{12})" \
6201 " fs_id=(\\d+)$";
6202 const std::string Server::MirrorXattrInfo::CLUSTER_ID = "ceph.mirror.info.cluster_id";
6203 const std::string Server::MirrorXattrInfo::FS_ID = "ceph.mirror.info.fs_id";
6204 int Server::parse_mirror_info_xattr(const std::string &name, const std::string &value,
6205 std::string &cluster_id, std::string &fs_id) {
6206 dout(20) << "parsing name=" << name << ", value=" << value << dendl;
6207
6208 static const std::regex regex(Server::MirrorXattrInfo::MIRROR_INFO_REGEX);
6209 std::smatch match;
6210
6211 std::regex_search(value, match, regex);
6212 if (match.size() != 3) {
6213 derr << "mirror info parse error" << dendl;
6214 return -CEPHFS_EINVAL;
6215 }
6216
6217 cluster_id = match[1];
6218 fs_id = match[2];
6219 dout(20) << " parsed cluster_id=" << cluster_id << ", fs_id=" << fs_id << dendl;
6220 return 0;
6221 }
6222
6223 int Server::mirror_info_xattr_validate(CInode *cur, const InodeStoreBase::xattr_map_const_ptr xattrs,
6224 XattrOp *xattr_op) {
6225 if (!cur->is_root()) {
6226 return -CEPHFS_EINVAL;
6227 }
6228
6229 int v1 = xattr_validate(cur, xattrs, Server::MirrorXattrInfo::CLUSTER_ID, xattr_op->op, xattr_op->flags);
6230 int v2 = xattr_validate(cur, xattrs, Server::MirrorXattrInfo::FS_ID, xattr_op->op, xattr_op->flags);
6231 if (v1 != v2) {
6232 derr << "inconsistent mirror info state (" << v1 << "," << v2 << ")" << dendl;
6233 return -CEPHFS_EINVAL;
6234 }
6235
6236 if (v1 < 0) {
6237 return v1;
6238 }
6239
6240 if (xattr_op->op == CEPH_MDS_OP_RMXATTR) {
6241 return 0;
6242 }
6243
6244 std::string cluster_id;
6245 std::string fs_id;
6246 int r = parse_mirror_info_xattr(xattr_op->xattr_name, xattr_op->xattr_value.to_str(),
6247 cluster_id, fs_id);
6248 if (r < 0) {
6249 return r;
6250 }
6251
6252 xattr_op->xinfo = std::make_unique<MirrorXattrInfo>(cluster_id, fs_id);
6253 return 0;
6254 }
6255
6256 void Server::mirror_info_setxattr_handler(CInode *cur, InodeStoreBase::xattr_map_ptr xattrs,
6257 const XattrOp &xattr_op) {
6258 auto mirror_info = dynamic_cast<MirrorXattrInfo&>(*(xattr_op.xinfo));
6259
6260 bufferlist bl;
6261 bl.append(mirror_info.cluster_id.c_str(), mirror_info.cluster_id.length());
6262 xattr_set(xattrs, Server::MirrorXattrInfo::CLUSTER_ID, bl);
6263
6264 bl.clear();
6265 bl.append(mirror_info.fs_id.c_str(), mirror_info.fs_id.length());
6266 xattr_set(xattrs, Server::MirrorXattrInfo::FS_ID, bl);
6267 }
6268
6269 void Server::mirror_info_removexattr_handler(CInode *cur, InodeStoreBase::xattr_map_ptr xattrs,
6270 const XattrOp &xattr_op) {
6271 xattr_rm(xattrs, Server::MirrorXattrInfo::CLUSTER_ID);
6272 xattr_rm(xattrs, Server::MirrorXattrInfo::FS_ID);
6273 }
6274
6275 void Server::handle_client_setxattr(MDRequestRef& mdr)
6276 {
6277 const cref_t<MClientRequest> &req = mdr->client_request;
6278 string name(req->get_path2());
6279
6280 // is a ceph virtual xattr?
6281 if (is_ceph_vxattr(name)) {
6282 // can't use rdlock_path_pin_ref because we need to xlock snaplock/policylock
6283 CInode *cur = try_get_auth_inode(mdr, req->get_filepath().get_ino());
6284 if (!cur)
6285 return;
6286
6287 handle_set_vxattr(mdr, cur);
6288 return;
6289 }
6290
6291 if (!is_allowed_ceph_xattr(name)) {
6292 respond_to_request(mdr, -CEPHFS_EINVAL);
6293 return;
6294 }
6295
6296 CInode *cur = rdlock_path_pin_ref(mdr, true);
6297 if (!cur)
6298 return;
6299
6300 if (mdr->snapid != CEPH_NOSNAP) {
6301 respond_to_request(mdr, -CEPHFS_EROFS);
6302 return;
6303 }
6304
6305 int flags = req->head.args.setxattr.flags;
6306
6307 MutationImpl::LockOpVec lov;
6308 lov.add_xlock(&cur->xattrlock);
6309 if (!mds->locker->acquire_locks(mdr, lov))
6310 return;
6311
6312 if (!check_access(mdr, cur, MAY_WRITE))
6313 return;
6314
6315 size_t len = req->get_data().length();
6316 size_t inc = len + name.length();
6317
6318 auto handler = Server::get_xattr_or_default_handler(name);
6319 const auto& pxattrs = cur->get_projected_xattrs();
6320 if (pxattrs) {
6321 // check xattrs kv pairs size
6322 size_t cur_xattrs_size = 0;
6323 for (const auto& p : *pxattrs) {
6324 if ((flags & CEPH_XATTR_REPLACE) && name.compare(p.first) == 0) {
6325 continue;
6326 }
6327 cur_xattrs_size += p.first.length() + p.second.length();
6328 }
6329
6330 if (((cur_xattrs_size + inc) > g_conf()->mds_max_xattr_pairs_size)) {
6331 dout(10) << "xattr kv pairs size too big. cur_xattrs_size "
6332 << cur_xattrs_size << ", inc " << inc << dendl;
6333 respond_to_request(mdr, -CEPHFS_ENOSPC);
6334 return;
6335 }
6336 }
6337
6338 XattrOp xattr_op(CEPH_MDS_OP_SETXATTR, name, req->get_data(), flags);
6339 int r = std::invoke(handler->validate, this, cur, pxattrs, &xattr_op);
6340 if (r < 0) {
6341 respond_to_request(mdr, r);
6342 return;
6343 }
6344
6345 dout(10) << "setxattr '" << name << "' len " << len << " on " << *cur << dendl;
6346
6347 // project update
6348 auto pi = cur->project_inode(mdr, true);
6349 pi.inode->version = cur->pre_dirty();
6350 pi.inode->ctime = mdr->get_op_stamp();
6351 if (mdr->get_op_stamp() > pi.inode->rstat.rctime)
6352 pi.inode->rstat.rctime = mdr->get_op_stamp();
6353 if (name == "encryption.ctx"sv)
6354 pi.inode->fscrypt = true;
6355 pi.inode->change_attr++;
6356 pi.inode->xattr_version++;
6357
6358 if ((flags & CEPH_XATTR_REMOVE)) {
6359 std::invoke(handler->removexattr, this, cur, pi.xattrs, xattr_op);
6360 } else {
6361 std::invoke(handler->setxattr, this, cur, pi.xattrs, xattr_op);
6362 }
6363
6364 // log + wait
6365 mdr->ls = mdlog->get_current_segment();
6366 EUpdate *le = new EUpdate(mdlog, "setxattr");
6367 mdlog->start_entry(le);
6368 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
6369 mdcache->predirty_journal_parents(mdr, &le->metablob, cur, 0, PREDIRTY_PRIMARY);
6370 mdcache->journal_dirty_inode(mdr.get(), &le->metablob, cur);
6371
6372 journal_and_reply(mdr, cur, 0, le, new C_MDS_inode_update_finish(this, mdr, cur));
6373 }
6374
6375 void Server::handle_client_removexattr(MDRequestRef& mdr)
6376 {
6377 const cref_t<MClientRequest> &req = mdr->client_request;
6378 std::string name(req->get_path2());
6379
6380 // is a ceph virtual xattr?
6381 if (is_ceph_vxattr(name)) {
6382 // can't use rdlock_path_pin_ref because we need to xlock snaplock/policylock
6383 CInode *cur = try_get_auth_inode(mdr, req->get_filepath().get_ino());
6384 if (!cur)
6385 return;
6386
6387 handle_remove_vxattr(mdr, cur);
6388 return;
6389 }
6390
6391 if (!is_allowed_ceph_xattr(name)) {
6392 respond_to_request(mdr, -CEPHFS_EINVAL);
6393 return;
6394 }
6395
6396 CInode* cur = rdlock_path_pin_ref(mdr, true);
6397 if (!cur)
6398 return;
6399
6400 if (mdr->snapid != CEPH_NOSNAP) {
6401 respond_to_request(mdr, -CEPHFS_EROFS);
6402 return;
6403 }
6404
6405 MutationImpl::LockOpVec lov;
6406 lov.add_xlock(&cur->xattrlock);
6407 if (!mds->locker->acquire_locks(mdr, lov))
6408 return;
6409
6410
6411 auto handler = Server::get_xattr_or_default_handler(name);
6412 bufferlist bl;
6413 XattrOp xattr_op(CEPH_MDS_OP_RMXATTR, name, bl, 0);
6414
6415 const auto& pxattrs = cur->get_projected_xattrs();
6416 int r = std::invoke(handler->validate, this, cur, pxattrs, &xattr_op);
6417 if (r < 0) {
6418 respond_to_request(mdr, r);
6419 return;
6420 }
6421
6422 dout(10) << "removexattr '" << name << "' on " << *cur << dendl;
6423
6424 // project update
6425 auto pi = cur->project_inode(mdr, true);
6426 pi.inode->version = cur->pre_dirty();
6427 pi.inode->ctime = mdr->get_op_stamp();
6428 if (mdr->get_op_stamp() > pi.inode->rstat.rctime)
6429 pi.inode->rstat.rctime = mdr->get_op_stamp();
6430 pi.inode->change_attr++;
6431 pi.inode->xattr_version++;
6432 std::invoke(handler->removexattr, this, cur, pi.xattrs, xattr_op);
6433
6434 // log + wait
6435 mdr->ls = mdlog->get_current_segment();
6436 EUpdate *le = new EUpdate(mdlog, "removexattr");
6437 mdlog->start_entry(le);
6438 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
6439 mdcache->predirty_journal_parents(mdr, &le->metablob, cur, 0, PREDIRTY_PRIMARY);
6440 mdcache->journal_dirty_inode(mdr.get(), &le->metablob, cur);
6441
6442 journal_and_reply(mdr, cur, 0, le, new C_MDS_inode_update_finish(this, mdr, cur));
6443 }
6444
6445 void Server::handle_client_getvxattr(MDRequestRef& mdr)
6446 {
6447 const auto& req = mdr->client_request;
6448 string xattr_name{req->get_path2()};
6449
6450 // is a ceph virtual xattr?
6451 if (!is_ceph_vxattr(xattr_name)) {
6452 respond_to_request(mdr, -CEPHFS_ENODATA);
6453 return;
6454 }
6455
6456 CInode *cur = rdlock_path_pin_ref(mdr, true, false);
6457 if (!cur) {
6458 return;
6459 }
6460
6461 if (is_ceph_dir_vxattr(xattr_name)) {
6462 if (!cur->is_dir()) {
6463 respond_to_request(mdr, -CEPHFS_ENODATA);
6464 return;
6465 }
6466 } else if (is_ceph_file_vxattr(xattr_name)) {
6467 if (cur->is_dir()) {
6468 respond_to_request(mdr, -CEPHFS_ENODATA);
6469 return;
6470 }
6471 }
6472
6473 CachedStackStringStream css;
6474 int r = 0;
6475 ceph::bufferlist bl;
6476 // handle these vxattrs
6477 if ((xattr_name.substr(0, 15) == "ceph.dir.layout"sv) ||
6478 (xattr_name.substr(0, 16) == "ceph.file.layout"sv)) {
6479 std::string layout_field;
6480
6481 struct layout_xattr_info_t {
6482 enum class InheritanceStatus : uint32_t {
6483 DEFAULT = 0,
6484 SET = 1,
6485 INHERITED = 2
6486 };
6487
6488 const file_layout_t layout;
6489 const InheritanceStatus status;
6490
6491 layout_xattr_info_t(const file_layout_t& l, InheritanceStatus inh)
6492 : layout(l), status(inh) { }
6493
6494 static std::string status_to_string(InheritanceStatus status) {
6495 switch (status) {
6496 case InheritanceStatus::DEFAULT: return "default"s;
6497 case InheritanceStatus::SET: return "set"s;
6498 case InheritanceStatus::INHERITED: return "inherited"s;
6499 default: return "unknown"s;
6500 }
6501 }
6502 };
6503
6504 auto is_default_layout = [&](const file_layout_t& layout) -> bool {
6505 return (layout == mdcache->default_file_layout);
6506 };
6507 auto get_inherited_layout = [&](CInode *cur) -> layout_xattr_info_t {
6508 auto orig_in = cur;
6509
6510 while (cur) {
6511 if (cur->get_projected_inode()->has_layout()) {
6512 auto& curr_layout = cur->get_projected_inode()->layout;
6513 if (is_default_layout(curr_layout)) {
6514 return {curr_layout, layout_xattr_info_t::InheritanceStatus::DEFAULT};
6515 }
6516 if (cur == orig_in) {
6517 // we've found a new layout at this inode
6518 return {curr_layout, layout_xattr_info_t::InheritanceStatus::SET};
6519 } else {
6520 return {curr_layout, layout_xattr_info_t::InheritanceStatus::INHERITED};
6521 }
6522 }
6523
6524 if (cur->is_root()) {
6525 break;
6526 }
6527
6528 cur = cur->get_projected_parent_dir()->get_inode();
6529 }
6530 mds->clog->error() << "no layout found at root dir!";
6531 ceph_abort("no layout found at root dir! something is really messed up with layouts!");
6532 };
6533
6534 if (xattr_name == "ceph.dir.layout.json"sv ||
6535 xattr_name == "ceph.file.layout.json"sv) {
6536 // fetch layout only for valid xattr_name
6537 const auto lxi = get_inherited_layout(cur);
6538
6539 *css << "{\"stripe_unit\": " << lxi.layout.stripe_unit
6540 << ", \"stripe_count\": " << lxi.layout.stripe_count
6541 << ", \"object_size\": " << lxi.layout.object_size
6542 << ", \"pool_name\": ";
6543 mds->objecter->with_osdmap([lxi, &css](const OSDMap& o) {
6544 *css << "\"";
6545 if (o.have_pg_pool(lxi.layout.pool_id)) {
6546 *css << o.get_pool_name(lxi.layout.pool_id);
6547 }
6548 *css << "\"";
6549 });
6550 *css << ", \"pool_id\": " << (uint64_t)lxi.layout.pool_id;
6551 *css << ", \"pool_namespace\": \"" << lxi.layout.pool_ns << "\"";
6552 *css << ", \"inheritance\": \"@"
6553 << layout_xattr_info_t::status_to_string(lxi.status) << "\"}";
6554 } else if ((xattr_name == "ceph.dir.layout.pool_name"sv) ||
6555 (xattr_name == "ceph.file.layout.pool_name"sv)) {
6556 // fetch layout only for valid xattr_name
6557 const auto lxi = get_inherited_layout(cur);
6558 mds->objecter->with_osdmap([lxi, &css](const OSDMap& o) {
6559 if (o.have_pg_pool(lxi.layout.pool_id)) {
6560 *css << o.get_pool_name(lxi.layout.pool_id);
6561 }
6562 });
6563 } else if ((xattr_name == "ceph.dir.layout.pool_id"sv) ||
6564 (xattr_name == "ceph.file.layout.pool_id"sv)) {
6565 // fetch layout only for valid xattr_name
6566 const auto lxi = get_inherited_layout(cur);
6567 *css << (uint64_t)lxi.layout.pool_id;
6568 } else {
6569 r = -CEPHFS_ENODATA; // no such attribute
6570 }
6571 } else if (xattr_name.substr(0, 12) == "ceph.dir.pin"sv) {
6572 if (xattr_name == "ceph.dir.pin"sv) {
6573 *css << cur->get_projected_inode()->export_pin;
6574 } else if (xattr_name == "ceph.dir.pin.random"sv) {
6575 *css << cur->get_projected_inode()->export_ephemeral_random_pin;
6576 } else if (xattr_name == "ceph.dir.pin.distributed"sv) {
6577 *css << cur->get_projected_inode()->export_ephemeral_distributed_pin;
6578 } else {
6579 // otherwise respond as invalid request
6580 // since we only handle ceph vxattrs here
6581 r = -CEPHFS_ENODATA; // no such attribute
6582 }
6583 } else {
6584 // otherwise respond as invalid request
6585 // since we only handle ceph vxattrs here
6586 r = -CEPHFS_ENODATA; // no such attribute
6587 }
6588
6589 if (r == 0) {
6590 ENCODE_START(1, 1, bl);
6591 encode(css->strv(), bl);
6592 ENCODE_FINISH(bl);
6593 mdr->reply_extra_bl = bl;
6594 }
6595
6596 respond_to_request(mdr, r);
6597 }
6598
6599 // =================================================================
6600 // DIRECTORY and NAMESPACE OPS
6601
6602
6603 // ------------------------------------------------
6604
6605 // MKNOD
6606
6607 class C_MDS_mknod_finish : public ServerLogContext {
6608 CDentry *dn;
6609 CInode *newi;
6610 public:
6611 C_MDS_mknod_finish(Server *s, MDRequestRef& r, CDentry *d, CInode *ni) :
6612 ServerLogContext(s, r), dn(d), newi(ni) {}
6613 void finish(int r) override {
6614 ceph_assert(r == 0);
6615
6616 // link the inode
6617 dn->pop_projected_linkage();
6618
6619 // be a bit hacky with the inode version, here.. we decrement it
6620 // just to keep mark_dirty() happen. (we didn't bother projecting
6621 // a new version of hte inode since it's just been created)
6622 newi->mark_dirty(mdr->ls);
6623 newi->mark_dirty_parent(mdr->ls, true);
6624
6625 // mkdir?
6626 if (newi->is_dir()) {
6627 CDir *dir = newi->get_dirfrag(frag_t());
6628 ceph_assert(dir);
6629 dir->mark_dirty(mdr->ls);
6630 dir->mark_new(mdr->ls);
6631 }
6632
6633 mdr->apply();
6634
6635 MDRequestRef null_ref;
6636 get_mds()->mdcache->send_dentry_link(dn, null_ref);
6637
6638 if (newi->is_file()) {
6639 get_mds()->locker->share_inode_max_size(newi);
6640 } else if (newi->is_dir()) {
6641 // We do this now so that the linkages on the new directory are stable.
6642 newi->maybe_ephemeral_rand();
6643 }
6644
6645 // hit pop
6646 get_mds()->balancer->hit_inode(newi, META_POP_IWR);
6647
6648 // reply
6649 server->respond_to_request(mdr, 0);
6650 }
6651 };
6652
6653
6654 void Server::handle_client_mknod(MDRequestRef& mdr)
6655 {
6656 const cref_t<MClientRequest> &req = mdr->client_request;
6657 client_t client = mdr->get_client();
6658
6659 unsigned mode = req->head.args.mknod.mode;
6660 if ((mode & S_IFMT) == 0)
6661 mode |= S_IFREG;
6662
6663 mdr->disable_lock_cache();
6664 CDentry *dn = rdlock_path_xlock_dentry(mdr, true, false, S_ISREG(mode));
6665 if (!dn)
6666 return;
6667
6668 CDir *dir = dn->get_dir();
6669 CInode *diri = dir->get_inode();
6670 if (!check_access(mdr, diri, MAY_WRITE))
6671 return;
6672 if (!check_fragment_space(mdr, dir))
6673 return;
6674 if (!check_dir_max_entries(mdr, dir))
6675 return;
6676
6677 ceph_assert(dn->get_projected_linkage()->is_null());
6678 if (req->get_alternate_name().size() > alternate_name_max) {
6679 dout(10) << " alternate_name longer than " << alternate_name_max << dendl;
6680 respond_to_request(mdr, -CEPHFS_ENAMETOOLONG);
6681 return;
6682 }
6683 dn->set_alternate_name(req->get_alternate_name());
6684
6685 // set layout
6686 file_layout_t layout;
6687 if (mdr->dir_layout != file_layout_t())
6688 layout = mdr->dir_layout;
6689 else
6690 layout = mdcache->default_file_layout;
6691
6692 CInode *newi = prepare_new_inode(mdr, dn->get_dir(), inodeno_t(req->head.ino), mode, &layout);
6693 ceph_assert(newi);
6694
6695 dn->push_projected_linkage(newi);
6696
6697 auto _inode = newi->_get_inode();
6698 _inode->version = dn->pre_dirty();
6699 _inode->rdev = req->head.args.mknod.rdev;
6700 _inode->rstat.rfiles = 1;
6701 _inode->accounted_rstat = _inode->rstat;
6702 if (layout.pool_id != mdcache->default_file_layout.pool_id)
6703 _inode->add_old_pool(mdcache->default_file_layout.pool_id);
6704 _inode->update_backtrace();
6705
6706 snapid_t follows = mdcache->get_global_snaprealm()->get_newest_seq();
6707 SnapRealm *realm = dn->get_dir()->inode->find_snaprealm();
6708 ceph_assert(follows >= realm->get_newest_seq());
6709
6710 // if the client created a _regular_ file via MKNOD, it's highly likely they'll
6711 // want to write to it (e.g., if they are reexporting NFS)
6712 if (S_ISREG(_inode->mode)) {
6713 // issue a cap on the file
6714 int cmode = CEPH_FILE_MODE_RDWR;
6715 Capability *cap = mds->locker->issue_new_caps(newi, cmode, mdr, realm);
6716 if (cap) {
6717 cap->set_wanted(0);
6718
6719 // put locks in excl mode
6720 newi->filelock.set_state(LOCK_EXCL);
6721 newi->authlock.set_state(LOCK_EXCL);
6722 newi->xattrlock.set_state(LOCK_EXCL);
6723
6724 dout(15) << " setting a client_range too, since this is a regular file" << dendl;
6725 _inode->client_ranges[client].range.first = 0;
6726 _inode->client_ranges[client].range.last = _inode->layout.stripe_unit;
6727 _inode->client_ranges[client].follows = follows;
6728 newi->mark_clientwriteable();
6729 cap->mark_clientwriteable();
6730 }
6731 }
6732
6733 ceph_assert(dn->first == follows + 1);
6734 newi->first = dn->first;
6735
6736 dout(10) << "mknod mode " << _inode->mode << " rdev " << _inode->rdev << dendl;
6737
6738 // prepare finisher
6739 mdr->ls = mdlog->get_current_segment();
6740 EUpdate *le = new EUpdate(mdlog, "mknod");
6741 mdlog->start_entry(le);
6742 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
6743 journal_allocated_inos(mdr, &le->metablob);
6744
6745 mdcache->predirty_journal_parents(mdr, &le->metablob, newi, dn->get_dir(),
6746 PREDIRTY_PRIMARY|PREDIRTY_DIR, 1);
6747 le->metablob.add_primary_dentry(dn, newi, true, true, true);
6748
6749 journal_and_reply(mdr, newi, dn, le, new C_MDS_mknod_finish(this, mdr, dn, newi));
6750 mds->balancer->maybe_fragment(dn->get_dir(), false);
6751 }
6752
6753
6754
6755 // MKDIR
6756 /* This function takes responsibility for the passed mdr*/
6757 void Server::handle_client_mkdir(MDRequestRef& mdr)
6758 {
6759 const cref_t<MClientRequest> &req = mdr->client_request;
6760
6761 mdr->disable_lock_cache();
6762 CDentry *dn = rdlock_path_xlock_dentry(mdr, true);
6763 if (!dn)
6764 return;
6765
6766 CDir *dir = dn->get_dir();
6767 CInode *diri = dir->get_inode();
6768
6769 // mkdir check access
6770 if (!check_access(mdr, diri, MAY_WRITE))
6771 return;
6772
6773 if (!check_fragment_space(mdr, dir))
6774 return;
6775 if (!check_dir_max_entries(mdr, dir))
6776 return;
6777
6778 ceph_assert(dn->get_projected_linkage()->is_null());
6779 if (req->get_alternate_name().size() > alternate_name_max) {
6780 dout(10) << " alternate_name longer than " << alternate_name_max << dendl;
6781 respond_to_request(mdr, -CEPHFS_ENAMETOOLONG);
6782 return;
6783 }
6784 dn->set_alternate_name(req->get_alternate_name());
6785
6786 // new inode
6787 unsigned mode = req->head.args.mkdir.mode;
6788 mode &= ~S_IFMT;
6789 mode |= S_IFDIR;
6790 CInode *newi = prepare_new_inode(mdr, dir, inodeno_t(req->head.ino), mode);
6791 ceph_assert(newi);
6792
6793 // it's a directory.
6794 dn->push_projected_linkage(newi);
6795
6796 auto _inode = newi->_get_inode();
6797 _inode->version = dn->pre_dirty();
6798 _inode->rstat.rsubdirs = 1;
6799 _inode->accounted_rstat = _inode->rstat;
6800 _inode->update_backtrace();
6801
6802 snapid_t follows = mdcache->get_global_snaprealm()->get_newest_seq();
6803 SnapRealm *realm = dn->get_dir()->inode->find_snaprealm();
6804 ceph_assert(follows >= realm->get_newest_seq());
6805
6806 dout(12) << " follows " << follows << dendl;
6807 ceph_assert(dn->first == follows + 1);
6808 newi->first = dn->first;
6809
6810 // ...and that new dir is empty.
6811 CDir *newdir = newi->get_or_open_dirfrag(mdcache, frag_t());
6812 newdir->state_set(CDir::STATE_CREATING);
6813 newdir->mark_complete();
6814 newdir->_get_fnode()->version = newdir->pre_dirty();
6815
6816 // prepare finisher
6817 mdr->ls = mdlog->get_current_segment();
6818 EUpdate *le = new EUpdate(mdlog, "mkdir");
6819 mdlog->start_entry(le);
6820 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
6821 journal_allocated_inos(mdr, &le->metablob);
6822 mdcache->predirty_journal_parents(mdr, &le->metablob, newi, dn->get_dir(), PREDIRTY_PRIMARY|PREDIRTY_DIR, 1);
6823 le->metablob.add_primary_dentry(dn, newi, true, true);
6824 le->metablob.add_new_dir(newdir); // dirty AND complete AND new
6825
6826 // issue a cap on the directory
6827 int cmode = CEPH_FILE_MODE_RDWR;
6828 Capability *cap = mds->locker->issue_new_caps(newi, cmode, mdr, realm);
6829 if (cap) {
6830 cap->set_wanted(0);
6831
6832 // put locks in excl mode
6833 newi->filelock.set_state(LOCK_EXCL);
6834 newi->authlock.set_state(LOCK_EXCL);
6835 newi->xattrlock.set_state(LOCK_EXCL);
6836 }
6837
6838 // make sure this inode gets into the journal
6839 le->metablob.add_opened_ino(newi->ino());
6840
6841 journal_and_reply(mdr, newi, dn, le, new C_MDS_mknod_finish(this, mdr, dn, newi));
6842
6843 // We hit_dir (via hit_inode) in our finish callback, but by then we might
6844 // have overshot the split size (multiple mkdir in flight), so here is
6845 // an early chance to split the dir if this mkdir makes it oversized.
6846 mds->balancer->maybe_fragment(dir, false);
6847 }
6848
6849
6850 // SYMLINK
6851
6852 void Server::handle_client_symlink(MDRequestRef& mdr)
6853 {
6854 const auto& req = mdr->client_request;
6855
6856 mdr->disable_lock_cache();
6857 CDentry *dn = rdlock_path_xlock_dentry(mdr, true);
6858 if (!dn)
6859 return;
6860
6861 CDir *dir = dn->get_dir();
6862 CInode *diri = dir->get_inode();
6863
6864 if (!check_access(mdr, diri, MAY_WRITE))
6865 return;
6866 if (!check_fragment_space(mdr, dir))
6867 return;
6868 if (!check_dir_max_entries(mdr, dir))
6869 return;
6870
6871 ceph_assert(dn->get_projected_linkage()->is_null());
6872 if (req->get_alternate_name().size() > alternate_name_max) {
6873 dout(10) << " alternate_name longer than " << alternate_name_max << dendl;
6874 respond_to_request(mdr, -CEPHFS_ENAMETOOLONG);
6875 }
6876 dn->set_alternate_name(req->get_alternate_name());
6877
6878 unsigned mode = S_IFLNK | 0777;
6879 CInode *newi = prepare_new_inode(mdr, dir, inodeno_t(req->head.ino), mode);
6880 ceph_assert(newi);
6881
6882 // it's a symlink
6883 dn->push_projected_linkage(newi);
6884
6885 newi->symlink = req->get_path2();
6886 auto _inode = newi->_get_inode();
6887 _inode->version = dn->pre_dirty();
6888 _inode->size = newi->symlink.length();
6889 _inode->rstat.rbytes = _inode->size;
6890 _inode->rstat.rfiles = 1;
6891 _inode->accounted_rstat = _inode->rstat;
6892 _inode->update_backtrace();
6893
6894 newi->first = dn->first;
6895
6896 // prepare finisher
6897 mdr->ls = mdlog->get_current_segment();
6898 EUpdate *le = new EUpdate(mdlog, "symlink");
6899 mdlog->start_entry(le);
6900 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
6901 journal_allocated_inos(mdr, &le->metablob);
6902 mdcache->predirty_journal_parents(mdr, &le->metablob, newi, dn->get_dir(), PREDIRTY_PRIMARY|PREDIRTY_DIR, 1);
6903 le->metablob.add_primary_dentry(dn, newi, true, true);
6904
6905 journal_and_reply(mdr, newi, dn, le, new C_MDS_mknod_finish(this, mdr, dn, newi));
6906 mds->balancer->maybe_fragment(dir, false);
6907 }
6908
6909
6910
6911
6912
6913 // LINK
6914
6915 void Server::handle_client_link(MDRequestRef& mdr)
6916 {
6917 const cref_t<MClientRequest> &req = mdr->client_request;
6918
6919 dout(7) << "handle_client_link " << req->get_filepath()
6920 << " to " << req->get_filepath2()
6921 << dendl;
6922
6923 mdr->disable_lock_cache();
6924
6925 CDentry *destdn;
6926 CInode *targeti;
6927
6928 if (req->get_filepath2().depth() == 0) {
6929 targeti = mdcache->get_inode(req->get_filepath2().get_ino());
6930 if (!targeti) {
6931 dout(10) << "CEPHFS_ESTALE on path2, attempting recovery" << dendl;
6932 inodeno_t ino = req->get_filepath2().get_ino();
6933 mdcache->find_ino_peers(ino, new C_MDS_TryFindInode(this, mdr, mdcache, ino));
6934 return;
6935 }
6936 mdr->pin(targeti);
6937
6938 if (!(mdr->locking_state & MutationImpl::SNAP2_LOCKED)) {
6939 CDentry *pdn = targeti->get_projected_parent_dn();
6940 if (!pdn) {
6941 dout(7) << "target has no parent dn, failing..." << dendl;
6942 respond_to_request(mdr, -CEPHFS_EINVAL);
6943 return;
6944 }
6945 if (!mds->locker->try_rdlock_snap_layout(pdn->get_dir()->get_inode(), mdr, 1))
6946 return;
6947 mdr->locking_state |= MutationImpl::SNAP2_LOCKED;
6948 }
6949
6950 destdn = rdlock_path_xlock_dentry(mdr, false);
6951 if (!destdn)
6952 return;
6953 } else {
6954 auto ret = rdlock_two_paths_xlock_destdn(mdr, false);
6955 destdn = ret.first;
6956 if (!destdn)
6957 return;
6958
6959 if (!destdn->get_projected_linkage()->is_null()) {
6960 respond_to_request(mdr, -CEPHFS_EEXIST);
6961 return;
6962 }
6963
6964 targeti = ret.second->get_projected_linkage()->get_inode();
6965 }
6966
6967 ceph_assert(destdn->get_projected_linkage()->is_null());
6968 if (req->get_alternate_name().size() > alternate_name_max) {
6969 dout(10) << " alternate_name longer than " << alternate_name_max << dendl;
6970 respond_to_request(mdr, -CEPHFS_ENAMETOOLONG);
6971 return;
6972 }
6973 destdn->set_alternate_name(req->get_alternate_name());
6974
6975 if (targeti->is_dir()) {
6976 dout(7) << "target is a dir, failing..." << dendl;
6977 respond_to_request(mdr, -CEPHFS_EINVAL);
6978 return;
6979 }
6980
6981 CDir *dir = destdn->get_dir();
6982 dout(7) << "handle_client_link link " << destdn->get_name() << " in " << *dir << dendl;
6983 dout(7) << "target is " << *targeti << dendl;
6984
6985 if (!(mdr->locking_state & MutationImpl::ALL_LOCKED)) {
6986 MutationImpl::LockOpVec lov;
6987 lov.add_xlock(&targeti->snaplock);
6988 lov.add_xlock(&targeti->linklock);
6989
6990 if (!mds->locker->acquire_locks(mdr, lov))
6991 return;
6992
6993 mdr->locking_state |= MutationImpl::ALL_LOCKED;
6994 }
6995
6996 if (targeti->get_projected_inode()->nlink == 0) {
6997 dout(7) << "target has no link, failing..." << dendl;
6998 respond_to_request(mdr, -CEPHFS_ENOENT);
6999 return;
7000 }
7001
7002 if ((!mdr->has_more() || mdr->more()->witnessed.empty())) {
7003 if (!check_access(mdr, targeti, MAY_WRITE))
7004 return;
7005
7006 if (!check_access(mdr, dir->get_inode(), MAY_WRITE))
7007 return;
7008
7009 if (!check_fragment_space(mdr, dir))
7010 return;
7011
7012 if (!check_dir_max_entries(mdr, dir))
7013 return;
7014 }
7015
7016 CInode* target_pin = targeti->get_projected_parent_dir()->inode;
7017 SnapRealm *target_realm = target_pin->find_snaprealm();
7018 if (target_pin != dir->inode &&
7019 target_realm->get_subvolume_ino() !=
7020 dir->inode->find_snaprealm()->get_subvolume_ino()) {
7021 dout(7) << "target is in different subvolume, failing..." << dendl;
7022 respond_to_request(mdr, -CEPHFS_EXDEV);
7023 return;
7024 }
7025
7026 // go!
7027 ceph_assert(g_conf()->mds_kill_link_at != 1);
7028
7029 // local or remote?
7030 if (targeti->is_auth())
7031 _link_local(mdr, destdn, targeti, target_realm);
7032 else
7033 _link_remote(mdr, true, destdn, targeti);
7034 mds->balancer->maybe_fragment(dir, false);
7035 }
7036
7037
7038 class C_MDS_link_local_finish : public ServerLogContext {
7039 CDentry *dn;
7040 CInode *targeti;
7041 version_t dnpv;
7042 version_t tipv;
7043 bool adjust_realm;
7044 public:
7045 C_MDS_link_local_finish(Server *s, MDRequestRef& r, CDentry *d, CInode *ti,
7046 version_t dnpv_, version_t tipv_, bool ar) :
7047 ServerLogContext(s, r), dn(d), targeti(ti),
7048 dnpv(dnpv_), tipv(tipv_), adjust_realm(ar) { }
7049 void finish(int r) override {
7050 ceph_assert(r == 0);
7051 server->_link_local_finish(mdr, dn, targeti, dnpv, tipv, adjust_realm);
7052 }
7053 };
7054
7055
7056 void Server::_link_local(MDRequestRef& mdr, CDentry *dn, CInode *targeti, SnapRealm *target_realm)
7057 {
7058 dout(10) << "_link_local " << *dn << " to " << *targeti << dendl;
7059
7060 mdr->ls = mdlog->get_current_segment();
7061
7062 // predirty NEW dentry
7063 version_t dnpv = dn->pre_dirty();
7064 version_t tipv = targeti->pre_dirty();
7065
7066 // project inode update
7067 auto pi = targeti->project_inode(mdr);
7068 pi.inode->nlink++;
7069 pi.inode->ctime = mdr->get_op_stamp();
7070 if (mdr->get_op_stamp() > pi.inode->rstat.rctime)
7071 pi.inode->rstat.rctime = mdr->get_op_stamp();
7072 pi.inode->change_attr++;
7073 pi.inode->version = tipv;
7074
7075 bool adjust_realm = false;
7076 if (!target_realm->get_subvolume_ino() && !targeti->is_projected_snaprealm_global()) {
7077 sr_t *newsnap = targeti->project_snaprealm();
7078 targeti->mark_snaprealm_global(newsnap);
7079 targeti->record_snaprealm_parent_dentry(newsnap, target_realm, targeti->get_projected_parent_dn(), true);
7080 adjust_realm = true;
7081 }
7082
7083 // log + wait
7084 EUpdate *le = new EUpdate(mdlog, "link_local");
7085 mdlog->start_entry(le);
7086 le->metablob.add_client_req(mdr->reqid, mdr->client_request->get_oldest_client_tid());
7087 mdcache->predirty_journal_parents(mdr, &le->metablob, targeti, dn->get_dir(), PREDIRTY_DIR, 1); // new dn
7088 mdcache->predirty_journal_parents(mdr, &le->metablob, targeti, 0, PREDIRTY_PRIMARY); // targeti
7089 le->metablob.add_remote_dentry(dn, true, targeti->ino(), targeti->d_type()); // new remote
7090 mdcache->journal_dirty_inode(mdr.get(), &le->metablob, targeti);
7091
7092 // do this after predirty_*, to avoid funky extra dnl arg
7093 dn->push_projected_linkage(targeti->ino(), targeti->d_type());
7094
7095 journal_and_reply(mdr, targeti, dn, le,
7096 new C_MDS_link_local_finish(this, mdr, dn, targeti, dnpv, tipv, adjust_realm));
7097 }
7098
7099 void Server::_link_local_finish(MDRequestRef& mdr, CDentry *dn, CInode *targeti,
7100 version_t dnpv, version_t tipv, bool adjust_realm)
7101 {
7102 dout(10) << "_link_local_finish " << *dn << " to " << *targeti << dendl;
7103
7104 // link and unlock the NEW dentry
7105 CDentry::linkage_t *dnl = dn->pop_projected_linkage();
7106 if (!dnl->get_inode())
7107 dn->link_remote(dnl, targeti);
7108 dn->mark_dirty(dnpv, mdr->ls);
7109
7110 // target inode
7111 mdr->apply();
7112
7113 MDRequestRef null_ref;
7114 mdcache->send_dentry_link(dn, null_ref);
7115
7116 if (adjust_realm) {
7117 int op = CEPH_SNAP_OP_SPLIT;
7118 mds->mdcache->send_snap_update(targeti, 0, op);
7119 mds->mdcache->do_realm_invalidate_and_update_notify(targeti, op);
7120 }
7121
7122 // bump target popularity
7123 mds->balancer->hit_inode(targeti, META_POP_IWR);
7124 mds->balancer->hit_dir(dn->get_dir(), META_POP_IWR);
7125
7126 // reply
7127 respond_to_request(mdr, 0);
7128 }
7129
7130
7131 // link / unlink remote
7132
7133 class C_MDS_link_remote_finish : public ServerLogContext {
7134 bool inc;
7135 CDentry *dn;
7136 CInode *targeti;
7137 version_t dpv;
7138 public:
7139 C_MDS_link_remote_finish(Server *s, MDRequestRef& r, bool i, CDentry *d, CInode *ti) :
7140 ServerLogContext(s, r), inc(i), dn(d), targeti(ti),
7141 dpv(d->get_projected_version()) {}
7142 void finish(int r) override {
7143 ceph_assert(r == 0);
7144 server->_link_remote_finish(mdr, inc, dn, targeti, dpv);
7145 }
7146 };
7147
7148 void Server::_link_remote(MDRequestRef& mdr, bool inc, CDentry *dn, CInode *targeti)
7149 {
7150 dout(10) << "_link_remote "
7151 << (inc ? "link ":"unlink ")
7152 << *dn << " to " << *targeti << dendl;
7153
7154 // 1. send LinkPrepare to dest (journal nlink++ prepare)
7155 mds_rank_t linkauth = targeti->authority().first;
7156 if (mdr->more()->witnessed.count(linkauth) == 0) {
7157 if (mds->is_cluster_degraded() &&
7158 !mds->mdsmap->is_clientreplay_or_active_or_stopping(linkauth)) {
7159 dout(10) << " targeti auth mds." << linkauth << " is not active" << dendl;
7160 if (mdr->more()->waiting_on_peer.empty())
7161 mds->wait_for_active_peer(linkauth, new C_MDS_RetryRequest(mdcache, mdr));
7162 return;
7163 }
7164
7165 dout(10) << " targeti auth must prepare nlink++/--" << dendl;
7166 int op;
7167 if (inc)
7168 op = MMDSPeerRequest::OP_LINKPREP;
7169 else
7170 op = MMDSPeerRequest::OP_UNLINKPREP;
7171 auto req = make_message<MMDSPeerRequest>(mdr->reqid, mdr->attempt, op);
7172 targeti->set_object_info(req->get_object_info());
7173 req->op_stamp = mdr->get_op_stamp();
7174 if (auto& desti_srnode = mdr->more()->desti_srnode)
7175 encode(*desti_srnode, req->desti_snapbl);
7176 mds->send_message_mds(req, linkauth);
7177
7178 ceph_assert(mdr->more()->waiting_on_peer.count(linkauth) == 0);
7179 mdr->more()->waiting_on_peer.insert(linkauth);
7180 return;
7181 }
7182 dout(10) << " targeti auth has prepared nlink++/--" << dendl;
7183
7184 ceph_assert(g_conf()->mds_kill_link_at != 2);
7185
7186 if (auto& desti_srnode = mdr->more()->desti_srnode) {
7187 delete desti_srnode;
7188 desti_srnode = NULL;
7189 }
7190
7191 mdr->set_mds_stamp(ceph_clock_now());
7192
7193 // add to event
7194 mdr->ls = mdlog->get_current_segment();
7195 EUpdate *le = new EUpdate(mdlog, inc ? "link_remote":"unlink_remote");
7196 mdlog->start_entry(le);
7197 le->metablob.add_client_req(mdr->reqid, mdr->client_request->get_oldest_client_tid());
7198 if (!mdr->more()->witnessed.empty()) {
7199 dout(20) << " noting uncommitted_peers " << mdr->more()->witnessed << dendl;
7200 le->reqid = mdr->reqid;
7201 le->had_peers = true;
7202 mdcache->add_uncommitted_leader(mdr->reqid, mdr->ls, mdr->more()->witnessed);
7203 }
7204
7205 if (inc) {
7206 dn->pre_dirty();
7207 mdcache->predirty_journal_parents(mdr, &le->metablob, targeti, dn->get_dir(), PREDIRTY_DIR, 1);
7208 le->metablob.add_remote_dentry(dn, true, targeti->ino(), targeti->d_type()); // new remote
7209 dn->push_projected_linkage(targeti->ino(), targeti->d_type());
7210 } else {
7211 dn->pre_dirty();
7212 mdcache->predirty_journal_parents(mdr, &le->metablob, targeti, dn->get_dir(), PREDIRTY_DIR, -1);
7213 mdcache->journal_cow_dentry(mdr.get(), &le->metablob, dn);
7214 le->metablob.add_null_dentry(dn, true);
7215 dn->push_projected_linkage();
7216 }
7217
7218 journal_and_reply(mdr, (inc ? targeti : nullptr), dn, le,
7219 new C_MDS_link_remote_finish(this, mdr, inc, dn, targeti));
7220 }
7221
7222 void Server::_link_remote_finish(MDRequestRef& mdr, bool inc,
7223 CDentry *dn, CInode *targeti,
7224 version_t dpv)
7225 {
7226 dout(10) << "_link_remote_finish "
7227 << (inc ? "link ":"unlink ")
7228 << *dn << " to " << *targeti << dendl;
7229
7230 ceph_assert(g_conf()->mds_kill_link_at != 3);
7231
7232 if (!mdr->more()->witnessed.empty())
7233 mdcache->logged_leader_update(mdr->reqid);
7234
7235 if (inc) {
7236 // link the new dentry
7237 CDentry::linkage_t *dnl = dn->pop_projected_linkage();
7238 if (!dnl->get_inode())
7239 dn->link_remote(dnl, targeti);
7240 dn->mark_dirty(dpv, mdr->ls);
7241 } else {
7242 // unlink main dentry
7243 dn->get_dir()->unlink_inode(dn);
7244 dn->pop_projected_linkage();
7245 dn->mark_dirty(dn->get_projected_version(), mdr->ls); // dirty old dentry
7246 }
7247
7248 mdr->apply();
7249
7250 MDRequestRef null_ref;
7251 if (inc)
7252 mdcache->send_dentry_link(dn, null_ref);
7253 else
7254 mdcache->send_dentry_unlink(dn, NULL, null_ref);
7255
7256 // bump target popularity
7257 mds->balancer->hit_inode(targeti, META_POP_IWR);
7258 mds->balancer->hit_dir(dn->get_dir(), META_POP_IWR);
7259
7260 // reply
7261 respond_to_request(mdr, 0);
7262
7263 if (!inc)
7264 // removing a new dn?
7265 dn->get_dir()->try_remove_unlinked_dn(dn);
7266 }
7267
7268
7269 // remote linking/unlinking
7270
7271 class C_MDS_PeerLinkPrep : public ServerLogContext {
7272 CInode *targeti;
7273 bool adjust_realm;
7274 public:
7275 C_MDS_PeerLinkPrep(Server *s, MDRequestRef& r, CInode *t, bool ar) :
7276 ServerLogContext(s, r), targeti(t), adjust_realm(ar) { }
7277 void finish(int r) override {
7278 ceph_assert(r == 0);
7279 server->_logged_peer_link(mdr, targeti, adjust_realm);
7280 }
7281 };
7282
7283 class C_MDS_PeerLinkCommit : public ServerContext {
7284 MDRequestRef mdr;
7285 CInode *targeti;
7286 public:
7287 C_MDS_PeerLinkCommit(Server *s, MDRequestRef& r, CInode *t) :
7288 ServerContext(s), mdr(r), targeti(t) { }
7289 void finish(int r) override {
7290 server->_commit_peer_link(mdr, r, targeti);
7291 }
7292 };
7293
7294 void Server::handle_peer_link_prep(MDRequestRef& mdr)
7295 {
7296 dout(10) << "handle_peer_link_prep " << *mdr
7297 << " on " << mdr->peer_request->get_object_info()
7298 << dendl;
7299
7300 ceph_assert(g_conf()->mds_kill_link_at != 4);
7301
7302 CInode *targeti = mdcache->get_inode(mdr->peer_request->get_object_info().ino);
7303 ceph_assert(targeti);
7304 dout(10) << "targeti " << *targeti << dendl;
7305 CDentry *dn = targeti->get_parent_dn();
7306 CDentry::linkage_t *dnl = dn->get_linkage();
7307 ceph_assert(dnl->is_primary());
7308
7309 mdr->set_op_stamp(mdr->peer_request->op_stamp);
7310
7311 mdr->auth_pin(targeti);
7312
7313 //ceph_abort(); // test hack: make sure leader can handle a peer that fails to prepare...
7314 ceph_assert(g_conf()->mds_kill_link_at != 5);
7315
7316 // journal it
7317 mdr->ls = mdlog->get_current_segment();
7318 EPeerUpdate *le = new EPeerUpdate(mdlog, "peer_link_prep", mdr->reqid, mdr->peer_to_mds,
7319 EPeerUpdate::OP_PREPARE, EPeerUpdate::LINK);
7320 mdlog->start_entry(le);
7321
7322 auto pi = dnl->get_inode()->project_inode(mdr);
7323
7324 // update journaled target inode
7325 bool inc;
7326 bool adjust_realm = false;
7327 bool realm_projected = false;
7328 if (mdr->peer_request->get_op() == MMDSPeerRequest::OP_LINKPREP) {
7329 inc = true;
7330 pi.inode->nlink++;
7331
7332 CDentry *target_pdn = targeti->get_projected_parent_dn();
7333 SnapRealm *target_realm = target_pdn->get_dir()->inode->find_snaprealm();
7334 if (!target_realm->get_subvolume_ino() && !targeti->is_projected_snaprealm_global()) {
7335 sr_t *newsnap = targeti->project_snaprealm();
7336 targeti->mark_snaprealm_global(newsnap);
7337 targeti->record_snaprealm_parent_dentry(newsnap, target_realm, target_pdn, true);
7338 adjust_realm = true;
7339 realm_projected = true;
7340 }
7341 } else {
7342 inc = false;
7343 pi.inode->nlink--;
7344 if (targeti->is_projected_snaprealm_global()) {
7345 ceph_assert(mdr->peer_request->desti_snapbl.length());
7346 auto p = mdr->peer_request->desti_snapbl.cbegin();
7347
7348 sr_t *newsnap = targeti->project_snaprealm();
7349 decode(*newsnap, p);
7350
7351 if (pi.inode->nlink == 0)
7352 ceph_assert(!newsnap->is_parent_global());
7353
7354 realm_projected = true;
7355 } else {
7356 ceph_assert(mdr->peer_request->desti_snapbl.length() == 0);
7357 }
7358 }
7359
7360 link_rollback rollback;
7361 rollback.reqid = mdr->reqid;
7362 rollback.ino = targeti->ino();
7363 rollback.old_ctime = targeti->get_inode()->ctime; // we hold versionlock xlock; no concorrent projections
7364 const auto& pf = targeti->get_parent_dn()->get_dir()->get_projected_fnode();
7365 rollback.old_dir_mtime = pf->fragstat.mtime;
7366 rollback.old_dir_rctime = pf->rstat.rctime;
7367 rollback.was_inc = inc;
7368 if (realm_projected) {
7369 if (targeti->snaprealm) {
7370 encode(true, rollback.snapbl);
7371 targeti->encode_snap_blob(rollback.snapbl);
7372 } else {
7373 encode(false, rollback.snapbl);
7374 }
7375 }
7376 encode(rollback, le->rollback);
7377 mdr->more()->rollback_bl = le->rollback;
7378
7379 pi.inode->ctime = mdr->get_op_stamp();
7380 pi.inode->version = targeti->pre_dirty();
7381
7382 dout(10) << " projected inode " << pi.inode->ino << " v " << pi.inode->version << dendl;
7383
7384 // commit case
7385 mdcache->predirty_journal_parents(mdr, &le->commit, dnl->get_inode(), 0, PREDIRTY_SHALLOW|PREDIRTY_PRIMARY);
7386 mdcache->journal_dirty_inode(mdr.get(), &le->commit, targeti);
7387 mdcache->add_uncommitted_peer(mdr->reqid, mdr->ls, mdr->peer_to_mds);
7388
7389 // set up commit waiter
7390 mdr->more()->peer_commit = new C_MDS_PeerLinkCommit(this, mdr, targeti);
7391
7392 mdr->more()->peer_update_journaled = true;
7393 submit_mdlog_entry(le, new C_MDS_PeerLinkPrep(this, mdr, targeti, adjust_realm),
7394 mdr, __func__);
7395 mdlog->flush();
7396 }
7397
7398 void Server::_logged_peer_link(MDRequestRef& mdr, CInode *targeti, bool adjust_realm)
7399 {
7400 dout(10) << "_logged_peer_link " << *mdr
7401 << " " << *targeti << dendl;
7402
7403 ceph_assert(g_conf()->mds_kill_link_at != 6);
7404
7405 // update the target
7406 mdr->apply();
7407
7408 // hit pop
7409 mds->balancer->hit_inode(targeti, META_POP_IWR);
7410
7411 // done.
7412 mdr->reset_peer_request();
7413
7414 if (adjust_realm) {
7415 int op = CEPH_SNAP_OP_SPLIT;
7416 mds->mdcache->send_snap_update(targeti, 0, op);
7417 mds->mdcache->do_realm_invalidate_and_update_notify(targeti, op);
7418 }
7419
7420 // ack
7421 if (!mdr->aborted) {
7422 auto reply = make_message<MMDSPeerRequest>(mdr->reqid, mdr->attempt, MMDSPeerRequest::OP_LINKPREPACK);
7423 mds->send_message_mds(reply, mdr->peer_to_mds);
7424 } else {
7425 dout(10) << " abort flag set, finishing" << dendl;
7426 mdcache->request_finish(mdr);
7427 }
7428 }
7429
7430
7431 struct C_MDS_CommittedPeer : public ServerLogContext {
7432 C_MDS_CommittedPeer(Server *s, MDRequestRef& m) : ServerLogContext(s, m) {}
7433 void finish(int r) override {
7434 server->_committed_peer(mdr);
7435 }
7436 };
7437
7438 void Server::_commit_peer_link(MDRequestRef& mdr, int r, CInode *targeti)
7439 {
7440 dout(10) << "_commit_peer_link " << *mdr
7441 << " r=" << r
7442 << " " << *targeti << dendl;
7443
7444 ceph_assert(g_conf()->mds_kill_link_at != 7);
7445
7446 if (r == 0) {
7447 // drop our pins, etc.
7448 mdr->cleanup();
7449
7450 // write a commit to the journal
7451 EPeerUpdate *le = new EPeerUpdate(mdlog, "peer_link_commit", mdr->reqid, mdr->peer_to_mds,
7452 EPeerUpdate::OP_COMMIT, EPeerUpdate::LINK);
7453 mdlog->start_entry(le);
7454 submit_mdlog_entry(le, new C_MDS_CommittedPeer(this, mdr), mdr, __func__);
7455 mdlog->flush();
7456 } else {
7457 do_link_rollback(mdr->more()->rollback_bl, mdr->peer_to_mds, mdr);
7458 }
7459 }
7460
7461 void Server::_committed_peer(MDRequestRef& mdr)
7462 {
7463 dout(10) << "_committed_peer " << *mdr << dendl;
7464
7465 ceph_assert(g_conf()->mds_kill_link_at != 8);
7466
7467 bool assert_exist = mdr->more()->peer_update_journaled;
7468 mdcache->finish_uncommitted_peer(mdr->reqid, assert_exist);
7469 auto req = make_message<MMDSPeerRequest>(mdr->reqid, mdr->attempt, MMDSPeerRequest::OP_COMMITTED);
7470 mds->send_message_mds(req, mdr->peer_to_mds);
7471 mdcache->request_finish(mdr);
7472 }
7473
7474 struct C_MDS_LoggedLinkRollback : public ServerLogContext {
7475 MutationRef mut;
7476 map<client_t,ref_t<MClientSnap>> splits;
7477 C_MDS_LoggedLinkRollback(Server *s, MutationRef& m, MDRequestRef& r,
7478 map<client_t,ref_t<MClientSnap>>&& _splits) :
7479 ServerLogContext(s, r), mut(m), splits(std::move(_splits)) {
7480 }
7481 void finish(int r) override {
7482 server->_link_rollback_finish(mut, mdr, splits);
7483 }
7484 };
7485
7486 void Server::do_link_rollback(bufferlist &rbl, mds_rank_t leader, MDRequestRef& mdr)
7487 {
7488 link_rollback rollback;
7489 auto p = rbl.cbegin();
7490 decode(rollback, p);
7491
7492 dout(10) << "do_link_rollback on " << rollback.reqid
7493 << (rollback.was_inc ? " inc":" dec")
7494 << " ino " << rollback.ino
7495 << dendl;
7496
7497 ceph_assert(g_conf()->mds_kill_link_at != 9);
7498
7499 mdcache->add_rollback(rollback.reqid, leader); // need to finish this update before resolve finishes
7500 ceph_assert(mdr || mds->is_resolve());
7501
7502 MutationRef mut(new MutationImpl(nullptr, utime_t(), rollback.reqid));
7503 mut->ls = mds->mdlog->get_current_segment();
7504
7505 CInode *in = mdcache->get_inode(rollback.ino);
7506 ceph_assert(in);
7507 dout(10) << " target is " << *in << dendl;
7508 ceph_assert(!in->is_projected()); // live peer request hold versionlock xlock.
7509
7510 auto pi = in->project_inode(mut);
7511 pi.inode->version = in->pre_dirty();
7512
7513 // parent dir rctime
7514 CDir *parent = in->get_projected_parent_dn()->get_dir();
7515 auto pf = parent->project_fnode(mut);
7516 pf->version = parent->pre_dirty();
7517 if (pf->fragstat.mtime == pi.inode->ctime) {
7518 pf->fragstat.mtime = rollback.old_dir_mtime;
7519 if (pf->rstat.rctime == pi.inode->ctime)
7520 pf->rstat.rctime = rollback.old_dir_rctime;
7521 mut->add_updated_lock(&parent->get_inode()->filelock);
7522 mut->add_updated_lock(&parent->get_inode()->nestlock);
7523 }
7524
7525 // inode
7526 pi.inode->ctime = rollback.old_ctime;
7527 if (rollback.was_inc)
7528 pi.inode->nlink--;
7529 else
7530 pi.inode->nlink++;
7531
7532 map<client_t,ref_t<MClientSnap>> splits;
7533 if (rollback.snapbl.length() && in->snaprealm) {
7534 bool hadrealm;
7535 auto p = rollback.snapbl.cbegin();
7536 decode(hadrealm, p);
7537 if (hadrealm) {
7538 if (!mds->is_resolve()) {
7539 sr_t *new_srnode = new sr_t();
7540 decode(*new_srnode, p);
7541 in->project_snaprealm(new_srnode);
7542 } else {
7543 decode(in->snaprealm->srnode, p);
7544 }
7545 } else {
7546 SnapRealm *realm = parent->get_inode()->find_snaprealm();
7547 if (!mds->is_resolve())
7548 mdcache->prepare_realm_merge(in->snaprealm, realm, splits);
7549 in->project_snaprealm(NULL);
7550 }
7551 }
7552
7553 // journal it
7554 EPeerUpdate *le = new EPeerUpdate(mdlog, "peer_link_rollback", rollback.reqid, leader,
7555 EPeerUpdate::OP_ROLLBACK, EPeerUpdate::LINK);
7556 mdlog->start_entry(le);
7557 le->commit.add_dir_context(parent);
7558 le->commit.add_dir(parent, true);
7559 le->commit.add_primary_dentry(in->get_projected_parent_dn(), 0, true);
7560
7561 submit_mdlog_entry(le, new C_MDS_LoggedLinkRollback(this, mut, mdr, std::move(splits)),
7562 mdr, __func__);
7563 mdlog->flush();
7564 }
7565
7566 void Server::_link_rollback_finish(MutationRef& mut, MDRequestRef& mdr,
7567 map<client_t,ref_t<MClientSnap>>& splits)
7568 {
7569 dout(10) << "_link_rollback_finish" << dendl;
7570
7571 ceph_assert(g_conf()->mds_kill_link_at != 10);
7572
7573 mut->apply();
7574
7575 if (!mds->is_resolve())
7576 mdcache->send_snaps(splits);
7577
7578 if (mdr)
7579 mdcache->request_finish(mdr);
7580
7581 mdcache->finish_rollback(mut->reqid, mdr);
7582
7583 mut->cleanup();
7584 }
7585
7586
7587 void Server::handle_peer_link_prep_ack(MDRequestRef& mdr, const cref_t<MMDSPeerRequest> &m)
7588 {
7589 dout(10) << "handle_peer_link_prep_ack " << *mdr
7590 << " " << *m << dendl;
7591 mds_rank_t from = mds_rank_t(m->get_source().num());
7592
7593 ceph_assert(g_conf()->mds_kill_link_at != 11);
7594
7595 // note peer
7596 mdr->more()->peers.insert(from);
7597
7598 // witnessed!
7599 ceph_assert(mdr->more()->witnessed.count(from) == 0);
7600 mdr->more()->witnessed.insert(from);
7601 ceph_assert(!m->is_not_journaled());
7602 mdr->more()->has_journaled_peers = true;
7603
7604 // remove from waiting list
7605 ceph_assert(mdr->more()->waiting_on_peer.count(from));
7606 mdr->more()->waiting_on_peer.erase(from);
7607
7608 ceph_assert(mdr->more()->waiting_on_peer.empty());
7609
7610 dispatch_client_request(mdr); // go again!
7611 }
7612
7613
7614
7615
7616
7617 // UNLINK
7618
7619 void Server::handle_client_unlink(MDRequestRef& mdr)
7620 {
7621 const cref_t<MClientRequest> &req = mdr->client_request;
7622 client_t client = mdr->get_client();
7623
7624 // rmdir or unlink?
7625 bool rmdir = (req->get_op() == CEPH_MDS_OP_RMDIR);
7626
7627 if (rmdir)
7628 mdr->disable_lock_cache();
7629 CDentry *dn = rdlock_path_xlock_dentry(mdr, false, true);
7630 if (!dn)
7631 return;
7632
7633 CDentry::linkage_t *dnl = dn->get_linkage(client, mdr);
7634 ceph_assert(!dnl->is_null());
7635 CInode *in = dnl->get_inode();
7636
7637 if (rmdir) {
7638 dout(7) << "handle_client_rmdir on " << *dn << dendl;
7639 } else {
7640 dout(7) << "handle_client_unlink on " << *dn << dendl;
7641 }
7642 dout(7) << "dn links to " << *in << dendl;
7643
7644 // rmdir vs is_dir
7645 if (in->is_dir()) {
7646 if (rmdir) {
7647 // do empty directory checks
7648 if (_dir_is_nonempty_unlocked(mdr, in)) {
7649 respond_to_request(mdr, -CEPHFS_ENOTEMPTY);
7650 return;
7651 }
7652 } else {
7653 dout(7) << "handle_client_unlink on dir " << *in << ", returning error" << dendl;
7654 respond_to_request(mdr, -CEPHFS_EISDIR);
7655 return;
7656 }
7657 } else {
7658 if (rmdir) {
7659 // unlink
7660 dout(7) << "handle_client_rmdir on non-dir " << *in << ", returning error" << dendl;
7661 respond_to_request(mdr, -CEPHFS_ENOTDIR);
7662 return;
7663 }
7664 }
7665
7666 CInode *diri = dn->get_dir()->get_inode();
7667 if ((!mdr->has_more() || mdr->more()->witnessed.empty())) {
7668 if (!check_access(mdr, diri, MAY_WRITE))
7669 return;
7670 }
7671
7672 // -- create stray dentry? --
7673 CDentry *straydn = NULL;
7674 if (dnl->is_primary()) {
7675 straydn = prepare_stray_dentry(mdr, dnl->get_inode());
7676 if (!straydn)
7677 return;
7678 dout(10) << " straydn is " << *straydn << dendl;
7679 } else if (mdr->straydn) {
7680 mdr->unpin(mdr->straydn);
7681 mdr->straydn = NULL;
7682 }
7683
7684 // lock
7685 if (!(mdr->locking_state & MutationImpl::ALL_LOCKED)) {
7686 MutationImpl::LockOpVec lov;
7687
7688 lov.add_xlock(&in->linklock);
7689 lov.add_xlock(&in->snaplock);
7690 if (in->is_dir())
7691 lov.add_rdlock(&in->filelock); // to verify it's empty
7692
7693 if (straydn) {
7694 lov.add_wrlock(&straydn->get_dir()->inode->filelock);
7695 lov.add_wrlock(&straydn->get_dir()->inode->nestlock);
7696 lov.add_xlock(&straydn->lock);
7697 }
7698
7699 if (!mds->locker->acquire_locks(mdr, lov))
7700 return;
7701
7702 mdr->locking_state |= MutationImpl::ALL_LOCKED;
7703 }
7704
7705 if (in->is_dir() &&
7706 _dir_is_nonempty(mdr, in)) {
7707 respond_to_request(mdr, -CEPHFS_ENOTEMPTY);
7708 return;
7709 }
7710
7711 if (straydn)
7712 straydn->first = mdcache->get_global_snaprealm()->get_newest_seq() + 1;
7713
7714 if (!mdr->more()->desti_srnode) {
7715 if (in->is_projected_snaprealm_global()) {
7716 sr_t *new_srnode = in->prepare_new_srnode(0);
7717 in->record_snaprealm_parent_dentry(new_srnode, nullptr, dn, dnl->is_primary());
7718 // dropping the last linkage or dropping the last remote linkage,
7719 // detch the inode from global snaprealm
7720 auto nlink = in->get_projected_inode()->nlink;
7721 if (nlink == 1 ||
7722 (nlink == 2 && !dnl->is_primary() &&
7723 !in->get_projected_parent_dir()->inode->is_stray()))
7724 in->clear_snaprealm_global(new_srnode);
7725 mdr->more()->desti_srnode = new_srnode;
7726 } else if (dnl->is_primary()) {
7727 // prepare snaprealm blob for peer request
7728 SnapRealm *realm = in->find_snaprealm();
7729 snapid_t follows = realm->get_newest_seq();
7730 if (in->snaprealm || follows + 1 > in->get_oldest_snap()) {
7731 sr_t *new_srnode = in->prepare_new_srnode(follows);
7732 in->record_snaprealm_past_parent(new_srnode, straydn->get_dir()->inode->find_snaprealm());
7733 mdr->more()->desti_srnode = new_srnode;
7734 }
7735 }
7736 }
7737
7738 // yay!
7739 if (in->is_dir() && in->has_subtree_root_dirfrag()) {
7740 // subtree root auths need to be witnesses
7741 set<mds_rank_t> witnesses;
7742 in->list_replicas(witnesses);
7743 dout(10) << " witnesses " << witnesses << ", have " << mdr->more()->witnessed << dendl;
7744
7745 for (set<mds_rank_t>::iterator p = witnesses.begin();
7746 p != witnesses.end();
7747 ++p) {
7748 if (mdr->more()->witnessed.count(*p)) {
7749 dout(10) << " already witnessed by mds." << *p << dendl;
7750 } else if (mdr->more()->waiting_on_peer.count(*p)) {
7751 dout(10) << " already waiting on witness mds." << *p << dendl;
7752 } else {
7753 if (!_rmdir_prepare_witness(mdr, *p, mdr->dn[0], straydn))
7754 return;
7755 }
7756 }
7757 if (!mdr->more()->waiting_on_peer.empty())
7758 return; // we're waiting for a witness.
7759 }
7760
7761 if (!rmdir && dnl->is_primary() && mdr->dn[0].size() == 1)
7762 mds->locker->create_lock_cache(mdr, diri);
7763
7764 // ok!
7765 if (dnl->is_remote() && !dnl->get_inode()->is_auth())
7766 _link_remote(mdr, false, dn, dnl->get_inode());
7767 else
7768 _unlink_local(mdr, dn, straydn);
7769 }
7770
7771 class C_MDS_unlink_local_finish : public ServerLogContext {
7772 CDentry *dn;
7773 CDentry *straydn;
7774 version_t dnpv; // deleted dentry
7775 public:
7776 C_MDS_unlink_local_finish(Server *s, MDRequestRef& r, CDentry *d, CDentry *sd) :
7777 ServerLogContext(s, r), dn(d), straydn(sd),
7778 dnpv(d->get_projected_version()) {}
7779 void finish(int r) override {
7780 ceph_assert(r == 0);
7781 server->_unlink_local_finish(mdr, dn, straydn, dnpv);
7782 }
7783 };
7784
7785 void Server::_unlink_local(MDRequestRef& mdr, CDentry *dn, CDentry *straydn)
7786 {
7787 dout(10) << "_unlink_local " << *dn << dendl;
7788
7789 CDentry::linkage_t *dnl = dn->get_projected_linkage();
7790 CInode *in = dnl->get_inode();
7791
7792
7793 // ok, let's do it.
7794 mdr->ls = mdlog->get_current_segment();
7795
7796 // prepare log entry
7797 EUpdate *le = new EUpdate(mdlog, "unlink_local");
7798 mdlog->start_entry(le);
7799 le->metablob.add_client_req(mdr->reqid, mdr->client_request->get_oldest_client_tid());
7800 if (!mdr->more()->witnessed.empty()) {
7801 dout(20) << " noting uncommitted_peers " << mdr->more()->witnessed << dendl;
7802 le->reqid = mdr->reqid;
7803 le->had_peers = true;
7804 mdcache->add_uncommitted_leader(mdr->reqid, mdr->ls, mdr->more()->witnessed);
7805 }
7806
7807 if (straydn) {
7808 ceph_assert(dnl->is_primary());
7809 straydn->push_projected_linkage(in);
7810 }
7811
7812 // the unlinked dentry
7813 dn->pre_dirty();
7814
7815 auto pi = in->project_inode(mdr);
7816 {
7817 std::string t;
7818 dn->make_path_string(t, true);
7819 pi.inode->stray_prior_path = std::move(t);
7820 }
7821 pi.inode->version = in->pre_dirty();
7822 pi.inode->ctime = mdr->get_op_stamp();
7823 if (mdr->get_op_stamp() > pi.inode->rstat.rctime)
7824 pi.inode->rstat.rctime = mdr->get_op_stamp();
7825 pi.inode->change_attr++;
7826 pi.inode->nlink--;
7827 if (pi.inode->nlink == 0)
7828 in->state_set(CInode::STATE_ORPHAN);
7829
7830 if (mdr->more()->desti_srnode) {
7831 auto& desti_srnode = mdr->more()->desti_srnode;
7832 in->project_snaprealm(desti_srnode);
7833 desti_srnode = NULL;
7834 }
7835
7836 if (straydn) {
7837 // will manually pop projected inode
7838
7839 // primary link. add stray dentry.
7840 mdcache->predirty_journal_parents(mdr, &le->metablob, in, dn->get_dir(), PREDIRTY_PRIMARY|PREDIRTY_DIR, -1);
7841 mdcache->predirty_journal_parents(mdr, &le->metablob, in, straydn->get_dir(), PREDIRTY_PRIMARY|PREDIRTY_DIR, 1);
7842
7843 pi.inode->update_backtrace();
7844 le->metablob.add_primary_dentry(straydn, in, true, true);
7845 } else {
7846 // remote link. update remote inode.
7847 mdcache->predirty_journal_parents(mdr, &le->metablob, in, dn->get_dir(), PREDIRTY_DIR, -1);
7848 mdcache->predirty_journal_parents(mdr, &le->metablob, in, 0, PREDIRTY_PRIMARY);
7849 mdcache->journal_dirty_inode(mdr.get(), &le->metablob, in);
7850 }
7851
7852 mdcache->journal_cow_dentry(mdr.get(), &le->metablob, dn);
7853 le->metablob.add_null_dentry(dn, true);
7854
7855 if (in->is_dir()) {
7856 dout(10) << " noting renamed (unlinked) dir ino " << in->ino() << " in metablob" << dendl;
7857 le->metablob.renamed_dirino = in->ino();
7858 }
7859
7860 dn->push_projected_linkage();
7861
7862 if (straydn) {
7863 ceph_assert(in->first <= straydn->first);
7864 in->first = straydn->first;
7865 }
7866
7867 if (in->is_dir()) {
7868 ceph_assert(straydn);
7869 mdcache->project_subtree_rename(in, dn->get_dir(), straydn->get_dir());
7870 }
7871
7872 journal_and_reply(mdr, 0, dn, le, new C_MDS_unlink_local_finish(this, mdr, dn, straydn));
7873 }
7874
7875 void Server::_unlink_local_finish(MDRequestRef& mdr,
7876 CDentry *dn, CDentry *straydn,
7877 version_t dnpv)
7878 {
7879 dout(10) << "_unlink_local_finish " << *dn << dendl;
7880
7881 if (!mdr->more()->witnessed.empty())
7882 mdcache->logged_leader_update(mdr->reqid);
7883
7884 CInode *strayin = NULL;
7885 bool hadrealm = false;
7886 if (straydn) {
7887 // if there is newly created snaprealm, need to split old snaprealm's
7888 // inodes_with_caps. So pop snaprealm before linkage changes.
7889 strayin = dn->get_linkage()->get_inode();
7890 hadrealm = strayin->snaprealm ? true : false;
7891 strayin->early_pop_projected_snaprealm();
7892 }
7893
7894 // unlink main dentry
7895 dn->get_dir()->unlink_inode(dn);
7896 dn->pop_projected_linkage();
7897 dn->mark_dirty(dnpv, mdr->ls);
7898
7899 // relink as stray? (i.e. was primary link?)
7900 if (straydn) {
7901 dout(20) << " straydn is " << *straydn << dendl;
7902 straydn->pop_projected_linkage();
7903 mdcache->touch_dentry_bottom(straydn);
7904 }
7905
7906 mdr->apply();
7907
7908 mdcache->send_dentry_unlink(dn, straydn, mdr);
7909
7910 if (straydn) {
7911 // update subtree map?
7912 if (strayin->is_dir())
7913 mdcache->adjust_subtree_after_rename(strayin, dn->get_dir(), true);
7914
7915 if (strayin->snaprealm && !hadrealm)
7916 mdcache->do_realm_invalidate_and_update_notify(strayin, CEPH_SNAP_OP_SPLIT, false);
7917 }
7918
7919 // bump pop
7920 mds->balancer->hit_dir(dn->get_dir(), META_POP_IWR);
7921
7922 // reply
7923 respond_to_request(mdr, 0);
7924
7925 // removing a new dn?
7926 dn->get_dir()->try_remove_unlinked_dn(dn);
7927
7928 // clean up ?
7929 // respond_to_request() drops locks. So stray reintegration can race with us.
7930 if (straydn && !straydn->get_projected_linkage()->is_null()) {
7931 // Tip off the MDCache that this dentry is a stray that
7932 // might be elegible for purge.
7933 mdcache->notify_stray(straydn);
7934 }
7935 }
7936
7937 bool Server::_rmdir_prepare_witness(MDRequestRef& mdr, mds_rank_t who, vector<CDentry*>& trace, CDentry *straydn)
7938 {
7939 if (mds->is_cluster_degraded() &&
7940 !mds->mdsmap->is_clientreplay_or_active_or_stopping(who)) {
7941 dout(10) << "_rmdir_prepare_witness mds." << who << " is not active" << dendl;
7942 if (mdr->more()->waiting_on_peer.empty())
7943 mds->wait_for_active_peer(who, new C_MDS_RetryRequest(mdcache, mdr));
7944 return false;
7945 }
7946
7947 dout(10) << "_rmdir_prepare_witness mds." << who << dendl;
7948 auto req = make_message<MMDSPeerRequest>(mdr->reqid, mdr->attempt, MMDSPeerRequest::OP_RMDIRPREP);
7949 req->srcdnpath = filepath(trace.front()->get_dir()->ino());
7950 for (auto dn : trace)
7951 req->srcdnpath.push_dentry(dn->get_name());
7952 mdcache->encode_replica_stray(straydn, who, req->straybl);
7953 if (mdr->more()->desti_srnode)
7954 encode(*mdr->more()->desti_srnode, req->desti_snapbl);
7955
7956 req->op_stamp = mdr->get_op_stamp();
7957 mds->send_message_mds(req, who);
7958
7959 ceph_assert(mdr->more()->waiting_on_peer.count(who) == 0);
7960 mdr->more()->waiting_on_peer.insert(who);
7961 return true;
7962 }
7963
7964 struct C_MDS_PeerRmdirPrep : public ServerLogContext {
7965 CDentry *dn, *straydn;
7966 C_MDS_PeerRmdirPrep(Server *s, MDRequestRef& r, CDentry *d, CDentry *st)
7967 : ServerLogContext(s, r), dn(d), straydn(st) {}
7968 void finish(int r) override {
7969 server->_logged_peer_rmdir(mdr, dn, straydn);
7970 }
7971 };
7972
7973 struct C_MDS_PeerRmdirCommit : public ServerContext {
7974 MDRequestRef mdr;
7975 CDentry *straydn;
7976 C_MDS_PeerRmdirCommit(Server *s, MDRequestRef& r, CDentry *sd)
7977 : ServerContext(s), mdr(r), straydn(sd) { }
7978 void finish(int r) override {
7979 server->_commit_peer_rmdir(mdr, r, straydn);
7980 }
7981 };
7982
7983 void Server::handle_peer_rmdir_prep(MDRequestRef& mdr)
7984 {
7985 dout(10) << "handle_peer_rmdir_prep " << *mdr
7986 << " " << mdr->peer_request->srcdnpath
7987 << " to " << mdr->peer_request->destdnpath
7988 << dendl;
7989
7990 vector<CDentry*> trace;
7991 filepath srcpath(mdr->peer_request->srcdnpath);
7992 dout(10) << " src " << srcpath << dendl;
7993 CInode *in;
7994 CF_MDS_RetryRequestFactory cf(mdcache, mdr, false);
7995 int r = mdcache->path_traverse(mdr, cf, srcpath,
7996 MDS_TRAVERSE_DISCOVER | MDS_TRAVERSE_PATH_LOCKED,
7997 &trace, &in);
7998 if (r > 0) return;
7999 if (r == -CEPHFS_ESTALE) {
8000 mdcache->find_ino_peers(srcpath.get_ino(), new C_MDS_RetryRequest(mdcache, mdr),
8001 mdr->peer_to_mds, true);
8002 return;
8003 }
8004 ceph_assert(r == 0);
8005 CDentry *dn = trace.back();
8006 dout(10) << " dn " << *dn << dendl;
8007 mdr->pin(dn);
8008
8009 ceph_assert(mdr->straydn);
8010 CDentry *straydn = mdr->straydn;
8011 dout(10) << " straydn " << *straydn << dendl;
8012
8013 mdr->set_op_stamp(mdr->peer_request->op_stamp);
8014
8015 rmdir_rollback rollback;
8016 rollback.reqid = mdr->reqid;
8017 rollback.src_dir = dn->get_dir()->dirfrag();
8018 rollback.src_dname = dn->get_name();
8019 rollback.dest_dir = straydn->get_dir()->dirfrag();
8020 rollback.dest_dname = straydn->get_name();
8021 if (mdr->peer_request->desti_snapbl.length()) {
8022 if (in->snaprealm) {
8023 encode(true, rollback.snapbl);
8024 in->encode_snap_blob(rollback.snapbl);
8025 } else {
8026 encode(false, rollback.snapbl);
8027 }
8028 }
8029 encode(rollback, mdr->more()->rollback_bl);
8030 // FIXME: rollback snaprealm
8031 dout(20) << " rollback is " << mdr->more()->rollback_bl.length() << " bytes" << dendl;
8032
8033 // set up commit waiter
8034 mdr->more()->peer_commit = new C_MDS_PeerRmdirCommit(this, mdr, straydn);
8035
8036 straydn->push_projected_linkage(in);
8037 dn->push_projected_linkage();
8038
8039 ceph_assert(straydn->first >= in->first);
8040 in->first = straydn->first;
8041
8042 if (!in->has_subtree_root_dirfrag(mds->get_nodeid())) {
8043 dout(10) << " no auth subtree in " << *in << ", skipping journal" << dendl;
8044 _logged_peer_rmdir(mdr, dn, straydn);
8045 return;
8046 }
8047
8048 mdr->ls = mdlog->get_current_segment();
8049 EPeerUpdate *le = new EPeerUpdate(mdlog, "peer_rmdir", mdr->reqid, mdr->peer_to_mds,
8050 EPeerUpdate::OP_PREPARE, EPeerUpdate::RMDIR);
8051 mdlog->start_entry(le);
8052 le->rollback = mdr->more()->rollback_bl;
8053
8054 le->commit.add_dir_context(straydn->get_dir());
8055 le->commit.add_primary_dentry(straydn, in, true);
8056 // peer: no need to journal original dentry
8057
8058 dout(10) << " noting renamed (unlinked) dir ino " << in->ino() << " in metablob" << dendl;
8059 le->commit.renamed_dirino = in->ino();
8060
8061 mdcache->project_subtree_rename(in, dn->get_dir(), straydn->get_dir());
8062 mdcache->add_uncommitted_peer(mdr->reqid, mdr->ls, mdr->peer_to_mds);
8063
8064 mdr->more()->peer_update_journaled = true;
8065 submit_mdlog_entry(le, new C_MDS_PeerRmdirPrep(this, mdr, dn, straydn),
8066 mdr, __func__);
8067 mdlog->flush();
8068 }
8069
8070 void Server::_logged_peer_rmdir(MDRequestRef& mdr, CDentry *dn, CDentry *straydn)
8071 {
8072 dout(10) << "_logged_peer_rmdir " << *mdr << " on " << *dn << dendl;
8073 CInode *in = dn->get_linkage()->get_inode();
8074
8075 bool new_realm;
8076 if (mdr->peer_request->desti_snapbl.length()) {
8077 new_realm = !in->snaprealm;
8078 in->decode_snap_blob(mdr->peer_request->desti_snapbl);
8079 ceph_assert(in->snaprealm);
8080 } else {
8081 new_realm = false;
8082 }
8083
8084 // update our cache now, so we are consistent with what is in the journal
8085 // when we journal a subtree map
8086 dn->get_dir()->unlink_inode(dn);
8087 straydn->pop_projected_linkage();
8088 dn->pop_projected_linkage();
8089
8090 mdcache->adjust_subtree_after_rename(in, dn->get_dir(), mdr->more()->peer_update_journaled);
8091
8092 if (new_realm)
8093 mdcache->do_realm_invalidate_and_update_notify(in, CEPH_SNAP_OP_SPLIT, false);
8094
8095 // done.
8096 mdr->reset_peer_request();
8097 mdr->straydn = 0;
8098
8099 if (!mdr->aborted) {
8100 auto reply = make_message<MMDSPeerRequest>(mdr->reqid, mdr->attempt, MMDSPeerRequest::OP_RMDIRPREPACK);
8101 if (!mdr->more()->peer_update_journaled)
8102 reply->mark_not_journaled();
8103 mds->send_message_mds(reply, mdr->peer_to_mds);
8104 } else {
8105 dout(10) << " abort flag set, finishing" << dendl;
8106 mdcache->request_finish(mdr);
8107 }
8108 }
8109
8110 void Server::handle_peer_rmdir_prep_ack(MDRequestRef& mdr, const cref_t<MMDSPeerRequest> &ack)
8111 {
8112 dout(10) << "handle_peer_rmdir_prep_ack " << *mdr
8113 << " " << *ack << dendl;
8114
8115 mds_rank_t from = mds_rank_t(ack->get_source().num());
8116
8117 mdr->more()->peers.insert(from);
8118 mdr->more()->witnessed.insert(from);
8119 if (!ack->is_not_journaled())
8120 mdr->more()->has_journaled_peers = true;
8121
8122 // remove from waiting list
8123 ceph_assert(mdr->more()->waiting_on_peer.count(from));
8124 mdr->more()->waiting_on_peer.erase(from);
8125
8126 if (mdr->more()->waiting_on_peer.empty())
8127 dispatch_client_request(mdr); // go again!
8128 else
8129 dout(10) << "still waiting on peers " << mdr->more()->waiting_on_peer << dendl;
8130 }
8131
8132 void Server::_commit_peer_rmdir(MDRequestRef& mdr, int r, CDentry *straydn)
8133 {
8134 dout(10) << "_commit_peer_rmdir " << *mdr << " r=" << r << dendl;
8135
8136 if (r == 0) {
8137 if (mdr->more()->peer_update_journaled) {
8138 CInode *strayin = straydn->get_projected_linkage()->get_inode();
8139 if (strayin && !strayin->snaprealm)
8140 mdcache->clear_dirty_bits_for_stray(strayin);
8141 }
8142
8143 mdr->cleanup();
8144
8145 if (mdr->more()->peer_update_journaled) {
8146 // write a commit to the journal
8147 EPeerUpdate *le = new EPeerUpdate(mdlog, "peer_rmdir_commit", mdr->reqid,
8148 mdr->peer_to_mds, EPeerUpdate::OP_COMMIT,
8149 EPeerUpdate::RMDIR);
8150 mdlog->start_entry(le);
8151 submit_mdlog_entry(le, new C_MDS_CommittedPeer(this, mdr), mdr, __func__);
8152 mdlog->flush();
8153 } else {
8154 _committed_peer(mdr);
8155 }
8156 } else {
8157 // abort
8158 do_rmdir_rollback(mdr->more()->rollback_bl, mdr->peer_to_mds, mdr);
8159 }
8160 }
8161
8162 struct C_MDS_LoggedRmdirRollback : public ServerLogContext {
8163 metareqid_t reqid;
8164 CDentry *dn;
8165 CDentry *straydn;
8166 C_MDS_LoggedRmdirRollback(Server *s, MDRequestRef& m, metareqid_t mr, CDentry *d, CDentry *st)
8167 : ServerLogContext(s, m), reqid(mr), dn(d), straydn(st) {}
8168 void finish(int r) override {
8169 server->_rmdir_rollback_finish(mdr, reqid, dn, straydn);
8170 }
8171 };
8172
8173 void Server::do_rmdir_rollback(bufferlist &rbl, mds_rank_t leader, MDRequestRef& mdr)
8174 {
8175 // unlink the other rollback methods, the rmdir rollback is only
8176 // needed to record the subtree changes in the journal for inode
8177 // replicas who are auth for empty dirfrags. no actual changes to
8178 // the file system are taking place here, so there is no Mutation.
8179
8180 rmdir_rollback rollback;
8181 auto p = rbl.cbegin();
8182 decode(rollback, p);
8183
8184 dout(10) << "do_rmdir_rollback on " << rollback.reqid << dendl;
8185 mdcache->add_rollback(rollback.reqid, leader); // need to finish this update before resolve finishes
8186 ceph_assert(mdr || mds->is_resolve());
8187
8188 CDir *dir = mdcache->get_dirfrag(rollback.src_dir);
8189 if (!dir)
8190 dir = mdcache->get_dirfrag(rollback.src_dir.ino, rollback.src_dname);
8191 ceph_assert(dir);
8192 CDentry *dn = dir->lookup(rollback.src_dname);
8193 ceph_assert(dn);
8194 dout(10) << " dn " << *dn << dendl;
8195 CDir *straydir = mdcache->get_dirfrag(rollback.dest_dir);
8196 ceph_assert(straydir);
8197 CDentry *straydn = straydir->lookup(rollback.dest_dname);
8198 ceph_assert(straydn);
8199 dout(10) << " straydn " << *straydn << dendl;
8200 CInode *in = straydn->get_linkage()->get_inode();
8201
8202 dn->push_projected_linkage(in);
8203 straydn->push_projected_linkage();
8204
8205 if (rollback.snapbl.length() && in->snaprealm) {
8206 bool hadrealm;
8207 auto p = rollback.snapbl.cbegin();
8208 decode(hadrealm, p);
8209 if (hadrealm) {
8210 decode(in->snaprealm->srnode, p);
8211 } else {
8212 in->snaprealm->merge_to(dir->get_inode()->find_snaprealm());
8213 }
8214 }
8215
8216 if (mdr && !mdr->more()->peer_update_journaled) {
8217 ceph_assert(!in->has_subtree_root_dirfrag(mds->get_nodeid()));
8218
8219 _rmdir_rollback_finish(mdr, rollback.reqid, dn, straydn);
8220 return;
8221 }
8222
8223
8224 EPeerUpdate *le = new EPeerUpdate(mdlog, "peer_rmdir_rollback", rollback.reqid, leader,
8225 EPeerUpdate::OP_ROLLBACK, EPeerUpdate::RMDIR);
8226 mdlog->start_entry(le);
8227
8228 le->commit.add_dir_context(dn->get_dir());
8229 le->commit.add_primary_dentry(dn, in, true);
8230 // peer: no need to journal straydn
8231
8232 dout(10) << " noting renamed (unlinked) dir ino " << in->ino() << " in metablob" << dendl;
8233 le->commit.renamed_dirino = in->ino();
8234
8235 mdcache->project_subtree_rename(in, straydn->get_dir(), dn->get_dir());
8236
8237 submit_mdlog_entry(le,
8238 new C_MDS_LoggedRmdirRollback(this, mdr,rollback.reqid,
8239 dn, straydn),
8240 mdr, __func__);
8241 mdlog->flush();
8242 }
8243
8244 void Server::_rmdir_rollback_finish(MDRequestRef& mdr, metareqid_t reqid, CDentry *dn, CDentry *straydn)
8245 {
8246 dout(10) << "_rmdir_rollback_finish " << reqid << dendl;
8247
8248 straydn->get_dir()->unlink_inode(straydn);
8249 dn->pop_projected_linkage();
8250 straydn->pop_projected_linkage();
8251
8252 CInode *in = dn->get_linkage()->get_inode();
8253 mdcache->adjust_subtree_after_rename(in, straydn->get_dir(),
8254 !mdr || mdr->more()->peer_update_journaled);
8255
8256 if (mds->is_resolve()) {
8257 CDir *root = mdcache->get_subtree_root(straydn->get_dir());
8258 mdcache->try_trim_non_auth_subtree(root);
8259 }
8260
8261 if (mdr)
8262 mdcache->request_finish(mdr);
8263
8264 mdcache->finish_rollback(reqid, mdr);
8265 }
8266
8267
8268 /** _dir_is_nonempty[_unlocked]
8269 *
8270 * check if a directory is non-empty (i.e. we can rmdir it).
8271 *
8272 * the unlocked varient this is a fastpath check. we can't really be
8273 * sure until we rdlock the filelock.
8274 */
8275 bool Server::_dir_is_nonempty_unlocked(MDRequestRef& mdr, CInode *in)
8276 {
8277 dout(10) << "dir_is_nonempty_unlocked " << *in << dendl;
8278 ceph_assert(in->is_auth());
8279
8280 if (in->filelock.is_cached())
8281 return false; // there can be pending async create/unlink. don't know.
8282 if (in->snaprealm && in->snaprealm->srnode.snaps.size())
8283 return true; // in a snapshot!
8284
8285 auto&& ls = in->get_dirfrags();
8286 for (const auto& dir : ls) {
8287 // is the frag obviously non-empty?
8288 if (dir->is_auth()) {
8289 if (dir->get_projected_fnode()->fragstat.size()) {
8290 dout(10) << "dir_is_nonempty_unlocked dirstat has "
8291 << dir->get_projected_fnode()->fragstat.size() << " items " << *dir << dendl;
8292 return true;
8293 }
8294 }
8295 }
8296
8297 return false;
8298 }
8299
8300 bool Server::_dir_is_nonempty(MDRequestRef& mdr, CInode *in)
8301 {
8302 dout(10) << "dir_is_nonempty " << *in << dendl;
8303 ceph_assert(in->is_auth());
8304 ceph_assert(in->filelock.can_read(mdr->get_client()));
8305
8306 frag_info_t dirstat;
8307 version_t dirstat_version = in->get_projected_inode()->dirstat.version;
8308
8309 auto&& ls = in->get_dirfrags();
8310 for (const auto& dir : ls) {
8311 const auto& pf = dir->get_projected_fnode();
8312 if (pf->fragstat.size()) {
8313 dout(10) << "dir_is_nonempty dirstat has "
8314 << pf->fragstat.size() << " items " << *dir << dendl;
8315 return true;
8316 }
8317
8318 if (pf->accounted_fragstat.version == dirstat_version)
8319 dirstat.add(pf->accounted_fragstat);
8320 else
8321 dirstat.add(pf->fragstat);
8322 }
8323
8324 return dirstat.size() != in->get_projected_inode()->dirstat.size();
8325 }
8326
8327
8328 // ======================================================
8329
8330
8331 class C_MDS_rename_finish : public ServerLogContext {
8332 CDentry *srcdn;
8333 CDentry *destdn;
8334 CDentry *straydn;
8335 public:
8336 C_MDS_rename_finish(Server *s, MDRequestRef& r,
8337 CDentry *sdn, CDentry *ddn, CDentry *stdn) :
8338 ServerLogContext(s, r),
8339 srcdn(sdn), destdn(ddn), straydn(stdn) { }
8340 void finish(int r) override {
8341 ceph_assert(r == 0);
8342 server->_rename_finish(mdr, srcdn, destdn, straydn);
8343 }
8344 };
8345
8346
8347 /** handle_client_rename
8348 *
8349 * rename leader is the destdn auth. this is because cached inodes
8350 * must remain connected. thus, any replica of srci, must also
8351 * replicate destdn, and possibly straydn, so that srci (and
8352 * destdn->inode) remain connected during the rename.
8353 *
8354 * to do this, we freeze srci, then leader (destdn auth) verifies that
8355 * all other nodes have also replciated destdn and straydn. note that
8356 * destdn replicas need not also replicate srci. this only works when
8357 * destdn is leader.
8358 *
8359 * This function takes responsibility for the passed mdr.
8360 */
8361 void Server::handle_client_rename(MDRequestRef& mdr)
8362 {
8363 const auto& req = mdr->client_request;
8364 dout(7) << "handle_client_rename " << *req << dendl;
8365
8366 filepath destpath = req->get_filepath();
8367 filepath srcpath = req->get_filepath2();
8368 if (srcpath.is_last_dot_or_dotdot() || destpath.is_last_dot_or_dotdot()) {
8369 respond_to_request(mdr, -CEPHFS_EBUSY);
8370 return;
8371 }
8372
8373 if (req->get_alternate_name().size() > alternate_name_max) {
8374 dout(10) << " alternate_name longer than " << alternate_name_max << dendl;
8375 respond_to_request(mdr, -CEPHFS_ENAMETOOLONG);
8376 return;
8377 }
8378
8379 auto [destdn, srcdn] = rdlock_two_paths_xlock_destdn(mdr, true);
8380 if (!destdn)
8381 return;
8382
8383 dout(10) << " destdn " << *destdn << dendl;
8384 CDir *destdir = destdn->get_dir();
8385 ceph_assert(destdir->is_auth());
8386 CDentry::linkage_t *destdnl = destdn->get_projected_linkage();
8387
8388 dout(10) << " srcdn " << *srcdn << dendl;
8389 CDir *srcdir = srcdn->get_dir();
8390 CDentry::linkage_t *srcdnl = srcdn->get_projected_linkage();
8391 CInode *srci = srcdnl->get_inode();
8392 dout(10) << " srci " << *srci << dendl;
8393
8394 // -- some sanity checks --
8395 if (destdn == srcdn) {
8396 dout(7) << "rename src=dest, noop" << dendl;
8397 respond_to_request(mdr, 0);
8398 return;
8399 }
8400
8401 // dest a child of src?
8402 // e.g. mv /usr /usr/foo
8403 if (srci->is_dir() && srci->is_projected_ancestor_of(destdir->get_inode())) {
8404 dout(7) << "cannot rename item to be a child of itself" << dendl;
8405 respond_to_request(mdr, -CEPHFS_EINVAL);
8406 return;
8407 }
8408
8409 // is this a stray migration, reintegration or merge? (sanity checks!)
8410 if (mdr->reqid.name.is_mds() &&
8411 !(MDS_INO_IS_STRAY(srcpath.get_ino()) &&
8412 MDS_INO_IS_STRAY(destpath.get_ino())) &&
8413 !(destdnl->is_remote() &&
8414 destdnl->get_remote_ino() == srci->ino())) {
8415 respond_to_request(mdr, -CEPHFS_EINVAL); // actually, this won't reply, but whatev.
8416 return;
8417 }
8418
8419 CInode *oldin = 0;
8420 if (!destdnl->is_null()) {
8421 //dout(10) << "dest dn exists " << *destdn << dendl;
8422 oldin = mdcache->get_dentry_inode(destdn, mdr, true);
8423 if (!oldin) return;
8424 dout(10) << " oldin " << *oldin << dendl;
8425
8426 // non-empty dir? do trivial fast unlocked check, do another check later with read locks
8427 if (oldin->is_dir() && _dir_is_nonempty_unlocked(mdr, oldin)) {
8428 respond_to_request(mdr, -CEPHFS_ENOTEMPTY);
8429 return;
8430 }
8431
8432 // mv /some/thing /to/some/existing_other_thing
8433 if (oldin->is_dir() && !srci->is_dir()) {
8434 respond_to_request(mdr, -CEPHFS_EISDIR);
8435 return;
8436 }
8437 if (!oldin->is_dir() && srci->is_dir()) {
8438 respond_to_request(mdr, -CEPHFS_ENOTDIR);
8439 return;
8440 }
8441 if (srci == oldin && !srcdir->inode->is_stray()) {
8442 respond_to_request(mdr, 0); // no-op. POSIX makes no sense.
8443 return;
8444 }
8445 if (destdn->get_alternate_name() != req->get_alternate_name()) {
8446 /* the dentry exists but the alternate_names do not match, fail... */
8447 respond_to_request(mdr, -CEPHFS_EINVAL);
8448 return;
8449 }
8450 }
8451
8452 vector<CDentry*>& srctrace = mdr->dn[1];
8453 vector<CDentry*>& desttrace = mdr->dn[0];
8454
8455 // src+dest traces _must_ share a common ancestor for locking to prevent orphans
8456 if (destpath.get_ino() != srcpath.get_ino() &&
8457 !(req->get_source().is_mds() &&
8458 MDS_INO_IS_STRAY(srcpath.get_ino()))) { // <-- mds 'rename' out of stray dir is ok!
8459 CInode *srcbase = srctrace[0]->get_dir()->get_inode();
8460 CInode *destbase = desttrace[0]->get_dir()->get_inode();
8461 // ok, extend srctrace toward root until it is an ancestor of desttrace.
8462 while (srcbase != destbase &&
8463 !srcbase->is_projected_ancestor_of(destbase)) {
8464 CDentry *pdn = srcbase->get_projected_parent_dn();
8465 srctrace.insert(srctrace.begin(), pdn);
8466 dout(10) << "rename prepending srctrace with " << *pdn << dendl;
8467 srcbase = pdn->get_dir()->get_inode();
8468 }
8469
8470 // then, extend destpath until it shares the same parent inode as srcpath.
8471 while (destbase != srcbase) {
8472 CDentry *pdn = destbase->get_projected_parent_dn();
8473 desttrace.insert(desttrace.begin(), pdn);
8474 dout(10) << "rename prepending desttrace with " << *pdn << dendl;
8475 destbase = pdn->get_dir()->get_inode();
8476 }
8477 dout(10) << "rename src and dest traces now share common ancestor " << *destbase << dendl;
8478 }
8479
8480
8481 bool linkmerge = srcdnl->get_inode() == destdnl->get_inode();
8482 if (linkmerge)
8483 dout(10) << " this is a link merge" << dendl;
8484
8485 // -- create stray dentry? --
8486 CDentry *straydn = NULL;
8487 if (destdnl->is_primary() && !linkmerge) {
8488 straydn = prepare_stray_dentry(mdr, destdnl->get_inode());
8489 if (!straydn)
8490 return;
8491 dout(10) << " straydn is " << *straydn << dendl;
8492 } else if (mdr->straydn) {
8493 mdr->unpin(mdr->straydn);
8494 mdr->straydn = NULL;
8495 }
8496
8497
8498 // -- locks --
8499 if (!(mdr->locking_state & MutationImpl::ALL_LOCKED)) {
8500 MutationImpl::LockOpVec lov;
8501
8502 // we need to update srci's ctime. xlock its least contended lock to do that...
8503 lov.add_xlock(&srci->linklock);
8504 lov.add_xlock(&srci->snaplock);
8505
8506 if (oldin) {
8507 // xlock oldin (for nlink--)
8508 lov.add_xlock(&oldin->linklock);
8509 lov.add_xlock(&oldin->snaplock);
8510 if (oldin->is_dir()) {
8511 ceph_assert(srci->is_dir());
8512 lov.add_rdlock(&oldin->filelock); // to verify it's empty
8513
8514 // adjust locking order?
8515 int cmp = mdr->compare_paths();
8516 if (cmp < 0 || (cmp == 0 && oldin->ino() < srci->ino()))
8517 std::reverse(lov.begin(), lov.end());
8518 } else {
8519 ceph_assert(!srci->is_dir());
8520 // adjust locking order;
8521 if (srci->ino() > oldin->ino())
8522 std::reverse(lov.begin(), lov.end());
8523 }
8524 }
8525
8526 // straydn?
8527 if (straydn) {
8528 lov.add_wrlock(&straydn->get_dir()->inode->filelock);
8529 lov.add_wrlock(&straydn->get_dir()->inode->nestlock);
8530 lov.add_xlock(&straydn->lock);
8531 }
8532
8533 CInode *auth_pin_freeze = !srcdn->is_auth() && srcdnl->is_primary() ? srci : nullptr;
8534 if (!mds->locker->acquire_locks(mdr, lov, auth_pin_freeze))
8535 return;
8536
8537 mdr->locking_state |= MutationImpl::ALL_LOCKED;
8538 }
8539
8540 if (linkmerge)
8541 ceph_assert(srcdir->inode->is_stray() && srcdnl->is_primary() && destdnl->is_remote());
8542
8543 if ((!mdr->has_more() || mdr->more()->witnessed.empty())) {
8544 if (!check_access(mdr, srcdir->get_inode(), MAY_WRITE))
8545 return;
8546
8547 if (!check_access(mdr, destdn->get_dir()->get_inode(), MAY_WRITE))
8548 return;
8549
8550 if (!linkmerge && !check_fragment_space(mdr, destdn->get_dir()))
8551 return;
8552
8553 if (!linkmerge && !check_dir_max_entries(mdr, destdn->get_dir()))
8554 return;
8555
8556 if (!check_access(mdr, srci, MAY_WRITE))
8557 return;
8558 }
8559
8560 // with read lock, really verify oldin is empty
8561 if (oldin &&
8562 oldin->is_dir() &&
8563 _dir_is_nonempty(mdr, oldin)) {
8564 respond_to_request(mdr, -CEPHFS_ENOTEMPTY);
8565 return;
8566 }
8567
8568 /* project_snaprealm_past_parent() will do this job
8569 *
8570 // moving between snaprealms?
8571 if (srcdnl->is_primary() && srci->is_multiversion() && !srci->snaprealm) {
8572 SnapRealm *srcrealm = srci->find_snaprealm();
8573 SnapRealm *destrealm = destdn->get_dir()->inode->find_snaprealm();
8574 if (srcrealm != destrealm &&
8575 (srcrealm->get_newest_seq() + 1 > srcdn->first ||
8576 destrealm->get_newest_seq() + 1 > srcdn->first)) {
8577 dout(10) << " renaming between snaprealms, creating snaprealm for " << *srci << dendl;
8578 mdcache->snaprealm_create(mdr, srci);
8579 return;
8580 }
8581 }
8582 */
8583
8584 SnapRealm *dest_realm = nullptr;
8585 SnapRealm *src_realm = nullptr;
8586 if (!linkmerge) {
8587 dest_realm = destdir->inode->find_snaprealm();
8588 if (srcdir->inode == destdir->inode)
8589 src_realm = dest_realm;
8590 else
8591 src_realm = srcdir->inode->find_snaprealm();
8592 if (src_realm != dest_realm &&
8593 src_realm->get_subvolume_ino() != dest_realm->get_subvolume_ino()) {
8594 respond_to_request(mdr, -CEPHFS_EXDEV);
8595 return;
8596 }
8597 }
8598
8599 ceph_assert(g_conf()->mds_kill_rename_at != 1);
8600
8601 // -- open all srcdn inode frags, if any --
8602 // we need these open so that auth can properly delegate from inode to dirfrags
8603 // after the inode is _ours_.
8604 if (srcdnl->is_primary() &&
8605 !srcdn->is_auth() &&
8606 srci->is_dir()) {
8607 dout(10) << "srci is remote dir, setting stickydirs and opening all frags" << dendl;
8608 mdr->set_stickydirs(srci);
8609
8610 frag_vec_t leaves;
8611 srci->dirfragtree.get_leaves(leaves);
8612 for (const auto& leaf : leaves) {
8613 CDir *dir = srci->get_dirfrag(leaf);
8614 if (!dir) {
8615 dout(10) << " opening " << leaf << " under " << *srci << dendl;
8616 mdcache->open_remote_dirfrag(srci, leaf, new C_MDS_RetryRequest(mdcache, mdr));
8617 return;
8618 }
8619 }
8620 }
8621
8622 // -- prepare snaprealm ---
8623
8624 if (linkmerge) {
8625 if (!mdr->more()->srci_srnode &&
8626 srci->get_projected_inode()->nlink == 1 &&
8627 srci->is_projected_snaprealm_global()) {
8628 sr_t *new_srnode = srci->prepare_new_srnode(0);
8629 srci->record_snaprealm_parent_dentry(new_srnode, nullptr, destdn, false);
8630
8631 srci->clear_snaprealm_global(new_srnode);
8632 mdr->more()->srci_srnode = new_srnode;
8633 }
8634 } else {
8635 if (oldin && !mdr->more()->desti_srnode) {
8636 if (oldin->is_projected_snaprealm_global()) {
8637 sr_t *new_srnode = oldin->prepare_new_srnode(0);
8638 oldin->record_snaprealm_parent_dentry(new_srnode, dest_realm, destdn, destdnl->is_primary());
8639 // dropping the last linkage or dropping the last remote linkage,
8640 // detch the inode from global snaprealm
8641 auto nlink = oldin->get_projected_inode()->nlink;
8642 if (nlink == 1 ||
8643 (nlink == 2 && !destdnl->is_primary() &&
8644 !oldin->get_projected_parent_dir()->inode->is_stray()))
8645 oldin->clear_snaprealm_global(new_srnode);
8646 mdr->more()->desti_srnode = new_srnode;
8647 } else if (destdnl->is_primary()) {
8648 snapid_t follows = dest_realm->get_newest_seq();
8649 if (oldin->snaprealm || follows + 1 > oldin->get_oldest_snap()) {
8650 sr_t *new_srnode = oldin->prepare_new_srnode(follows);
8651 oldin->record_snaprealm_past_parent(new_srnode, straydn->get_dir()->inode->find_snaprealm());
8652 mdr->more()->desti_srnode = new_srnode;
8653 }
8654 }
8655 }
8656 if (!mdr->more()->srci_srnode) {
8657 if (srci->is_projected_snaprealm_global()) {
8658 sr_t *new_srnode = srci->prepare_new_srnode(0);
8659 srci->record_snaprealm_parent_dentry(new_srnode, src_realm, srcdn, srcdnl->is_primary());
8660 mdr->more()->srci_srnode = new_srnode;
8661 } else if (srcdnl->is_primary()) {
8662 snapid_t follows = src_realm->get_newest_seq();
8663 if (src_realm != dest_realm &&
8664 (srci->snaprealm || follows + 1 > srci->get_oldest_snap())) {
8665 sr_t *new_srnode = srci->prepare_new_srnode(follows);
8666 srci->record_snaprealm_past_parent(new_srnode, dest_realm);
8667 mdr->more()->srci_srnode = new_srnode;
8668 }
8669 }
8670 }
8671 }
8672
8673 // -- prepare witnesses --
8674
8675 /*
8676 * NOTE: we use _all_ replicas as witnesses.
8677 * this probably isn't totally necessary (esp for file renames),
8678 * but if/when we change that, we have to make sure rejoin is
8679 * sufficiently robust to handle strong rejoins from survivors
8680 * with totally wrong dentry->inode linkage.
8681 * (currently, it can ignore rename effects, because the resolve
8682 * stage will sort them out.)
8683 */
8684 set<mds_rank_t> witnesses = mdr->more()->extra_witnesses;
8685 if (srcdn->is_auth())
8686 srcdn->list_replicas(witnesses);
8687 else
8688 witnesses.insert(srcdn->authority().first);
8689 if (srcdnl->is_remote() && !srci->is_auth())
8690 witnesses.insert(srci->authority().first);
8691 destdn->list_replicas(witnesses);
8692 if (destdnl->is_remote() && !oldin->is_auth())
8693 witnesses.insert(oldin->authority().first);
8694 dout(10) << " witnesses " << witnesses << ", have " << mdr->more()->witnessed << dendl;
8695
8696 if (!witnesses.empty()) {
8697 // Replicas can't see projected dentry linkages and will get confused.
8698 // We have taken snaplocks on ancestor inodes. Later rename/rmdir requests
8699 // can't project these inodes' linkages.
8700 bool need_flush = false;
8701 for (auto& dn : srctrace) {
8702 if (dn->is_projected()) {
8703 need_flush = true;
8704 break;
8705 }
8706 }
8707 if (!need_flush) {
8708 CDentry *dn = destdn;
8709 do {
8710 if (dn->is_projected()) {
8711 need_flush = true;
8712 break;
8713 }
8714 CInode *diri = dn->get_dir()->get_inode();
8715 dn = diri->get_projected_parent_dn();
8716 } while (dn);
8717 }
8718 if (need_flush) {
8719 mdlog->wait_for_safe(
8720 new MDSInternalContextWrapper(mds,
8721 new C_MDS_RetryRequest(mdcache, mdr)));
8722 mdlog->flush();
8723 return;
8724 }
8725 }
8726
8727 // do srcdn auth last
8728 mds_rank_t last = MDS_RANK_NONE;
8729 if (!srcdn->is_auth()) {
8730 last = srcdn->authority().first;
8731 mdr->more()->srcdn_auth_mds = last;
8732 // ask auth of srci to mark srci as ambiguous auth if more than two MDS
8733 // are involved in the rename operation.
8734 if (srcdnl->is_primary() && !mdr->more()->is_ambiguous_auth) {
8735 dout(10) << " preparing ambiguous auth for srci" << dendl;
8736 ceph_assert(mdr->more()->is_remote_frozen_authpin);
8737 ceph_assert(mdr->more()->rename_inode == srci);
8738 _rename_prepare_witness(mdr, last, witnesses, srctrace, desttrace, straydn);
8739 return;
8740 }
8741 }
8742
8743 for (set<mds_rank_t>::iterator p = witnesses.begin();
8744 p != witnesses.end();
8745 ++p) {
8746 if (*p == last) continue; // do it last!
8747 if (mdr->more()->witnessed.count(*p)) {
8748 dout(10) << " already witnessed by mds." << *p << dendl;
8749 } else if (mdr->more()->waiting_on_peer.count(*p)) {
8750 dout(10) << " already waiting on witness mds." << *p << dendl;
8751 } else {
8752 if (!_rename_prepare_witness(mdr, *p, witnesses, srctrace, desttrace, straydn))
8753 return;
8754 }
8755 }
8756 if (!mdr->more()->waiting_on_peer.empty())
8757 return; // we're waiting for a witness.
8758
8759 if (last != MDS_RANK_NONE && mdr->more()->witnessed.count(last) == 0) {
8760 dout(10) << " preparing last witness (srcdn auth)" << dendl;
8761 ceph_assert(mdr->more()->waiting_on_peer.count(last) == 0);
8762 _rename_prepare_witness(mdr, last, witnesses, srctrace, desttrace, straydn);
8763 return;
8764 }
8765
8766 // test hack: bail after peer does prepare, so we can verify it's _live_ rollback.
8767 if (!mdr->more()->peers.empty() && !srci->is_dir())
8768 ceph_assert(g_conf()->mds_kill_rename_at != 3);
8769 if (!mdr->more()->peers.empty() && srci->is_dir())
8770 ceph_assert(g_conf()->mds_kill_rename_at != 4);
8771
8772 // -- declare now --
8773 mdr->set_mds_stamp(ceph_clock_now());
8774
8775 // -- prepare journal entry --
8776 mdr->ls = mdlog->get_current_segment();
8777 EUpdate *le = new EUpdate(mdlog, "rename");
8778 mdlog->start_entry(le);
8779 le->metablob.add_client_req(mdr->reqid, req->get_oldest_client_tid());
8780 if (!mdr->more()->witnessed.empty()) {
8781 dout(20) << " noting uncommitted_peers " << mdr->more()->witnessed << dendl;
8782
8783 le->reqid = mdr->reqid;
8784 le->had_peers = true;
8785
8786 mdcache->add_uncommitted_leader(mdr->reqid, mdr->ls, mdr->more()->witnessed);
8787 // no need to send frozen auth pin to recovring auth MDS of srci
8788 mdr->more()->is_remote_frozen_authpin = false;
8789 }
8790
8791 _rename_prepare(mdr, &le->metablob, &le->client_map, srcdn, destdn, req->get_alternate_name(), straydn);
8792 if (le->client_map.length())
8793 le->cmapv = mds->sessionmap.get_projected();
8794
8795 // -- commit locally --
8796 C_MDS_rename_finish *fin = new C_MDS_rename_finish(this, mdr, srcdn, destdn, straydn);
8797
8798 journal_and_reply(mdr, srci, destdn, le, fin);
8799 mds->balancer->maybe_fragment(destdn->get_dir(), false);
8800 }
8801
8802
8803 void Server::_rename_finish(MDRequestRef& mdr, CDentry *srcdn, CDentry *destdn, CDentry *straydn)
8804 {
8805 dout(10) << "_rename_finish " << *mdr << dendl;
8806
8807 if (!mdr->more()->witnessed.empty())
8808 mdcache->logged_leader_update(mdr->reqid);
8809
8810 // apply
8811 _rename_apply(mdr, srcdn, destdn, straydn);
8812
8813 mdcache->send_dentry_link(destdn, mdr);
8814
8815 CDentry::linkage_t *destdnl = destdn->get_linkage();
8816 CInode *in = destdnl->get_inode();
8817 bool need_eval = mdr->more()->cap_imports.count(in);
8818
8819 // test hack: test peer commit
8820 if (!mdr->more()->peers.empty() && !in->is_dir())
8821 ceph_assert(g_conf()->mds_kill_rename_at != 5);
8822 if (!mdr->more()->peers.empty() && in->is_dir())
8823 ceph_assert(g_conf()->mds_kill_rename_at != 6);
8824
8825 // bump popularity
8826 mds->balancer->hit_dir(srcdn->get_dir(), META_POP_IWR);
8827 if (destdnl->is_remote() && in->is_auth())
8828 mds->balancer->hit_inode(in, META_POP_IWR);
8829
8830 // did we import srci? if so, explicitly ack that import that, before we unlock and reply.
8831
8832 ceph_assert(g_conf()->mds_kill_rename_at != 7);
8833
8834 // reply
8835 respond_to_request(mdr, 0);
8836
8837 if (need_eval)
8838 mds->locker->eval(in, CEPH_CAP_LOCKS, true);
8839
8840 // clean up?
8841 // respond_to_request() drops locks. So stray reintegration can race with us.
8842 if (straydn && !straydn->get_projected_linkage()->is_null()) {
8843 mdcache->notify_stray(straydn);
8844 }
8845 }
8846
8847
8848
8849 // helpers
8850
8851 bool Server::_rename_prepare_witness(MDRequestRef& mdr, mds_rank_t who, set<mds_rank_t> &witnesse,
8852 vector<CDentry*>& srctrace, vector<CDentry*>& dsttrace, CDentry *straydn)
8853 {
8854 const auto& client_req = mdr->client_request;
8855 ceph_assert(client_req);
8856
8857 if (mds->is_cluster_degraded() &&
8858 !mds->mdsmap->is_clientreplay_or_active_or_stopping(who)) {
8859 dout(10) << "_rename_prepare_witness mds." << who << " is not active" << dendl;
8860 if (mdr->more()->waiting_on_peer.empty())
8861 mds->wait_for_active_peer(who, new C_MDS_RetryRequest(mdcache, mdr));
8862 return false;
8863 }
8864
8865 dout(10) << "_rename_prepare_witness mds." << who << dendl;
8866 auto req = make_message<MMDSPeerRequest>(mdr->reqid, mdr->attempt, MMDSPeerRequest::OP_RENAMEPREP);
8867
8868 req->srcdnpath = filepath(srctrace.front()->get_dir()->ino());
8869 for (auto dn : srctrace)
8870 req->srcdnpath.push_dentry(dn->get_name());
8871 req->destdnpath = filepath(dsttrace.front()->get_dir()->ino());
8872 for (auto dn : dsttrace)
8873 req->destdnpath.push_dentry(dn->get_name());
8874 req->alternate_name = client_req->alternate_name;
8875 if (straydn)
8876 mdcache->encode_replica_stray(straydn, who, req->straybl);
8877
8878 if (mdr->more()->srci_srnode)
8879 encode(*mdr->more()->srci_srnode, req->srci_snapbl);
8880 if (mdr->more()->desti_srnode)
8881 encode(*mdr->more()->desti_srnode, req->desti_snapbl);
8882
8883 req->srcdn_auth = mdr->more()->srcdn_auth_mds;
8884
8885 // srcdn auth will verify our current witness list is sufficient
8886 req->witnesses = witnesse;
8887
8888 req->op_stamp = mdr->get_op_stamp();
8889 mds->send_message_mds(req, who);
8890
8891 ceph_assert(mdr->more()->waiting_on_peer.count(who) == 0);
8892 mdr->more()->waiting_on_peer.insert(who);
8893 return true;
8894 }
8895
8896 version_t Server::_rename_prepare_import(MDRequestRef& mdr, CDentry *srcdn, bufferlist *client_map_bl)
8897 {
8898 version_t oldpv = mdr->more()->inode_import_v;
8899
8900 CDentry::linkage_t *srcdnl = srcdn->get_linkage();
8901
8902 /* import node */
8903 auto blp = mdr->more()->inode_import.cbegin();
8904
8905 // imported caps
8906 map<client_t,entity_inst_t> client_map;
8907 map<client_t, client_metadata_t> client_metadata_map;
8908 decode(client_map, blp);
8909 decode(client_metadata_map, blp);
8910 prepare_force_open_sessions(client_map, client_metadata_map,
8911 mdr->more()->imported_session_map);
8912 encode(client_map, *client_map_bl, mds->mdsmap->get_up_features());
8913 encode(client_metadata_map, *client_map_bl);
8914
8915 list<ScatterLock*> updated_scatterlocks;
8916 mdcache->migrator->decode_import_inode(srcdn, blp, srcdn->authority().first, mdr->ls,
8917 mdr->more()->cap_imports, updated_scatterlocks);
8918
8919 // hack: force back to !auth and clean, temporarily
8920 srcdnl->get_inode()->state_clear(CInode::STATE_AUTH);
8921 srcdnl->get_inode()->mark_clean();
8922
8923 return oldpv;
8924 }
8925
8926 bool Server::_need_force_journal(CInode *diri, bool empty)
8927 {
8928 auto&& dirs = diri->get_dirfrags();
8929
8930 bool force_journal = false;
8931 if (empty) {
8932 for (const auto& dir : dirs) {
8933 if (dir->is_subtree_root() && dir->get_dir_auth().first == mds->get_nodeid()) {
8934 dout(10) << " frag " << dir->get_frag() << " is auth subtree dirfrag, will force journal" << dendl;
8935 force_journal = true;
8936 break;
8937 } else
8938 dout(20) << " frag " << dir->get_frag() << " is not auth subtree dirfrag" << dendl;
8939 }
8940 } else {
8941 // see if any children of our frags are auth subtrees.
8942 std::vector<CDir*> subtrees;
8943 mdcache->get_subtrees(subtrees);
8944 dout(10) << " subtrees " << subtrees << " frags " << dirs << dendl;
8945 for (const auto& dir : dirs) {
8946 for (const auto& subtree : subtrees) {
8947 if (dir->contains(subtree)) {
8948 if (subtree->get_dir_auth().first == mds->get_nodeid()) {
8949 dout(10) << " frag " << dir->get_frag() << " contains (maybe) auth subtree, will force journal "
8950 << *subtree << dendl;
8951 force_journal = true;
8952 break;
8953 } else
8954 dout(20) << " frag " << dir->get_frag() << " contains but isn't auth for " << *subtree << dendl;
8955 } else
8956 dout(20) << " frag " << dir->get_frag() << " does not contain " << *subtree << dendl;
8957 }
8958 if (force_journal)
8959 break;
8960 }
8961 }
8962 return force_journal;
8963 }
8964
8965 void Server::_rename_prepare(MDRequestRef& mdr,
8966 EMetaBlob *metablob, bufferlist *client_map_bl,
8967 CDentry *srcdn, CDentry *destdn, std::string_view alternate_name,
8968 CDentry *straydn)
8969 {
8970 dout(10) << "_rename_prepare " << *mdr << " " << *srcdn << " " << *destdn << dendl;
8971 if (straydn)
8972 dout(10) << " straydn " << *straydn << dendl;
8973
8974 CDentry::linkage_t *srcdnl = srcdn->get_projected_linkage();
8975 CDentry::linkage_t *destdnl = destdn->get_projected_linkage();
8976 CInode *srci = srcdnl->get_inode();
8977 CInode *oldin = destdnl->get_inode();
8978
8979 // primary+remote link merge?
8980 bool linkmerge = (srci == oldin);
8981 if (linkmerge)
8982 ceph_assert(srcdnl->is_primary() && destdnl->is_remote());
8983 bool silent = srcdn->get_dir()->inode->is_stray();
8984
8985 bool force_journal_dest = false;
8986 if (srci->is_dir() && !destdn->is_auth()) {
8987 if (srci->is_auth()) {
8988 // if we are auth for srci and exporting it, force journal because journal replay needs
8989 // the source inode to create auth subtrees.
8990 dout(10) << " we are exporting srci, will force journal destdn" << dendl;
8991 force_journal_dest = true;
8992 } else
8993 force_journal_dest = _need_force_journal(srci, false);
8994 }
8995
8996 bool force_journal_stray = false;
8997 if (oldin && oldin->is_dir() && straydn && !straydn->is_auth())
8998 force_journal_stray = _need_force_journal(oldin, true);
8999
9000 if (linkmerge)
9001 dout(10) << " merging remote and primary links to the same inode" << dendl;
9002 if (silent)
9003 dout(10) << " reintegrating stray; will avoid changing nlink or dir mtime" << dendl;
9004 if (force_journal_dest)
9005 dout(10) << " forcing journal destdn because we (will) have auth subtrees nested beneath it" << dendl;
9006 if (force_journal_stray)
9007 dout(10) << " forcing journal straydn because we (will) have auth subtrees nested beneath it" << dendl;
9008
9009 if (srci->is_dir() && (destdn->is_auth() || force_journal_dest)) {
9010 dout(10) << " noting renamed dir ino " << srci->ino() << " in metablob" << dendl;
9011 metablob->renamed_dirino = srci->ino();
9012 } else if (oldin && oldin->is_dir() && force_journal_stray) {
9013 dout(10) << " noting rename target dir " << oldin->ino() << " in metablob" << dendl;
9014 metablob->renamed_dirino = oldin->ino();
9015 }
9016
9017 // prepare
9018 CInode::mempool_inode *spi = 0; // renamed inode
9019 CInode::mempool_inode *tpi = 0; // target/overwritten inode
9020
9021 // target inode
9022 if (!linkmerge) {
9023 if (destdnl->is_primary()) {
9024 ceph_assert(straydn); // moving to straydn.
9025 // link--, and move.
9026 if (destdn->is_auth()) {
9027 auto pi= oldin->project_inode(mdr); //project_snaprealm
9028 pi.inode->version = straydn->pre_dirty(pi.inode->version);
9029 pi.inode->update_backtrace();
9030 tpi = pi.inode.get();
9031 }
9032 straydn->push_projected_linkage(oldin);
9033 } else if (destdnl->is_remote()) {
9034 // nlink-- targeti
9035 if (oldin->is_auth()) {
9036 auto pi = oldin->project_inode(mdr);
9037 pi.inode->version = oldin->pre_dirty();
9038 tpi = pi.inode.get();
9039 }
9040 }
9041 }
9042
9043 // dest
9044 if (destdnl->is_null()) {
9045 /* handle_client_rename checks that alternate_name matches for existing destdn */
9046 destdn->set_alternate_name(alternate_name);
9047 }
9048 if (srcdnl->is_remote()) {
9049 if (!linkmerge) {
9050 // destdn
9051 if (destdn->is_auth())
9052 mdr->more()->pvmap[destdn] = destdn->pre_dirty();
9053 destdn->push_projected_linkage(srcdnl->get_remote_ino(), srcdnl->get_remote_d_type());
9054 // srci
9055 if (srci->is_auth()) {
9056 auto pi = srci->project_inode(mdr);
9057 pi.inode->version = srci->pre_dirty();
9058 spi = pi.inode.get();
9059 }
9060 } else {
9061 dout(10) << " will merge remote onto primary link" << dendl;
9062 if (destdn->is_auth()) {
9063 auto pi = oldin->project_inode(mdr);
9064 pi.inode->version = mdr->more()->pvmap[destdn] = destdn->pre_dirty(oldin->get_version());
9065 spi = pi.inode.get();
9066 }
9067 }
9068 } else { // primary
9069 if (destdn->is_auth()) {
9070 version_t oldpv;
9071 if (srcdn->is_auth())
9072 oldpv = srci->get_projected_version();
9073 else {
9074 oldpv = _rename_prepare_import(mdr, srcdn, client_map_bl);
9075
9076 // note which dirfrags have child subtrees in the journal
9077 // event, so that we can open those (as bounds) during replay.
9078 if (srci->is_dir()) {
9079 auto&& ls = srci->get_dirfrags();
9080 for (const auto& dir : ls) {
9081 if (!dir->is_auth())
9082 metablob->renamed_dir_frags.push_back(dir->get_frag());
9083 }
9084 dout(10) << " noting renamed dir open frags " << metablob->renamed_dir_frags << dendl;
9085 }
9086 }
9087 auto pi = srci->project_inode(mdr); // project snaprealm if srcdnl->is_primary
9088 // & srcdnl->snaprealm
9089 pi.inode->version = mdr->more()->pvmap[destdn] = destdn->pre_dirty(oldpv);
9090 pi.inode->update_backtrace();
9091 spi = pi.inode.get();
9092 }
9093 destdn->push_projected_linkage(srci);
9094 }
9095
9096 // src
9097 if (srcdn->is_auth())
9098 mdr->more()->pvmap[srcdn] = srcdn->pre_dirty();
9099 srcdn->push_projected_linkage(); // push null linkage
9100
9101 if (!silent) {
9102 if (spi) {
9103 spi->ctime = mdr->get_op_stamp();
9104 if (mdr->get_op_stamp() > spi->rstat.rctime)
9105 spi->rstat.rctime = mdr->get_op_stamp();
9106 spi->change_attr++;
9107 if (linkmerge)
9108 spi->nlink--;
9109 }
9110 if (tpi) {
9111 tpi->ctime = mdr->get_op_stamp();
9112 if (mdr->get_op_stamp() > tpi->rstat.rctime)
9113 tpi->rstat.rctime = mdr->get_op_stamp();
9114 tpi->change_attr++;
9115 {
9116 std::string t;
9117 destdn->make_path_string(t, true);
9118 tpi->stray_prior_path = std::move(t);
9119 }
9120 tpi->nlink--;
9121 if (tpi->nlink == 0)
9122 oldin->state_set(CInode::STATE_ORPHAN);
9123 }
9124 }
9125
9126 // prepare nesting, mtime updates
9127 int predirty_dir = silent ? 0:PREDIRTY_DIR;
9128
9129 // guarantee stray dir is processed first during journal replay. unlink the old inode,
9130 // then link the source inode to destdn
9131 if (destdnl->is_primary()) {
9132 ceph_assert(straydn);
9133 if (straydn->is_auth()) {
9134 metablob->add_dir_context(straydn->get_dir());
9135 metablob->add_dir(straydn->get_dir(), true);
9136 }
9137 }
9138
9139 if (!linkmerge && destdnl->is_remote() && oldin->is_auth()) {
9140 CDir *oldin_dir = oldin->get_projected_parent_dir();
9141 if (oldin_dir != srcdn->get_dir() && oldin_dir != destdn->get_dir())
9142 mdcache->predirty_journal_parents(mdr, metablob, oldin, oldin_dir, PREDIRTY_PRIMARY);
9143 }
9144
9145 // sub off target
9146 if (destdn->is_auth() && !destdnl->is_null()) {
9147 mdcache->predirty_journal_parents(mdr, metablob, oldin, destdn->get_dir(),
9148 (destdnl->is_primary() ? PREDIRTY_PRIMARY:0)|predirty_dir, -1);
9149 if (destdnl->is_primary()) {
9150 ceph_assert(straydn);
9151 mdcache->predirty_journal_parents(mdr, metablob, oldin, straydn->get_dir(),
9152 PREDIRTY_PRIMARY|PREDIRTY_DIR, 1);
9153 }
9154 }
9155
9156 if (srcdnl->is_remote() && srci->is_auth()) {
9157 CDir *srci_dir = srci->get_projected_parent_dir();
9158 if (srci_dir != srcdn->get_dir() && srci_dir != destdn->get_dir())
9159 mdcache->predirty_journal_parents(mdr, metablob, srci, srci_dir, PREDIRTY_PRIMARY);
9160 }
9161
9162 // move srcdn
9163 int predirty_primary = (srcdnl->is_primary() && srcdn->get_dir() != destdn->get_dir()) ? PREDIRTY_PRIMARY:0;
9164 int flags = predirty_dir | predirty_primary;
9165 if (srcdn->is_auth())
9166 mdcache->predirty_journal_parents(mdr, metablob, srci, srcdn->get_dir(), PREDIRTY_SHALLOW|flags, -1);
9167 if (destdn->is_auth())
9168 mdcache->predirty_journal_parents(mdr, metablob, srci, destdn->get_dir(), flags, 1);
9169
9170 // add it all to the metablob
9171 // target inode
9172 if (!linkmerge) {
9173 if (destdnl->is_primary()) {
9174 ceph_assert(straydn);
9175 if (destdn->is_auth()) {
9176 // project snaprealm, too
9177 if (auto& desti_srnode = mdr->more()->desti_srnode) {
9178 oldin->project_snaprealm(desti_srnode);
9179 if (tpi->nlink == 0)
9180 ceph_assert(!desti_srnode->is_parent_global());
9181 desti_srnode = NULL;
9182 }
9183 straydn->first = mdcache->get_global_snaprealm()->get_newest_seq() + 1;
9184 metablob->add_primary_dentry(straydn, oldin, true, true);
9185 } else if (force_journal_stray) {
9186 dout(10) << " forced journaling straydn " << *straydn << dendl;
9187 metablob->add_dir_context(straydn->get_dir());
9188 metablob->add_primary_dentry(straydn, oldin, true);
9189 }
9190 } else if (destdnl->is_remote()) {
9191 if (oldin->is_auth()) {
9192 sr_t *new_srnode = NULL;
9193 if (mdr->peer_request) {
9194 if (mdr->peer_request->desti_snapbl.length() > 0) {
9195 new_srnode = new sr_t();
9196 auto p = mdr->peer_request->desti_snapbl.cbegin();
9197 decode(*new_srnode, p);
9198 }
9199 } else if (auto& desti_srnode = mdr->more()->desti_srnode) {
9200 new_srnode = desti_srnode;
9201 desti_srnode = NULL;
9202 }
9203 if (new_srnode) {
9204 oldin->project_snaprealm(new_srnode);
9205 if (tpi->nlink == 0)
9206 ceph_assert(!new_srnode->is_parent_global());
9207 }
9208 // auth for targeti
9209 CDentry *oldin_pdn = oldin->get_projected_parent_dn();
9210 mdcache->journal_cow_dentry(mdr.get(), metablob, oldin_pdn);
9211 metablob->add_primary_dentry(oldin_pdn, oldin, true);
9212 }
9213 }
9214 }
9215
9216 // dest
9217 if (srcdnl->is_remote()) {
9218 ceph_assert(!linkmerge);
9219 if (destdn->is_auth() && !destdnl->is_null())
9220 mdcache->journal_cow_dentry(mdr.get(), metablob, destdn, CEPH_NOSNAP, 0, destdnl);
9221 else
9222 destdn->first = mdcache->get_global_snaprealm()->get_newest_seq() + 1;
9223
9224 if (destdn->is_auth())
9225 metablob->add_remote_dentry(destdn, true, srcdnl->get_remote_ino(), srcdnl->get_remote_d_type());
9226
9227 if (srci->is_auth() ) { // it's remote
9228 if (mdr->peer_request) {
9229 if (mdr->peer_request->srci_snapbl.length() > 0) {
9230 sr_t *new_srnode = new sr_t();
9231 auto p = mdr->peer_request->srci_snapbl.cbegin();
9232 decode(*new_srnode, p);
9233 srci->project_snaprealm(new_srnode);
9234 }
9235 } else if (auto& srci_srnode = mdr->more()->srci_srnode) {
9236 srci->project_snaprealm(srci_srnode);
9237 srci_srnode = NULL;
9238 }
9239
9240 CDentry *srci_pdn = srci->get_projected_parent_dn();
9241 mdcache->journal_cow_dentry(mdr.get(), metablob, srci_pdn);
9242 metablob->add_primary_dentry(srci_pdn, srci, true);
9243 }
9244 } else if (srcdnl->is_primary()) {
9245 // project snap parent update?
9246 if (destdn->is_auth()) {
9247 if (auto& srci_srnode = mdr->more()->srci_srnode) {
9248 srci->project_snaprealm(srci_srnode);
9249 srci_srnode = NULL;
9250 }
9251 }
9252
9253 if (destdn->is_auth() && !destdnl->is_null())
9254 mdcache->journal_cow_dentry(mdr.get(), metablob, destdn, CEPH_NOSNAP, 0, destdnl);
9255
9256 destdn->first = mdcache->get_global_snaprealm()->get_newest_seq() + 1;
9257
9258 if (destdn->is_auth())
9259 metablob->add_primary_dentry(destdn, srci, true, true);
9260 else if (force_journal_dest) {
9261 dout(10) << " forced journaling destdn " << *destdn << dendl;
9262 metablob->add_dir_context(destdn->get_dir());
9263 metablob->add_primary_dentry(destdn, srci, true);
9264 if (srcdn->is_auth() && srci->is_dir()) {
9265 // journal new subtrees root dirfrags
9266 auto&& ls = srci->get_dirfrags();
9267 for (const auto& dir : ls) {
9268 if (dir->is_auth())
9269 metablob->add_dir(dir, true);
9270 }
9271 }
9272 }
9273 }
9274
9275 // src
9276 if (srcdn->is_auth()) {
9277 dout(10) << " journaling srcdn " << *srcdn << dendl;
9278 mdcache->journal_cow_dentry(mdr.get(), metablob, srcdn, CEPH_NOSNAP, 0, srcdnl);
9279 // also journal the inode in case we need do peer rename rollback. It is Ok to add
9280 // both primary and NULL dentries. Because during journal replay, null dentry is
9281 // processed after primary dentry.
9282 if (srcdnl->is_primary() && !srci->is_dir() && !destdn->is_auth())
9283 metablob->add_primary_dentry(srcdn, srci, true);
9284 metablob->add_null_dentry(srcdn, true);
9285 } else
9286 dout(10) << " NOT journaling srcdn " << *srcdn << dendl;
9287
9288 // make renamed inode first track the dn
9289 if (srcdnl->is_primary() && destdn->is_auth()) {
9290 ceph_assert(srci->first <= destdn->first);
9291 srci->first = destdn->first;
9292 }
9293 // make stray inode first track the straydn
9294 if (straydn && straydn->is_auth()) {
9295 ceph_assert(oldin->first <= straydn->first);
9296 oldin->first = straydn->first;
9297 }
9298
9299 if (oldin && oldin->is_dir()) {
9300 ceph_assert(straydn);
9301 mdcache->project_subtree_rename(oldin, destdn->get_dir(), straydn->get_dir());
9302 }
9303 if (srci->is_dir())
9304 mdcache->project_subtree_rename(srci, srcdn->get_dir(), destdn->get_dir());
9305
9306 }
9307
9308
9309 void Server::_rename_apply(MDRequestRef& mdr, CDentry *srcdn, CDentry *destdn, CDentry *straydn)
9310 {
9311 dout(10) << "_rename_apply " << *mdr << " " << *srcdn << " " << *destdn << dendl;
9312 dout(10) << " pvs " << mdr->more()->pvmap << dendl;
9313
9314 CDentry::linkage_t *srcdnl = srcdn->get_linkage();
9315 CDentry::linkage_t *destdnl = destdn->get_linkage();
9316
9317 CInode *oldin = destdnl->get_inode();
9318
9319 // primary+remote link merge?
9320 bool linkmerge = (srcdnl->get_inode() == oldin);
9321 if (linkmerge)
9322 ceph_assert(srcdnl->is_primary() || destdnl->is_remote());
9323
9324 bool new_in_snaprealm = false;
9325 bool new_oldin_snaprealm = false;
9326
9327 // target inode
9328 if (!linkmerge) {
9329 if (destdnl->is_primary()) {
9330 ceph_assert(straydn);
9331 dout(10) << "straydn is " << *straydn << dendl;
9332
9333 // if there is newly created snaprealm, need to split old snaprealm's
9334 // inodes_with_caps. So pop snaprealm before linkage changes.
9335 if (destdn->is_auth()) {
9336 bool hadrealm = (oldin->snaprealm ? true : false);
9337 oldin->early_pop_projected_snaprealm();
9338 new_oldin_snaprealm = (oldin->snaprealm && !hadrealm);
9339 } else {
9340 ceph_assert(mdr->peer_request);
9341 if (mdr->peer_request->desti_snapbl.length()) {
9342 new_oldin_snaprealm = !oldin->snaprealm;
9343 oldin->decode_snap_blob(mdr->peer_request->desti_snapbl);
9344 ceph_assert(oldin->snaprealm);
9345 }
9346 }
9347
9348 destdn->get_dir()->unlink_inode(destdn, false);
9349
9350 straydn->pop_projected_linkage();
9351 if (mdr->is_peer() && !mdr->more()->peer_update_journaled)
9352 ceph_assert(!straydn->is_projected()); // no other projected
9353
9354 // nlink-- targeti
9355 if (destdn->is_auth())
9356 oldin->pop_and_dirty_projected_inode(mdr->ls, mdr);
9357
9358 mdcache->touch_dentry_bottom(straydn); // drop dn as quickly as possible.
9359 } else if (destdnl->is_remote()) {
9360 destdn->get_dir()->unlink_inode(destdn, false);
9361 if (oldin->is_auth()) {
9362 oldin->pop_and_dirty_projected_inode(mdr->ls, mdr);
9363 } else if (mdr->peer_request) {
9364 if (mdr->peer_request->desti_snapbl.length() > 0) {
9365 ceph_assert(oldin->snaprealm);
9366 oldin->decode_snap_blob(mdr->peer_request->desti_snapbl);
9367 }
9368 } else if (auto& desti_srnode = mdr->more()->desti_srnode) {
9369 delete desti_srnode;
9370 desti_srnode = NULL;
9371 }
9372 }
9373 }
9374
9375 // unlink src before we relink it at dest
9376 CInode *in = srcdnl->get_inode();
9377 ceph_assert(in);
9378
9379 bool srcdn_was_remote = srcdnl->is_remote();
9380 if (!srcdn_was_remote) {
9381 // if there is newly created snaprealm, need to split old snaprealm's
9382 // inodes_with_caps. So pop snaprealm before linkage changes.
9383 if (destdn->is_auth()) {
9384 bool hadrealm = (in->snaprealm ? true : false);
9385 in->early_pop_projected_snaprealm();
9386 new_in_snaprealm = (in->snaprealm && !hadrealm);
9387 } else {
9388 ceph_assert(mdr->peer_request);
9389 if (mdr->peer_request->srci_snapbl.length()) {
9390 new_in_snaprealm = !in->snaprealm;
9391 in->decode_snap_blob(mdr->peer_request->srci_snapbl);
9392 ceph_assert(in->snaprealm);
9393 }
9394 }
9395 }
9396
9397 srcdn->get_dir()->unlink_inode(srcdn);
9398
9399 // dest
9400 if (srcdn_was_remote) {
9401 if (!linkmerge) {
9402 // destdn
9403 destdnl = destdn->pop_projected_linkage();
9404 if (mdr->is_peer() && !mdr->more()->peer_update_journaled)
9405 ceph_assert(!destdn->is_projected()); // no other projected
9406
9407 destdn->link_remote(destdnl, in);
9408 if (destdn->is_auth())
9409 destdn->mark_dirty(mdr->more()->pvmap[destdn], mdr->ls);
9410 // in
9411 if (in->is_auth()) {
9412 in->pop_and_dirty_projected_inode(mdr->ls, mdr);
9413 } else if (mdr->peer_request) {
9414 if (mdr->peer_request->srci_snapbl.length() > 0) {
9415 ceph_assert(in->snaprealm);
9416 in->decode_snap_blob(mdr->peer_request->srci_snapbl);
9417 }
9418 } else if (auto& srci_srnode = mdr->more()->srci_srnode) {
9419 delete srci_srnode;
9420 srci_srnode = NULL;
9421 }
9422 } else {
9423 dout(10) << "merging remote onto primary link" << dendl;
9424 oldin->pop_and_dirty_projected_inode(mdr->ls, mdr);
9425 }
9426 } else { // primary
9427 if (linkmerge) {
9428 dout(10) << "merging primary onto remote link" << dendl;
9429 destdn->get_dir()->unlink_inode(destdn, false);
9430 }
9431 destdnl = destdn->pop_projected_linkage();
9432 if (mdr->is_peer() && !mdr->more()->peer_update_journaled)
9433 ceph_assert(!destdn->is_projected()); // no other projected
9434
9435 // srcdn inode import?
9436 if (!srcdn->is_auth() && destdn->is_auth()) {
9437 ceph_assert(mdr->more()->inode_import.length() > 0);
9438
9439 map<client_t,Capability::Import> imported_caps;
9440
9441 // finish cap imports
9442 finish_force_open_sessions(mdr->more()->imported_session_map);
9443 if (mdr->more()->cap_imports.count(destdnl->get_inode())) {
9444 mdcache->migrator->finish_import_inode_caps(destdnl->get_inode(),
9445 mdr->more()->srcdn_auth_mds, true,
9446 mdr->more()->imported_session_map,
9447 mdr->more()->cap_imports[destdnl->get_inode()],
9448 imported_caps);
9449 }
9450
9451 mdr->more()->inode_import.clear();
9452 encode(imported_caps, mdr->more()->inode_import);
9453
9454 /* hack: add an auth pin for each xlock we hold. These were
9455 * remote xlocks previously but now they're local and
9456 * we're going to try and unpin when we xlock_finish. */
9457
9458 for (auto i = mdr->locks.lower_bound(&destdnl->get_inode()->versionlock);
9459 i != mdr->locks.end();
9460 ++i) {
9461 SimpleLock *lock = i->lock;
9462 if (lock->get_parent() != destdnl->get_inode())
9463 break;
9464 if (i->is_xlock() && !lock->is_locallock())
9465 mds->locker->xlock_import(lock);
9466 }
9467
9468 // hack: fix auth bit
9469 in->state_set(CInode::STATE_AUTH);
9470
9471 mdr->clear_ambiguous_auth();
9472 }
9473
9474 if (destdn->is_auth())
9475 in->pop_and_dirty_projected_inode(mdr->ls, mdr);
9476 }
9477
9478 // src
9479 if (srcdn->is_auth())
9480 srcdn->mark_dirty(mdr->more()->pvmap[srcdn], mdr->ls);
9481 srcdn->pop_projected_linkage();
9482 if (mdr->is_peer() && !mdr->more()->peer_update_journaled)
9483 ceph_assert(!srcdn->is_projected()); // no other projected
9484
9485 // apply remaining projected inodes (nested)
9486 mdr->apply();
9487
9488 // update subtree map?
9489 if (destdnl->is_primary() && in->is_dir())
9490 mdcache->adjust_subtree_after_rename(in, srcdn->get_dir(), true);
9491
9492 if (straydn && oldin->is_dir())
9493 mdcache->adjust_subtree_after_rename(oldin, destdn->get_dir(), true);
9494
9495 if (new_oldin_snaprealm)
9496 mdcache->do_realm_invalidate_and_update_notify(oldin, CEPH_SNAP_OP_SPLIT, false);
9497 if (new_in_snaprealm)
9498 mdcache->do_realm_invalidate_and_update_notify(in, CEPH_SNAP_OP_SPLIT, true);
9499
9500 // removing a new dn?
9501 if (srcdn->is_auth())
9502 srcdn->get_dir()->try_remove_unlinked_dn(srcdn);
9503 }
9504
9505
9506
9507 // ------------
9508 // PEER
9509
9510 class C_MDS_PeerRenamePrep : public ServerLogContext {
9511 CDentry *srcdn, *destdn, *straydn;
9512 public:
9513 C_MDS_PeerRenamePrep(Server *s, MDRequestRef& m, CDentry *sr, CDentry *de, CDentry *st) :
9514 ServerLogContext(s, m), srcdn(sr), destdn(de), straydn(st) {}
9515 void finish(int r) override {
9516 server->_logged_peer_rename(mdr, srcdn, destdn, straydn);
9517 }
9518 };
9519
9520 class C_MDS_PeerRenameCommit : public ServerContext {
9521 MDRequestRef mdr;
9522 CDentry *srcdn, *destdn, *straydn;
9523 public:
9524 C_MDS_PeerRenameCommit(Server *s, MDRequestRef& m, CDentry *sr, CDentry *de, CDentry *st) :
9525 ServerContext(s), mdr(m), srcdn(sr), destdn(de), straydn(st) {}
9526 void finish(int r) override {
9527 server->_commit_peer_rename(mdr, r, srcdn, destdn, straydn);
9528 }
9529 };
9530
9531 class C_MDS_PeerRenameSessionsFlushed : public ServerContext {
9532 MDRequestRef mdr;
9533 public:
9534 C_MDS_PeerRenameSessionsFlushed(Server *s, MDRequestRef& r) :
9535 ServerContext(s), mdr(r) {}
9536 void finish(int r) override {
9537 server->_peer_rename_sessions_flushed(mdr);
9538 }
9539 };
9540
9541 void Server::handle_peer_rename_prep(MDRequestRef& mdr)
9542 {
9543 dout(10) << "handle_peer_rename_prep " << *mdr
9544 << " " << mdr->peer_request->srcdnpath
9545 << " to " << mdr->peer_request->destdnpath
9546 << dendl;
9547
9548 if (mdr->peer_request->is_interrupted()) {
9549 dout(10) << " peer request interrupted, sending noop reply" << dendl;
9550 auto reply = make_message<MMDSPeerRequest>(mdr->reqid, mdr->attempt, MMDSPeerRequest::OP_RENAMEPREPACK);
9551 reply->mark_interrupted();
9552 mds->send_message_mds(reply, mdr->peer_to_mds);
9553 mdr->reset_peer_request();
9554 return;
9555 }
9556
9557 // discover destdn
9558 filepath destpath(mdr->peer_request->destdnpath);
9559 dout(10) << " dest " << destpath << dendl;
9560 vector<CDentry*> trace;
9561 CF_MDS_RetryRequestFactory cf(mdcache, mdr, false);
9562 int r = mdcache->path_traverse(mdr, cf, destpath,
9563 MDS_TRAVERSE_DISCOVER | MDS_TRAVERSE_PATH_LOCKED | MDS_TRAVERSE_WANT_DENTRY,
9564 &trace);
9565 if (r > 0) return;
9566 if (r == -CEPHFS_ESTALE) {
9567 mdcache->find_ino_peers(destpath.get_ino(), new C_MDS_RetryRequest(mdcache, mdr),
9568 mdr->peer_to_mds, true);
9569 return;
9570 }
9571 ceph_assert(r == 0); // we shouldn't get an error here!
9572
9573 CDentry *destdn = trace.back();
9574 CDentry::linkage_t *destdnl = destdn->get_projected_linkage();
9575 dout(10) << " destdn " << *destdn << dendl;
9576 mdr->pin(destdn);
9577
9578 // discover srcdn
9579 filepath srcpath(mdr->peer_request->srcdnpath);
9580 dout(10) << " src " << srcpath << dendl;
9581 CInode *srci = nullptr;
9582 r = mdcache->path_traverse(mdr, cf, srcpath,
9583 MDS_TRAVERSE_DISCOVER | MDS_TRAVERSE_PATH_LOCKED,
9584 &trace, &srci);
9585 if (r > 0) return;
9586 ceph_assert(r == 0);
9587
9588 CDentry *srcdn = trace.back();
9589 CDentry::linkage_t *srcdnl = srcdn->get_projected_linkage();
9590 dout(10) << " srcdn " << *srcdn << dendl;
9591 mdr->pin(srcdn);
9592 mdr->pin(srci);
9593
9594 // stray?
9595 bool linkmerge = srcdnl->get_inode() == destdnl->get_inode();
9596 if (linkmerge)
9597 ceph_assert(srcdnl->is_primary() && destdnl->is_remote());
9598 CDentry *straydn = mdr->straydn;
9599 if (destdnl->is_primary() && !linkmerge)
9600 ceph_assert(straydn);
9601
9602 mdr->set_op_stamp(mdr->peer_request->op_stamp);
9603 mdr->more()->srcdn_auth_mds = srcdn->authority().first;
9604
9605 // set up commit waiter (early, to clean up any freezing etc we do)
9606 if (!mdr->more()->peer_commit)
9607 mdr->more()->peer_commit = new C_MDS_PeerRenameCommit(this, mdr, srcdn, destdn, straydn);
9608
9609 // am i srcdn auth?
9610 if (srcdn->is_auth()) {
9611 set<mds_rank_t> srcdnrep;
9612 srcdn->list_replicas(srcdnrep);
9613
9614 bool reply_witness = false;
9615 if (srcdnl->is_primary() && !srcdnl->get_inode()->state_test(CInode::STATE_AMBIGUOUSAUTH)) {
9616 // freeze?
9617 // we need this to
9618 // - avoid conflicting lock state changes
9619 // - avoid concurrent updates to the inode
9620 // (this could also be accomplished with the versionlock)
9621 int allowance = 3; // 1 for the mdr auth_pin, 1 for the link lock, 1 for the snap lock
9622 dout(10) << " freezing srci " << *srcdnl->get_inode() << " with allowance " << allowance << dendl;
9623 bool frozen_inode = srcdnl->get_inode()->freeze_inode(allowance);
9624
9625 // unfreeze auth pin after freezing the inode to avoid queueing waiters
9626 if (srcdnl->get_inode()->is_frozen_auth_pin())
9627 mdr->unfreeze_auth_pin();
9628
9629 if (!frozen_inode) {
9630 srcdnl->get_inode()->add_waiter(CInode::WAIT_FROZEN, new C_MDS_RetryRequest(mdcache, mdr));
9631 return;
9632 }
9633
9634 /*
9635 * set ambiguous auth for srci
9636 * NOTE: we don't worry about ambiguous cache expire as we do
9637 * with subtree migrations because all peers will pin
9638 * srcdn->get_inode() for duration of this rename.
9639 */
9640 mdr->set_ambiguous_auth(srcdnl->get_inode());
9641
9642 // just mark the source inode as ambiguous auth if more than two MDS are involved.
9643 // the leader will send another OP_RENAMEPREP peer request later.
9644 if (mdr->peer_request->witnesses.size() > 1) {
9645 dout(10) << " set srci ambiguous auth; providing srcdn replica list" << dendl;
9646 reply_witness = true;
9647 }
9648
9649 // make sure bystanders have received all lock related messages
9650 for (set<mds_rank_t>::iterator p = srcdnrep.begin(); p != srcdnrep.end(); ++p) {
9651 if (*p == mdr->peer_to_mds ||
9652 (mds->is_cluster_degraded() &&
9653 !mds->mdsmap->is_clientreplay_or_active_or_stopping(*p)))
9654 continue;
9655 auto notify = make_message<MMDSPeerRequest>(mdr->reqid, mdr->attempt, MMDSPeerRequest::OP_RENAMENOTIFY);
9656 mds->send_message_mds(notify, *p);
9657 mdr->more()->waiting_on_peer.insert(*p);
9658 }
9659
9660 // make sure clients have received all cap related messages
9661 set<client_t> export_client_set;
9662 mdcache->migrator->get_export_client_set(srcdnl->get_inode(), export_client_set);
9663
9664 MDSGatherBuilder gather(g_ceph_context);
9665 flush_client_sessions(export_client_set, gather);
9666 if (gather.has_subs()) {
9667 mdr->more()->waiting_on_peer.insert(MDS_RANK_NONE);
9668 gather.set_finisher(new C_MDS_PeerRenameSessionsFlushed(this, mdr));
9669 gather.activate();
9670 }
9671 }
9672
9673 // is witness list sufficient?
9674 for (set<mds_rank_t>::iterator p = srcdnrep.begin(); p != srcdnrep.end(); ++p) {
9675 if (*p == mdr->peer_to_mds ||
9676 mdr->peer_request->witnesses.count(*p)) continue;
9677 dout(10) << " witness list insufficient; providing srcdn replica list" << dendl;
9678 reply_witness = true;
9679 break;
9680 }
9681
9682 if (reply_witness) {
9683 ceph_assert(!srcdnrep.empty());
9684 auto reply = make_message<MMDSPeerRequest>(mdr->reqid, mdr->attempt, MMDSPeerRequest::OP_RENAMEPREPACK);
9685 reply->witnesses.swap(srcdnrep);
9686 mds->send_message_mds(reply, mdr->peer_to_mds);
9687 mdr->reset_peer_request();
9688 return;
9689 }
9690 dout(10) << " witness list sufficient: includes all srcdn replicas" << dendl;
9691 if (!mdr->more()->waiting_on_peer.empty()) {
9692 dout(10) << " still waiting for rename notify acks from "
9693 << mdr->more()->waiting_on_peer << dendl;
9694 return;
9695 }
9696 } else if (srcdnl->is_primary() && srcdn->authority() != destdn->authority()) {
9697 // set ambiguous auth for srci on witnesses
9698 mdr->set_ambiguous_auth(srcdnl->get_inode());
9699 }
9700
9701 // encode everything we'd need to roll this back... basically, just the original state.
9702 rename_rollback rollback;
9703
9704 rollback.reqid = mdr->reqid;
9705
9706 rollback.orig_src.dirfrag = srcdn->get_dir()->dirfrag();
9707 rollback.orig_src.dirfrag_old_mtime = srcdn->get_dir()->get_projected_fnode()->fragstat.mtime;
9708 rollback.orig_src.dirfrag_old_rctime = srcdn->get_dir()->get_projected_fnode()->rstat.rctime;
9709 rollback.orig_src.dname = srcdn->get_name();
9710 if (srcdnl->is_primary())
9711 rollback.orig_src.ino = srcdnl->get_inode()->ino();
9712 else {
9713 ceph_assert(srcdnl->is_remote());
9714 rollback.orig_src.remote_ino = srcdnl->get_remote_ino();
9715 rollback.orig_src.remote_d_type = srcdnl->get_remote_d_type();
9716 }
9717
9718 rollback.orig_dest.dirfrag = destdn->get_dir()->dirfrag();
9719 rollback.orig_dest.dirfrag_old_mtime = destdn->get_dir()->get_projected_fnode()->fragstat.mtime;
9720 rollback.orig_dest.dirfrag_old_rctime = destdn->get_dir()->get_projected_fnode()->rstat.rctime;
9721 rollback.orig_dest.dname = destdn->get_name();
9722 if (destdnl->is_primary())
9723 rollback.orig_dest.ino = destdnl->get_inode()->ino();
9724 else if (destdnl->is_remote()) {
9725 rollback.orig_dest.remote_ino = destdnl->get_remote_ino();
9726 rollback.orig_dest.remote_d_type = destdnl->get_remote_d_type();
9727 }
9728
9729 if (straydn) {
9730 rollback.stray.dirfrag = straydn->get_dir()->dirfrag();
9731 rollback.stray.dirfrag_old_mtime = straydn->get_dir()->get_projected_fnode()->fragstat.mtime;
9732 rollback.stray.dirfrag_old_rctime = straydn->get_dir()->get_projected_fnode()->rstat.rctime;
9733 rollback.stray.dname = straydn->get_name();
9734 }
9735 if (mdr->peer_request->desti_snapbl.length()) {
9736 CInode *oldin = destdnl->get_inode();
9737 if (oldin->snaprealm) {
9738 encode(true, rollback.desti_snapbl);
9739 oldin->encode_snap_blob(rollback.desti_snapbl);
9740 } else {
9741 encode(false, rollback.desti_snapbl);
9742 }
9743 }
9744 if (mdr->peer_request->srci_snapbl.length()) {
9745 if (srci->snaprealm) {
9746 encode(true, rollback.srci_snapbl);
9747 srci->encode_snap_blob(rollback.srci_snapbl);
9748 } else {
9749 encode(false, rollback.srci_snapbl);
9750 }
9751 }
9752 encode(rollback, mdr->more()->rollback_bl);
9753 // FIXME: rollback snaprealm
9754 dout(20) << " rollback is " << mdr->more()->rollback_bl.length() << " bytes" << dendl;
9755
9756 // journal.
9757 mdr->ls = mdlog->get_current_segment();
9758 EPeerUpdate *le = new EPeerUpdate(mdlog, "peer_rename_prep", mdr->reqid, mdr->peer_to_mds,
9759 EPeerUpdate::OP_PREPARE, EPeerUpdate::RENAME);
9760 mdlog->start_entry(le);
9761 le->rollback = mdr->more()->rollback_bl;
9762
9763 bufferlist blah; // inode import data... obviously not used if we're the peer
9764 _rename_prepare(mdr, &le->commit, &blah, srcdn, destdn, mdr->peer_request->alternate_name, straydn);
9765
9766 if (le->commit.empty()) {
9767 dout(10) << " empty metablob, skipping journal" << dendl;
9768 mdlog->cancel_entry(le);
9769 mdr->ls = NULL;
9770 _logged_peer_rename(mdr, srcdn, destdn, straydn);
9771 } else {
9772 mdcache->add_uncommitted_peer(mdr->reqid, mdr->ls, mdr->peer_to_mds);
9773 mdr->more()->peer_update_journaled = true;
9774 submit_mdlog_entry(le, new C_MDS_PeerRenamePrep(this, mdr, srcdn, destdn, straydn),
9775 mdr, __func__);
9776 mdlog->flush();
9777 }
9778 }
9779
9780 void Server::_logged_peer_rename(MDRequestRef& mdr,
9781 CDentry *srcdn, CDentry *destdn, CDentry *straydn)
9782 {
9783 dout(10) << "_logged_peer_rename " << *mdr << dendl;
9784
9785 // prepare ack
9786 ref_t<MMDSPeerRequest> reply;
9787 if (!mdr->aborted) {
9788 reply = make_message<MMDSPeerRequest>(mdr->reqid, mdr->attempt, MMDSPeerRequest::OP_RENAMEPREPACK);
9789 if (!mdr->more()->peer_update_journaled)
9790 reply->mark_not_journaled();
9791 }
9792
9793 CDentry::linkage_t *srcdnl = srcdn->get_linkage();
9794 //CDentry::linkage_t *straydnl = straydn ? straydn->get_linkage() : 0;
9795
9796 // export srci?
9797 if (srcdn->is_auth() && srcdnl->is_primary()) {
9798 // set export bounds for CInode::encode_export()
9799 if (reply) {
9800 std::vector<CDir*> bounds;
9801 if (srcdnl->get_inode()->is_dir()) {
9802 srcdnl->get_inode()->get_dirfrags(bounds);
9803 for (const auto& bound : bounds) {
9804 bound->state_set(CDir::STATE_EXPORTBOUND);
9805 }
9806 }
9807
9808 map<client_t,entity_inst_t> exported_client_map;
9809 map<client_t, client_metadata_t> exported_client_metadata_map;
9810 bufferlist inodebl;
9811 mdcache->migrator->encode_export_inode(srcdnl->get_inode(), inodebl,
9812 exported_client_map,
9813 exported_client_metadata_map);
9814
9815 for (const auto& bound : bounds) {
9816 bound->state_clear(CDir::STATE_EXPORTBOUND);
9817 }
9818
9819 encode(exported_client_map, reply->inode_export, mds->mdsmap->get_up_features());
9820 encode(exported_client_metadata_map, reply->inode_export);
9821 reply->inode_export.claim_append(inodebl);
9822 reply->inode_export_v = srcdnl->get_inode()->get_version();
9823 }
9824
9825 // remove mdr auth pin
9826 mdr->auth_unpin(srcdnl->get_inode());
9827 mdr->more()->is_inode_exporter = true;
9828
9829 if (srcdnl->get_inode()->is_dirty())
9830 srcdnl->get_inode()->mark_clean();
9831
9832 dout(10) << " exported srci " << *srcdnl->get_inode() << dendl;
9833 }
9834
9835 // apply
9836 _rename_apply(mdr, srcdn, destdn, straydn);
9837
9838 CDentry::linkage_t *destdnl = destdn->get_linkage();
9839
9840 // bump popularity
9841 mds->balancer->hit_dir(srcdn->get_dir(), META_POP_IWR);
9842 if (destdnl->get_inode() && destdnl->get_inode()->is_auth())
9843 mds->balancer->hit_inode(destdnl->get_inode(), META_POP_IWR);
9844
9845 // done.
9846 mdr->reset_peer_request();
9847 mdr->straydn = 0;
9848
9849 if (reply) {
9850 mds->send_message_mds(reply, mdr->peer_to_mds);
9851 } else {
9852 ceph_assert(mdr->aborted);
9853 dout(10) << " abort flag set, finishing" << dendl;
9854 mdcache->request_finish(mdr);
9855 }
9856 }
9857
9858 void Server::_commit_peer_rename(MDRequestRef& mdr, int r,
9859 CDentry *srcdn, CDentry *destdn, CDentry *straydn)
9860 {
9861 dout(10) << "_commit_peer_rename " << *mdr << " r=" << r << dendl;
9862
9863 CInode *in = destdn->get_linkage()->get_inode();
9864
9865 inodeno_t migrated_stray;
9866 if (srcdn->is_auth() && srcdn->get_dir()->inode->is_stray())
9867 migrated_stray = in->ino();
9868
9869 MDSContext::vec finished;
9870 if (r == 0) {
9871 // unfreeze+singleauth inode
9872 // hmm, do i really need to delay this?
9873 if (mdr->more()->is_inode_exporter) {
9874 // drop our pins
9875 // we exported, clear out any xlocks that we moved to another MDS
9876
9877 for (auto i = mdr->locks.lower_bound(&in->versionlock);
9878 i != mdr->locks.end(); ) {
9879 SimpleLock *lock = i->lock;
9880 if (lock->get_parent() != in)
9881 break;
9882 // we only care about xlocks on the exported inode
9883 if (i->is_xlock() && !lock->is_locallock())
9884 mds->locker->xlock_export(i++, mdr.get());
9885 else
9886 ++i;
9887 }
9888
9889 map<client_t,Capability::Import> peer_imported;
9890 auto bp = mdr->more()->inode_import.cbegin();
9891 decode(peer_imported, bp);
9892
9893 dout(10) << " finishing inode export on " << *in << dendl;
9894 mdcache->migrator->finish_export_inode(in, mdr->peer_to_mds, peer_imported, finished);
9895 mds->queue_waiters(finished); // this includes SINGLEAUTH waiters.
9896
9897 // unfreeze
9898 ceph_assert(in->is_frozen_inode());
9899 in->unfreeze_inode(finished);
9900 }
9901
9902 // singleauth
9903 if (mdr->more()->is_ambiguous_auth) {
9904 mdr->more()->rename_inode->clear_ambiguous_auth(finished);
9905 mdr->more()->is_ambiguous_auth = false;
9906 }
9907
9908 if (straydn && mdr->more()->peer_update_journaled) {
9909 CInode *strayin = straydn->get_projected_linkage()->get_inode();
9910 if (strayin && !strayin->snaprealm)
9911 mdcache->clear_dirty_bits_for_stray(strayin);
9912 }
9913
9914 mds->queue_waiters(finished);
9915 mdr->cleanup();
9916
9917 if (mdr->more()->peer_update_journaled) {
9918 // write a commit to the journal
9919 EPeerUpdate *le = new EPeerUpdate(mdlog, "peer_rename_commit", mdr->reqid,
9920 mdr->peer_to_mds, EPeerUpdate::OP_COMMIT,
9921 EPeerUpdate::RENAME);
9922 mdlog->start_entry(le);
9923 submit_mdlog_entry(le, new C_MDS_CommittedPeer(this, mdr), mdr, __func__);
9924 mdlog->flush();
9925 } else {
9926 _committed_peer(mdr);
9927 }
9928 } else {
9929
9930 // abort
9931 // rollback_bl may be empty if we froze the inode but had to provide an expanded
9932 // witness list from the leader, and they failed before we tried prep again.
9933 if (mdr->more()->rollback_bl.length()) {
9934 if (mdr->more()->is_inode_exporter) {
9935 dout(10) << " reversing inode export of " << *in << dendl;
9936 in->abort_export();
9937 }
9938 if (mdcache->is_ambiguous_peer_update(mdr->reqid, mdr->peer_to_mds)) {
9939 mdcache->remove_ambiguous_peer_update(mdr->reqid, mdr->peer_to_mds);
9940 // rollback but preserve the peer request
9941 do_rename_rollback(mdr->more()->rollback_bl, mdr->peer_to_mds, mdr, false);
9942 mdr->more()->rollback_bl.clear();
9943 } else
9944 do_rename_rollback(mdr->more()->rollback_bl, mdr->peer_to_mds, mdr, true);
9945 } else {
9946 dout(10) << " rollback_bl empty, not rollback back rename (leader failed after getting extra witnesses?)" << dendl;
9947 // singleauth
9948 if (mdr->more()->is_ambiguous_auth) {
9949 if (srcdn->is_auth())
9950 mdr->more()->rename_inode->unfreeze_inode(finished);
9951
9952 mdr->more()->rename_inode->clear_ambiguous_auth(finished);
9953 mdr->more()->is_ambiguous_auth = false;
9954 }
9955 mds->queue_waiters(finished);
9956 mdcache->request_finish(mdr);
9957 }
9958 }
9959
9960 if (migrated_stray && mds->is_stopping())
9961 mdcache->shutdown_export_stray_finish(migrated_stray);
9962 }
9963
9964 static void _rollback_repair_dir(MutationRef& mut, CDir *dir,
9965 rename_rollback::drec &r, utime_t ctime,
9966 bool isdir, const nest_info_t &rstat)
9967 {
9968 auto pf = dir->project_fnode(mut);
9969 pf->version = dir->pre_dirty();
9970
9971 if (isdir) {
9972 pf->fragstat.nsubdirs += 1;
9973 } else {
9974 pf->fragstat.nfiles += 1;
9975 }
9976 if (r.ino) {
9977 pf->rstat.rbytes += rstat.rbytes;
9978 pf->rstat.rfiles += rstat.rfiles;
9979 pf->rstat.rsubdirs += rstat.rsubdirs;
9980 pf->rstat.rsnaps += rstat.rsnaps;
9981 }
9982 if (pf->fragstat.mtime == ctime) {
9983 pf->fragstat.mtime = r.dirfrag_old_mtime;
9984 if (pf->rstat.rctime == ctime)
9985 pf->rstat.rctime = r.dirfrag_old_rctime;
9986 }
9987 mut->add_updated_lock(&dir->get_inode()->filelock);
9988 mut->add_updated_lock(&dir->get_inode()->nestlock);
9989 }
9990
9991 struct C_MDS_LoggedRenameRollback : public ServerLogContext {
9992 MutationRef mut;
9993 CDentry *srcdn;
9994 version_t srcdnpv;
9995 CDentry *destdn;
9996 CDentry *straydn;
9997 map<client_t,ref_t<MClientSnap>> splits[2];
9998 bool finish_mdr;
9999 C_MDS_LoggedRenameRollback(Server *s, MutationRef& m, MDRequestRef& r,
10000 CDentry *sd, version_t pv, CDentry *dd, CDentry *st,
10001 map<client_t,ref_t<MClientSnap>> _splits[2], bool f) :
10002 ServerLogContext(s, r), mut(m), srcdn(sd), srcdnpv(pv), destdn(dd),
10003 straydn(st), finish_mdr(f) {
10004 splits[0].swap(_splits[0]);
10005 splits[1].swap(_splits[1]);
10006 }
10007 void finish(int r) override {
10008 server->_rename_rollback_finish(mut, mdr, srcdn, srcdnpv,
10009 destdn, straydn, splits, finish_mdr);
10010 }
10011 };
10012
10013 void Server::do_rename_rollback(bufferlist &rbl, mds_rank_t leader, MDRequestRef& mdr,
10014 bool finish_mdr)
10015 {
10016 rename_rollback rollback;
10017 auto p = rbl.cbegin();
10018 decode(rollback, p);
10019
10020 dout(10) << "do_rename_rollback on " << rollback.reqid << dendl;
10021 // need to finish this update before sending resolve to claim the subtree
10022 mdcache->add_rollback(rollback.reqid, leader);
10023
10024 MutationRef mut(new MutationImpl(nullptr, utime_t(), rollback.reqid));
10025 mut->ls = mds->mdlog->get_current_segment();
10026
10027 CDentry *srcdn = NULL;
10028 CDir *srcdir = mdcache->get_dirfrag(rollback.orig_src.dirfrag);
10029 if (!srcdir)
10030 srcdir = mdcache->get_dirfrag(rollback.orig_src.dirfrag.ino, rollback.orig_src.dname);
10031 if (srcdir) {
10032 dout(10) << " srcdir " << *srcdir << dendl;
10033 srcdn = srcdir->lookup(rollback.orig_src.dname);
10034 if (srcdn) {
10035 dout(10) << " srcdn " << *srcdn << dendl;
10036 ceph_assert(srcdn->get_linkage()->is_null());
10037 } else
10038 dout(10) << " srcdn not found" << dendl;
10039 } else
10040 dout(10) << " srcdir not found" << dendl;
10041
10042 CDentry *destdn = NULL;
10043 CDir *destdir = mdcache->get_dirfrag(rollback.orig_dest.dirfrag);
10044 if (!destdir)
10045 destdir = mdcache->get_dirfrag(rollback.orig_dest.dirfrag.ino, rollback.orig_dest.dname);
10046 if (destdir) {
10047 dout(10) << " destdir " << *destdir << dendl;
10048 destdn = destdir->lookup(rollback.orig_dest.dname);
10049 if (destdn)
10050 dout(10) << " destdn " << *destdn << dendl;
10051 else
10052 dout(10) << " destdn not found" << dendl;
10053 } else
10054 dout(10) << " destdir not found" << dendl;
10055
10056 CInode *in = NULL;
10057 if (rollback.orig_src.ino) {
10058 in = mdcache->get_inode(rollback.orig_src.ino);
10059 if (in && in->is_dir())
10060 ceph_assert(srcdn && destdn);
10061 } else
10062 in = mdcache->get_inode(rollback.orig_src.remote_ino);
10063
10064 CDir *straydir = NULL;
10065 CDentry *straydn = NULL;
10066 if (rollback.stray.dirfrag.ino) {
10067 straydir = mdcache->get_dirfrag(rollback.stray.dirfrag);
10068 if (straydir) {
10069 dout(10) << "straydir " << *straydir << dendl;
10070 straydn = straydir->lookup(rollback.stray.dname);
10071 if (straydn) {
10072 dout(10) << " straydn " << *straydn << dendl;
10073 ceph_assert(straydn->get_linkage()->is_primary());
10074 } else
10075 dout(10) << " straydn not found" << dendl;
10076 } else
10077 dout(10) << "straydir not found" << dendl;
10078 }
10079
10080 CInode *target = NULL;
10081 if (rollback.orig_dest.ino) {
10082 target = mdcache->get_inode(rollback.orig_dest.ino);
10083 if (target)
10084 ceph_assert(destdn && straydn);
10085 } else if (rollback.orig_dest.remote_ino)
10086 target = mdcache->get_inode(rollback.orig_dest.remote_ino);
10087
10088 // can't use is_auth() in the resolve stage
10089 mds_rank_t whoami = mds->get_nodeid();
10090 // peer
10091 ceph_assert(!destdn || destdn->authority().first != whoami);
10092 ceph_assert(!straydn || straydn->authority().first != whoami);
10093
10094 bool force_journal_src = false;
10095 bool force_journal_dest = false;
10096 if (in && in->is_dir() && srcdn->authority().first != whoami)
10097 force_journal_src = _need_force_journal(in, false);
10098 if (in && target && target->is_dir())
10099 force_journal_dest = _need_force_journal(in, true);
10100
10101 version_t srcdnpv = 0;
10102 // repair src
10103 if (srcdn) {
10104 if (srcdn->authority().first == whoami)
10105 srcdnpv = srcdn->pre_dirty();
10106 if (rollback.orig_src.ino) {
10107 ceph_assert(in);
10108 srcdn->push_projected_linkage(in);
10109 } else
10110 srcdn->push_projected_linkage(rollback.orig_src.remote_ino,
10111 rollback.orig_src.remote_d_type);
10112 }
10113
10114 map<client_t,ref_t<MClientSnap>> splits[2];
10115
10116 const CInode::mempool_inode *pip = nullptr;
10117 if (in) {
10118 bool projected;
10119 CDir *pdir = in->get_projected_parent_dir();
10120 if (pdir->authority().first == whoami) {
10121 auto pi = in->project_inode(mut);
10122 pi.inode->version = in->pre_dirty();
10123 if (pdir != srcdir) {
10124 auto pf = pdir->project_fnode(mut);
10125 pf->version = pdir->pre_dirty();
10126 }
10127 if (pi.inode->ctime == rollback.ctime)
10128 pi.inode->ctime = rollback.orig_src.old_ctime;
10129 projected = true;
10130 } else {
10131 if (in->get_inode()->ctime == rollback.ctime) {
10132 auto _inode = CInode::allocate_inode(*in->get_inode());
10133 _inode->ctime = rollback.orig_src.old_ctime;
10134 in->reset_inode(_inode);
10135 }
10136 projected = false;
10137 }
10138 pip = in->get_projected_inode().get();
10139
10140 if (rollback.srci_snapbl.length() && in->snaprealm) {
10141 bool hadrealm;
10142 auto p = rollback.srci_snapbl.cbegin();
10143 decode(hadrealm, p);
10144 if (hadrealm) {
10145 if (projected && !mds->is_resolve()) {
10146 sr_t *new_srnode = new sr_t();
10147 decode(*new_srnode, p);
10148 in->project_snaprealm(new_srnode);
10149 } else
10150 decode(in->snaprealm->srnode, p);
10151 } else {
10152 SnapRealm *realm;
10153 if (rollback.orig_src.ino) {
10154 ceph_assert(srcdir);
10155 realm = srcdir->get_inode()->find_snaprealm();
10156 } else {
10157 realm = in->snaprealm->parent;
10158 }
10159 if (!mds->is_resolve())
10160 mdcache->prepare_realm_merge(in->snaprealm, realm, splits[0]);
10161 if (projected)
10162 in->project_snaprealm(NULL);
10163 else
10164 in->snaprealm->merge_to(realm);
10165 }
10166 }
10167 }
10168
10169 // repair dest
10170 if (destdn) {
10171 if (rollback.orig_dest.ino && target) {
10172 destdn->push_projected_linkage(target);
10173 } else if (rollback.orig_dest.remote_ino) {
10174 destdn->push_projected_linkage(rollback.orig_dest.remote_ino,
10175 rollback.orig_dest.remote_d_type);
10176 } else {
10177 // the dentry will be trimmed soon, it's ok to have wrong linkage
10178 if (rollback.orig_dest.ino)
10179 ceph_assert(mds->is_resolve());
10180 destdn->push_projected_linkage();
10181 }
10182 }
10183
10184 if (straydn)
10185 straydn->push_projected_linkage();
10186
10187 if (target) {
10188 bool projected;
10189 CInode::inode_ptr ti;
10190 CDir *pdir = target->get_projected_parent_dir();
10191 if (pdir->authority().first == whoami) {
10192 auto pi = target->project_inode(mut);
10193 pi.inode->version = target->pre_dirty();
10194 if (pdir != srcdir) {
10195 auto pf = pdir->project_fnode(mut);
10196 pf->version = pdir->pre_dirty();
10197 }
10198 ti = pi.inode;
10199 projected = true;
10200 } else {
10201 ti = CInode::allocate_inode(*target->get_inode());
10202 projected = false;
10203 }
10204
10205 if (ti->ctime == rollback.ctime)
10206 ti->ctime = rollback.orig_dest.old_ctime;
10207 if (MDS_INO_IS_STRAY(rollback.orig_src.dirfrag.ino)) {
10208 if (MDS_INO_IS_STRAY(rollback.orig_dest.dirfrag.ino))
10209 ceph_assert(!rollback.orig_dest.ino && !rollback.orig_dest.remote_ino);
10210 else
10211 ceph_assert(rollback.orig_dest.remote_ino &&
10212 rollback.orig_dest.remote_ino == rollback.orig_src.ino);
10213 } else
10214 ti->nlink++;
10215
10216 if (!projected)
10217 target->reset_inode(ti);
10218
10219 if (rollback.desti_snapbl.length() && target->snaprealm) {
10220 bool hadrealm;
10221 auto p = rollback.desti_snapbl.cbegin();
10222 decode(hadrealm, p);
10223 if (hadrealm) {
10224 if (projected && !mds->is_resolve()) {
10225 sr_t *new_srnode = new sr_t();
10226 decode(*new_srnode, p);
10227 target->project_snaprealm(new_srnode);
10228 } else
10229 decode(target->snaprealm->srnode, p);
10230 } else {
10231 SnapRealm *realm;
10232 if (rollback.orig_dest.ino) {
10233 ceph_assert(destdir);
10234 realm = destdir->get_inode()->find_snaprealm();
10235 } else {
10236 realm = target->snaprealm->parent;
10237 }
10238 if (!mds->is_resolve())
10239 mdcache->prepare_realm_merge(target->snaprealm, realm, splits[1]);
10240 if (projected)
10241 target->project_snaprealm(NULL);
10242 else
10243 target->snaprealm->merge_to(realm);
10244 }
10245 }
10246 }
10247
10248 if (srcdn && srcdn->authority().first == whoami) {
10249 nest_info_t blah;
10250 _rollback_repair_dir(mut, srcdir, rollback.orig_src, rollback.ctime,
10251 in && in->is_dir(), pip ? pip->accounted_rstat : blah);
10252 }
10253
10254 if (srcdn)
10255 dout(0) << " srcdn back to " << *srcdn << dendl;
10256 if (in)
10257 dout(0) << " srci back to " << *in << dendl;
10258 if (destdn)
10259 dout(0) << " destdn back to " << *destdn << dendl;
10260 if (target)
10261 dout(0) << " desti back to " << *target << dendl;
10262
10263 // journal it
10264 EPeerUpdate *le = new EPeerUpdate(mdlog, "peer_rename_rollback", rollback.reqid, leader,
10265 EPeerUpdate::OP_ROLLBACK, EPeerUpdate::RENAME);
10266 mdlog->start_entry(le);
10267
10268 if (srcdn && (srcdn->authority().first == whoami || force_journal_src)) {
10269 le->commit.add_dir_context(srcdir);
10270 if (rollback.orig_src.ino)
10271 le->commit.add_primary_dentry(srcdn, 0, true);
10272 else
10273 le->commit.add_remote_dentry(srcdn, true);
10274 }
10275
10276 if (!rollback.orig_src.ino && // remote linkage
10277 in && in->authority().first == whoami) {
10278 le->commit.add_dir_context(in->get_projected_parent_dir());
10279 le->commit.add_primary_dentry(in->get_projected_parent_dn(), in, true);
10280 }
10281
10282 if (force_journal_dest) {
10283 ceph_assert(rollback.orig_dest.ino);
10284 le->commit.add_dir_context(destdir);
10285 le->commit.add_primary_dentry(destdn, 0, true);
10286 }
10287
10288 // peer: no need to journal straydn
10289
10290 if (target && target != in && target->authority().first == whoami) {
10291 ceph_assert(rollback.orig_dest.remote_ino);
10292 le->commit.add_dir_context(target->get_projected_parent_dir());
10293 le->commit.add_primary_dentry(target->get_projected_parent_dn(), target, true);
10294 }
10295
10296 if (in && in->is_dir() && (srcdn->authority().first == whoami || force_journal_src)) {
10297 dout(10) << " noting renamed dir ino " << in->ino() << " in metablob" << dendl;
10298 le->commit.renamed_dirino = in->ino();
10299 if (srcdn->authority().first == whoami) {
10300 auto&& ls = in->get_dirfrags();
10301 for (const auto& dir : ls) {
10302 if (!dir->is_auth())
10303 le->commit.renamed_dir_frags.push_back(dir->get_frag());
10304 }
10305 dout(10) << " noting renamed dir open frags " << le->commit.renamed_dir_frags << dendl;
10306 }
10307 } else if (force_journal_dest) {
10308 dout(10) << " noting rename target ino " << target->ino() << " in metablob" << dendl;
10309 le->commit.renamed_dirino = target->ino();
10310 }
10311
10312 if (target && target->is_dir()) {
10313 ceph_assert(destdn);
10314 mdcache->project_subtree_rename(target, straydir, destdir);
10315 }
10316
10317 if (in && in->is_dir()) {
10318 ceph_assert(srcdn);
10319 mdcache->project_subtree_rename(in, destdir, srcdir);
10320 }
10321
10322 if (mdr && !mdr->more()->peer_update_journaled) {
10323 ceph_assert(le->commit.empty());
10324 mdlog->cancel_entry(le);
10325 mut->ls = NULL;
10326 _rename_rollback_finish(mut, mdr, srcdn, srcdnpv, destdn, straydn, splits, finish_mdr);
10327 } else {
10328 ceph_assert(!le->commit.empty());
10329 if (mdr)
10330 mdr->more()->peer_update_journaled = false;
10331 MDSLogContextBase *fin = new C_MDS_LoggedRenameRollback(this, mut, mdr,
10332 srcdn, srcdnpv, destdn, straydn,
10333 splits, finish_mdr);
10334 submit_mdlog_entry(le, fin, mdr, __func__);
10335 mdlog->flush();
10336 }
10337 }
10338
10339 void Server::_rename_rollback_finish(MutationRef& mut, MDRequestRef& mdr, CDentry *srcdn,
10340 version_t srcdnpv, CDentry *destdn, CDentry *straydn,
10341 map<client_t,ref_t<MClientSnap>> splits[2], bool finish_mdr)
10342 {
10343 dout(10) << "_rename_rollback_finish " << mut->reqid << dendl;
10344
10345 if (straydn) {
10346 straydn->get_dir()->unlink_inode(straydn);
10347 straydn->pop_projected_linkage();
10348 }
10349 if (destdn) {
10350 destdn->get_dir()->unlink_inode(destdn);
10351 destdn->pop_projected_linkage();
10352 }
10353 if (srcdn) {
10354 srcdn->pop_projected_linkage();
10355 if (srcdn->authority().first == mds->get_nodeid()) {
10356 srcdn->mark_dirty(srcdnpv, mut->ls);
10357 if (srcdn->get_linkage()->is_primary())
10358 srcdn->get_linkage()->get_inode()->state_set(CInode::STATE_AUTH);
10359 }
10360 }
10361
10362 mut->apply();
10363
10364 if (srcdn && srcdn->get_linkage()->is_primary()) {
10365 CInode *in = srcdn->get_linkage()->get_inode();
10366 if (in && in->is_dir()) {
10367 ceph_assert(destdn);
10368 mdcache->adjust_subtree_after_rename(in, destdn->get_dir(), true);
10369 }
10370 }
10371
10372 if (destdn) {
10373 CInode *oldin = destdn->get_linkage()->get_inode();
10374 // update subtree map?
10375 if (oldin && oldin->is_dir()) {
10376 ceph_assert(straydn);
10377 mdcache->adjust_subtree_after_rename(oldin, straydn->get_dir(), true);
10378 }
10379 }
10380
10381 if (mds->is_resolve()) {
10382 CDir *root = NULL;
10383 if (straydn)
10384 root = mdcache->get_subtree_root(straydn->get_dir());
10385 else if (destdn)
10386 root = mdcache->get_subtree_root(destdn->get_dir());
10387 if (root)
10388 mdcache->try_trim_non_auth_subtree(root);
10389 } else {
10390 mdcache->send_snaps(splits[1]);
10391 mdcache->send_snaps(splits[0]);
10392 }
10393
10394 if (mdr) {
10395 MDSContext::vec finished;
10396 if (mdr->more()->is_ambiguous_auth) {
10397 if (srcdn->is_auth())
10398 mdr->more()->rename_inode->unfreeze_inode(finished);
10399
10400 mdr->more()->rename_inode->clear_ambiguous_auth(finished);
10401 mdr->more()->is_ambiguous_auth = false;
10402 }
10403 mds->queue_waiters(finished);
10404 if (finish_mdr || mdr->aborted)
10405 mdcache->request_finish(mdr);
10406 else
10407 mdr->more()->peer_rolling_back = false;
10408 }
10409
10410 mdcache->finish_rollback(mut->reqid, mdr);
10411
10412 mut->cleanup();
10413 }
10414
10415 void Server::handle_peer_rename_prep_ack(MDRequestRef& mdr, const cref_t<MMDSPeerRequest> &ack)
10416 {
10417 dout(10) << "handle_peer_rename_prep_ack " << *mdr
10418 << " witnessed by " << ack->get_source()
10419 << " " << *ack << dendl;
10420 mds_rank_t from = mds_rank_t(ack->get_source().num());
10421
10422 // note peer
10423 mdr->more()->peers.insert(from);
10424 if (mdr->more()->srcdn_auth_mds == from &&
10425 mdr->more()->is_remote_frozen_authpin &&
10426 !mdr->more()->is_ambiguous_auth) {
10427 mdr->set_ambiguous_auth(mdr->more()->rename_inode);
10428 }
10429
10430 // witnessed? or add extra witnesses?
10431 ceph_assert(mdr->more()->witnessed.count(from) == 0);
10432 if (ack->is_interrupted()) {
10433 dout(10) << " peer request interrupted, noop" << dendl;
10434 } else if (ack->witnesses.empty()) {
10435 mdr->more()->witnessed.insert(from);
10436 if (!ack->is_not_journaled())
10437 mdr->more()->has_journaled_peers = true;
10438 } else {
10439 dout(10) << " extra witnesses (srcdn replicas) are " << ack->witnesses << dendl;
10440 mdr->more()->extra_witnesses = ack->witnesses;
10441 mdr->more()->extra_witnesses.erase(mds->get_nodeid()); // not me!
10442 }
10443
10444 // srci import?
10445 if (ack->inode_export.length()) {
10446 dout(10) << " got srci import" << dendl;
10447 mdr->more()->inode_import.share(ack->inode_export);
10448 mdr->more()->inode_import_v = ack->inode_export_v;
10449 }
10450
10451 // remove from waiting list
10452 ceph_assert(mdr->more()->waiting_on_peer.count(from));
10453 mdr->more()->waiting_on_peer.erase(from);
10454
10455 if (mdr->more()->waiting_on_peer.empty())
10456 dispatch_client_request(mdr); // go again!
10457 else
10458 dout(10) << "still waiting on peers " << mdr->more()->waiting_on_peer << dendl;
10459 }
10460
10461 void Server::handle_peer_rename_notify_ack(MDRequestRef& mdr, const cref_t<MMDSPeerRequest> &ack)
10462 {
10463 dout(10) << "handle_peer_rename_notify_ack " << *mdr << " from mds."
10464 << ack->get_source() << dendl;
10465 ceph_assert(mdr->is_peer());
10466 mds_rank_t from = mds_rank_t(ack->get_source().num());
10467
10468 if (mdr->more()->waiting_on_peer.count(from)) {
10469 mdr->more()->waiting_on_peer.erase(from);
10470
10471 if (mdr->more()->waiting_on_peer.empty()) {
10472 if (mdr->peer_request)
10473 dispatch_peer_request(mdr);
10474 } else
10475 dout(10) << " still waiting for rename notify acks from "
10476 << mdr->more()->waiting_on_peer << dendl;
10477 }
10478 }
10479
10480 void Server::_peer_rename_sessions_flushed(MDRequestRef& mdr)
10481 {
10482 dout(10) << "_peer_rename_sessions_flushed " << *mdr << dendl;
10483
10484 if (mdr->more()->waiting_on_peer.count(MDS_RANK_NONE)) {
10485 mdr->more()->waiting_on_peer.erase(MDS_RANK_NONE);
10486
10487 if (mdr->more()->waiting_on_peer.empty()) {
10488 if (mdr->peer_request)
10489 dispatch_peer_request(mdr);
10490 } else
10491 dout(10) << " still waiting for rename notify acks from "
10492 << mdr->more()->waiting_on_peer << dendl;
10493 }
10494 }
10495
10496 // snaps
10497 /* This function takes responsibility for the passed mdr*/
10498 void Server::handle_client_lssnap(MDRequestRef& mdr)
10499 {
10500 const cref_t<MClientRequest> &req = mdr->client_request;
10501
10502 // traverse to path
10503 CInode *diri = try_get_auth_inode(mdr, req->get_filepath().get_ino());
10504 if (!diri)
10505 return;
10506
10507 if (!diri->is_dir()) {
10508 respond_to_request(mdr, -CEPHFS_ENOTDIR);
10509 return;
10510 }
10511 dout(10) << "lssnap on " << *diri << dendl;
10512
10513 // lock snap
10514 if (!mds->locker->try_rdlock_snap_layout(diri, mdr))
10515 return;
10516
10517 if (!check_access(mdr, diri, MAY_READ))
10518 return;
10519
10520 SnapRealm *realm = diri->find_snaprealm();
10521 map<snapid_t,const SnapInfo*> infomap;
10522 realm->get_snap_info(infomap, diri->get_oldest_snap());
10523
10524 unsigned max_entries = req->head.args.readdir.max_entries;
10525 if (!max_entries)
10526 max_entries = infomap.size();
10527 int max_bytes = req->head.args.readdir.max_bytes;
10528 if (!max_bytes)
10529 // make sure at least one item can be encoded
10530 max_bytes = (512 << 10) + g_conf()->mds_max_xattr_pairs_size;
10531
10532 __u64 last_snapid = 0;
10533 string offset_str = req->get_path2();
10534 if (!offset_str.empty())
10535 last_snapid = realm->resolve_snapname(offset_str, diri->ino());
10536
10537 //Empty DirStat
10538 bufferlist dirbl;
10539 static DirStat empty;
10540 CDir::encode_dirstat(dirbl, mdr->session->info, empty);
10541
10542 max_bytes -= dirbl.length() - sizeof(__u32) + sizeof(__u8) * 2;
10543
10544 __u32 num = 0;
10545 bufferlist dnbl;
10546 auto p = infomap.upper_bound(last_snapid);
10547 for (; p != infomap.end() && num < max_entries; ++p) {
10548 dout(10) << p->first << " -> " << *p->second << dendl;
10549
10550 // actual
10551 string snap_name;
10552 if (p->second->ino == diri->ino())
10553 snap_name = p->second->name;
10554 else
10555 snap_name = p->second->get_long_name();
10556
10557 unsigned start_len = dnbl.length();
10558 if (int(start_len + snap_name.length() + sizeof(__u32) + sizeof(LeaseStat)) > max_bytes)
10559 break;
10560
10561 encode(snap_name, dnbl);
10562 //infinite lease
10563 LeaseStat e(CEPH_LEASE_VALID, -1, 0);
10564 mds->locker->encode_lease(dnbl, mdr->session->info, e);
10565 dout(20) << "encode_infinite_lease" << dendl;
10566
10567 int r = diri->encode_inodestat(dnbl, mdr->session, realm, p->first, max_bytes - (int)dnbl.length());
10568 if (r < 0) {
10569 bufferlist keep;
10570 keep.substr_of(dnbl, 0, start_len);
10571 dnbl.swap(keep);
10572 break;
10573 }
10574 ++num;
10575 }
10576
10577 encode(num, dirbl);
10578 __u16 flags = 0;
10579 if (p == infomap.end()) {
10580 flags = CEPH_READDIR_FRAG_END;
10581 if (last_snapid == 0)
10582 flags |= CEPH_READDIR_FRAG_COMPLETE;
10583 }
10584 encode(flags, dirbl);
10585 dirbl.claim_append(dnbl);
10586
10587 mdr->reply_extra_bl = dirbl;
10588 mdr->tracei = diri;
10589 respond_to_request(mdr, 0);
10590 }
10591
10592
10593 // MKSNAP
10594
10595 struct C_MDS_mksnap_finish : public ServerLogContext {
10596 CInode *diri;
10597 SnapInfo info;
10598 C_MDS_mksnap_finish(Server *s, MDRequestRef& r, CInode *di, SnapInfo &i) :
10599 ServerLogContext(s, r), diri(di), info(i) {}
10600 void finish(int r) override {
10601 server->_mksnap_finish(mdr, diri, info);
10602 }
10603 };
10604
10605 /* This function takes responsibility for the passed mdr*/
10606 void Server::handle_client_mksnap(MDRequestRef& mdr)
10607 {
10608 const cref_t<MClientRequest> &req = mdr->client_request;
10609 // make sure we have as new a map as the client
10610 if (req->get_mdsmap_epoch() > mds->mdsmap->get_epoch()) {
10611 mds->wait_for_mdsmap(req->get_mdsmap_epoch(), new C_MDS_RetryRequest(mdcache, mdr));
10612 return;
10613 }
10614 if (!mds->mdsmap->allows_snaps()) {
10615 // you can't make snapshots until you set an option right now
10616 dout(5) << "new snapshots are disabled for this fs" << dendl;
10617 respond_to_request(mdr, -CEPHFS_EPERM);
10618 return;
10619 }
10620
10621 CInode *diri = try_get_auth_inode(mdr, req->get_filepath().get_ino());
10622 if (!diri)
10623 return;
10624
10625 // dir only
10626 if (!diri->is_dir()) {
10627 respond_to_request(mdr, -CEPHFS_ENOTDIR);
10628 return;
10629 }
10630 if (diri->is_system() && !diri->is_root()) {
10631 // no snaps in system dirs (root is ok)
10632 dout(5) << "is an internal system dir" << dendl;
10633 respond_to_request(mdr, -CEPHFS_EPERM);
10634 return;
10635 }
10636
10637 std::string_view snapname = req->get_filepath().last_dentry();
10638
10639 if (mdr->client_request->get_caller_uid() < g_conf()->mds_snap_min_uid || mdr->client_request->get_caller_uid() > g_conf()->mds_snap_max_uid) {
10640 dout(20) << "mksnap " << snapname << " on " << *diri << " denied to uid " << mdr->client_request->get_caller_uid() << dendl;
10641 respond_to_request(mdr, -CEPHFS_EPERM);
10642 return;
10643 }
10644
10645 dout(10) << "mksnap " << snapname << " on " << *diri << dendl;
10646
10647 // lock snap
10648 if (!(mdr->locking_state & MutationImpl::ALL_LOCKED)) {
10649 MutationImpl::LockOpVec lov;
10650 lov.add_xlock(&diri->snaplock);
10651 if (!mds->locker->acquire_locks(mdr, lov))
10652 return;
10653
10654 if (CDentry *pdn = diri->get_projected_parent_dn(); pdn) {
10655 if (!mds->locker->try_rdlock_snap_layout(pdn->get_dir()->get_inode(), mdr))
10656 return;
10657 }
10658 mdr->locking_state |= MutationImpl::ALL_LOCKED;
10659 }
10660
10661 if (!check_access(mdr, diri, MAY_WRITE|MAY_SNAPSHOT))
10662 return;
10663
10664 if (inodeno_t subvol_ino = diri->find_snaprealm()->get_subvolume_ino();
10665 (subvol_ino && subvol_ino != diri->ino())) {
10666 dout(5) << "is a descendent of a subvolume dir" << dendl;
10667 respond_to_request(mdr, -CEPHFS_EPERM);
10668 return;
10669 }
10670
10671 // check if we can create any more snapshots
10672 // we don't allow any more if we are already at or beyond the limit
10673 if (diri->snaprealm &&
10674 diri->snaprealm->get_snaps().size() >= max_snaps_per_dir) {
10675 respond_to_request(mdr, -CEPHFS_EMLINK);
10676 return;
10677 }
10678
10679 // make sure name is unique
10680 if (diri->snaprealm &&
10681 diri->snaprealm->exists(snapname)) {
10682 respond_to_request(mdr, -CEPHFS_EEXIST);
10683 return;
10684 }
10685 if (snapname.length() == 0 ||
10686 snapname[0] == '_') {
10687 respond_to_request(mdr, -CEPHFS_EINVAL);
10688 return;
10689 }
10690
10691 // allocate a snapid
10692 if (!mdr->more()->stid) {
10693 // prepare an stid
10694 mds->snapclient->prepare_create(diri->ino(), snapname,
10695 mdr->get_mds_stamp(),
10696 &mdr->more()->stid, &mdr->more()->snapidbl,
10697 new C_MDS_RetryRequest(mdcache, mdr));
10698 return;
10699 }
10700
10701 version_t stid = mdr->more()->stid;
10702 snapid_t snapid;
10703 auto p = mdr->more()->snapidbl.cbegin();
10704 decode(snapid, p);
10705 dout(10) << " stid " << stid << " snapid " << snapid << dendl;
10706
10707 ceph_assert(mds->snapclient->get_cached_version() >= stid);
10708
10709 SnapPayload payload;
10710 if (req->get_data().length()) {
10711 try {
10712 auto iter = req->get_data().cbegin();
10713 decode(payload, iter);
10714 } catch (const ceph::buffer::error &e) {
10715 // backward compat -- client sends xattr bufferlist. however,
10716 // that is not used anywhere -- so (log and) ignore.
10717 dout(20) << ": no metadata in payload (old client?)" << dendl;
10718 }
10719 }
10720
10721 // journal
10722 SnapInfo info;
10723 info.ino = diri->ino();
10724 info.snapid = snapid;
10725 info.name = snapname;
10726 info.stamp = mdr->get_op_stamp();
10727 info.metadata = payload.metadata;
10728
10729 auto pi = diri->project_inode(mdr, false, true);
10730 pi.inode->ctime = info.stamp;
10731 if (info.stamp > pi.inode->rstat.rctime)
10732 pi.inode->rstat.rctime = info.stamp;
10733 pi.inode->rstat.rsnaps++;
10734 pi.inode->version = diri->pre_dirty();
10735
10736 // project the snaprealm
10737 auto &newsnap = *pi.snapnode;
10738 newsnap.created = snapid;
10739 auto em = newsnap.snaps.emplace(std::piecewise_construct, std::forward_as_tuple(snapid), std::forward_as_tuple(info));
10740 if (!em.second)
10741 em.first->second = info;
10742 newsnap.seq = snapid;
10743 newsnap.last_created = snapid;
10744
10745 // journal the inode changes
10746 mdr->ls = mdlog->get_current_segment();
10747 EUpdate *le = new EUpdate(mdlog, "mksnap");
10748 mdlog->start_entry(le);
10749
10750 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
10751 le->metablob.add_table_transaction(TABLE_SNAP, stid);
10752 mdcache->predirty_journal_parents(mdr, &le->metablob, diri, 0, PREDIRTY_PRIMARY, false);
10753 mdcache->journal_dirty_inode(mdr.get(), &le->metablob, diri);
10754
10755 // journal the snaprealm changes
10756 submit_mdlog_entry(le, new C_MDS_mksnap_finish(this, mdr, diri, info),
10757 mdr, __func__);
10758 mdlog->flush();
10759 }
10760
10761 void Server::_mksnap_finish(MDRequestRef& mdr, CInode *diri, SnapInfo &info)
10762 {
10763 dout(10) << "_mksnap_finish " << *mdr << " " << info << dendl;
10764
10765 int op = (diri->snaprealm? CEPH_SNAP_OP_CREATE : CEPH_SNAP_OP_SPLIT);
10766
10767 mdr->apply();
10768
10769 mds->snapclient->commit(mdr->more()->stid, mdr->ls);
10770
10771 // create snap
10772 dout(10) << "snaprealm now " << *diri->snaprealm << dendl;
10773
10774 // notify other mds
10775 mdcache->send_snap_update(diri, mdr->more()->stid, op);
10776
10777 mdcache->do_realm_invalidate_and_update_notify(diri, op);
10778
10779 // yay
10780 mdr->in[0] = diri;
10781 mdr->snapid = info.snapid;
10782 mdr->tracei = diri;
10783 respond_to_request(mdr, 0);
10784 }
10785
10786
10787 // RMSNAP
10788
10789 struct C_MDS_rmsnap_finish : public ServerLogContext {
10790 CInode *diri;
10791 snapid_t snapid;
10792 C_MDS_rmsnap_finish(Server *s, MDRequestRef& r, CInode *di, snapid_t sn) :
10793 ServerLogContext(s, r), diri(di), snapid(sn) {}
10794 void finish(int r) override {
10795 server->_rmsnap_finish(mdr, diri, snapid);
10796 }
10797 };
10798
10799 /* This function takes responsibility for the passed mdr*/
10800 void Server::handle_client_rmsnap(MDRequestRef& mdr)
10801 {
10802 const cref_t<MClientRequest> &req = mdr->client_request;
10803
10804 CInode *diri = try_get_auth_inode(mdr, req->get_filepath().get_ino());
10805 if (!diri)
10806 return;
10807
10808 if (!diri->is_dir()) {
10809 respond_to_request(mdr, -CEPHFS_ENOTDIR);
10810 return;
10811 }
10812
10813 std::string_view snapname = req->get_filepath().last_dentry();
10814
10815 if (mdr->client_request->get_caller_uid() < g_conf()->mds_snap_min_uid || mdr->client_request->get_caller_uid() > g_conf()->mds_snap_max_uid) {
10816 dout(20) << "rmsnap " << snapname << " on " << *diri << " denied to uid " << mdr->client_request->get_caller_uid() << dendl;
10817 respond_to_request(mdr, -CEPHFS_EPERM);
10818 return;
10819 }
10820
10821 dout(10) << "rmsnap " << snapname << " on " << *diri << dendl;
10822
10823 // does snap exist?
10824 if (snapname.length() == 0 || snapname[0] == '_') {
10825 respond_to_request(mdr, -CEPHFS_EINVAL); // can't prune a parent snap, currently.
10826 return;
10827 }
10828 if (!diri->snaprealm || !diri->snaprealm->exists(snapname)) {
10829 respond_to_request(mdr, -CEPHFS_ENOENT);
10830 return;
10831 }
10832 snapid_t snapid = diri->snaprealm->resolve_snapname(snapname, diri->ino());
10833 dout(10) << " snapname " << snapname << " is " << snapid << dendl;
10834
10835 if (!(mdr->locking_state & MutationImpl::ALL_LOCKED)) {
10836 MutationImpl::LockOpVec lov;
10837 lov.add_xlock(&diri->snaplock);
10838 if (!mds->locker->acquire_locks(mdr, lov))
10839 return;
10840 if (CDentry *pdn = diri->get_projected_parent_dn(); pdn) {
10841 if (!mds->locker->try_rdlock_snap_layout(pdn->get_dir()->get_inode(), mdr))
10842 return;
10843 }
10844 mdr->locking_state |= MutationImpl::ALL_LOCKED;
10845 }
10846
10847 if (!check_access(mdr, diri, MAY_WRITE|MAY_SNAPSHOT))
10848 return;
10849
10850 // prepare
10851 if (!mdr->more()->stid) {
10852 mds->snapclient->prepare_destroy(diri->ino(), snapid,
10853 &mdr->more()->stid, &mdr->more()->snapidbl,
10854 new C_MDS_RetryRequest(mdcache, mdr));
10855 return;
10856 }
10857 version_t stid = mdr->more()->stid;
10858 auto p = mdr->more()->snapidbl.cbegin();
10859 snapid_t seq;
10860 decode(seq, p);
10861 dout(10) << " stid is " << stid << ", seq is " << seq << dendl;
10862
10863 ceph_assert(mds->snapclient->get_cached_version() >= stid);
10864
10865 // journal
10866 auto pi = diri->project_inode(mdr, false, true);
10867 pi.inode->version = diri->pre_dirty();
10868 pi.inode->ctime = mdr->get_op_stamp();
10869 if (mdr->get_op_stamp() > pi.inode->rstat.rctime)
10870 pi.inode->rstat.rctime = mdr->get_op_stamp();
10871 pi.inode->rstat.rsnaps--;
10872
10873 mdr->ls = mdlog->get_current_segment();
10874 EUpdate *le = new EUpdate(mdlog, "rmsnap");
10875 mdlog->start_entry(le);
10876
10877 // project the snaprealm
10878 auto &newnode = *pi.snapnode;
10879 newnode.snaps.erase(snapid);
10880 newnode.seq = seq;
10881 newnode.last_destroyed = seq;
10882
10883 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
10884 le->metablob.add_table_transaction(TABLE_SNAP, stid);
10885 mdcache->predirty_journal_parents(mdr, &le->metablob, diri, 0, PREDIRTY_PRIMARY, false);
10886 mdcache->journal_dirty_inode(mdr.get(), &le->metablob, diri);
10887
10888 submit_mdlog_entry(le, new C_MDS_rmsnap_finish(this, mdr, diri, snapid),
10889 mdr, __func__);
10890 mdlog->flush();
10891 }
10892
10893 void Server::_rmsnap_finish(MDRequestRef& mdr, CInode *diri, snapid_t snapid)
10894 {
10895 dout(10) << "_rmsnap_finish " << *mdr << " " << snapid << dendl;
10896 snapid_t stid = mdr->more()->stid;
10897 auto p = mdr->more()->snapidbl.cbegin();
10898 snapid_t seq;
10899 decode(seq, p);
10900
10901 mdr->apply();
10902
10903 mds->snapclient->commit(stid, mdr->ls);
10904
10905 dout(10) << "snaprealm now " << *diri->snaprealm << dendl;
10906
10907 // notify other mds
10908 mdcache->send_snap_update(diri, mdr->more()->stid, CEPH_SNAP_OP_DESTROY);
10909
10910 mdcache->do_realm_invalidate_and_update_notify(diri, CEPH_SNAP_OP_DESTROY);
10911
10912 // yay
10913 mdr->in[0] = diri;
10914 respond_to_request(mdr, 0);
10915
10916 // purge snapshot data
10917 diri->purge_stale_snap_data(diri->snaprealm->get_snaps());
10918 }
10919
10920 struct C_MDS_renamesnap_finish : public ServerLogContext {
10921 CInode *diri;
10922 snapid_t snapid;
10923 C_MDS_renamesnap_finish(Server *s, MDRequestRef& r, CInode *di, snapid_t sn) :
10924 ServerLogContext(s, r), diri(di), snapid(sn) {}
10925 void finish(int r) override {
10926 server->_renamesnap_finish(mdr, diri, snapid);
10927 }
10928 };
10929
10930 /* This function takes responsibility for the passed mdr*/
10931 void Server::handle_client_renamesnap(MDRequestRef& mdr)
10932 {
10933 const cref_t<MClientRequest> &req = mdr->client_request;
10934 if (req->get_filepath().get_ino() != req->get_filepath2().get_ino()) {
10935 respond_to_request(mdr, -CEPHFS_EINVAL);
10936 return;
10937 }
10938
10939 CInode *diri = try_get_auth_inode(mdr, req->get_filepath().get_ino());
10940 if (!diri)
10941 return;
10942
10943 if (!diri->is_dir()) { // dir only
10944 respond_to_request(mdr, -CEPHFS_ENOTDIR);
10945 return;
10946 }
10947
10948 if (mdr->client_request->get_caller_uid() < g_conf()->mds_snap_min_uid ||
10949 mdr->client_request->get_caller_uid() > g_conf()->mds_snap_max_uid) {
10950 respond_to_request(mdr, -CEPHFS_EPERM);
10951 return;
10952 }
10953
10954 std::string_view dstname = req->get_filepath().last_dentry();
10955 std::string_view srcname = req->get_filepath2().last_dentry();
10956 dout(10) << "renamesnap " << srcname << "->" << dstname << " on " << *diri << dendl;
10957
10958 if (srcname.length() == 0 || srcname[0] == '_') {
10959 respond_to_request(mdr, -CEPHFS_EINVAL); // can't rename a parent snap.
10960 return;
10961 }
10962 if (!diri->snaprealm || !diri->snaprealm->exists(srcname)) {
10963 respond_to_request(mdr, -CEPHFS_ENOENT);
10964 return;
10965 }
10966 if (dstname.length() == 0 || dstname[0] == '_') {
10967 respond_to_request(mdr, -CEPHFS_EINVAL);
10968 return;
10969 }
10970 if (diri->snaprealm->exists(dstname)) {
10971 respond_to_request(mdr, -CEPHFS_EEXIST);
10972 return;
10973 }
10974
10975 snapid_t snapid = diri->snaprealm->resolve_snapname(srcname, diri->ino());
10976 dout(10) << " snapname " << srcname << " is " << snapid << dendl;
10977
10978 // lock snap
10979 if (!(mdr->locking_state & MutationImpl::ALL_LOCKED)) {
10980 MutationImpl::LockOpVec lov;
10981 lov.add_xlock(&diri->snaplock);
10982 if (!mds->locker->acquire_locks(mdr, lov))
10983 return;
10984 if (CDentry *pdn = diri->get_projected_parent_dn(); pdn) {
10985 if (!mds->locker->try_rdlock_snap_layout(pdn->get_dir()->get_inode(), mdr))
10986 return;
10987 }
10988 mdr->locking_state |= MutationImpl::ALL_LOCKED;
10989 }
10990
10991 if (!check_access(mdr, diri, MAY_WRITE|MAY_SNAPSHOT))
10992 return;
10993
10994 // prepare
10995 if (!mdr->more()->stid) {
10996 mds->snapclient->prepare_update(diri->ino(), snapid, dstname, utime_t(),
10997 &mdr->more()->stid,
10998 new C_MDS_RetryRequest(mdcache, mdr));
10999 return;
11000 }
11001
11002 version_t stid = mdr->more()->stid;
11003 dout(10) << " stid is " << stid << dendl;
11004
11005 ceph_assert(mds->snapclient->get_cached_version() >= stid);
11006
11007 // journal
11008 auto pi = diri->project_inode(mdr, false, true);
11009 pi.inode->ctime = mdr->get_op_stamp();
11010 if (mdr->get_op_stamp() > pi.inode->rstat.rctime)
11011 pi.inode->rstat.rctime = mdr->get_op_stamp();
11012 pi.inode->version = diri->pre_dirty();
11013
11014 // project the snaprealm
11015 auto &newsnap = *pi.snapnode;
11016 auto it = newsnap.snaps.find(snapid);
11017 ceph_assert(it != newsnap.snaps.end());
11018 it->second.name = dstname;
11019
11020 // journal the inode changes
11021 mdr->ls = mdlog->get_current_segment();
11022 EUpdate *le = new EUpdate(mdlog, "renamesnap");
11023 mdlog->start_entry(le);
11024
11025 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
11026 le->metablob.add_table_transaction(TABLE_SNAP, stid);
11027 mdcache->predirty_journal_parents(mdr, &le->metablob, diri, 0, PREDIRTY_PRIMARY, false);
11028 mdcache->journal_dirty_inode(mdr.get(), &le->metablob, diri);
11029
11030 // journal the snaprealm changes
11031 submit_mdlog_entry(le, new C_MDS_renamesnap_finish(this, mdr, diri, snapid),
11032 mdr, __func__);
11033 mdlog->flush();
11034 }
11035
11036 void Server::_renamesnap_finish(MDRequestRef& mdr, CInode *diri, snapid_t snapid)
11037 {
11038 dout(10) << "_renamesnap_finish " << *mdr << " " << snapid << dendl;
11039
11040 mdr->apply();
11041
11042 mds->snapclient->commit(mdr->more()->stid, mdr->ls);
11043
11044 dout(10) << "snaprealm now " << *diri->snaprealm << dendl;
11045
11046 // notify other mds
11047 mdcache->send_snap_update(diri, mdr->more()->stid, CEPH_SNAP_OP_UPDATE);
11048
11049 mdcache->do_realm_invalidate_and_update_notify(diri, CEPH_SNAP_OP_UPDATE);
11050
11051 // yay
11052 mdr->in[0] = diri;
11053 mdr->tracei = diri;
11054 mdr->snapid = snapid;
11055 respond_to_request(mdr, 0);
11056 }
11057
11058 /**
11059 * Return true if server is in state RECONNECT and this
11060 * client has not yet reconnected.
11061 */
11062 bool Server::waiting_for_reconnect(client_t c) const
11063 {
11064 return client_reconnect_gather.count(c) > 0;
11065 }
11066
11067 void Server::dump_reconnect_status(Formatter *f) const
11068 {
11069 f->open_object_section("reconnect_status");
11070 f->dump_stream("client_reconnect_gather") << client_reconnect_gather;
11071 f->close_section();
11072 }