]> git.proxmox.com Git - ceph.git/blame - ceph/src/mds/Server.cc
import ceph 16.2.7
[ceph.git] / ceph / src / mds / Server.cc
CommitLineData
7c673cae
FG
1// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2// vim: ts=8 sw=2 smarttab
3/*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 *
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
12 *
13 */
14
15#include <boost/lexical_cast.hpp>
11fdf7f2 16#include "include/ceph_assert.h" // lexical_cast includes system assert.h
7c673cae
FG
17
18#include <boost/config/warning_disable.hpp>
19#include <boost/fusion/include/std_pair.hpp>
a8e16298 20#include <boost/range/adaptor/reversed.hpp>
7c673cae
FG
21
22#include "MDSRank.h"
23#include "Server.h"
24#include "Locker.h"
25#include "MDCache.h"
26#include "MDLog.h"
27#include "Migrator.h"
28#include "MDBalancer.h"
29#include "InoTable.h"
30#include "SnapClient.h"
31#include "Mutation.h"
f67539c2 32#include "MetricsHandler.h"
11fdf7f2 33#include "cephfs_features.h"
7c673cae
FG
34
35#include "msg/Messenger.h"
36
37#include "osdc/Objecter.h"
38
7c673cae 39#include "events/EUpdate.h"
f67539c2 40#include "events/EPeerUpdate.h"
7c673cae
FG
41#include "events/ESession.h"
42#include "events/EOpen.h"
43#include "events/ECommitted.h"
9f95a23c 44#include "events/EPurged.h"
7c673cae 45
11fdf7f2 46#include "include/stringify.h"
7c673cae
FG
47#include "include/filepath.h"
48#include "common/errno.h"
49#include "common/Timer.h"
50#include "common/perf_counters.h"
51#include "include/compat.h"
52#include "osd/OSDMap.h"
53
54#include <errno.h>
55
56#include <list>
f67539c2 57#include <regex>
11fdf7f2 58#include <string_view>
f67539c2 59#include <functional>
7c673cae
FG
60
61#include "common/config.h"
62
63#define dout_context g_ceph_context
64#define dout_subsys ceph_subsys_mds
65#undef dout_prefix
66#define dout_prefix *_dout << "mds." << mds->get_nodeid() << ".server "
67
11fdf7f2 68class ServerContext : public MDSContext {
7c673cae
FG
69 protected:
70 Server *server;
71 MDSRank *get_mds() override
72 {
73 return server->mds;
74 }
75
76 public:
77 explicit ServerContext(Server *s) : server(s) {
11fdf7f2 78 ceph_assert(server != NULL);
7c673cae
FG
79 }
80};
81
9f95a23c
TL
82class Batch_Getattr_Lookup : public BatchOp {
83protected:
84 Server* server;
85 ceph::ref_t<MDRequestImpl> mdr;
f91f0fd5 86 std::vector<ceph::ref_t<MDRequestImpl>> batch_reqs;
9f95a23c
TL
87 int res = 0;
88public:
f91f0fd5
TL
89 Batch_Getattr_Lookup(Server* s, const ceph::ref_t<MDRequestImpl>& r)
90 : server(s), mdr(r) {
91 if (mdr->client_request->get_op() == CEPH_MDS_OP_LOOKUP)
92 mdr->batch_op_map = &mdr->dn[0].back()->batch_ops;
93 else
94 mdr->batch_op_map = &mdr->in[0]->batch_ops;
95 }
96 void add_request(const ceph::ref_t<MDRequestImpl>& r) override {
97 batch_reqs.push_back(r);
9f95a23c 98 }
f91f0fd5
TL
99 ceph::ref_t<MDRequestImpl> find_new_head() override {
100 while (!batch_reqs.empty()) {
101 auto r = std::move(batch_reqs.back());
102 batch_reqs.pop_back();
103 if (r->killed)
104 continue;
105
106 r->batch_op_map = mdr->batch_op_map;
107 mdr->batch_op_map = nullptr;
108 mdr = r;
109 return mdr;
110 }
111 return nullptr;
9f95a23c
TL
112 }
113 void _forward(mds_rank_t t) override {
f91f0fd5 114 MDCache* mdcache = server->mdcache;
9f95a23c
TL
115 mdcache->mds->forward_message_mds(mdr->release_client_request(), t);
116 mdr->set_mds_stamp(ceph_clock_now());
f91f0fd5 117 for (auto& m : batch_reqs) {
9f95a23c
TL
118 if (!m->killed)
119 mdcache->request_forward(m, t);
120 }
f91f0fd5 121 batch_reqs.clear();
9f95a23c
TL
122 }
123 void _respond(int r) override {
124 mdr->set_mds_stamp(ceph_clock_now());
f91f0fd5 125 for (auto& m : batch_reqs) {
9f95a23c
TL
126 if (!m->killed) {
127 m->tracei = mdr->tracei;
128 m->tracedn = mdr->tracedn;
129 server->respond_to_request(m, r);
130 }
131 }
f91f0fd5 132 batch_reqs.clear();
9f95a23c
TL
133 server->reply_client_request(mdr, make_message<MClientReply>(*mdr->client_request, r));
134 }
135 void print(std::ostream& o) {
136 o << "[batch front=" << *mdr << "]";
137 }
138};
139
7c673cae
FG
140class ServerLogContext : public MDSLogContextBase {
141protected:
142 Server *server;
143 MDSRank *get_mds() override
144 {
145 return server->mds;
146 }
147
148 MDRequestRef mdr;
149 void pre_finish(int r) override {
150 if (mdr)
151 mdr->mark_event("journal_committed: ");
152 }
153public:
154 explicit ServerLogContext(Server *s) : server(s) {
11fdf7f2 155 ceph_assert(server != NULL);
7c673cae
FG
156 }
157 explicit ServerLogContext(Server *s, MDRequestRef& r) : server(s), mdr(r) {
11fdf7f2 158 ceph_assert(server != NULL);
7c673cae
FG
159 }
160};
161
162void Server::create_logger()
163{
164 PerfCountersBuilder plb(g_ceph_context, "mds_server", l_mdss_first, l_mdss_last);
91327a77
AA
165
166 plb.add_u64_counter(l_mdss_handle_client_request, "handle_client_request",
167 "Client requests", "hcr", PerfCountersBuilder::PRIO_INTERESTING);
f67539c2
TL
168 plb.add_u64_counter(l_mdss_handle_peer_request, "handle_peer_request",
169 "Peer requests", "hsr", PerfCountersBuilder::PRIO_INTERESTING);
91327a77
AA
170 plb.add_u64_counter(l_mdss_handle_client_session,
171 "handle_client_session", "Client session messages", "hcs",
172 PerfCountersBuilder::PRIO_INTERESTING);
11fdf7f2
TL
173 plb.add_u64_counter(l_mdss_cap_revoke_eviction, "cap_revoke_eviction",
174 "Cap Revoke Client Eviction", "cre", PerfCountersBuilder::PRIO_INTERESTING);
adb31ebb
TL
175 plb.add_u64_counter(l_mdss_cap_acquisition_throttle,
176 "cap_acquisition_throttle", "Cap acquisition throttle counter", "cat",
177 PerfCountersBuilder::PRIO_INTERESTING);
91327a77
AA
178
179 // fop latencies are useful
180 plb.set_prio_default(PerfCountersBuilder::PRIO_USEFUL);
181 plb.add_time_avg(l_mdss_req_lookuphash_latency, "req_lookuphash_latency",
182 "Request type lookup hash of inode latency");
183 plb.add_time_avg(l_mdss_req_lookupino_latency, "req_lookupino_latency",
184 "Request type lookup inode latency");
185 plb.add_time_avg(l_mdss_req_lookupparent_latency, "req_lookupparent_latency",
186 "Request type lookup parent latency");
187 plb.add_time_avg(l_mdss_req_lookupname_latency, "req_lookupname_latency",
188 "Request type lookup name latency");
189 plb.add_time_avg(l_mdss_req_lookup_latency, "req_lookup_latency",
190 "Request type lookup latency");
191 plb.add_time_avg(l_mdss_req_lookupsnap_latency, "req_lookupsnap_latency",
192 "Request type lookup snapshot latency");
193 plb.add_time_avg(l_mdss_req_getattr_latency, "req_getattr_latency",
194 "Request type get attribute latency");
195 plb.add_time_avg(l_mdss_req_setattr_latency, "req_setattr_latency",
196 "Request type set attribute latency");
197 plb.add_time_avg(l_mdss_req_setlayout_latency, "req_setlayout_latency",
198 "Request type set file layout latency");
199 plb.add_time_avg(l_mdss_req_setdirlayout_latency, "req_setdirlayout_latency",
200 "Request type set directory layout latency");
201 plb.add_time_avg(l_mdss_req_setxattr_latency, "req_setxattr_latency",
202 "Request type set extended attribute latency");
203 plb.add_time_avg(l_mdss_req_rmxattr_latency, "req_rmxattr_latency",
204 "Request type remove extended attribute latency");
205 plb.add_time_avg(l_mdss_req_readdir_latency, "req_readdir_latency",
206 "Request type read directory latency");
207 plb.add_time_avg(l_mdss_req_setfilelock_latency, "req_setfilelock_latency",
208 "Request type set file lock latency");
209 plb.add_time_avg(l_mdss_req_getfilelock_latency, "req_getfilelock_latency",
210 "Request type get file lock latency");
211 plb.add_time_avg(l_mdss_req_create_latency, "req_create_latency",
212 "Request type create latency");
213 plb.add_time_avg(l_mdss_req_open_latency, "req_open_latency",
214 "Request type open latency");
215 plb.add_time_avg(l_mdss_req_mknod_latency, "req_mknod_latency",
216 "Request type make node latency");
217 plb.add_time_avg(l_mdss_req_link_latency, "req_link_latency",
218 "Request type link latency");
219 plb.add_time_avg(l_mdss_req_unlink_latency, "req_unlink_latency",
220 "Request type unlink latency");
221 plb.add_time_avg(l_mdss_req_rmdir_latency, "req_rmdir_latency",
222 "Request type remove directory latency");
223 plb.add_time_avg(l_mdss_req_rename_latency, "req_rename_latency",
224 "Request type rename latency");
225 plb.add_time_avg(l_mdss_req_mkdir_latency, "req_mkdir_latency",
226 "Request type make directory latency");
227 plb.add_time_avg(l_mdss_req_symlink_latency, "req_symlink_latency",
228 "Request type symbolic link latency");
229 plb.add_time_avg(l_mdss_req_lssnap_latency, "req_lssnap_latency",
230 "Request type list snapshot latency");
231 plb.add_time_avg(l_mdss_req_mksnap_latency, "req_mksnap_latency",
232 "Request type make snapshot latency");
233 plb.add_time_avg(l_mdss_req_rmsnap_latency, "req_rmsnap_latency",
234 "Request type remove snapshot latency");
235 plb.add_time_avg(l_mdss_req_renamesnap_latency, "req_renamesnap_latency",
236 "Request type rename snapshot latency");
237
91327a77
AA
238 plb.set_prio_default(PerfCountersBuilder::PRIO_DEBUGONLY);
239 plb.add_u64_counter(l_mdss_dispatch_client_request, "dispatch_client_request",
240 "Client requests dispatched");
f67539c2 241 plb.add_u64_counter(l_mdss_dispatch_peer_request, "dispatch_server_request",
91327a77
AA
242 "Server requests dispatched");
243
7c673cae
FG
244 logger = plb.create_perf_counters();
245 g_ceph_context->get_perfcounters_collection()->add(logger);
246}
247
f67539c2 248Server::Server(MDSRank *m, MetricsHandler *metrics_handler) :
7c673cae
FG
249 mds(m),
250 mdcache(mds->mdcache), mdlog(mds->mdlog),
f67539c2
TL
251 recall_throttle(g_conf().get_val<double>("mds_recall_max_decay_rate")),
252 metrics_handler(metrics_handler)
7c673cae 253{
f91f0fd5 254 forward_all_requests_to_auth = g_conf().get_val<bool>("mds_forward_all_requests_to_auth");
92f5a8d4 255 replay_unsafe_with_closed_session = g_conf().get_val<bool>("mds_replay_unsafe_with_closed_session");
81eedcae 256 cap_revoke_eviction_timeout = g_conf().get_val<double>("mds_cap_revoke_eviction_timeout");
9f95a23c
TL
257 max_snaps_per_dir = g_conf().get_val<uint64_t>("mds_max_snaps_per_dir");
258 delegate_inos_pct = g_conf().get_val<uint64_t>("mds_client_delegate_inos_pct");
adb31ebb
TL
259 max_caps_per_client = g_conf().get_val<uint64_t>("mds_max_caps_per_client");
260 cap_acquisition_throttle = g_conf().get_val<uint64_t>("mds_session_cap_acquisition_throttle");
261 max_caps_throttle_ratio = g_conf().get_val<double>("mds_session_max_caps_throttle_ratio");
262 caps_throttle_retry_request_timeout = g_conf().get_val<double>("mds_cap_acquisition_throttle_retry_request_timeout");
11fdf7f2 263 supported_features = feature_bitset_t(CEPHFS_FEATURES_MDS_SUPPORTED);
7c673cae
FG
264}
265
9f95a23c 266void Server::dispatch(const cref_t<Message> &m)
7c673cae
FG
267{
268 switch (m->get_type()) {
269 case CEPH_MSG_CLIENT_RECONNECT:
9f95a23c 270 handle_client_reconnect(ref_cast<MClientReconnect>(m));
7c673cae
FG
271 return;
272 }
273
92f5a8d4
TL
274/*
275 *In reconnect phase, client sent unsafe requests to mds before reconnect msg. Seting sessionclosed_isok will handle scenario like this:
276
2771. In reconnect phase, client sent unsafe requests to mds.
2782. It reached reconnect timeout. All sessions without sending reconnect msg in time, some of which may had sent unsafe requests, are marked as closed.
279(Another situation is #31668, which will deny all client reconnect msg to speed up reboot).
2803.So these unsafe request from session without sending reconnect msg in time or being denied could be handled in clientreplay phase.
281
282*/
283 bool sessionclosed_isok = replay_unsafe_with_closed_session;
7c673cae 284 // active?
f67539c2 285 // handle_peer_request()/handle_client_session() will wait if necessary
94b18763 286 if (m->get_type() == CEPH_MSG_CLIENT_REQUEST && !mds->is_active()) {
9f95a23c 287 const auto &req = ref_cast<MClientRequest>(m);
94b18763
FG
288 if (mds->is_reconnect() || mds->get_want_state() == CEPH_MDS_STATE_RECONNECT) {
289 Session *session = mds->get_session(req);
92f5a8d4 290 if (!session || (!session->is_open() && !sessionclosed_isok)) {
7c673cae 291 dout(5) << "session is closed, dropping " << req->get_reqid() << dendl;
7c673cae
FG
292 return;
293 }
294 bool queue_replay = false;
9f95a23c 295 if (req->is_replay() || req->is_async()) {
7c673cae
FG
296 dout(3) << "queuing replayed op" << dendl;
297 queue_replay = true;
11fdf7f2
TL
298 if (req->head.ino &&
299 !session->have_completed_request(req->get_reqid().tid, nullptr)) {
f67539c2
TL
300 inodeno_t ino(req->head.ino);
301 mdcache->add_replay_ino_alloc(ino);
302 if (replay_unsafe_with_closed_session &&
303 session->free_prealloc_inos.contains(ino)) {
304 // don't purge inodes that will be created by later replay
305 session->free_prealloc_inos.erase(ino);
306 session->delegated_inos.insert(ino);
307 }
11fdf7f2 308 }
7c673cae
FG
309 } else if (req->get_retry_attempt()) {
310 // process completed request in clientreplay stage. The completed request
311 // might have created new file/directorie. This guarantees MDS sends a reply
312 // to client before other request modifies the new file/directorie.
313 if (session->have_completed_request(req->get_reqid().tid, NULL)) {
314 dout(3) << "queuing completed op" << dendl;
315 queue_replay = true;
316 }
317 // this request was created before the cap reconnect message, drop any embedded
318 // cap releases.
319 req->releases.clear();
320 }
321 if (queue_replay) {
322 req->mark_queued_for_replay();
323 mds->enqueue_replay(new C_MDS_RetryMessage(mds, m));
324 return;
325 }
326 }
327
328 bool wait_for_active = true;
94b18763 329 if (mds->is_stopping()) {
28e407b8 330 wait_for_active = false;
7c673cae 331 } else if (mds->is_clientreplay()) {
94b18763 332 if (req->is_queued_for_replay()) {
7c673cae 333 wait_for_active = false;
7c673cae
FG
334 }
335 }
336 if (wait_for_active) {
337 dout(3) << "not active yet, waiting" << dendl;
338 mds->wait_for_active(new C_MDS_RetryMessage(mds, m));
339 return;
340 }
341 }
342
343 switch (m->get_type()) {
344 case CEPH_MSG_CLIENT_SESSION:
9f95a23c 345 handle_client_session(ref_cast<MClientSession>(m));
7c673cae
FG
346 return;
347 case CEPH_MSG_CLIENT_REQUEST:
9f95a23c 348 handle_client_request(ref_cast<MClientRequest>(m));
11fdf7f2
TL
349 return;
350 case CEPH_MSG_CLIENT_RECLAIM:
9f95a23c 351 handle_client_reclaim(ref_cast<MClientReclaim>(m));
7c673cae 352 return;
f67539c2
TL
353 case MSG_MDS_PEER_REQUEST:
354 handle_peer_request(ref_cast<MMDSPeerRequest>(m));
7c673cae
FG
355 return;
356 default:
357 derr << "server unknown message " << m->get_type() << dendl;
11fdf7f2 358 ceph_abort_msg("server unknown message");
7c673cae
FG
359 }
360}
361
362
363
364// ----------------------------------------------------------
365// SESSION management
366
367class C_MDS_session_finish : public ServerLogContext {
368 Session *session;
369 uint64_t state_seq;
370 bool open;
371 version_t cmapv;
f67539c2 372 interval_set<inodeno_t> inos_to_free;
7c673cae 373 version_t inotablev;
f67539c2 374 interval_set<inodeno_t> inos_to_purge;
9f95a23c 375 LogSegment *ls = nullptr;
7c673cae
FG
376 Context *fin;
377public:
f67539c2 378 C_MDS_session_finish(Server *srv, Session *se, uint64_t sseq, bool s, version_t mv, Context *fin_ = nullptr) :
7c673cae 379 ServerLogContext(srv), session(se), state_seq(sseq), open(s), cmapv(mv), inotablev(0), fin(fin_) { }
f67539c2
TL
380 C_MDS_session_finish(Server *srv, Session *se, uint64_t sseq, bool s, version_t mv,
381 const interval_set<inodeno_t>& to_free, version_t iv,
382 const interval_set<inodeno_t>& to_purge, LogSegment *_ls, Context *fin_ = nullptr) :
383 ServerLogContext(srv), session(se), state_seq(sseq), open(s), cmapv(mv),
384 inos_to_free(to_free), inotablev(iv), inos_to_purge(to_purge), ls(_ls), fin(fin_) {}
7c673cae 385 void finish(int r) override {
11fdf7f2 386 ceph_assert(r == 0);
f67539c2 387 server->_session_logged(session, state_seq, open, cmapv, inos_to_free, inotablev, inos_to_purge, ls);
7c673cae
FG
388 if (fin) {
389 fin->complete(r);
390 }
391 }
392};
393
11fdf7f2
TL
394Session* Server::find_session_by_uuid(std::string_view uuid)
395{
396 Session* session = nullptr;
397 for (auto& it : mds->sessionmap.get_sessions()) {
398 auto& metadata = it.second->info.client_metadata;
399
400 auto p = metadata.find("uuid");
401 if (p == metadata.end() || p->second != uuid)
402 continue;
403
404 if (!session) {
405 session = it.second;
406 } else if (!session->reclaiming_from) {
407 assert(it.second->reclaiming_from == session);
408 session = it.second;
409 } else {
410 assert(session->reclaiming_from == it.second);
411 }
412 }
413 return session;
414}
415
9f95a23c 416void Server::reclaim_session(Session *session, const cref_t<MClientReclaim> &m)
11fdf7f2
TL
417{
418 if (!session->is_open() && !session->is_stale()) {
419 dout(10) << "session not open, dropping this req" << dendl;
420 return;
421 }
422
9f95a23c 423 auto reply = make_message<MClientReclaimReply>(0);
11fdf7f2
TL
424 if (m->get_uuid().empty()) {
425 dout(10) << __func__ << " invalid message (no uuid)" << dendl;
f67539c2 426 reply->set_result(-CEPHFS_EINVAL);
11fdf7f2
TL
427 mds->send_message_client(reply, session);
428 return;
429 }
430
431 unsigned flags = m->get_flags();
432 if (flags != CEPH_RECLAIM_RESET) { // currently only support reset
433 dout(10) << __func__ << " unsupported flags" << dendl;
f67539c2 434 reply->set_result(-CEPHFS_EOPNOTSUPP);
11fdf7f2
TL
435 mds->send_message_client(reply, session);
436 return;
437 }
438
439 Session* target = find_session_by_uuid(m->get_uuid());
440 if (target) {
441 if (session->info.auth_name != target->info.auth_name) {
442 dout(10) << __func__ << " session auth_name " << session->info.auth_name
443 << " != target auth_name " << target->info.auth_name << dendl;
f67539c2 444 reply->set_result(-CEPHFS_EPERM);
11fdf7f2
TL
445 mds->send_message_client(reply, session);
446 }
447
448 assert(!target->reclaiming_from);
449 assert(!session->reclaiming_from);
450 session->reclaiming_from = target;
451 reply->set_addrs(entity_addrvec_t(target->info.inst.addr));
452 }
453
454 if (flags & CEPH_RECLAIM_RESET) {
455 finish_reclaim_session(session, reply);
456 return;
457 }
458
459 ceph_abort();
460}
461
9f95a23c 462void Server::finish_reclaim_session(Session *session, const ref_t<MClientReclaimReply> &reply)
11fdf7f2
TL
463{
464 Session *target = session->reclaiming_from;
465 if (target) {
466 session->reclaiming_from = nullptr;
467
468 Context *send_reply;
469 if (reply) {
470 int64_t session_id = session->get_client().v;
9f95a23c
TL
471 send_reply = new LambdaContext([this, session_id, reply](int r) {
472 assert(ceph_mutex_is_locked_by_me(mds->mds_lock));
11fdf7f2
TL
473 Session *session = mds->sessionmap.get_session(entity_name_t::CLIENT(session_id));
474 if (!session) {
475 return;
476 }
477 auto epoch = mds->objecter->with_osdmap([](const OSDMap &map){ return map.get_epoch(); });
478 reply->set_epoch(epoch);
479 mds->send_message_client(reply, session);
480 });
481 } else {
482 send_reply = nullptr;
483 }
484
f67539c2
TL
485 bool blocklisted = mds->objecter->with_osdmap([target](const OSDMap &map) {
486 return map.is_blocklisted(target->info.inst.addr);
11fdf7f2
TL
487 });
488
f67539c2 489 if (blocklisted || !g_conf()->mds_session_blocklist_on_evict) {
11fdf7f2
TL
490 kill_session(target, send_reply);
491 } else {
f67539c2
TL
492 CachedStackStringStream css;
493 mds->evict_client(target->get_client().v, false, true, *css, send_reply);
11fdf7f2
TL
494 }
495 } else if (reply) {
496 mds->send_message_client(reply, session);
497 }
498}
499
9f95a23c 500void Server::handle_client_reclaim(const cref_t<MClientReclaim> &m)
11fdf7f2
TL
501{
502 Session *session = mds->get_session(m);
503 dout(3) << __func__ << " " << *m << " from " << m->get_source() << dendl;
504 assert(m->get_source().is_client()); // should _not_ come from an mds!
505
506 if (!session) {
507 dout(0) << " ignoring sessionless msg " << *m << dendl;
508 return;
509 }
510
f67539c2
TL
511 std::string_view fs_name = mds->get_fs_name();
512 if (!fs_name.empty() && !session->fs_name_capable(fs_name, MAY_READ)) {
513 dout(0) << " dropping message not allowed for this fs_name: " << *m << dendl;
514 return;
515 }
516
11fdf7f2
TL
517 if (mds->get_state() < MDSMap::STATE_CLIENTREPLAY) {
518 mds->wait_for_replay(new C_MDS_RetryMessage(mds, m));
519 return;
520 }
521
522 if (m->get_flags() & MClientReclaim::FLAG_FINISH) {
523 finish_reclaim_session(session);
524 } else {
525 reclaim_session(session, m);
526 }
527}
528
9f95a23c 529void Server::handle_client_session(const cref_t<MClientSession> &m)
7c673cae
FG
530{
531 version_t pv;
94b18763 532 Session *session = mds->get_session(m);
7c673cae
FG
533
534 dout(3) << "handle_client_session " << *m << " from " << m->get_source() << dendl;
11fdf7f2 535 ceph_assert(m->get_source().is_client()); // should _not_ come from an mds!
7c673cae
FG
536
537 if (!session) {
538 dout(0) << " ignoring sessionless msg " << *m << dendl;
9f95a23c 539 auto reply = make_message<MClientSession>(CEPH_SESSION_REJECT);
92f5a8d4
TL
540 reply->metadata["error_string"] = "sessionless";
541 mds->send_message(reply, m->get_connection());
7c673cae
FG
542 return;
543 }
544
f67539c2
TL
545 std::string_view fs_name = mds->get_fs_name();
546 if (!fs_name.empty() && !session->fs_name_capable(fs_name, MAY_READ)) {
547 dout(0) << " dropping message not allowed for this fs_name: " << *m << dendl;
548 auto reply = make_message<MClientSession>(CEPH_SESSION_REJECT);
549 reply->metadata["error_string"] = "client doesn't have caps for FS \"" +
550 std::string(fs_name) + "\"";
551 mds->send_message(std::move(reply), m->get_connection());
552 return;
553 }
554
94b18763
FG
555 if (m->get_op() == CEPH_SESSION_REQUEST_RENEWCAPS) {
556 // always handle renewcaps (state >= MDSMap::STATE_RECONNECT)
557 } else if (m->get_op() == CEPH_SESSION_REQUEST_CLOSE) {
558 // close requests need to be handled when mds is active
559 if (mds->get_state() < MDSMap::STATE_ACTIVE) {
560 mds->wait_for_active(new C_MDS_RetryMessage(mds, m));
561 return;
562 }
563 } else {
564 if (mds->get_state() < MDSMap::STATE_CLIENTREPLAY) {
565 mds->wait_for_replay(new C_MDS_RetryMessage(mds, m));
566 return;
567 }
568 }
569
7c673cae
FG
570 if (logger)
571 logger->inc(l_mdss_handle_client_session);
572
573 uint64_t sseq = 0;
574 switch (m->get_op()) {
575 case CEPH_SESSION_REQUEST_OPEN:
576 if (session->is_opening() ||
577 session->is_open() ||
578 session->is_stale() ||
28e407b8
AA
579 session->is_killing() ||
580 terminating_sessions) {
7c673cae 581 dout(10) << "currently open|opening|stale|killing, dropping this req" << dendl;
7c673cae
FG
582 return;
583 }
11fdf7f2 584 ceph_assert(session->is_closed() || session->is_closing());
7c673cae 585
b32b8144
FG
586 if (mds->is_stopping()) {
587 dout(10) << "mds is stopping, dropping open req" << dendl;
b32b8144
FG
588 return;
589 }
590
a8e16298
TL
591 {
592 auto& addr = session->info.inst.addr;
9f95a23c 593 session->set_client_metadata(client_metadata_t(m->metadata, m->supported_features, m->metric_spec));
a8e16298
TL
594 auto& client_metadata = session->info.client_metadata;
595
11fdf7f2 596 auto log_session_status = [this, m, session](std::string_view status, std::string_view err) {
a8e16298
TL
597 auto now = ceph_clock_now();
598 auto throttle_elapsed = m->get_recv_complete_stamp() - m->get_throttle_stamp();
599 auto elapsed = now - m->get_recv_stamp();
11fdf7f2
TL
600 CachedStackStringStream css;
601 *css << "New client session:"
a8e16298
TL
602 << " addr=\"" << session->info.inst.addr << "\""
603 << ",elapsed=" << elapsed
604 << ",throttled=" << throttle_elapsed
605 << ",status=\"" << status << "\"";
606 if (!err.empty()) {
11fdf7f2 607 *css << ",error=\"" << err << "\"";
a8e16298
TL
608 }
609 const auto& metadata = session->info.client_metadata;
11fdf7f2
TL
610 if (auto it = metadata.find("root"); it != metadata.end()) {
611 *css << ",root=\"" << it->second << "\"";
a8e16298 612 }
11fdf7f2
TL
613 dout(2) << css->strv() << dendl;
614 };
615
a4b75251
TL
616 auto send_reject_message = [this, &session, &log_session_status](std::string_view err_str, unsigned flags=0) {
617 auto m = make_message<MClientSession>(CEPH_SESSION_REJECT, 0, flags);
11fdf7f2
TL
618 if (session->info.has_feature(CEPHFS_FEATURE_MIMIC))
619 m->metadata["error_string"] = err_str;
620 mds->send_message_client(m, session);
621 log_session_status("REJECTED", err_str);
a8e16298 622 };
7c673cae 623
f67539c2 624 bool blocklisted = mds->objecter->with_osdmap(
11fdf7f2 625 [&addr](const OSDMap &osd_map) -> bool {
f67539c2 626 return osd_map.is_blocklisted(addr);
11fdf7f2
TL
627 });
628
f67539c2
TL
629 if (blocklisted) {
630 dout(10) << "rejecting blocklisted client " << addr << dendl;
631 // This goes on the wire and the "blacklisted" substring is
632 // depended upon by the kernel client for detecting whether it
633 // has been blocklisted. If mounted with recover_session=clean
634 // (since 5.4), it tries to automatically recover itself from
635 // blocklisting.
a4b75251
TL
636 unsigned flags = 0;
637 flags |= MClientSession::SESSION_BLOCKLISTED;
638 send_reject_message("blocklisted (blacklisted)", flags);
11fdf7f2
TL
639 session->clear();
640 break;
7c673cae 641 }
7c673cae 642
11fdf7f2
TL
643 if (client_metadata.features.empty())
644 infer_supported_features(session, client_metadata);
645
646 dout(20) << __func__ << " CEPH_SESSION_REQUEST_OPEN metadata entries:" << dendl;
9f95a23c
TL
647 dout(20) << " features: '" << client_metadata.features << "'" << dendl;
648 dout(20) << " metric specification: [" << client_metadata.metric_spec << "]" << dendl;
11fdf7f2
TL
649 for (const auto& p : client_metadata) {
650 dout(20) << " " << p.first << ": " << p.second << dendl;
651 }
652
653 feature_bitset_t missing_features = required_client_features;
654 missing_features -= client_metadata.features;
655 if (!missing_features.empty()) {
f67539c2
TL
656 CachedStackStringStream css;
657 *css << "missing required features '" << missing_features << "'";
658 send_reject_message(css->strv());
92f5a8d4
TL
659 mds->clog->warn() << "client session (" << session->info.inst
660 << ") lacks required features " << missing_features
661 << "; client supports " << client_metadata.features;
11fdf7f2
TL
662 session->clear();
663 break;
a8e16298 664 }
7c673cae 665
a8e16298
TL
666 // Special case for the 'root' metadata path; validate that the claimed
667 // root is actually within the caps of the session
11fdf7f2
TL
668 if (auto it = client_metadata.find("root"); it != client_metadata.end()) {
669 auto claimed_root = it->second;
f67539c2 670 CachedStackStringStream css;
11fdf7f2
TL
671 bool denied = false;
672 // claimed_root has a leading "/" which we strip before passing
673 // into caps check
674 if (claimed_root.empty() || claimed_root[0] != '/') {
675 denied = true;
f67539c2 676 *css << "invalue root '" << claimed_root << "'";
11fdf7f2
TL
677 } else if (!session->auth_caps.path_capable(claimed_root.substr(1))) {
678 denied = true;
f67539c2 679 *css << "non-allowable root '" << claimed_root << "'";
11fdf7f2
TL
680 }
681
682 if (denied) {
683 // Tell the client we're rejecting their open
f67539c2
TL
684 send_reject_message(css->strv());
685 mds->clog->warn() << "client session with " << css->strv()
11fdf7f2
TL
686 << " denied (" << session->info.inst << ")";
687 session->clear();
688 break;
689 }
690 }
691
692 if (auto it = client_metadata.find("uuid"); it != client_metadata.end()) {
693 if (find_session_by_uuid(it->second)) {
694 send_reject_message("duplicated session uuid");
695 mds->clog->warn() << "client session with duplicated session uuid '"
696 << it->second << "' denied (" << session->info.inst << ")";
697 session->clear();
698 break;
699 }
a8e16298
TL
700 }
701
f67539c2
TL
702 if (session->is_closed()) {
703 mds->sessionmap.add_session(session);
704 }
a8e16298
TL
705
706 pv = mds->sessionmap.mark_projected(session);
707 sseq = mds->sessionmap.set_state(session, Session::STATE_OPENING);
708 mds->sessionmap.touch_session(session);
9f95a23c 709 auto fin = new LambdaContext([log_session_status = std::move(log_session_status)](int r){
11fdf7f2 710 ceph_assert(r == 0);
a8e16298
TL
711 log_session_status("ACCEPTED", "");
712 });
713 mdlog->start_submit_entry(new ESession(m->get_source_inst(), true, pv, client_metadata),
714 new C_MDS_session_finish(this, session, sseq, true, pv, fin));
11fdf7f2 715 mdlog->flush();
a8e16298 716 }
7c673cae
FG
717 break;
718
719 case CEPH_SESSION_REQUEST_RENEWCAPS:
11fdf7f2 720 if (session->is_open() || session->is_stale()) {
7c673cae
FG
721 mds->sessionmap.touch_session(session);
722 if (session->is_stale()) {
723 mds->sessionmap.set_state(session, Session::STATE_OPEN);
724 mds->locker->resume_stale_caps(session);
725 mds->sessionmap.touch_session(session);
726 }
9f95a23c 727 auto reply = make_message<MClientSession>(CEPH_SESSION_RENEWCAPS, m->get_seq());
11fdf7f2 728 mds->send_message_client(reply, session);
7c673cae
FG
729 } else {
730 dout(10) << "ignoring renewcaps on non open|stale session (" << session->get_state_name() << ")" << dendl;
731 }
732 break;
733
734 case CEPH_SESSION_REQUEST_CLOSE:
735 {
736 if (session->is_closed() ||
737 session->is_closing() ||
738 session->is_killing()) {
739 dout(10) << "already closed|closing|killing, dropping this req" << dendl;
7c673cae
FG
740 return;
741 }
742 if (session->is_importing()) {
743 dout(10) << "ignoring close req on importing session" << dendl;
7c673cae
FG
744 return;
745 }
11fdf7f2 746 ceph_assert(session->is_open() ||
7c673cae
FG
747 session->is_stale() ||
748 session->is_opening());
749 if (m->get_seq() < session->get_push_seq()) {
750 dout(10) << "old push seq " << m->get_seq() << " < " << session->get_push_seq()
751 << ", dropping" << dendl;
7c673cae
FG
752 return;
753 }
754 // We are getting a seq that is higher than expected.
755 // Handle the same as any other seqn error.
756 //
757 if (m->get_seq() != session->get_push_seq()) {
758 dout(0) << "old push seq " << m->get_seq() << " != " << session->get_push_seq()
759 << ", BUGGY!" << dendl;
760 mds->clog->warn() << "incorrect push seq " << m->get_seq() << " != "
761 << session->get_push_seq() << ", dropping" << " from client : " << session->get_human_name();
7c673cae
FG
762 return;
763 }
764 journal_close_session(session, Session::STATE_CLOSING, NULL);
765 }
766 break;
767
768 case CEPH_SESSION_FLUSHMSG_ACK:
769 finish_flush_session(session, m->get_seq());
770 break;
771
31f18b77 772 case CEPH_SESSION_REQUEST_FLUSH_MDLOG:
b32b8144
FG
773 if (mds->is_active())
774 mdlog->flush();
31f18b77
FG
775 break;
776
7c673cae
FG
777 default:
778 ceph_abort();
779 }
7c673cae
FG
780}
781
f91f0fd5 782void Server::flush_session(Session *session, MDSGatherBuilder& gather) {
f64942e4 783 if (!session->is_open() ||
11fdf7f2
TL
784 !session->get_connection() ||
785 !session->get_connection()->has_feature(CEPH_FEATURE_EXPORT_PEER)) {
f64942e4
AA
786 return;
787 }
788
f91f0fd5 789 version_t seq = session->wait_for_flush(gather.new_sub());
11fdf7f2 790 mds->send_message_client(
9f95a23c 791 make_message<MClientSession>(CEPH_SESSION_FLUSHMSG, seq), session);
f64942e4
AA
792}
793
7c673cae
FG
794void Server::flush_client_sessions(set<client_t>& client_set, MDSGatherBuilder& gather)
795{
f91f0fd5
TL
796 for (const auto& client : client_set) {
797 Session *session = mds->sessionmap.get_session(entity_name_t::CLIENT(client.v));
11fdf7f2 798 ceph_assert(session);
f91f0fd5 799 flush_session(session, gather);
7c673cae
FG
800 }
801}
802
803void Server::finish_flush_session(Session *session, version_t seq)
804{
11fdf7f2 805 MDSContext::vec finished;
7c673cae
FG
806 session->finish_flush(seq, finished);
807 mds->queue_waiters(finished);
808}
809
810void Server::_session_logged(Session *session, uint64_t state_seq, bool open, version_t pv,
f67539c2
TL
811 const interval_set<inodeno_t>& inos_to_free, version_t piv,
812 const interval_set<inodeno_t>& inos_to_purge, LogSegment *ls)
7c673cae 813{
9f95a23c
TL
814 dout(10) << "_session_logged " << session->info.inst
815 << " state_seq " << state_seq
f67539c2
TL
816 << " " << (open ? "open":"close") << " " << pv
817 << " inos_to_free " << inos_to_free << " inotablev " << piv
818 << " inos_to_purge " << inos_to_purge << dendl;
819
820 if (!open) {
821 if (inos_to_purge.size()){
822 ceph_assert(ls);
823 session->info.prealloc_inos.subtract(inos_to_purge);
824 ls->purging_inodes.insert(inos_to_purge);
825 if (mds->is_clientreplay() || mds->is_active() || mds->is_stopping())
826 mdcache->purge_inodes(inos_to_purge, ls);
827 }
828
829 if (inos_to_free.size()) {
830 ceph_assert(piv);
831 ceph_assert(session->is_closing() || session->is_killing() ||
832 session->is_opening()); // re-open closing session
833 session->info.prealloc_inos.subtract(inos_to_free);
834 mds->inotable->apply_release_ids(inos_to_free);
835 ceph_assert(mds->inotable->get_version() == piv);
836 }
837 session->free_prealloc_inos = session->info.prealloc_inos;
9f95a23c 838 session->delegated_inos.clear();
7c673cae
FG
839 }
840
841 mds->sessionmap.mark_dirty(session);
842
843 // apply
844 if (session->get_state_seq() != state_seq) {
845 dout(10) << " journaled state_seq " << state_seq << " != current " << session->get_state_seq()
846 << ", noop" << dendl;
847 // close must have been canceled (by an import?), or any number of other things..
848 } else if (open) {
11fdf7f2 849 ceph_assert(session->is_opening());
7c673cae
FG
850 mds->sessionmap.set_state(session, Session::STATE_OPEN);
851 mds->sessionmap.touch_session(session);
f67539c2 852 metrics_handler->add_session(session);
11fdf7f2 853 ceph_assert(session->get_connection());
9f95a23c 854 auto reply = make_message<MClientSession>(CEPH_SESSION_OPEN);
11fdf7f2
TL
855 if (session->info.has_feature(CEPHFS_FEATURE_MIMIC))
856 reply->supported_features = supported_features;
857 mds->send_message_client(reply, session);
858 if (mdcache->is_readonly()) {
9f95a23c 859 auto m = make_message<MClientSession>(CEPH_SESSION_FORCE_RO);
11fdf7f2
TL
860 mds->send_message_client(m, session);
861 }
7c673cae
FG
862 } else if (session->is_closing() ||
863 session->is_killing()) {
864 // kill any lingering capabilities, leases, requests
f91f0fd5 865 bool killing = session->is_killing();
7c673cae
FG
866 while (!session->caps.empty()) {
867 Capability *cap = session->caps.front();
868 CInode *in = cap->get_inode();
869 dout(20) << " killing capability " << ccap_string(cap->issued()) << " on " << *in << dendl;
f91f0fd5 870 mds->locker->remove_client_cap(in, cap, killing);
7c673cae
FG
871 }
872 while (!session->leases.empty()) {
873 ClientLease *r = session->leases.front();
874 CDentry *dn = static_cast<CDentry*>(r->parent);
875 dout(20) << " killing client lease of " << *dn << dendl;
876 dn->remove_client_lease(r, mds->locker);
877 }
11fdf7f2 878 if (client_reconnect_gather.erase(session->info.get_client())) {
7c673cae 879 dout(20) << " removing client from reconnect set" << dendl;
7c673cae
FG
880 if (client_reconnect_gather.empty()) {
881 dout(7) << " client " << session->info.inst << " was last reconnect, finishing" << dendl;
882 reconnect_gather_finish();
883 }
884 }
11fdf7f2
TL
885 if (client_reclaim_gather.erase(session->info.get_client())) {
886 dout(20) << " removing client from reclaim set" << dendl;
887 if (client_reclaim_gather.empty()) {
888 dout(7) << " client " << session->info.inst << " was last reclaimed, finishing" << dendl;
889 mds->maybe_clientreplay_done();
890 }
891 }
7c673cae
FG
892
893 if (session->is_closing()) {
894 // mark con disposable. if there is a fault, we will get a
895 // reset and clean it up. if the client hasn't received the
896 // CLOSE message yet, they will reconnect and get an
897 // ms_handle_remote_reset() and realize they had in fact closed.
898 // do this *before* sending the message to avoid a possible
899 // race.
11fdf7f2 900 if (session->get_connection()) {
7c673cae
FG
901 // Conditional because terminate_sessions will indiscrimately
902 // put sessions in CLOSING whether they ever had a conn or not.
11fdf7f2 903 session->get_connection()->mark_disposable();
7c673cae
FG
904 }
905
906 // reset session
9f95a23c 907 mds->send_message_client(make_message<MClientSession>(CEPH_SESSION_CLOSE), session);
7c673cae
FG
908 mds->sessionmap.set_state(session, Session::STATE_CLOSED);
909 session->clear();
f67539c2 910 metrics_handler->remove_session(session);
7c673cae
FG
911 mds->sessionmap.remove_session(session);
912 } else if (session->is_killing()) {
913 // destroy session, close connection
11fdf7f2 914 if (session->get_connection()) {
92f5a8d4
TL
915 session->get_connection()->mark_down();
916 mds->sessionmap.set_state(session, Session::STATE_CLOSED);
917 session->set_connection(nullptr);
7c673cae 918 }
f67539c2 919 metrics_handler->remove_session(session);
7c673cae
FG
920 mds->sessionmap.remove_session(session);
921 } else {
922 ceph_abort();
923 }
924 } else {
925 ceph_abort();
926 }
927}
928
929/**
930 * Inject sessions from some source other than actual connections.
931 *
932 * For example:
933 * - sessions inferred from journal replay
934 * - sessions learned from other MDSs during rejoin
935 * - sessions learned from other MDSs during dir/caps migration
936 * - sessions learned from other MDSs during a cross-MDS rename
937 */
938version_t Server::prepare_force_open_sessions(map<client_t,entity_inst_t>& cm,
11fdf7f2 939 map<client_t,client_metadata_t>& cmm,
28e407b8 940 map<client_t, pair<Session*,uint64_t> >& smap)
7c673cae
FG
941{
942 version_t pv = mds->sessionmap.get_projected();
943
944 dout(10) << "prepare_force_open_sessions " << pv
945 << " on " << cm.size() << " clients"
946 << dendl;
7c673cae 947
28e407b8 948 mds->objecter->with_osdmap(
11fdf7f2 949 [this, &cm, &cmm](const OSDMap &osd_map) {
28e407b8 950 for (auto p = cm.begin(); p != cm.end(); ) {
f67539c2
TL
951 if (osd_map.is_blocklisted(p->second.addr)) {
952 dout(10) << " ignoring blocklisted client." << p->first
28e407b8 953 << " (" << p->second.addr << ")" << dendl;
11fdf7f2 954 cmm.erase(p->first);
28e407b8
AA
955 cm.erase(p++);
956 } else {
957 ++p;
958 }
959 }
960 });
961
962 for (map<client_t,entity_inst_t>::iterator p = cm.begin(); p != cm.end(); ++p) {
7c673cae
FG
963 Session *session = mds->sessionmap.get_or_add_session(p->second);
964 pv = mds->sessionmap.mark_projected(session);
28e407b8 965 uint64_t sseq;
7c673cae
FG
966 if (session->is_closed() ||
967 session->is_closing() ||
28e407b8
AA
968 session->is_killing()) {
969 sseq = mds->sessionmap.set_state(session, Session::STATE_OPENING);
11fdf7f2
TL
970 auto q = cmm.find(p->first);
971 if (q != cmm.end())
972 session->info.client_metadata.merge(q->second);
28e407b8 973 } else {
11fdf7f2 974 ceph_assert(session->is_open() ||
7c673cae
FG
975 session->is_opening() ||
976 session->is_stale());
28e407b8
AA
977 sseq = 0;
978 }
979 smap[p->first] = make_pair(session, sseq);
7c673cae
FG
980 session->inc_importing();
981 }
982 return pv;
983}
984
28e407b8 985void Server::finish_force_open_sessions(const map<client_t,pair<Session*,uint64_t> >& smap,
7c673cae
FG
986 bool dec_import)
987{
988 /*
989 * FIXME: need to carefully consider the race conditions between a
990 * client trying to close a session and an MDS doing an import
991 * trying to force open a session...
992 */
28e407b8 993 dout(10) << "finish_force_open_sessions on " << smap.size() << " clients,"
7c673cae 994 << " initial v " << mds->sessionmap.get_version() << dendl;
7c673cae 995
28e407b8
AA
996 for (auto &it : smap) {
997 Session *session = it.second.first;
998 uint64_t sseq = it.second.second;
999 if (sseq > 0) {
7c673cae
FG
1000 if (session->get_state_seq() != sseq) {
1001 dout(10) << "force_open_sessions skipping changed " << session->info.inst << dendl;
1002 } else {
1003 dout(10) << "force_open_sessions opened " << session->info.inst << dendl;
1004 mds->sessionmap.set_state(session, Session::STATE_OPEN);
1005 mds->sessionmap.touch_session(session);
f67539c2 1006 metrics_handler->add_session(session);
11fdf7f2 1007
9f95a23c 1008 auto reply = make_message<MClientSession>(CEPH_SESSION_OPEN);
11fdf7f2
TL
1009 if (session->info.has_feature(CEPHFS_FEATURE_MIMIC))
1010 reply->supported_features = supported_features;
1011 mds->send_message_client(reply, session);
1012
7c673cae 1013 if (mdcache->is_readonly())
9f95a23c 1014 mds->send_message_client(make_message<MClientSession>(CEPH_SESSION_FORCE_RO), session);
7c673cae
FG
1015 }
1016 } else {
1017 dout(10) << "force_open_sessions skipping already-open " << session->info.inst << dendl;
11fdf7f2 1018 ceph_assert(session->is_open() || session->is_stale());
7c673cae
FG
1019 }
1020
1021 if (dec_import) {
1022 session->dec_importing();
1023 }
1024
1025 mds->sessionmap.mark_dirty(session);
1026 }
1027
1028 dout(10) << __func__ << ": final v " << mds->sessionmap.get_version() << dendl;
1029}
1030
1031class C_MDS_TerminatedSessions : public ServerContext {
1032 void finish(int r) override {
1033 server->terminating_sessions = false;
1034 }
1035 public:
1036 explicit C_MDS_TerminatedSessions(Server *s) : ServerContext(s) {}
1037};
1038
1039void Server::terminate_sessions()
1040{
a8e16298 1041 dout(5) << "terminating all sessions..." << dendl;
7c673cae
FG
1042
1043 terminating_sessions = true;
1044
1045 // kill them off. clients will retry etc.
1046 set<Session*> sessions;
1047 mds->sessionmap.get_client_session_set(sessions);
1048 for (set<Session*>::const_iterator p = sessions.begin();
1049 p != sessions.end();
1050 ++p) {
1051 Session *session = *p;
1052 if (session->is_closing() ||
1053 session->is_killing() ||
1054 session->is_closed())
1055 continue;
1056 journal_close_session(session, Session::STATE_CLOSING, NULL);
1057 }
1058
1059 mdlog->wait_for_safe(new C_MDS_TerminatedSessions(this));
1060}
1061
1062
1063void Server::find_idle_sessions()
1064{
91327a77
AA
1065 auto now = clock::now();
1066 auto last_cleared_laggy = mds->last_cleared_laggy();
1067
1068 dout(10) << "find_idle_sessions. last cleared laggy state " << last_cleared_laggy << "s ago" << dendl;
7c673cae
FG
1069
1070 // timeout/stale
1071 // (caps go stale, lease die)
91327a77
AA
1072 double queue_max_age = mds->get_dispatch_queue_max_age(ceph_clock_now());
1073 double cutoff = queue_max_age + mds->mdsmap->get_session_timeout();
f64942e4 1074
494da23a
TL
1075 // don't kick clients if we've been laggy
1076 if (last_cleared_laggy < cutoff) {
1077 dout(10) << " last cleared laggy " << last_cleared_laggy << "s ago (< cutoff " << cutoff
1078 << "), not marking any client stale" << dendl;
1079 return;
1080 }
1081
11fdf7f2
TL
1082 std::vector<Session*> to_evict;
1083
494da23a 1084 bool defer_session_stale = g_conf().get_val<bool>("mds_defer_session_stale");
f64942e4
AA
1085 const auto sessions_p1 = mds->sessionmap.by_state.find(Session::STATE_OPEN);
1086 if (sessions_p1 != mds->sessionmap.by_state.end() && !sessions_p1->second->empty()) {
1087 std::vector<Session*> new_stale;
1088
1089 for (auto session : *(sessions_p1->second)) {
1090 auto last_cap_renew_span = std::chrono::duration<double>(now - session->last_cap_renew).count();
1091 if (last_cap_renew_span < cutoff) {
1092 dout(20) << "laggiest active session is " << session->info.inst
1093 << " and renewed caps recently (" << last_cap_renew_span << "s ago)" << dendl;
1094 break;
1095 }
1096
1097 if (session->last_seen > session->last_cap_renew) {
1098 last_cap_renew_span = std::chrono::duration<double>(now - session->last_seen).count();
1099 if (last_cap_renew_span < cutoff) {
1100 dout(20) << "laggiest active session is " << session->info.inst
1101 << " and renewed caps recently (" << last_cap_renew_span << "s ago)" << dendl;
1102 continue;
1103 }
1104 }
1105
494da23a
TL
1106 if (last_cap_renew_span >= mds->mdsmap->get_session_autoclose()) {
1107 dout(20) << "evicting session " << session->info.inst << " since autoclose "
1108 "has arrived" << dendl;
1109 // evict session without marking it stale
1110 to_evict.push_back(session);
1111 continue;
1112 }
1113
1114 if (defer_session_stale &&
1115 !session->is_any_flush_waiter() &&
1116 !mds->locker->is_revoking_any_caps_from(session->get_client())) {
1117 dout(20) << "deferring marking session " << session->info.inst << " stale "
1118 "since it holds no caps" << dendl;
1119 continue;
1120 }
1121
11fdf7f2
TL
1122 auto it = session->info.client_metadata.find("timeout");
1123 if (it != session->info.client_metadata.end()) {
1124 unsigned timeout = strtoul(it->second.c_str(), nullptr, 0);
1125 if (timeout == 0) {
1126 dout(10) << "skipping session " << session->info.inst
1127 << ", infinite timeout specified" << dendl;
1128 continue;
1129 }
1130 double cutoff = queue_max_age + timeout;
1131 if (last_cap_renew_span < cutoff) {
1132 dout(10) << "skipping session " << session->info.inst
1133 << ", timeout (" << timeout << ") specified"
1134 << " and renewed caps recently (" << last_cap_renew_span << "s ago)" << dendl;
1135 continue;
1136 }
1137
1138 // do not go through stale, evict it directly.
1139 to_evict.push_back(session);
1140 } else {
1141 dout(10) << "new stale session " << session->info.inst
1142 << " last renewed caps " << last_cap_renew_span << "s ago" << dendl;
1143 new_stale.push_back(session);
1144 }
7c673cae
FG
1145 }
1146
f64942e4
AA
1147 for (auto session : new_stale) {
1148 mds->sessionmap.set_state(session, Session::STATE_STALE);
494da23a
TL
1149 if (mds->locker->revoke_stale_caps(session)) {
1150 mds->locker->remove_stale_leases(session);
1151 finish_flush_session(session, session->get_push_seq());
9f95a23c 1152 auto m = make_message<MClientSession>(CEPH_SESSION_STALE, session->get_push_seq());
494da23a
TL
1153 mds->send_message_client(m, session);
1154 } else {
1155 to_evict.push_back(session);
1156 }
f64942e4 1157 }
7c673cae
FG
1158 }
1159
1160 // autoclose
91327a77 1161 cutoff = queue_max_age + mds->mdsmap->get_session_autoclose();
7c673cae 1162
31f18b77 1163 // Collect a list of sessions exceeding the autoclose threshold
f64942e4 1164 const auto sessions_p2 = mds->sessionmap.by_state.find(Session::STATE_STALE);
11fdf7f2
TL
1165 if (sessions_p2 != mds->sessionmap.by_state.end() && !sessions_p2->second->empty()) {
1166 for (auto session : *(sessions_p2->second)) {
1167 assert(session->is_stale());
1168 auto last_cap_renew_span = std::chrono::duration<double>(now - session->last_cap_renew).count();
1169 if (last_cap_renew_span < cutoff) {
1170 dout(20) << "oldest stale session is " << session->info.inst
1171 << " and recently renewed caps " << last_cap_renew_span << "s ago" << dendl;
1172 break;
1173 }
1174 to_evict.push_back(session);
1175 }
31f18b77 1176 }
31f18b77 1177
11fdf7f2 1178 for (auto session: to_evict) {
7c673cae 1179 if (session->is_importing()) {
11fdf7f2
TL
1180 dout(10) << "skipping session " << session->info.inst << ", it's being imported" << dendl;
1181 continue;
7c673cae 1182 }
31f18b77 1183
11fdf7f2
TL
1184 auto last_cap_renew_span = std::chrono::duration<double>(now - session->last_cap_renew).count();
1185 mds->clog->warn() << "evicting unresponsive client " << *session
1186 << ", after " << last_cap_renew_span << " seconds";
1187 dout(10) << "autoclosing stale session " << session->info.inst
1188 << " last renewed caps " << last_cap_renew_span << "s ago" << dendl;
31f18b77 1189
f67539c2
TL
1190 if (g_conf()->mds_session_blocklist_on_timeout) {
1191 CachedStackStringStream css;
1192 mds->evict_client(session->get_client().v, false, true, *css, nullptr);
31f18b77
FG
1193 } else {
1194 kill_session(session, NULL);
1195 }
7c673cae
FG
1196 }
1197}
1198
91327a77
AA
1199void Server::evict_cap_revoke_non_responders() {
1200 if (!cap_revoke_eviction_timeout) {
1201 return;
1202 }
1203
9f95a23c 1204 auto&& to_evict = mds->locker->get_late_revoking_clients(cap_revoke_eviction_timeout);
91327a77
AA
1205
1206 for (auto const &client: to_evict) {
1207 mds->clog->warn() << "client id " << client << " has not responded to"
1208 << " cap revoke by MDS for over " << cap_revoke_eviction_timeout
1209 << " seconds, evicting";
1210 dout(1) << __func__ << ": evicting cap revoke non-responder client id "
1211 << client << dendl;
1212
f67539c2 1213 CachedStackStringStream css;
91327a77 1214 bool evicted = mds->evict_client(client.v, false,
f67539c2
TL
1215 g_conf()->mds_session_blocklist_on_evict,
1216 *css, nullptr);
91327a77
AA
1217 if (evicted && logger) {
1218 logger->inc(l_mdss_cap_revoke_eviction);
1219 }
1220 }
1221}
1222
92f5a8d4 1223void Server::handle_conf_change(const std::set<std::string>& changed) {
f91f0fd5
TL
1224 if (changed.count("mds_forward_all_requests_to_auth")){
1225 forward_all_requests_to_auth = g_conf().get_val<bool>("mds_forward_all_requests_to_auth");
92f5a8d4 1226 }
91327a77 1227 if (changed.count("mds_cap_revoke_eviction_timeout")) {
11fdf7f2 1228 cap_revoke_eviction_timeout = g_conf().get_val<double>("mds_cap_revoke_eviction_timeout");
91327a77
AA
1229 dout(20) << __func__ << " cap revoke eviction timeout changed to "
1230 << cap_revoke_eviction_timeout << dendl;
1231 }
a8e16298 1232 if (changed.count("mds_recall_max_decay_rate")) {
11fdf7f2 1233 recall_throttle = DecayCounter(g_conf().get_val<double>("mds_recall_max_decay_rate"));
a8e16298 1234 }
9f95a23c
TL
1235 if (changed.count("mds_max_snaps_per_dir")) {
1236 max_snaps_per_dir = g_conf().get_val<uint64_t>("mds_max_snaps_per_dir");
1237 dout(20) << __func__ << " max snapshots per directory changed to "
1238 << max_snaps_per_dir << dendl;
1239 }
1240 if (changed.count("mds_client_delegate_inos_pct")) {
1241 delegate_inos_pct = g_conf().get_val<uint64_t>("mds_client_delegate_inos_pct");
1242 }
adb31ebb
TL
1243 if (changed.count("mds_max_caps_per_client")) {
1244 max_caps_per_client = g_conf().get_val<uint64_t>("mds_max_caps_per_client");
1245 }
1246 if (changed.count("mds_session_cap_acquisition_throttle")) {
1247 cap_acquisition_throttle = g_conf().get_val<uint64_t>("mds_session_cap_acquisition_throttle");
1248 }
1249 if (changed.count("mds_session_max_caps_throttle_ratio")) {
1250 max_caps_throttle_ratio = g_conf().get_val<double>("mds_session_max_caps_throttle_ratio");
1251 }
1252 if (changed.count("mds_cap_acquisition_throttle_retry_request_timeout")) {
1253 caps_throttle_retry_request_timeout = g_conf().get_val<double>("mds_cap_acquisition_throttle_retry_request_timeout");
1254 }
f67539c2
TL
1255 if (changed.count("mds_alternate_name_max")) {
1256 alternate_name_max = g_conf().get_val<Option::size_t>("mds_alternate_name_max");
1257 }
91327a77
AA
1258}
1259
7c673cae 1260/*
11fdf7f2 1261 * XXX bump in the interface here, not using an MDSContext here
7c673cae
FG
1262 * because all the callers right now happen to use a SaferCond
1263 */
f67539c2 1264void Server::kill_session(Session *session, Context *on_safe)
7c673cae 1265{
9f95a23c 1266 ceph_assert(ceph_mutex_is_locked_by_me(mds->mds_lock));
31f18b77 1267
7c673cae
FG
1268 if ((session->is_opening() ||
1269 session->is_open() ||
1270 session->is_stale()) &&
1271 !session->is_importing()) {
1272 dout(10) << "kill_session " << session << dendl;
f67539c2 1273 journal_close_session(session, Session::STATE_KILLING, on_safe);
7c673cae
FG
1274 } else {
1275 dout(10) << "kill_session importing or already closing/killing " << session << dendl;
11fdf7f2
TL
1276 if (session->is_closing() ||
1277 session->is_killing()) {
1278 if (on_safe)
1279 mdlog->wait_for_safe(new MDSInternalContextWrapper(mds, on_safe));
1280 } else {
1281 ceph_assert(session->is_closed() ||
1282 session->is_importing());
1283 if (on_safe)
1284 on_safe->complete(0);
7c673cae
FG
1285 }
1286 }
1287}
1288
f67539c2 1289size_t Server::apply_blocklist(const std::set<entity_addr_t> &blocklist)
31f18b77 1290{
81eedcae
TL
1291 bool prenautilus = mds->objecter->with_osdmap(
1292 [&](const OSDMap& o) {
9f95a23c 1293 return o.require_osd_release < ceph_release_t::nautilus;
81eedcae
TL
1294 });
1295
1296 std::vector<Session*> victims;
11fdf7f2 1297 const auto& sessions = mds->sessionmap.get_sessions();
81eedcae 1298 for (const auto& p : sessions) {
31f18b77 1299 if (!p.first.is_client()) {
f67539c2 1300 // Do not apply OSDMap blocklist to MDS daemons, we find out
31f18b77
FG
1301 // about their death via MDSMap.
1302 continue;
1303 }
1304
1305 Session *s = p.second;
81eedcae 1306 auto inst_addr = s->info.inst.addr;
f67539c2 1307 // blocklist entries are always TYPE_ANY for nautilus+
81eedcae 1308 inst_addr.set_type(entity_addr_t::TYPE_ANY);
f67539c2 1309 if (blocklist.count(inst_addr)) {
31f18b77 1310 victims.push_back(s);
81eedcae
TL
1311 continue;
1312 }
1313 if (prenautilus) {
1314 // ...except pre-nautilus, they were TYPE_LEGACY
1315 inst_addr.set_type(entity_addr_t::TYPE_LEGACY);
f67539c2 1316 if (blocklist.count(inst_addr)) {
81eedcae
TL
1317 victims.push_back(s);
1318 }
31f18b77
FG
1319 }
1320 }
1321
9f95a23c 1322 for (const auto& s : victims) {
31f18b77
FG
1323 kill_session(s, nullptr);
1324 }
1325
f67539c2 1326 dout(10) << "apply_blocklist: killed " << victims.size() << dendl;
31f18b77
FG
1327
1328 return victims.size();
1329}
1330
f67539c2 1331void Server::journal_close_session(Session *session, int state, Context *on_safe)
7c673cae 1332{
9f95a23c 1333 dout(10) << __func__ << " : "
9f95a23c 1334 << session->info.inst
f67539c2
TL
1335 << " pending_prealloc_inos " << session->pending_prealloc_inos
1336 << " free_prealloc_inos " << session->free_prealloc_inos
1337 << " delegated_inos " << session->delegated_inos << dendl;
9f95a23c 1338
7c673cae
FG
1339 uint64_t sseq = mds->sessionmap.set_state(session, state);
1340 version_t pv = mds->sessionmap.mark_projected(session);
1341 version_t piv = 0;
1342
1343 // release alloc and pending-alloc inos for this session
1344 // and wipe out session state, in case the session close aborts for some reason
f67539c2
TL
1345 interval_set<inodeno_t> inos_to_free;
1346 inos_to_free.insert(session->pending_prealloc_inos);
1347 inos_to_free.insert(session->free_prealloc_inos);
1348 if (inos_to_free.size()) {
1349 mds->inotable->project_release_ids(inos_to_free);
7c673cae
FG
1350 piv = mds->inotable->get_projected_version();
1351 } else
1352 piv = 0;
9f95a23c 1353
f67539c2
TL
1354 auto le = new ESession(session->info.inst, false, pv, inos_to_free, piv, session->delegated_inos);
1355 auto fin = new C_MDS_session_finish(this, session, sseq, false, pv, inos_to_free, piv,
1356 session->delegated_inos, mdlog->get_current_segment(), on_safe);
1357 mdlog->start_submit_entry(le, fin);
7c673cae
FG
1358 mdlog->flush();
1359
1360 // clean up requests, too
f67539c2
TL
1361 while(!session->requests.empty()) {
1362 auto mdr = MDRequestRef(*session->requests.begin());
7c673cae
FG
1363 mdcache->request_kill(mdr);
1364 }
1365
1366 finish_flush_session(session, session->get_push_seq());
1367}
1368
11fdf7f2 1369void Server::reconnect_clients(MDSContext *reconnect_done_)
7c673cae
FG
1370{
1371 reconnect_done = reconnect_done_;
28e407b8 1372
11fdf7f2 1373 auto now = clock::now();
28e407b8
AA
1374 set<Session*> sessions;
1375 mds->sessionmap.get_client_session_set(sessions);
1376 for (auto session : sessions) {
11fdf7f2
TL
1377 if (session->is_open()) {
1378 client_reconnect_gather.insert(session->get_client());
92f5a8d4 1379 session->set_reconnecting(true);
11fdf7f2
TL
1380 session->last_cap_renew = now;
1381 }
28e407b8 1382 }
7c673cae
FG
1383
1384 if (client_reconnect_gather.empty()) {
1385 dout(7) << "reconnect_clients -- no sessions, doing nothing." << dendl;
1386 reconnect_gather_finish();
1387 return;
1388 }
1389
1390 // clients will get the mdsmap and discover we're reconnecting via the monitor.
1391
11fdf7f2 1392 reconnect_start = now;
7c673cae
FG
1393 dout(1) << "reconnect_clients -- " << client_reconnect_gather.size() << " sessions" << dendl;
1394 mds->sessionmap.dump();
1395}
1396
9f95a23c 1397void Server::handle_client_reconnect(const cref_t<MClientReconnect> &m)
7c673cae 1398{
11fdf7f2
TL
1399 dout(7) << "handle_client_reconnect " << m->get_source()
1400 << (m->has_more() ? " (more)" : "") << dendl;
7c673cae 1401 client_t from = m->get_source().num();
94b18763 1402 Session *session = mds->get_session(m);
92f5a8d4
TL
1403 if (!session) {
1404 dout(0) << " ignoring sessionless msg " << *m << dendl;
9f95a23c 1405 auto reply = make_message<MClientSession>(CEPH_SESSION_REJECT);
92f5a8d4
TL
1406 reply->metadata["error_string"] = "sessionless";
1407 mds->send_message(reply, m->get_connection());
81eedcae 1408 return;
92f5a8d4
TL
1409 }
1410
1411 if (!session->is_open()) {
1412 dout(0) << " ignoring msg from not-open session" << *m << dendl;
9f95a23c 1413 auto reply = make_message<MClientSession>(CEPH_SESSION_CLOSE);
92f5a8d4
TL
1414 mds->send_message(reply, m->get_connection());
1415 return;
1416 }
7c673cae 1417
f67539c2
TL
1418 bool reconnect_all_deny = g_conf().get_val<bool>("mds_deny_all_reconnect");
1419
7c673cae
FG
1420 if (!mds->is_reconnect() && mds->get_want_state() == CEPH_MDS_STATE_RECONNECT) {
1421 dout(10) << " we're almost in reconnect state (mdsmap delivery race?); waiting" << dendl;
1422 mds->wait_for_reconnect(new C_MDS_RetryMessage(mds, m));
1423 return;
1424 }
1425
f64942e4 1426 auto delay = std::chrono::duration<double>(clock::now() - reconnect_start).count();
7c673cae
FG
1427 dout(10) << " reconnect_start " << reconnect_start << " delay " << delay << dendl;
1428
1429 bool deny = false;
f67539c2 1430 if (reconnect_all_deny || !mds->is_reconnect() || mds->get_want_state() != CEPH_MDS_STATE_RECONNECT || reconnect_evicting) {
7c673cae 1431 // XXX maybe in the future we can do better than this?
f67539c2
TL
1432 if (reconnect_all_deny) {
1433 dout(1) << "mds_deny_all_reconnect was set to speed up reboot phase, ignoring reconnect, sending close" << dendl;
1434 } else {
1435 dout(1) << "no longer in reconnect state, ignoring reconnect, sending close" << dendl;
1436 }
7c673cae
FG
1437 mds->clog->info() << "denied reconnect attempt (mds is "
1438 << ceph_mds_state_name(mds->get_state())
1439 << ") from " << m->get_source_inst()
11fdf7f2 1440 << " after " << delay << " (allowed interval " << g_conf()->mds_reconnect_timeout << ")";
7c673cae 1441 deny = true;
11fdf7f2
TL
1442 } else {
1443 std::string error_str;
1444 if (!session->is_open()) {
1445 error_str = "session is closed";
1446 } else if (mdcache->is_readonly()) {
1447 error_str = "mds is readonly";
1448 } else {
1449 if (session->info.client_metadata.features.empty())
1450 infer_supported_features(session, session->info.client_metadata);
1451
1452 feature_bitset_t missing_features = required_client_features;
1453 missing_features -= session->info.client_metadata.features;
1454 if (!missing_features.empty()) {
f67539c2
TL
1455 CachedStackStringStream css;
1456 *css << "missing required features '" << missing_features << "'";
1457 error_str = css->strv();
11fdf7f2
TL
1458 }
1459 }
1460
1461 if (!error_str.empty()) {
1462 deny = true;
1463 dout(1) << " " << error_str << ", ignoring reconnect, sending close" << dendl;
1464 mds->clog->info() << "denied reconnect attempt from "
1465 << m->get_source_inst() << " (" << error_str << ")";
1466 }
7c673cae
FG
1467 }
1468
1469 if (deny) {
9f95a23c 1470 auto r = make_message<MClientSession>(CEPH_SESSION_CLOSE);
11fdf7f2 1471 mds->send_message_client(r, session);
f67539c2
TL
1472 if (session->is_open()) {
1473 client_reconnect_denied.insert(session->get_client());
1474 }
7c673cae
FG
1475 return;
1476 }
1477
11fdf7f2 1478 if (!m->has_more()) {
f67539c2 1479 metrics_handler->add_session(session);
11fdf7f2 1480 // notify client of success with an OPEN
9f95a23c 1481 auto reply = make_message<MClientSession>(CEPH_SESSION_OPEN);
11fdf7f2
TL
1482 if (session->info.has_feature(CEPHFS_FEATURE_MIMIC))
1483 reply->supported_features = supported_features;
1484 mds->send_message_client(reply, session);
1485 mds->clog->debug() << "reconnect by " << session->info.inst << " after " << delay;
1486 }
1487
91327a77 1488 session->last_cap_renew = clock::now();
7c673cae
FG
1489
1490 // snaprealms
11fdf7f2
TL
1491 for (const auto &r : m->realms) {
1492 CInode *in = mdcache->get_inode(inodeno_t(r.realm.ino));
7c673cae
FG
1493 if (in && in->state_test(CInode::STATE_PURGING))
1494 continue;
1495 if (in) {
11fdf7f2
TL
1496 if (in->snaprealm) {
1497 dout(15) << "open snaprealm (w inode) on " << *in << dendl;
7c673cae 1498 } else {
11fdf7f2
TL
1499 // this can happen if we are non-auth or we rollback snaprealm
1500 dout(15) << "open snaprealm (null snaprealm) on " << *in << dendl;
7c673cae 1501 }
11fdf7f2 1502 mdcache->add_reconnected_snaprealm(from, inodeno_t(r.realm.ino), snapid_t(r.realm.seq));
7c673cae 1503 } else {
11fdf7f2
TL
1504 dout(15) << "open snaprealm (w/o inode) on " << inodeno_t(r.realm.ino)
1505 << " seq " << r.realm.seq << dendl;
1506 mdcache->add_reconnected_snaprealm(from, inodeno_t(r.realm.ino), snapid_t(r.realm.seq));
7c673cae
FG
1507 }
1508 }
1509
1510 // caps
11fdf7f2 1511 for (const auto &p : m->caps) {
7c673cae 1512 // make sure our last_cap_id is MAX over all issued caps
11fdf7f2
TL
1513 if (p.second.capinfo.cap_id > mdcache->last_cap_id)
1514 mdcache->last_cap_id = p.second.capinfo.cap_id;
7c673cae 1515
11fdf7f2 1516 CInode *in = mdcache->get_inode(p.first);
7c673cae
FG
1517 if (in && in->state_test(CInode::STATE_PURGING))
1518 continue;
1519 if (in && in->is_auth()) {
1520 // we recovered it, and it's ours. take note.
11fdf7f2 1521 dout(15) << "open cap realm " << inodeno_t(p.second.capinfo.snaprealm)
7c673cae 1522 << " on " << *in << dendl;
11fdf7f2
TL
1523 in->reconnect_cap(from, p.second, session);
1524 mdcache->add_reconnected_cap(from, p.first, p.second);
1525 recover_filelocks(in, p.second.flockbl, m->get_orig_source().num());
7c673cae
FG
1526 continue;
1527 }
1528
1529 if (in && !in->is_auth()) {
1530 // not mine.
1531 dout(10) << "non-auth " << *in << ", will pass off to authority" << dendl;
1532 // add to cap export list.
11fdf7f2
TL
1533 mdcache->rejoin_export_caps(p.first, from, p.second,
1534 in->authority().first, true);
7c673cae
FG
1535 } else {
1536 // don't know if the inode is mine
11fdf7f2
TL
1537 dout(10) << "missing ino " << p.first << ", will load later" << dendl;
1538 mdcache->rejoin_recovered_caps(p.first, from, p.second, MDS_RANK_NONE);
7c673cae
FG
1539 }
1540 }
1541
f64942e4
AA
1542 reconnect_last_seen = clock::now();
1543
11fdf7f2
TL
1544 if (!m->has_more()) {
1545 mdcache->rejoin_recovered_client(session->get_client(), session->info.inst);
1546
1547 // remove from gather set
1548 client_reconnect_gather.erase(from);
92f5a8d4 1549 session->set_reconnecting(false);
11fdf7f2
TL
1550 if (client_reconnect_gather.empty())
1551 reconnect_gather_finish();
1552 }
1553}
1554
1555void Server::infer_supported_features(Session *session, client_metadata_t& client_metadata)
1556{
1557 int supported = -1;
1558 auto it = client_metadata.find("ceph_version");
1559 if (it != client_metadata.end()) {
1560 // user space client
1561 if (it->second.compare(0, 16, "ceph version 12.") == 0)
1562 supported = CEPHFS_FEATURE_LUMINOUS;
1563 else if (session->get_connection()->has_feature(CEPH_FEATURE_FS_CHANGE_ATTR))
1564 supported = CEPHFS_FEATURE_KRAKEN;
1565 } else {
1566 it = client_metadata.find("kernel_version");
1567 if (it != client_metadata.end()) {
1568 // kernel client
1569 if (session->get_connection()->has_feature(CEPH_FEATURE_NEW_OSDOP_ENCODING))
1570 supported = CEPHFS_FEATURE_LUMINOUS;
1571 }
1572 }
1573 if (supported == -1 &&
1574 session->get_connection()->has_feature(CEPH_FEATURE_FS_FILE_LAYOUT_V2))
1575 supported = CEPHFS_FEATURE_JEWEL;
7c673cae 1576
11fdf7f2
TL
1577 if (supported >= 0) {
1578 unsigned long value = (1UL << (supported + 1)) - 1;
1579 client_metadata.features = feature_bitset_t(value);
1580 dout(10) << __func__ << " got '" << client_metadata.features << "'" << dendl;
1581 }
7c673cae
FG
1582}
1583
11fdf7f2
TL
1584void Server::update_required_client_features()
1585{
f67539c2 1586 required_client_features = mds->mdsmap->get_required_client_features();
11fdf7f2
TL
1587 dout(7) << "required_client_features: " << required_client_features << dendl;
1588
1589 if (mds->get_state() >= MDSMap::STATE_RECONNECT) {
1590 set<Session*> sessions;
1591 mds->sessionmap.get_client_session_set(sessions);
1592 for (auto session : sessions) {
1593 feature_bitset_t missing_features = required_client_features;
1594 missing_features -= session->info.client_metadata.features;
1595 if (!missing_features.empty()) {
f67539c2 1596 bool blocklisted = mds->objecter->with_osdmap(
11fdf7f2 1597 [session](const OSDMap &osd_map) -> bool {
f67539c2 1598 return osd_map.is_blocklisted(session->info.inst.addr);
11fdf7f2 1599 });
f67539c2 1600 if (blocklisted)
11fdf7f2 1601 continue;
7c673cae 1602
11fdf7f2
TL
1603 mds->clog->warn() << "evicting session " << *session << ", missing required features '"
1604 << missing_features << "'";
f67539c2 1605 CachedStackStringStream css;
11fdf7f2 1606 mds->evict_client(session->get_client().v, false,
f67539c2 1607 g_conf()->mds_session_blocklist_on_evict, *css);
11fdf7f2
TL
1608 }
1609 }
1610 }
1611}
7c673cae
FG
1612
1613void Server::reconnect_gather_finish()
1614{
1615 dout(7) << "reconnect_gather_finish. failed on " << failed_reconnects << " clients" << dendl;
11fdf7f2
TL
1616 ceph_assert(reconnect_done);
1617
1618 if (!mds->snapclient->is_synced()) {
1619 // make sure snaptable cache is populated. snaprealms will be
1620 // extensively used in rejoin stage.
1621 dout(7) << " snaptable cache isn't synced, delaying state transition" << dendl;
1622 mds->snapclient->wait_for_sync(reconnect_done);
1623 } else {
1624 reconnect_done->complete(0);
1625 }
7c673cae
FG
1626 reconnect_done = NULL;
1627}
1628
1629void Server::reconnect_tick()
1630{
f67539c2 1631 bool reject_all_reconnect = false;
31f18b77 1632 if (reconnect_evicting) {
f64942e4 1633 dout(7) << "reconnect_tick: waiting for evictions" << dendl;
31f18b77
FG
1634 return;
1635 }
1636
f67539c2
TL
1637 /*
1638 * Set mds_deny_all_reconnect to reject all the reconnect req ,
1639 * then load less meta information in rejoin phase. This will shorten reboot time.
1640 * Moreover, loading less meta increases the chance standby with less memory can failover.
1641
1642 * Why not shorten reconnect period?
1643 * Clients may send unsafe or retry requests, which haven't been
1644 * completed before old mds stop, to new mds. These requests may
1645 * need to be processed during new mds's clientreplay phase,
1646 * see: #https://github.com/ceph/ceph/pull/29059.
1647 */
1648 bool reconnect_all_deny = g_conf().get_val<bool>("mds_deny_all_reconnect");
f64942e4
AA
1649 if (client_reconnect_gather.empty())
1650 return;
31f18b77 1651
f67539c2
TL
1652 if (reconnect_all_deny && (client_reconnect_gather == client_reconnect_denied))
1653 reject_all_reconnect = true;
1654
f64942e4
AA
1655 auto now = clock::now();
1656 auto elapse1 = std::chrono::duration<double>(now - reconnect_start).count();
f67539c2 1657 if (elapse1 < g_conf()->mds_reconnect_timeout && !reject_all_reconnect)
f64942e4 1658 return;
31f18b77 1659
f64942e4
AA
1660 vector<Session*> remaining_sessions;
1661 remaining_sessions.reserve(client_reconnect_gather.size());
1662 for (auto c : client_reconnect_gather) {
1663 Session *session = mds->sessionmap.get_session(entity_name_t::CLIENT(c.v));
1664 ceph_assert(session);
1665 remaining_sessions.push_back(session);
1666 // client re-sends cap flush messages before the reconnect message
1667 if (session->last_seen > reconnect_last_seen)
1668 reconnect_last_seen = session->last_seen;
1669 }
31f18b77 1670
f64942e4 1671 auto elapse2 = std::chrono::duration<double>(now - reconnect_last_seen).count();
f67539c2 1672 if (elapse2 < g_conf()->mds_reconnect_timeout / 2 && !reject_all_reconnect) {
f64942e4
AA
1673 dout(7) << "reconnect_tick: last seen " << elapse2
1674 << " seconds ago, extending reconnect interval" << dendl;
1675 return;
1676 }
1677
1678 dout(7) << "reconnect timed out, " << remaining_sessions.size()
f67539c2 1679 << " clients have not reconnected in time" << dendl;
f64942e4 1680
f67539c2 1681 // If we're doing blocklist evictions, use this to wait for them before
f64942e4
AA
1682 // proceeding to reconnect_gather_finish
1683 MDSGatherBuilder gather(g_ceph_context);
1684
1685 for (auto session : remaining_sessions) {
11fdf7f2
TL
1686 // Keep sessions that have specified timeout. These sessions will prevent
1687 // mds from going to active. MDS goes to active after they all have been
1688 // killed or reclaimed.
1689 if (session->info.client_metadata.find("timeout") !=
1690 session->info.client_metadata.end()) {
1691 dout(1) << "reconnect keeps " << session->info.inst
1692 << ", need to be reclaimed" << dendl;
1693 client_reclaim_gather.insert(session->get_client());
1694 continue;
1695 }
1696
f64942e4 1697 dout(1) << "reconnect gives up on " << session->info.inst << dendl;
31f18b77 1698
f64942e4
AA
1699 mds->clog->warn() << "evicting unresponsive client " << *session
1700 << ", after waiting " << elapse1
1701 << " seconds during MDS startup";
1702
f67539c2
TL
1703 // make _session_logged() purge orphan objects of lost async/unsafe requests
1704 session->delegated_inos.swap(session->free_prealloc_inos);
1705
1706 if (g_conf()->mds_session_blocklist_on_timeout) {
1707 CachedStackStringStream css;
1708 mds->evict_client(session->get_client().v, false, true, *css,
f64942e4 1709 gather.new_sub());
31f18b77 1710 } else {
f67539c2 1711 kill_session(session, NULL);
31f18b77 1712 }
f64942e4
AA
1713
1714 failed_reconnects++;
1715 }
1716 client_reconnect_gather.clear();
f67539c2 1717 client_reconnect_denied.clear();
f64942e4
AA
1718
1719 if (gather.has_subs()) {
1720 dout(1) << "reconnect will complete once clients are evicted" << dendl;
9f95a23c 1721 gather.set_finisher(new MDSInternalContextWrapper(mds, new LambdaContext(
f64942e4
AA
1722 [this](int r){reconnect_gather_finish();})));
1723 gather.activate();
1724 reconnect_evicting = true;
1725 } else {
1726 reconnect_gather_finish();
7c673cae
FG
1727 }
1728}
1729
1730void Server::recover_filelocks(CInode *in, bufferlist locks, int64_t client)
1731{
1732 if (!locks.length()) return;
1733 int numlocks;
1734 ceph_filelock lock;
11fdf7f2
TL
1735 auto p = locks.cbegin();
1736 decode(numlocks, p);
7c673cae 1737 for (int i = 0; i < numlocks; ++i) {
11fdf7f2 1738 decode(lock, p);
7c673cae
FG
1739 lock.client = client;
1740 in->get_fcntl_lock_state()->held_locks.insert(pair<uint64_t, ceph_filelock>(lock.start, lock));
1741 ++in->get_fcntl_lock_state()->client_held_lock_counts[client];
1742 }
11fdf7f2 1743 decode(numlocks, p);
7c673cae 1744 for (int i = 0; i < numlocks; ++i) {
11fdf7f2 1745 decode(lock, p);
7c673cae
FG
1746 lock.client = client;
1747 in->get_flock_lock_state()->held_locks.insert(pair<uint64_t, ceph_filelock> (lock.start, lock));
1748 ++in->get_flock_lock_state()->client_held_lock_counts[client];
1749 }
1750}
1751
7c673cae
FG
1752/**
1753 * Call this when the MDCache is oversized, to send requests to the clients
1754 * to trim some caps, and consequently unpin some inodes in the MDCache so
1755 * that it can trim too.
1756 */
a8e16298
TL
1757std::pair<bool, uint64_t> Server::recall_client_state(MDSGatherBuilder* gather, RecallFlags flags)
1758{
1759 const auto now = clock::now();
92f5a8d4
TL
1760 const bool steady = !!(flags&RecallFlags::STEADY);
1761 const bool enforce_max = !!(flags&RecallFlags::ENFORCE_MAX);
1762 const bool enforce_liveness = !!(flags&RecallFlags::ENFORCE_LIVENESS);
1763 const bool trim = !!(flags&RecallFlags::TRIM);
a8e16298 1764
11fdf7f2
TL
1765 const auto max_caps_per_client = g_conf().get_val<uint64_t>("mds_max_caps_per_client");
1766 const auto min_caps_per_client = g_conf().get_val<uint64_t>("mds_min_caps_per_client");
1767 const auto recall_global_max_decay_threshold = g_conf().get_val<Option::size_t>("mds_recall_global_max_decay_threshold");
1768 const auto recall_max_caps = g_conf().get_val<Option::size_t>("mds_recall_max_caps");
1769 const auto recall_max_decay_threshold = g_conf().get_val<Option::size_t>("mds_recall_max_decay_threshold");
92f5a8d4 1770 const auto cache_liveness_magnitude = g_conf().get_val<Option::size_t>("mds_session_cache_liveness_magnitude");
a8e16298
TL
1771
1772 dout(7) << __func__ << ":"
1773 << " min=" << min_caps_per_client
1774 << " max=" << max_caps_per_client
1775 << " total=" << Capability::count()
92f5a8d4 1776 << " flags=" << flags
a8e16298 1777 << dendl;
f64942e4 1778
a8e16298
TL
1779 /* trim caps of sessions with the most caps first */
1780 std::multimap<uint64_t, Session*> caps_session;
92f5a8d4 1781 auto f = [&caps_session, enforce_max, enforce_liveness, trim, max_caps_per_client, cache_liveness_magnitude](auto& s) {
a8e16298 1782 auto num_caps = s->caps.size();
92f5a8d4
TL
1783 auto cache_liveness = s->get_session_cache_liveness();
1784 if (trim || (enforce_max && num_caps > max_caps_per_client) || (enforce_liveness && cache_liveness < (num_caps>>cache_liveness_magnitude))) {
a8e16298
TL
1785 caps_session.emplace(std::piecewise_construct, std::forward_as_tuple(num_caps), std::forward_as_tuple(s));
1786 }
1787 };
1788 mds->sessionmap.get_client_sessions(std::move(f));
1789
1790 std::pair<bool, uint64_t> result = {false, 0};
11fdf7f2 1791 auto& [throttled, caps_recalled] = result;
a8e16298 1792 last_recall_state = now;
11fdf7f2 1793 for (const auto& [num_caps, session] : boost::adaptors::reverse(caps_session)) {
7c673cae 1794 if (!session->is_open() ||
11fdf7f2 1795 !session->get_connection() ||
7c673cae
FG
1796 !session->info.inst.name.is_client())
1797 continue;
1798
a8e16298
TL
1799 dout(10) << __func__ << ":"
1800 << " session " << session->info.inst
1801 << " caps " << num_caps
7c673cae
FG
1802 << ", leases " << session->leases.size()
1803 << dendl;
1804
a8e16298
TL
1805 uint64_t newlim;
1806 if (num_caps < recall_max_caps || (num_caps-recall_max_caps) < min_caps_per_client) {
1807 newlim = min_caps_per_client;
1808 } else {
1809 newlim = num_caps-recall_max_caps;
1810 }
1811 if (num_caps > newlim) {
1812 /* now limit the number of caps we recall at a time to prevent overloading ourselves */
1813 uint64_t recall = std::min<uint64_t>(recall_max_caps, num_caps-newlim);
1814 newlim = num_caps-recall;
1815 const uint64_t session_recall_throttle = session->get_recall_caps_throttle();
11fdf7f2
TL
1816 const uint64_t session_recall_throttle2o = session->get_recall_caps_throttle2o();
1817 const uint64_t global_recall_throttle = recall_throttle.get();
a8e16298
TL
1818 if (session_recall_throttle+recall > recall_max_decay_threshold) {
1819 dout(15) << " session recall threshold (" << recall_max_decay_threshold << ") hit at " << session_recall_throttle << "; skipping!" << dendl;
1820 throttled = true;
1821 continue;
11fdf7f2
TL
1822 } else if (session_recall_throttle2o+recall > recall_max_caps*2) {
1823 dout(15) << " session recall 2nd-order threshold (" << 2*recall_max_caps << ") hit at " << session_recall_throttle2o << "; skipping!" << dendl;
1824 throttled = true;
1825 continue;
a8e16298
TL
1826 } else if (global_recall_throttle+recall > recall_global_max_decay_threshold) {
1827 dout(15) << " global recall threshold (" << recall_global_max_decay_threshold << ") hit at " << global_recall_throttle << "; skipping!" << dendl;
1828 throttled = true;
1829 break;
1830 }
1831
1832 // now check if we've recalled caps recently and the client is unlikely to satisfy a new recall
1833 if (steady) {
1834 const auto session_recall = session->get_recall_caps();
1835 const auto session_release = session->get_release_caps();
1836 if (2*session_release < session_recall && 2*session_recall > recall_max_decay_threshold) {
1837 /* The session has been unable to keep up with the number of caps
1838 * recalled (by half); additionally, to prevent marking sessions
1839 * we've just begun to recall from, the session_recall counter
1840 * (decayed count of caps recently recalled) is **greater** than the
1841 * session threshold for the session's cap recall throttle.
1842 */
1843 dout(15) << " 2*session_release < session_recall"
11fdf7f2
TL
1844 " (2*" << session_release << " < " << session_recall << ") &&"
1845 " 2*session_recall < recall_max_decay_threshold"
1846 " (2*" << session_recall << " > " << recall_max_decay_threshold << ")"
a8e16298
TL
1847 " Skipping because we are unlikely to get more released." << dendl;
1848 continue;
1849 } else if (recall < recall_max_caps && 2*recall < session_recall) {
1850 /* The number of caps recalled is less than the number we *could*
1851 * recall (so there isn't much left to recall?) and the number of
1852 * caps is less than the current recall_caps counter (decayed count
1853 * of caps recently recalled).
1854 */
1855 dout(15) << " 2*recall < session_recall "
1856 " (2*" << recall << " < " << session_recall << ") &&"
1857 " recall < recall_max_caps (" << recall << " < " << recall_max_caps << ");"
1858 " Skipping because we are unlikely to get more released." << dendl;
1859 continue;
1860 }
1861 }
1862
1863 dout(7) << " recalling " << recall << " caps; session_recall_throttle = " << session_recall_throttle << "; global_recall_throttle = " << global_recall_throttle << dendl;
1864
9f95a23c 1865 auto m = make_message<MClientSession>(CEPH_SESSION_RECALL_STATE);
3efd9988
FG
1866 m->head.max_caps = newlim;
1867 mds->send_message_client(m, session);
a8e16298 1868 if (gather) {
f91f0fd5 1869 flush_session(session, *gather);
f64942e4 1870 }
a8e16298 1871 caps_recalled += session->notify_recall_sent(newlim);
11fdf7f2 1872 recall_throttle.hit(recall);
7c673cae
FG
1873 }
1874 }
a8e16298
TL
1875
1876 dout(7) << "recalled" << (throttled ? " (throttled)" : "") << " " << caps_recalled << " client caps." << dendl;
1877
1878 return result;
7c673cae
FG
1879}
1880
1881void Server::force_clients_readonly()
1882{
1883 dout(10) << "force_clients_readonly" << dendl;
1884 set<Session*> sessions;
1885 mds->sessionmap.get_client_session_set(sessions);
1886 for (set<Session*>::const_iterator p = sessions.begin();
1887 p != sessions.end();
1888 ++p) {
1889 Session *session = *p;
1890 if (!session->info.inst.name.is_client() ||
1891 !(session->is_open() || session->is_stale()))
1892 continue;
9f95a23c 1893 mds->send_message_client(make_message<MClientSession>(CEPH_SESSION_FORCE_RO), session);
7c673cae
FG
1894 }
1895}
1896
1897/*******
1898 * some generic stuff for finishing off requests
1899 */
1900void Server::journal_and_reply(MDRequestRef& mdr, CInode *in, CDentry *dn, LogEvent *le, MDSLogContextBase *fin)
1901{
1902 dout(10) << "journal_and_reply tracei " << in << " tracedn " << dn << dendl;
11fdf7f2 1903 ceph_assert(!mdr->has_completed);
7c673cae
FG
1904
1905 // note trace items for eventual reply.
1906 mdr->tracei = in;
1907 if (in)
1908 mdr->pin(in);
1909
1910 mdr->tracedn = dn;
1911 if (dn)
1912 mdr->pin(dn);
1913
1914 early_reply(mdr, in, dn);
1915
1916 mdr->committing = true;
1917 submit_mdlog_entry(le, fin, mdr, __func__);
1918
1919 if (mdr->client_request && mdr->client_request->is_queued_for_replay()) {
1920 if (mds->queue_one_replay()) {
1921 dout(10) << " queued next replay op" << dendl;
1922 } else {
11fdf7f2 1923 dout(10) << " journaled last replay op" << dendl;
7c673cae
FG
1924 }
1925 } else if (mdr->did_early_reply)
b32b8144 1926 mds->locker->drop_rdlocks_for_early_reply(mdr.get());
7c673cae
FG
1927 else
1928 mdlog->flush();
1929}
1930
1931void Server::submit_mdlog_entry(LogEvent *le, MDSLogContextBase *fin, MDRequestRef& mdr,
11fdf7f2 1932 std::string_view event)
7c673cae
FG
1933{
1934 if (mdr) {
1935 string event_str("submit entry: ");
1936 event_str += event;
11fdf7f2 1937 mdr->mark_event(event_str);
7c673cae
FG
1938 }
1939 mdlog->submit_entry(le, fin);
1940}
1941
1942/*
1943 * send response built from mdr contents and error code; clean up mdr
1944 */
1945void Server::respond_to_request(MDRequestRef& mdr, int r)
1946{
1947 if (mdr->client_request) {
f91f0fd5
TL
1948 if (mdr->is_batch_head()) {
1949 dout(20) << __func__ << " batch head " << *mdr << dendl;
1950 mdr->release_batch_op()->respond(r);
9f95a23c
TL
1951 } else {
1952 reply_client_request(mdr, make_message<MClientReply>(*mdr->client_request, r));
1953 }
7c673cae
FG
1954 } else if (mdr->internal_op > -1) {
1955 dout(10) << "respond_to_request on internal request " << mdr << dendl;
1956 if (!mdr->internal_op_finish)
11fdf7f2 1957 ceph_abort_msg("trying to respond to internal op without finisher");
7c673cae
FG
1958 mdr->internal_op_finish->complete(r);
1959 mdcache->request_finish(mdr);
1960 }
1961}
1962
91327a77 1963// statistics mds req op number and latency
9f95a23c 1964void Server::perf_gather_op_latency(const cref_t<MClientRequest> &req, utime_t lat)
91327a77
AA
1965{
1966 int code = l_mdss_first;
1967 switch(req->get_op()) {
1968 case CEPH_MDS_OP_LOOKUPHASH:
1969 code = l_mdss_req_lookuphash_latency;
1970 break;
1971 case CEPH_MDS_OP_LOOKUPINO:
1972 code = l_mdss_req_lookupino_latency;
1973 break;
1974 case CEPH_MDS_OP_LOOKUPPARENT:
1975 code = l_mdss_req_lookupparent_latency;
1976 break;
1977 case CEPH_MDS_OP_LOOKUPNAME:
1978 code = l_mdss_req_lookupname_latency;
1979 break;
1980 case CEPH_MDS_OP_LOOKUP:
1981 code = l_mdss_req_lookup_latency;
1982 break;
1983 case CEPH_MDS_OP_LOOKUPSNAP:
1984 code = l_mdss_req_lookupsnap_latency;
1985 break;
1986 case CEPH_MDS_OP_GETATTR:
1987 code = l_mdss_req_getattr_latency;
1988 break;
1989 case CEPH_MDS_OP_SETATTR:
1990 code = l_mdss_req_setattr_latency;
1991 break;
1992 case CEPH_MDS_OP_SETLAYOUT:
1993 code = l_mdss_req_setlayout_latency;
1994 break;
1995 case CEPH_MDS_OP_SETDIRLAYOUT:
1996 code = l_mdss_req_setdirlayout_latency;
1997 break;
1998 case CEPH_MDS_OP_SETXATTR:
1999 code = l_mdss_req_setxattr_latency;
2000 break;
2001 case CEPH_MDS_OP_RMXATTR:
2002 code = l_mdss_req_rmxattr_latency;
2003 break;
2004 case CEPH_MDS_OP_READDIR:
2005 code = l_mdss_req_readdir_latency;
2006 break;
2007 case CEPH_MDS_OP_SETFILELOCK:
2008 code = l_mdss_req_setfilelock_latency;
2009 break;
2010 case CEPH_MDS_OP_GETFILELOCK:
2011 code = l_mdss_req_getfilelock_latency;
2012 break;
2013 case CEPH_MDS_OP_CREATE:
2014 code = l_mdss_req_create_latency;
2015 break;
2016 case CEPH_MDS_OP_OPEN:
2017 code = l_mdss_req_open_latency;
2018 break;
2019 case CEPH_MDS_OP_MKNOD:
2020 code = l_mdss_req_mknod_latency;
2021 break;
2022 case CEPH_MDS_OP_LINK:
2023 code = l_mdss_req_link_latency;
2024 break;
2025 case CEPH_MDS_OP_UNLINK:
2026 code = l_mdss_req_unlink_latency;
2027 break;
2028 case CEPH_MDS_OP_RMDIR:
2029 code = l_mdss_req_rmdir_latency;
2030 break;
2031 case CEPH_MDS_OP_RENAME:
2032 code = l_mdss_req_rename_latency;
2033 break;
2034 case CEPH_MDS_OP_MKDIR:
2035 code = l_mdss_req_mkdir_latency;
2036 break;
2037 case CEPH_MDS_OP_SYMLINK:
2038 code = l_mdss_req_symlink_latency;
2039 break;
2040 case CEPH_MDS_OP_LSSNAP:
2041 code = l_mdss_req_lssnap_latency;
2042 break;
2043 case CEPH_MDS_OP_MKSNAP:
2044 code = l_mdss_req_mksnap_latency;
2045 break;
2046 case CEPH_MDS_OP_RMSNAP:
2047 code = l_mdss_req_rmsnap_latency;
2048 break;
2049 case CEPH_MDS_OP_RENAMESNAP:
2050 code = l_mdss_req_renamesnap_latency;
2051 break;
2052 default: ceph_abort();
2053 }
2054 logger->tinc(code, lat);
2055}
2056
7c673cae
FG
2057void Server::early_reply(MDRequestRef& mdr, CInode *tracei, CDentry *tracedn)
2058{
11fdf7f2 2059 if (!g_conf()->mds_early_reply)
7c673cae
FG
2060 return;
2061
b32b8144
FG
2062 if (mdr->no_early_reply) {
2063 dout(10) << "early_reply - flag no_early_reply is set, not allowed." << dendl;
2064 return;
2065 }
2066
f67539c2
TL
2067 if (mdr->has_more() && mdr->more()->has_journaled_peers) {
2068 dout(10) << "early_reply - there are journaled peers, not allowed." << dendl;
7c673cae
FG
2069 return;
2070 }
2071
2072 if (mdr->alloc_ino) {
2073 dout(10) << "early_reply - allocated ino, not allowed" << dendl;
2074 return;
2075 }
2076
9f95a23c 2077 const cref_t<MClientRequest> &req = mdr->client_request;
7c673cae
FG
2078 entity_inst_t client_inst = req->get_source_inst();
2079 if (client_inst.name.is_mds())
2080 return;
2081
2082 if (req->is_replay()) {
2083 dout(10) << " no early reply on replay op" << dendl;
2084 return;
2085 }
2086
2087
9f95a23c 2088 auto reply = make_message<MClientReply>(*req, 0);
7c673cae
FG
2089 reply->set_unsafe();
2090
2091 // mark xlocks "done", indicating that we are exposing uncommitted changes.
2092 //
2093 //_rename_finish() does not send dentry link/unlink message to replicas.
2094 // so do not set xlocks on dentries "done", the xlocks prevent dentries
2095 // that have projected linkages from getting new replica.
2096 mds->locker->set_xlocks_done(mdr.get(), req->get_op() == CEPH_MDS_OP_RENAME);
2097
2098 dout(10) << "early_reply " << reply->get_result()
2099 << " (" << cpp_strerror(reply->get_result())
2100 << ") " << *req << dendl;
2101
2102 if (tracei || tracedn) {
2103 if (tracei)
2104 mdr->cap_releases.erase(tracei->vino());
2105 if (tracedn)
2106 mdr->cap_releases.erase(tracedn->get_dir()->get_inode()->vino());
2107
9f95a23c 2108 set_trace_dist(reply, tracei, tracedn, mdr);
7c673cae
FG
2109 }
2110
2111 reply->set_extra_bl(mdr->reply_extra_bl);
11fdf7f2 2112 mds->send_message_client(reply, mdr->session);
7c673cae
FG
2113
2114 mdr->did_early_reply = true;
2115
2116 mds->logger->inc(l_mds_reply);
2117 utime_t lat = ceph_clock_now() - req->get_recv_stamp();
2118 mds->logger->tinc(l_mds_reply_latency, lat);
91327a77
AA
2119 if (client_inst.name.is_client()) {
2120 mds->sessionmap.hit_session(mdr->session);
2121 }
2122 perf_gather_op_latency(req, lat);
7c673cae
FG
2123 dout(20) << "lat " << lat << dendl;
2124
2125 mdr->mark_event("early_replied");
2126}
2127
2128/*
2129 * send given reply
2130 * include a trace to tracei
2131 * Clean up mdr
2132 */
9f95a23c 2133void Server::reply_client_request(MDRequestRef& mdr, const ref_t<MClientReply> &reply)
7c673cae 2134{
11fdf7f2 2135 ceph_assert(mdr.get());
9f95a23c 2136 const cref_t<MClientRequest> &req = mdr->client_request;
7c673cae
FG
2137
2138 dout(7) << "reply_client_request " << reply->get_result()
2139 << " (" << cpp_strerror(reply->get_result())
2140 << ") " << *req << dendl;
2141
2142 mdr->mark_event("replying");
2143
2144 Session *session = mdr->session;
2145
2146 // note successful request in session map?
2147 //
2148 // setfilelock requests are special, they only modify states in MDS memory.
2149 // The states get lost when MDS fails. If Client re-send a completed
2150 // setfilelock request, it means that client did not receive corresponding
2151 // setfilelock reply. So MDS should re-execute the setfilelock request.
2152 if (req->may_write() && req->get_op() != CEPH_MDS_OP_SETFILELOCK &&
2153 reply->get_result() == 0 && session) {
2154 inodeno_t created = mdr->alloc_ino ? mdr->alloc_ino : mdr->used_prealloc_ino;
2155 session->add_completed_request(mdr->reqid.tid, created);
2156 if (mdr->ls) {
2157 mdr->ls->touched_sessions.insert(session->info.inst.name);
2158 }
2159 }
2160
2161 // give any preallocated inos to the session
2162 apply_allocated_inos(mdr, session);
2163
2164 // get tracei/tracedn from mdr?
7c673cae
FG
2165 CInode *tracei = mdr->tracei;
2166 CDentry *tracedn = mdr->tracedn;
2167
2168 bool is_replay = mdr->client_request->is_replay();
2169 bool did_early_reply = mdr->did_early_reply;
2170 entity_inst_t client_inst = req->get_source_inst();
7c673cae
FG
2171
2172 if (!did_early_reply && !is_replay) {
2173
2174 mds->logger->inc(l_mds_reply);
2175 utime_t lat = ceph_clock_now() - mdr->client_request->get_recv_stamp();
2176 mds->logger->tinc(l_mds_reply_latency, lat);
81eedcae 2177 if (session && client_inst.name.is_client()) {
91327a77
AA
2178 mds->sessionmap.hit_session(session);
2179 }
2180 perf_gather_op_latency(req, lat);
7c673cae
FG
2181 dout(20) << "lat " << lat << dendl;
2182
2183 if (tracei)
2184 mdr->cap_releases.erase(tracei->vino());
2185 if (tracedn)
2186 mdr->cap_releases.erase(tracedn->get_dir()->get_inode()->vino());
2187 }
2188
2189 // drop non-rdlocks before replying, so that we can issue leases
2190 mdcache->request_drop_non_rdlocks(mdr);
2191
2192 // reply at all?
81eedcae 2193 if (session && !client_inst.name.is_mds()) {
7c673cae
FG
2194 // send reply.
2195 if (!did_early_reply && // don't issue leases if we sent an earlier reply already
2196 (tracei || tracedn)) {
2197 if (is_replay) {
2198 if (tracei)
2199 mdcache->try_reconnect_cap(tracei, session);
2200 } else {
2201 // include metadata in reply
9f95a23c 2202 set_trace_dist(reply, tracei, tracedn, mdr);
7c673cae
FG
2203 }
2204 }
2205
2206 // We can set the extra bl unconditionally: if it's already been sent in the
2207 // early_reply, set_extra_bl will have claimed it and reply_extra_bl is empty
2208 reply->set_extra_bl(mdr->reply_extra_bl);
2209
2210 reply->set_mdsmap_epoch(mds->mdsmap->get_epoch());
11fdf7f2 2211 mds->send_message_client(reply, session);
7c673cae
FG
2212 }
2213
2214 if (req->is_queued_for_replay() &&
2215 (mdr->has_completed || reply->get_result() < 0)) {
2216 if (reply->get_result() < 0) {
2217 int r = reply->get_result();
2218 derr << "reply_client_request: failed to replay " << *req
2219 << " error " << r << " (" << cpp_strerror(r) << ")" << dendl;
2220 mds->clog->warn() << "failed to replay " << req->get_reqid() << " error " << r;
2221 }
2222 mds->queue_one_replay();
2223 }
2224
2225 // clean up request
2226 mdcache->request_finish(mdr);
2227
2228 // take a closer look at tracei, if it happens to be a remote link
2229 if (tracei &&
2230 tracedn &&
2231 tracedn->get_projected_linkage()->is_remote()) {
2232 mdcache->eval_remote(tracedn);
2233 }
2234}
2235
7c673cae
FG
2236/*
2237 * pass inode OR dentry (not both, or we may get confused)
2238 *
2239 * trace is in reverse order (i.e. root inode comes last)
2240 */
9f95a23c 2241void Server::set_trace_dist(const ref_t<MClientReply> &reply,
7c673cae 2242 CInode *in, CDentry *dn,
7c673cae
FG
2243 MDRequestRef& mdr)
2244{
2245 // skip doing this for debugging purposes?
11fdf7f2 2246 if (g_conf()->mds_inject_traceless_reply_probability &&
7c673cae 2247 mdr->ls && !mdr->o_trunc &&
11fdf7f2 2248 (rand() % 10000 < g_conf()->mds_inject_traceless_reply_probability * 10000.0)) {
7c673cae
FG
2249 dout(5) << "deliberately skipping trace for " << *reply << dendl;
2250 return;
2251 }
2252
2253 // inode, dentry, dir, ..., inode
2254 bufferlist bl;
2255 mds_rank_t whoami = mds->get_nodeid();
9f95a23c
TL
2256 Session *session = mdr->session;
2257 snapid_t snapid = mdr->snapid;
7c673cae
FG
2258 utime_t now = ceph_clock_now();
2259
2260 dout(20) << "set_trace_dist snapid " << snapid << dendl;
2261
7c673cae
FG
2262 // realm
2263 if (snapid == CEPH_NOSNAP) {
2264 SnapRealm *realm;
2265 if (in)
2266 realm = in->find_snaprealm();
2267 else
2268 realm = dn->get_dir()->get_inode()->find_snaprealm();
2269 reply->snapbl = realm->get_snap_trace();
2270 dout(10) << "set_trace_dist snaprealm " << *realm << " len=" << reply->snapbl.length() << dendl;
2271 }
2272
2273 // dir + dentry?
2274 if (dn) {
2275 reply->head.is_dentry = 1;
2276 CDir *dir = dn->get_dir();
2277 CInode *diri = dir->get_inode();
2278
2279 diri->encode_inodestat(bl, session, NULL, snapid);
2280 dout(20) << "set_trace_dist added diri " << *diri << dendl;
2281
2282#ifdef MDS_VERIFY_FRAGSTAT
2283 if (dir->is_complete())
2284 dir->verify_fragstat();
2285#endif
11fdf7f2
TL
2286 DirStat ds;
2287 ds.frag = dir->get_frag();
2288 ds.auth = dir->get_dir_auth().first;
f91f0fd5 2289 if (dir->is_auth() && !forward_all_requests_to_auth)
11fdf7f2
TL
2290 dir->get_dist_spec(ds.dist, whoami);
2291
2292 dir->encode_dirstat(bl, session->info, ds);
7c673cae
FG
2293 dout(20) << "set_trace_dist added dir " << *dir << dendl;
2294
11fdf7f2 2295 encode(dn->get_name(), bl);
9f95a23c
TL
2296
2297 int lease_mask = 0;
2298 CDentry::linkage_t *dnl = dn->get_linkage(mdr->get_client(), mdr);
2299 if (dnl->is_primary()) {
2300 ceph_assert(dnl->get_inode() == in);
2301 lease_mask = CEPH_LEASE_PRIMARY_LINK;
2302 } else {
2303 if (dnl->is_remote())
2304 ceph_assert(dnl->get_remote_ino() == in->ino());
2305 else
2306 ceph_assert(!in);
11fdf7f2 2307 }
9f95a23c 2308 mds->locker->issue_client_lease(dn, mdr, lease_mask, now, bl);
7c673cae
FG
2309 dout(20) << "set_trace_dist added dn " << snapid << " " << *dn << dendl;
2310 } else
2311 reply->head.is_dentry = 0;
2312
2313 // inode
2314 if (in) {
2315 in->encode_inodestat(bl, session, NULL, snapid, 0, mdr->getattr_caps);
2316 dout(20) << "set_trace_dist added in " << *in << dendl;
2317 reply->head.is_target = 1;
2318 } else
2319 reply->head.is_target = 0;
2320
2321 reply->set_trace(bl);
2322}
2323
9f95a23c 2324void Server::handle_client_request(const cref_t<MClientRequest> &req)
7c673cae
FG
2325{
2326 dout(4) << "handle_client_request " << *req << dendl;
2327
2328 if (mds->logger)
2329 mds->logger->inc(l_mds_request);
2330 if (logger)
2331 logger->inc(l_mdss_handle_client_request);
2332
2333 if (!mdcache->is_open()) {
2334 dout(5) << "waiting for root" << dendl;
2335 mdcache->wait_for_open(new C_MDS_RetryMessage(mds, req));
2336 return;
2337 }
2338
92f5a8d4 2339 bool sessionclosed_isok = replay_unsafe_with_closed_session;
7c673cae
FG
2340 // active session?
2341 Session *session = 0;
2342 if (req->get_source().is_client()) {
94b18763 2343 session = mds->get_session(req);
7c673cae
FG
2344 if (!session) {
2345 dout(5) << "no session for " << req->get_source() << ", dropping" << dendl;
92f5a8d4 2346 } else if ((session->is_closed() && (!mds->is_clientreplay() || !sessionclosed_isok)) ||
7c673cae
FG
2347 session->is_closing() ||
2348 session->is_killing()) {
2349 dout(5) << "session closed|closing|killing, dropping" << dendl;
2350 session = NULL;
2351 }
2352 if (!session) {
2353 if (req->is_queued_for_replay())
2354 mds->queue_one_replay();
7c673cae
FG
2355 return;
2356 }
2357 }
2358
2359 // old mdsmap?
2360 if (req->get_mdsmap_epoch() < mds->mdsmap->get_epoch()) {
2361 // send it? hrm, this isn't ideal; they may get a lot of copies if
2362 // they have a high request rate.
2363 }
2364
2365 // completed request?
2366 bool has_completed = false;
2367 if (req->is_replay() || req->get_retry_attempt()) {
11fdf7f2 2368 ceph_assert(session);
7c673cae
FG
2369 inodeno_t created;
2370 if (session->have_completed_request(req->get_reqid().tid, &created)) {
2371 has_completed = true;
92f5a8d4
TL
2372 if (!session->is_open())
2373 return;
7c673cae
FG
2374 // Don't send traceless reply if the completed request has created
2375 // new inode. Treat the request as lookup request instead.
2376 if (req->is_replay() ||
2377 ((created == inodeno_t() || !mds->is_clientreplay()) &&
2378 req->get_op() != CEPH_MDS_OP_OPEN &&
2379 req->get_op() != CEPH_MDS_OP_CREATE)) {
2380 dout(5) << "already completed " << req->get_reqid() << dendl;
9f95a23c 2381 auto reply = make_message<MClientReply>(*req, 0);
7c673cae
FG
2382 if (created != inodeno_t()) {
2383 bufferlist extra;
11fdf7f2 2384 encode(created, extra);
7c673cae
FG
2385 reply->set_extra_bl(extra);
2386 }
11fdf7f2 2387 mds->send_message_client(reply, session);
7c673cae
FG
2388
2389 if (req->is_queued_for_replay())
2390 mds->queue_one_replay();
2391
7c673cae
FG
2392 return;
2393 }
2394 if (req->get_op() != CEPH_MDS_OP_OPEN &&
2395 req->get_op() != CEPH_MDS_OP_CREATE) {
2396 dout(10) << " completed request which created new inode " << created
2397 << ", convert it to lookup request" << dendl;
2398 req->head.op = req->get_dentry_wanted() ? CEPH_MDS_OP_LOOKUP : CEPH_MDS_OP_GETATTR;
2399 req->head.args.getattr.mask = CEPH_STAT_CAP_INODE_ALL;
2400 }
2401 }
2402 }
2403
2404 // trim completed_request list
2405 if (req->get_oldest_client_tid() > 0) {
2406 dout(15) << " oldest_client_tid=" << req->get_oldest_client_tid() << dendl;
11fdf7f2 2407 ceph_assert(session);
7c673cae
FG
2408 if (session->trim_completed_requests(req->get_oldest_client_tid())) {
2409 // Sessions 'completed_requests' was dirtied, mark it to be
2410 // potentially flushed at segment expiry.
2411 mdlog->get_current_segment()->touched_sessions.insert(session->info.inst.name);
2412
2413 if (session->get_num_trim_requests_warnings() > 0 &&
11fdf7f2 2414 session->get_num_completed_requests() * 2 < g_conf()->mds_max_completed_requests)
7c673cae
FG
2415 session->reset_num_trim_requests_warnings();
2416 } else {
2417 if (session->get_num_completed_requests() >=
11fdf7f2 2418 (g_conf()->mds_max_completed_requests << session->get_num_trim_requests_warnings())) {
7c673cae 2419 session->inc_num_trim_requests_warnings();
f67539c2
TL
2420 CachedStackStringStream css;
2421 *css << "client." << session->get_client() << " does not advance its oldest_client_tid ("
7c673cae
FG
2422 << req->get_oldest_client_tid() << "), "
2423 << session->get_num_completed_requests()
2424 << " completed requests recorded in session\n";
f67539c2
TL
2425 mds->clog->warn() << css->strv();
2426 dout(20) << __func__ << " " << css->strv() << dendl;
7c673cae
FG
2427 }
2428 }
2429 }
2430
2431 // register + dispatch
2432 MDRequestRef mdr = mdcache->request_start(req);
2433 if (!mdr.get())
2434 return;
2435
2436 if (session) {
2437 mdr->session = session;
2438 session->requests.push_back(&mdr->item_session_request);
2439 }
2440
2441 if (has_completed)
2442 mdr->has_completed = true;
2443
2444 // process embedded cap releases?
2445 // (only if NOT replay!)
2446 if (!req->releases.empty() && req->get_source().is_client() && !req->is_replay()) {
2447 client_t client = req->get_source().num();
11fdf7f2
TL
2448 for (const auto &r : req->releases) {
2449 mds->locker->process_request_cap_release(mdr, client, r.item, r.dname);
2450 }
7c673cae
FG
2451 req->releases.clear();
2452 }
2453
2454 dispatch_client_request(mdr);
2455 return;
2456}
2457
2458void Server::handle_osd_map()
2459{
2460 /* Note that we check the OSDMAP_FULL flag directly rather than
2461 * using osdmap_full_flag(), because we want to know "is the flag set"
2462 * rather than "does the flag apply to us?" */
2463 mds->objecter->with_osdmap([this](const OSDMap& o) {
b3b6e05e 2464 auto pi = o.get_pg_pool(mds->get_metadata_pool());
b32b8144 2465 is_full = pi && pi->has_flag(pg_pool_t::FLAG_FULL);
7c673cae
FG
2466 dout(7) << __func__ << ": full = " << is_full << " epoch = "
2467 << o.get_epoch() << dendl;
2468 });
2469}
2470
2471void Server::dispatch_client_request(MDRequestRef& mdr)
2472{
2473 // we shouldn't be waiting on anyone.
f67539c2 2474 ceph_assert(!mdr->has_more() || mdr->more()->waiting_on_peer.empty());
7c673cae
FG
2475
2476 if (mdr->killed) {
2477 dout(10) << "request " << *mdr << " was killed" << dendl;
9f95a23c
TL
2478 //if the mdr is a "batch_op" and it has followers, pick a follower as
2479 //the new "head of the batch ops" and go on processing the new one.
f91f0fd5
TL
2480 if (mdr->is_batch_head()) {
2481 int mask = mdr->client_request->head.args.getattr.mask;
2482 auto it = mdr->batch_op_map->find(mask);
2483 auto new_batch_head = it->second->find_new_head();
2484 if (!new_batch_head) {
2485 mdr->batch_op_map->erase(it);
9f95a23c
TL
2486 return;
2487 }
f91f0fd5 2488 mdr = std::move(new_batch_head);
9f95a23c
TL
2489 } else {
2490 return;
2491 }
94b18763
FG
2492 } else if (mdr->aborted) {
2493 mdr->aborted = false;
2494 mdcache->request_kill(mdr);
2495 return;
7c673cae
FG
2496 }
2497
9f95a23c 2498 const cref_t<MClientRequest> &req = mdr->client_request;
7c673cae
FG
2499
2500 if (logger) logger->inc(l_mdss_dispatch_client_request);
2501
2502 dout(7) << "dispatch_client_request " << *req << dendl;
2503
9f95a23c
TL
2504 if (req->may_write() && mdcache->is_readonly()) {
2505 dout(10) << " read-only FS" << dendl;
f67539c2 2506 respond_to_request(mdr, -CEPHFS_EROFS);
9f95a23c
TL
2507 return;
2508 }
f67539c2
TL
2509 if (mdr->has_more() && mdr->more()->peer_error) {
2510 dout(10) << " got error from peers" << dendl;
2511 respond_to_request(mdr, mdr->more()->peer_error);
9f95a23c 2512 return;
7c673cae
FG
2513 }
2514
2515 if (is_full) {
b3b6e05e
TL
2516 CInode *cur = try_get_auth_inode(mdr, req->get_filepath().get_ino());
2517 if (!cur) {
2518 respond_to_request(mdr, -EINVAL);
2519 return;
2520 }
7c673cae
FG
2521 if (req->get_op() == CEPH_MDS_OP_SETLAYOUT ||
2522 req->get_op() == CEPH_MDS_OP_SETDIRLAYOUT ||
2523 req->get_op() == CEPH_MDS_OP_SETLAYOUT ||
2524 req->get_op() == CEPH_MDS_OP_RMXATTR ||
2525 req->get_op() == CEPH_MDS_OP_SETXATTR ||
2526 req->get_op() == CEPH_MDS_OP_CREATE ||
2527 req->get_op() == CEPH_MDS_OP_SYMLINK ||
2528 req->get_op() == CEPH_MDS_OP_MKSNAP ||
2529 ((req->get_op() == CEPH_MDS_OP_LINK ||
2530 req->get_op() == CEPH_MDS_OP_RENAME) &&
f67539c2 2531 (!mdr->has_more() || mdr->more()->witnessed.empty())) // haven't started peer request
7c673cae
FG
2532 ) {
2533
b3b6e05e
TL
2534 if (check_access(mdr, cur, MAY_FULL)) {
2535 dout(20) << __func__ << ": full, has FULL caps, permitting op " << ceph_mds_op_name(req->get_op()) << dendl;
2536 } else {
2537 dout(20) << __func__ << ": full, responding CEPHFS_ENOSPC to op " << ceph_mds_op_name(req->get_op()) << dendl;
2538 respond_to_request(mdr, -CEPHFS_ENOSPC);
2539 return;
2540 }
7c673cae
FG
2541 } else {
2542 dout(20) << __func__ << ": full, permitting op " << ceph_mds_op_name(req->get_op()) << dendl;
2543 }
2544 }
2545
2546 switch (req->get_op()) {
2547 case CEPH_MDS_OP_LOOKUPHASH:
2548 case CEPH_MDS_OP_LOOKUPINO:
2549 handle_client_lookup_ino(mdr, false, false);
2550 break;
2551 case CEPH_MDS_OP_LOOKUPPARENT:
2552 handle_client_lookup_ino(mdr, true, false);
2553 break;
2554 case CEPH_MDS_OP_LOOKUPNAME:
2555 handle_client_lookup_ino(mdr, false, true);
2556 break;
2557
2558 // inodes ops.
2559 case CEPH_MDS_OP_LOOKUP:
2560 handle_client_getattr(mdr, true);
2561 break;
2562
2563 case CEPH_MDS_OP_LOOKUPSNAP:
2564 // lookupsnap does not reference a CDentry; treat it as a getattr
2565 case CEPH_MDS_OP_GETATTR:
2566 handle_client_getattr(mdr, false);
2567 break;
2568
2569 case CEPH_MDS_OP_SETATTR:
2570 handle_client_setattr(mdr);
2571 break;
2572 case CEPH_MDS_OP_SETLAYOUT:
2573 handle_client_setlayout(mdr);
2574 break;
2575 case CEPH_MDS_OP_SETDIRLAYOUT:
2576 handle_client_setdirlayout(mdr);
2577 break;
2578 case CEPH_MDS_OP_SETXATTR:
2579 handle_client_setxattr(mdr);
2580 break;
2581 case CEPH_MDS_OP_RMXATTR:
2582 handle_client_removexattr(mdr);
2583 break;
2584
2585 case CEPH_MDS_OP_READDIR:
2586 handle_client_readdir(mdr);
2587 break;
2588
2589 case CEPH_MDS_OP_SETFILELOCK:
2590 handle_client_file_setlock(mdr);
2591 break;
2592
2593 case CEPH_MDS_OP_GETFILELOCK:
2594 handle_client_file_readlock(mdr);
2595 break;
2596
2597 // funky.
2598 case CEPH_MDS_OP_CREATE:
2599 if (mdr->has_completed)
2600 handle_client_open(mdr); // already created.. just open
2601 else
2602 handle_client_openc(mdr);
2603 break;
2604
2605 case CEPH_MDS_OP_OPEN:
2606 handle_client_open(mdr);
2607 break;
2608
2609 // namespace.
2610 // no prior locks.
2611 case CEPH_MDS_OP_MKNOD:
2612 handle_client_mknod(mdr);
2613 break;
2614 case CEPH_MDS_OP_LINK:
2615 handle_client_link(mdr);
2616 break;
2617 case CEPH_MDS_OP_UNLINK:
2618 case CEPH_MDS_OP_RMDIR:
2619 handle_client_unlink(mdr);
2620 break;
2621 case CEPH_MDS_OP_RENAME:
2622 handle_client_rename(mdr);
2623 break;
2624 case CEPH_MDS_OP_MKDIR:
2625 handle_client_mkdir(mdr);
2626 break;
2627 case CEPH_MDS_OP_SYMLINK:
2628 handle_client_symlink(mdr);
2629 break;
2630
2631
2632 // snaps
2633 case CEPH_MDS_OP_LSSNAP:
2634 handle_client_lssnap(mdr);
2635 break;
2636 case CEPH_MDS_OP_MKSNAP:
2637 handle_client_mksnap(mdr);
2638 break;
2639 case CEPH_MDS_OP_RMSNAP:
2640 handle_client_rmsnap(mdr);
2641 break;
2642 case CEPH_MDS_OP_RENAMESNAP:
2643 handle_client_renamesnap(mdr);
2644 break;
2645
2646 default:
2647 dout(1) << " unknown client op " << req->get_op() << dendl;
f67539c2 2648 respond_to_request(mdr, -CEPHFS_EOPNOTSUPP);
7c673cae
FG
2649 }
2650}
2651
2652
2653// ---------------------------------------
f67539c2 2654// PEER REQUESTS
7c673cae 2655
f67539c2 2656void Server::handle_peer_request(const cref_t<MMDSPeerRequest> &m)
7c673cae 2657{
f67539c2 2658 dout(4) << "handle_peer_request " << m->get_reqid() << " from " << m->get_source() << dendl;
7c673cae
FG
2659 mds_rank_t from = mds_rank_t(m->get_source().num());
2660
f67539c2 2661 if (logger) logger->inc(l_mdss_handle_peer_request);
7c673cae
FG
2662
2663 // reply?
2664 if (m->is_reply())
f67539c2 2665 return handle_peer_request_reply(m);
7c673cae
FG
2666
2667 // the purpose of rename notify is enforcing causal message ordering. making sure
2668 // bystanders have received all messages from rename srcdn's auth MDS.
f67539c2
TL
2669 if (m->get_op() == MMDSPeerRequest::OP_RENAMENOTIFY) {
2670 auto reply = make_message<MMDSPeerRequest>(m->get_reqid(), m->get_attempt(), MMDSPeerRequest::OP_RENAMENOTIFYACK);
7c673cae 2671 mds->send_message(reply, m->get_connection());
7c673cae
FG
2672 return;
2673 }
2674
2675 CDentry *straydn = NULL;
11fdf7f2 2676 if (m->straybl.length() > 0) {
9f95a23c 2677 mdcache->decode_replica_stray(straydn, m->straybl, from);
11fdf7f2
TL
2678 ceph_assert(straydn);
2679 m->straybl.clear();
7c673cae
FG
2680 }
2681
9f95a23c
TL
2682 if (!mds->is_clientreplay() && !mds->is_active() && !mds->is_stopping()) {
2683 dout(3) << "not clientreplay|active yet, waiting" << dendl;
2684 mds->wait_for_replay(new C_MDS_RetryMessage(mds, m));
2685 return;
2686 }
2687
f67539c2 2688 // am i a new peer?
7c673cae
FG
2689 MDRequestRef mdr;
2690 if (mdcache->have_request(m->get_reqid())) {
2691 // existing?
2692 mdr = mdcache->request_get(m->get_reqid());
2693
2694 // is my request newer?
2695 if (mdr->attempt > m->get_attempt()) {
2696 dout(10) << "local request " << *mdr << " attempt " << mdr->attempt << " > " << m->get_attempt()
2697 << ", dropping " << *m << dendl;
7c673cae
FG
2698 return;
2699 }
2700
7c673cae
FG
2701 if (mdr->attempt < m->get_attempt()) {
2702 // mine is old, close it out
2703 dout(10) << "local request " << *mdr << " attempt " << mdr->attempt << " < " << m->get_attempt()
2704 << ", closing out" << dendl;
2705 mdcache->request_finish(mdr);
2706 mdr.reset();
f67539c2
TL
2707 } else if (mdr->peer_to_mds != from) {
2708 dout(10) << "local request " << *mdr << " not peer to mds." << from << dendl;
7c673cae
FG
2709 return;
2710 }
2711
f67539c2
TL
2712 // may get these while mdr->peer_request is non-null
2713 if (m->get_op() == MMDSPeerRequest::OP_DROPLOCKS) {
9f95a23c
TL
2714 mds->locker->drop_locks(mdr.get());
2715 return;
2716 }
f67539c2 2717 if (m->get_op() == MMDSPeerRequest::OP_FINISH) {
9f95a23c
TL
2718 if (m->is_abort()) {
2719 mdr->aborted = true;
f67539c2 2720 if (mdr->peer_request) {
9f95a23c 2721 // only abort on-going xlock, wrlock and auth pin
f67539c2 2722 ceph_assert(!mdr->peer_did_prepare());
9f95a23c
TL
2723 } else {
2724 mdcache->request_finish(mdr);
2725 }
7c673cae 2726 } else {
9f95a23c
TL
2727 if (m->inode_export.length() > 0)
2728 mdr->more()->inode_import = m->inode_export;
2729 // finish off request.
7c673cae
FG
2730 mdcache->request_finish(mdr);
2731 }
2732 return;
2733 }
2734 }
2735 if (!mdr.get()) {
2736 // new?
f67539c2
TL
2737 if (m->get_op() == MMDSPeerRequest::OP_FINISH) {
2738 dout(10) << "missing peer request for " << m->get_reqid()
7c673cae 2739 << " OP_FINISH, must have lost race with a forward" << dendl;
7c673cae
FG
2740 return;
2741 }
f67539c2 2742 mdr = mdcache->request_start_peer(m->get_reqid(), m->get_attempt(), m);
7c673cae
FG
2743 mdr->set_op_stamp(m->op_stamp);
2744 }
f67539c2 2745 ceph_assert(mdr->peer_request == 0); // only one at a time, please!
7c673cae
FG
2746
2747 if (straydn) {
2748 mdr->pin(straydn);
2749 mdr->straydn = straydn;
2750 }
2751
9f95a23c
TL
2752 if (mds->is_clientreplay() && !mds->mdsmap->is_clientreplay(from) &&
2753 mdr->locks.empty()) {
7c673cae
FG
2754 dout(3) << "not active yet, waiting" << dendl;
2755 mds->wait_for_active(new C_MDS_RetryMessage(mds, m));
2756 return;
2757 }
2758
f67539c2 2759 mdr->reset_peer_request(m);
7c673cae 2760
f67539c2 2761 dispatch_peer_request(mdr);
7c673cae
FG
2762}
2763
f67539c2 2764void Server::handle_peer_request_reply(const cref_t<MMDSPeerRequest> &m)
7c673cae
FG
2765{
2766 mds_rank_t from = mds_rank_t(m->get_source().num());
2767
2768 if (!mds->is_clientreplay() && !mds->is_active() && !mds->is_stopping()) {
2769 metareqid_t r = m->get_reqid();
f67539c2
TL
2770 if (!mdcache->have_uncommitted_leader(r, from)) {
2771 dout(10) << "handle_peer_request_reply ignoring peer reply from mds."
7c673cae 2772 << from << " reqid " << r << dendl;
7c673cae
FG
2773 return;
2774 }
2775 dout(3) << "not clientreplay|active yet, waiting" << dendl;
2776 mds->wait_for_replay(new C_MDS_RetryMessage(mds, m));
2777 return;
2778 }
2779
f67539c2 2780 if (m->get_op() == MMDSPeerRequest::OP_COMMITTED) {
7c673cae 2781 metareqid_t r = m->get_reqid();
f67539c2 2782 mdcache->committed_leader_peer(r, from);
7c673cae
FG
2783 return;
2784 }
2785
2786 MDRequestRef mdr = mdcache->request_get(m->get_reqid());
2787 if (m->get_attempt() != mdr->attempt) {
f67539c2 2788 dout(10) << "handle_peer_request_reply " << *mdr << " ignoring reply from other attempt "
7c673cae 2789 << m->get_attempt() << dendl;
7c673cae
FG
2790 return;
2791 }
2792
2793 switch (m->get_op()) {
f67539c2 2794 case MMDSPeerRequest::OP_XLOCKACK:
7c673cae 2795 {
f67539c2 2796 // identify lock, leader request
7c673cae
FG
2797 SimpleLock *lock = mds->locker->get_lock(m->get_lock_type(),
2798 m->get_object_info());
f67539c2 2799 mdr->more()->peers.insert(from);
11fdf7f2 2800 lock->decode_locked_state(m->get_lock_data());
7c673cae 2801 dout(10) << "got remote xlock on " << *lock << " on " << *lock->get_parent() << dendl;
9f95a23c 2802 mdr->emplace_lock(lock, MutationImpl::LockOp::XLOCK);
7c673cae
FG
2803 mdr->finish_locking(lock);
2804 lock->get_xlock(mdr, mdr->get_client());
2805
f67539c2
TL
2806 ceph_assert(mdr->more()->waiting_on_peer.count(from));
2807 mdr->more()->waiting_on_peer.erase(from);
2808 ceph_assert(mdr->more()->waiting_on_peer.empty());
7c673cae
FG
2809 mdcache->dispatch_request(mdr);
2810 }
2811 break;
2812
f67539c2 2813 case MMDSPeerRequest::OP_WRLOCKACK:
7c673cae 2814 {
f67539c2 2815 // identify lock, leader request
7c673cae
FG
2816 SimpleLock *lock = mds->locker->get_lock(m->get_lock_type(),
2817 m->get_object_info());
f67539c2 2818 mdr->more()->peers.insert(from);
7c673cae 2819 dout(10) << "got remote wrlock on " << *lock << " on " << *lock->get_parent() << dendl;
9f95a23c 2820 auto it = mdr->emplace_lock(lock, MutationImpl::LockOp::REMOTE_WRLOCK, from);
11fdf7f2
TL
2821 ceph_assert(it->is_remote_wrlock());
2822 ceph_assert(it->wrlock_target == from);
2823
7c673cae
FG
2824 mdr->finish_locking(lock);
2825
f67539c2
TL
2826 ceph_assert(mdr->more()->waiting_on_peer.count(from));
2827 mdr->more()->waiting_on_peer.erase(from);
2828 ceph_assert(mdr->more()->waiting_on_peer.empty());
7c673cae
FG
2829 mdcache->dispatch_request(mdr);
2830 }
2831 break;
2832
f67539c2
TL
2833 case MMDSPeerRequest::OP_AUTHPINACK:
2834 handle_peer_auth_pin_ack(mdr, m);
7c673cae
FG
2835 break;
2836
f67539c2
TL
2837 case MMDSPeerRequest::OP_LINKPREPACK:
2838 handle_peer_link_prep_ack(mdr, m);
7c673cae
FG
2839 break;
2840
f67539c2
TL
2841 case MMDSPeerRequest::OP_RMDIRPREPACK:
2842 handle_peer_rmdir_prep_ack(mdr, m);
7c673cae
FG
2843 break;
2844
f67539c2
TL
2845 case MMDSPeerRequest::OP_RENAMEPREPACK:
2846 handle_peer_rename_prep_ack(mdr, m);
7c673cae
FG
2847 break;
2848
f67539c2
TL
2849 case MMDSPeerRequest::OP_RENAMENOTIFYACK:
2850 handle_peer_rename_notify_ack(mdr, m);
7c673cae
FG
2851 break;
2852
2853 default:
2854 ceph_abort();
2855 }
7c673cae
FG
2856}
2857
f67539c2 2858void Server::dispatch_peer_request(MDRequestRef& mdr)
7c673cae 2859{
f67539c2 2860 dout(7) << "dispatch_peer_request " << *mdr << " " << *mdr->peer_request << dendl;
7c673cae
FG
2861
2862 if (mdr->aborted) {
2863 dout(7) << " abort flag set, finishing" << dendl;
2864 mdcache->request_finish(mdr);
2865 return;
2866 }
2867
f67539c2 2868 if (logger) logger->inc(l_mdss_dispatch_peer_request);
7c673cae 2869
f67539c2 2870 int op = mdr->peer_request->get_op();
7c673cae 2871 switch (op) {
f67539c2
TL
2872 case MMDSPeerRequest::OP_XLOCK:
2873 case MMDSPeerRequest::OP_WRLOCK:
7c673cae
FG
2874 {
2875 // identify object
f67539c2
TL
2876 SimpleLock *lock = mds->locker->get_lock(mdr->peer_request->get_lock_type(),
2877 mdr->peer_request->get_object_info());
7c673cae
FG
2878
2879 if (!lock) {
2880 dout(10) << "don't have object, dropping" << dendl;
2881 ceph_abort(); // can this happen, if we auth pinned properly.
2882 }
f67539c2 2883 if (op == MMDSPeerRequest::OP_XLOCK && !lock->get_parent()->is_auth()) {
7c673cae
FG
2884 dout(10) << "not auth for remote xlock attempt, dropping on "
2885 << *lock << " on " << *lock->get_parent() << dendl;
2886 } else {
2887 // use acquire_locks so that we get auth_pinning.
11fdf7f2
TL
2888 MutationImpl::LockOpVec lov;
2889 for (const auto& p : mdr->locks) {
2890 if (p.is_xlock())
2891 lov.add_xlock(p.lock);
2892 else if (p.is_wrlock())
2893 lov.add_wrlock(p.lock);
2894 }
7c673cae
FG
2895
2896 int replycode = 0;
2897 switch (op) {
f67539c2 2898 case MMDSPeerRequest::OP_XLOCK:
11fdf7f2 2899 lov.add_xlock(lock);
f67539c2 2900 replycode = MMDSPeerRequest::OP_XLOCKACK;
7c673cae 2901 break;
f67539c2 2902 case MMDSPeerRequest::OP_WRLOCK:
11fdf7f2 2903 lov.add_wrlock(lock);
f67539c2 2904 replycode = MMDSPeerRequest::OP_WRLOCKACK;
7c673cae
FG
2905 break;
2906 }
2907
11fdf7f2 2908 if (!mds->locker->acquire_locks(mdr, lov))
7c673cae
FG
2909 return;
2910
2911 // ack
f67539c2 2912 auto r = make_message<MMDSPeerRequest>(mdr->reqid, mdr->attempt, replycode);
7c673cae
FG
2913 r->set_lock_type(lock->get_type());
2914 lock->get_parent()->set_object_info(r->get_object_info());
f67539c2 2915 if (replycode == MMDSPeerRequest::OP_XLOCKACK)
11fdf7f2 2916 lock->encode_locked_state(r->get_lock_data());
f67539c2 2917 mds->send_message(r, mdr->peer_request->get_connection());
7c673cae
FG
2918 }
2919
2920 // done.
f67539c2 2921 mdr->reset_peer_request();
7c673cae
FG
2922 }
2923 break;
2924
f67539c2
TL
2925 case MMDSPeerRequest::OP_UNXLOCK:
2926 case MMDSPeerRequest::OP_UNWRLOCK:
7c673cae 2927 {
f67539c2
TL
2928 SimpleLock *lock = mds->locker->get_lock(mdr->peer_request->get_lock_type(),
2929 mdr->peer_request->get_object_info());
11fdf7f2
TL
2930 ceph_assert(lock);
2931 auto it = mdr->locks.find(lock);
2932 ceph_assert(it != mdr->locks.end());
7c673cae
FG
2933 bool need_issue = false;
2934 switch (op) {
f67539c2 2935 case MMDSPeerRequest::OP_UNXLOCK:
11fdf7f2 2936 mds->locker->xlock_finish(it, mdr.get(), &need_issue);
7c673cae 2937 break;
f67539c2 2938 case MMDSPeerRequest::OP_UNWRLOCK:
11fdf7f2 2939 mds->locker->wrlock_finish(it, mdr.get(), &need_issue);
7c673cae
FG
2940 break;
2941 }
2942 if (need_issue)
2943 mds->locker->issue_caps(static_cast<CInode*>(lock->get_parent()));
2944
2945 // done. no ack necessary.
f67539c2 2946 mdr->reset_peer_request();
7c673cae
FG
2947 }
2948 break;
2949
f67539c2
TL
2950 case MMDSPeerRequest::OP_AUTHPIN:
2951 handle_peer_auth_pin(mdr);
7c673cae
FG
2952 break;
2953
f67539c2
TL
2954 case MMDSPeerRequest::OP_LINKPREP:
2955 case MMDSPeerRequest::OP_UNLINKPREP:
2956 handle_peer_link_prep(mdr);
7c673cae
FG
2957 break;
2958
f67539c2
TL
2959 case MMDSPeerRequest::OP_RMDIRPREP:
2960 handle_peer_rmdir_prep(mdr);
7c673cae
FG
2961 break;
2962
f67539c2
TL
2963 case MMDSPeerRequest::OP_RENAMEPREP:
2964 handle_peer_rename_prep(mdr);
7c673cae
FG
2965 break;
2966
7c673cae
FG
2967 default:
2968 ceph_abort();
2969 }
2970}
2971
f67539c2 2972void Server::handle_peer_auth_pin(MDRequestRef& mdr)
7c673cae 2973{
f67539c2 2974 dout(10) << "handle_peer_auth_pin " << *mdr << dendl;
7c673cae
FG
2975
2976 // build list of objects
2977 list<MDSCacheObject*> objects;
2978 CInode *auth_pin_freeze = NULL;
f67539c2 2979 bool nonblocking = mdr->peer_request->is_nonblocking();
7c673cae 2980 bool fail = false, wouldblock = false, readonly = false;
f67539c2 2981 ref_t<MMDSPeerRequest> reply;
7c673cae
FG
2982
2983 if (mdcache->is_readonly()) {
2984 dout(10) << " read-only FS" << dendl;
2985 readonly = true;
2986 fail = true;
2987 }
2988
2989 if (!fail) {
f67539c2 2990 for (const auto &oi : mdr->peer_request->get_authpins()) {
11fdf7f2 2991 MDSCacheObject *object = mdcache->get_object(oi);
7c673cae 2992 if (!object) {
11fdf7f2 2993 dout(10) << " don't have " << oi << dendl;
7c673cae
FG
2994 fail = true;
2995 break;
2996 }
2997
2998 objects.push_back(object);
f67539c2 2999 if (oi == mdr->peer_request->get_authpin_freeze())
7c673cae
FG
3000 auth_pin_freeze = static_cast<CInode*>(object);
3001 }
3002 }
3003
3004 // can we auth pin them?
3005 if (!fail) {
9f95a23c
TL
3006 for (const auto& obj : objects) {
3007 if (!obj->is_auth()) {
3008 dout(10) << " not auth for " << *obj << dendl;
7c673cae
FG
3009 fail = true;
3010 break;
3011 }
9f95a23c 3012 if (mdr->is_auth_pinned(obj))
7c673cae 3013 continue;
9f95a23c
TL
3014 if (!mdr->can_auth_pin(obj)) {
3015 if (nonblocking) {
3016 dout(10) << " can't auth_pin (freezing?) " << *obj << " nonblocking" << dendl;
7c673cae
FG
3017 fail = true;
3018 wouldblock = true;
3019 break;
3020 }
3021 // wait
9f95a23c
TL
3022 dout(10) << " waiting for authpinnable on " << *obj << dendl;
3023 obj->add_waiter(CDir::WAIT_UNFREEZE, new C_MDS_RetryRequest(mdcache, mdr));
7c673cae
FG
3024 mdr->drop_local_auth_pins();
3025
9f95a23c
TL
3026 mds->locker->notify_freeze_waiter(obj);
3027 goto blocked;
7c673cae
FG
3028 }
3029 }
3030 }
3031
9f95a23c 3032 if (!fail) {
7c673cae
FG
3033 /* freeze authpin wrong inode */
3034 if (mdr->has_more() && mdr->more()->is_freeze_authpin &&
3035 mdr->more()->rename_inode != auth_pin_freeze)
3036 mdr->unfreeze_auth_pin(true);
3037
f67539c2 3038 /* handle_peer_rename_prep() call freeze_inode() to wait for all other operations
7c673cae
FG
3039 * on the source inode to complete. This happens after all locks for the rename
3040 * operation are acquired. But to acquire locks, we need auth pin locks' parent
3041 * objects first. So there is an ABBA deadlock if someone auth pins the source inode
f67539c2 3042 * after locks are acquired and before Server::handle_peer_rename_prep() is called.
7c673cae
FG
3043 * The solution is freeze the inode and prevent other MDRequests from getting new
3044 * auth pins.
3045 */
3046 if (auth_pin_freeze) {
3047 dout(10) << " freezing auth pin on " << *auth_pin_freeze << dendl;
3048 if (!mdr->freeze_auth_pin(auth_pin_freeze)) {
3049 auth_pin_freeze->add_waiter(CInode::WAIT_FROZEN, new C_MDS_RetryRequest(mdcache, mdr));
3050 mds->mdlog->flush();
9f95a23c 3051 goto blocked;
7c673cae
FG
3052 }
3053 }
7c673cae
FG
3054 }
3055
f67539c2 3056 reply = make_message<MMDSPeerRequest>(mdr->reqid, mdr->attempt, MMDSPeerRequest::OP_AUTHPINACK);
7c673cae 3057
9f95a23c
TL
3058 if (fail) {
3059 mdr->drop_local_auth_pins(); // just in case
3060 if (readonly)
3061 reply->mark_error_rofs();
3062 if (wouldblock)
3063 reply->mark_error_wouldblock();
3064 } else {
3065 // auth pin!
3066 for (const auto& obj : objects) {
3067 dout(10) << "auth_pinning " << *obj << dendl;
3068 mdr->auth_pin(obj);
3069 }
3070 // return list of my auth_pins (if any)
3071 for (const auto &p : mdr->object_states) {
3072 if (!p.second.auth_pinned)
3073 continue;
3074 MDSCacheObjectInfo info;
3075 p.first->set_object_info(info);
3076 reply->get_authpins().push_back(info);
3077 if (p.first == (MDSCacheObject*)auth_pin_freeze)
3078 auth_pin_freeze->set_object_info(reply->get_authpin_freeze());
3079 }
3080 }
7c673cae 3081
f67539c2 3082 mds->send_message_mds(reply, mdr->peer_to_mds);
7c673cae
FG
3083
3084 // clean up this request
f67539c2 3085 mdr->reset_peer_request();
7c673cae 3086 return;
9f95a23c
TL
3087
3088blocked:
f67539c2
TL
3089 if (mdr->peer_request->should_notify_blocking()) {
3090 reply = make_message<MMDSPeerRequest>(mdr->reqid, mdr->attempt, MMDSPeerRequest::OP_AUTHPINACK);
9f95a23c 3091 reply->mark_req_blocked();
f67539c2
TL
3092 mds->send_message_mds(reply, mdr->peer_to_mds);
3093 mdr->peer_request->clear_notify_blocking();
9f95a23c
TL
3094 }
3095 return;
7c673cae
FG
3096}
3097
f67539c2 3098void Server::handle_peer_auth_pin_ack(MDRequestRef& mdr, const cref_t<MMDSPeerRequest> &ack)
7c673cae 3099{
f67539c2 3100 dout(10) << "handle_peer_auth_pin_ack on " << *mdr << " " << *ack << dendl;
7c673cae
FG
3101 mds_rank_t from = mds_rank_t(ack->get_source().num());
3102
9f95a23c
TL
3103 if (ack->is_req_blocked()) {
3104 mdr->disable_lock_cache();
f67539c2 3105 // peer auth pin is blocked, drop locks to avoid deadlock
9f95a23c
TL
3106 mds->locker->drop_locks(mdr.get(), nullptr);
3107 return;
3108 }
3109
7c673cae
FG
3110 // added auth pins?
3111 set<MDSCacheObject*> pinned;
11fdf7f2
TL
3112 for (const auto &oi : ack->get_authpins()) {
3113 MDSCacheObject *object = mdcache->get_object(oi);
3114 ceph_assert(object); // we pinned it
7c673cae 3115 dout(10) << " remote has pinned " << *object << dendl;
9f95a23c 3116 mdr->set_remote_auth_pinned(object, from);
11fdf7f2 3117 if (oi == ack->get_authpin_freeze())
7c673cae
FG
3118 mdr->set_remote_frozen_auth_pin(static_cast<CInode *>(object));
3119 pinned.insert(object);
3120 }
3121
3122 // removed frozen auth pin ?
3123 if (mdr->more()->is_remote_frozen_authpin &&
3124 ack->get_authpin_freeze() == MDSCacheObjectInfo()) {
9f95a23c
TL
3125 auto stat_p = mdr->find_object_state(mdr->more()->rename_inode);
3126 ceph_assert(stat_p);
3127 if (stat_p->remote_auth_pinned == from) {
7c673cae
FG
3128 mdr->more()->is_remote_frozen_authpin = false;
3129 }
3130 }
3131
3132 // removed auth pins?
9f95a23c
TL
3133 for (auto& p : mdr->object_states) {
3134 if (p.second.remote_auth_pinned == MDS_RANK_NONE)
3135 continue;
3136 MDSCacheObject* object = p.first;
3137 if (p.second.remote_auth_pinned == from && pinned.count(object) == 0) {
7c673cae 3138 dout(10) << " remote has unpinned " << *object << dendl;
9f95a23c 3139 mdr->_clear_remote_auth_pinned(p.second);
7c673cae
FG
3140 }
3141 }
3142
f67539c2
TL
3143 // note peer
3144 mdr->more()->peers.insert(from);
9f95a23c
TL
3145
3146 // clear from waiting list
f67539c2 3147 auto ret = mdr->more()->waiting_on_peer.erase(from);
9f95a23c
TL
3148 ceph_assert(ret);
3149
7c673cae 3150 if (ack->is_error_rofs()) {
f67539c2 3151 mdr->more()->peer_error = -CEPHFS_EROFS;
7c673cae 3152 } else if (ack->is_error_wouldblock()) {
f67539c2 3153 mdr->more()->peer_error = -CEPHFS_EWOULDBLOCK;
7c673cae 3154 }
7c673cae
FG
3155
3156 // go again?
f67539c2 3157 if (mdr->more()->waiting_on_peer.empty())
7c673cae
FG
3158 mdcache->dispatch_request(mdr);
3159 else
f67539c2 3160 dout(10) << "still waiting on peers " << mdr->more()->waiting_on_peer << dendl;
7c673cae
FG
3161}
3162
3163
3164// ---------------------------------------
3165// HELPERS
3166
3167
3168/**
3169 * check whether we are permitted to complete a request
3170 *
3171 * Check whether we have permission to perform the operation specified
3172 * by mask on the given inode, based on the capability in the mdr's
3173 * session.
3174 */
3175bool Server::check_access(MDRequestRef& mdr, CInode *in, unsigned mask)
3176{
3177 if (mdr->session) {
3178 int r = mdr->session->check_access(
3179 in, mask,
3180 mdr->client_request->get_caller_uid(),
3181 mdr->client_request->get_caller_gid(),
3182 &mdr->client_request->get_caller_gid_list(),
3183 mdr->client_request->head.args.setattr.uid,
3184 mdr->client_request->head.args.setattr.gid);
3185 if (r < 0) {
3186 respond_to_request(mdr, r);
3187 return false;
3188 }
3189 }
3190 return true;
3191}
3192
3193/**
3194 * check whether fragment has reached maximum size
3195 *
3196 */
3197bool Server::check_fragment_space(MDRequestRef &mdr, CDir *in)
3198{
3199 const auto size = in->get_frag_size();
11fdf7f2 3200 if (size >= g_conf()->mds_bal_fragment_size_max) {
f67539c2
TL
3201 dout(10) << "fragment " << *in << " size exceeds " << g_conf()->mds_bal_fragment_size_max << " (CEPHFS_ENOSPC)" << dendl;
3202 respond_to_request(mdr, -CEPHFS_ENOSPC);
7c673cae
FG
3203 return false;
3204 }
3205
3206 return true;
3207}
3208
7c673cae
FG
3209CDentry* Server::prepare_stray_dentry(MDRequestRef& mdr, CInode *in)
3210{
f67539c2
TL
3211 string straydname;
3212 in->name_stray_dentry(straydname);
3213
7c673cae
FG
3214 CDentry *straydn = mdr->straydn;
3215 if (straydn) {
9f95a23c
TL
3216 ceph_assert(straydn->get_name() == straydname);
3217 return straydn;
7c673cae 3218 }
7c673cae
FG
3219 CDir *straydir = mdcache->get_stray_dir(in);
3220
3221 if (!mdr->client_request->is_replay() &&
3222 !check_fragment_space(mdr, straydir))
f67539c2
TL
3223 return nullptr;
3224
3225 straydn = straydir->lookup(straydname);
3226 if (!straydn) {
3227 if (straydir->is_frozen_dir()) {
3228 dout(10) << __func__ << ": " << *straydir << " is frozen, waiting" << dendl;
3229 mds->locker->drop_locks(mdr.get());
3230 mdr->drop_local_auth_pins();
3231 straydir->add_waiter(CInode::WAIT_UNFREEZE, new C_MDS_RetryRequest(mdcache, mdr));
3232 return nullptr;
3233 }
3234 straydn = straydir->add_null_dentry(straydname);
3235 straydn->mark_new();
3236 } else {
3237 ceph_assert(straydn->get_projected_linkage()->is_null());
3238 }
7c673cae 3239
f67539c2 3240 straydn->state_set(CDentry::STATE_STRAY);
7c673cae
FG
3241 mdr->straydn = straydn;
3242 mdr->pin(straydn);
f67539c2 3243
7c673cae
FG
3244 return straydn;
3245}
3246
3247/** prepare_new_inode
3248 *
3249 * create a new inode. set c/m/atime. hit dir pop.
3250 */
3251CInode* Server::prepare_new_inode(MDRequestRef& mdr, CDir *dir, inodeno_t useino, unsigned mode,
f67539c2 3252 const file_layout_t *layout)
7c673cae
FG
3253{
3254 CInode *in = new CInode(mdcache);
f67539c2 3255 auto _inode = in->_get_inode();
7c673cae
FG
3256
3257 // Server::prepare_force_open_sessions() can re-open session in closing
3258 // state. In that corner case, session's prealloc_inos are being freed.
3259 // To simplify the code, we disallow using/refilling session's prealloc_ino
3260 // while session is opening.
92f5a8d4 3261 bool allow_prealloc_inos = mdr->session->is_open();
7c673cae
FG
3262
3263 // assign ino
f67539c2 3264 if (allow_prealloc_inos && (mdr->used_prealloc_ino = _inode->ino = mdr->session->take_ino(useino))) {
7c673cae 3265 mds->sessionmap.mark_projected(mdr->session);
7c673cae 3266 dout(10) << "prepare_new_inode used_prealloc " << mdr->used_prealloc_ino
f67539c2 3267 << " (" << mdr->session->info.prealloc_inos.size() << " left)"
7c673cae
FG
3268 << dendl;
3269 } else {
3270 mdr->alloc_ino =
f67539c2 3271 _inode->ino = mds->inotable->project_alloc_id(useino);
7c673cae
FG
3272 dout(10) << "prepare_new_inode alloc " << mdr->alloc_ino << dendl;
3273 }
3274
f67539c2
TL
3275 if (useino && useino != _inode->ino) {
3276 dout(0) << "WARNING: client specified " << useino << " and i allocated " << _inode->ino << dendl;
7c673cae
FG
3277 mds->clog->error() << mdr->client_request->get_source()
3278 << " specified ino " << useino
f67539c2 3279 << " but mds." << mds->get_nodeid() << " allocated " << _inode->ino;
7c673cae
FG
3280 //ceph_abort(); // just for now.
3281 }
3282
3283 if (allow_prealloc_inos &&
11fdf7f2
TL
3284 mdr->session->get_num_projected_prealloc_inos() < g_conf()->mds_client_prealloc_inos / 2) {
3285 int need = g_conf()->mds_client_prealloc_inos - mdr->session->get_num_projected_prealloc_inos();
7c673cae 3286 mds->inotable->project_alloc_ids(mdr->prealloc_inos, need);
11fdf7f2 3287 ceph_assert(mdr->prealloc_inos.size()); // or else fix projected increment semantics
7c673cae
FG
3288 mdr->session->pending_prealloc_inos.insert(mdr->prealloc_inos);
3289 mds->sessionmap.mark_projected(mdr->session);
3290 dout(10) << "prepare_new_inode prealloc " << mdr->prealloc_inos << dendl;
3291 }
3292
f67539c2
TL
3293 _inode->version = 1;
3294 _inode->xattr_version = 1;
3295 _inode->nlink = 1; // FIXME
7c673cae 3296
f67539c2 3297 _inode->mode = mode;
7c673cae 3298
92f5a8d4 3299 // FIPS zeroization audit 20191117: this memset is not security related.
f67539c2
TL
3300 memset(&_inode->dir_layout, 0, sizeof(_inode->dir_layout));
3301 if (_inode->is_dir()) {
3302 _inode->dir_layout.dl_dir_hash = g_conf()->mds_default_dir_hash;
7c673cae 3303 } else if (layout) {
f67539c2 3304 _inode->layout = *layout;
7c673cae 3305 } else {
f67539c2 3306 _inode->layout = mdcache->default_file_layout;
7c673cae
FG
3307 }
3308
f67539c2
TL
3309 _inode->truncate_size = -1ull; // not truncated, yet!
3310 _inode->truncate_seq = 1; /* starting with 1, 0 is kept for no-truncation logic */
7c673cae
FG
3311
3312 CInode *diri = dir->get_inode();
3313
f67539c2 3314 dout(10) << oct << " dir mode 0" << diri->get_inode()->mode << " new mode 0" << mode << dec << dendl;
7c673cae 3315
f67539c2 3316 if (diri->get_inode()->mode & S_ISGID) {
7c673cae 3317 dout(10) << " dir is sticky" << dendl;
f67539c2 3318 _inode->gid = diri->get_inode()->gid;
7c673cae
FG
3319 if (S_ISDIR(mode)) {
3320 dout(10) << " new dir also sticky" << dendl;
f67539c2 3321 _inode->mode |= S_ISGID;
7c673cae
FG
3322 }
3323 } else
f67539c2 3324 _inode->gid = mdr->client_request->get_caller_gid();
7c673cae 3325
f67539c2 3326 _inode->uid = mdr->client_request->get_caller_uid();
7c673cae 3327
f67539c2 3328 _inode->btime = _inode->ctime = _inode->mtime = _inode->atime =
7c673cae
FG
3329 mdr->get_op_stamp();
3330
f67539c2 3331 _inode->change_attr = 0;
7c673cae 3332
9f95a23c 3333 const cref_t<MClientRequest> &req = mdr->client_request;
7c673cae 3334 if (req->get_data().length()) {
11fdf7f2 3335 auto p = req->get_data().cbegin();
7c673cae
FG
3336
3337 // xattrs on new inode?
f67539c2
TL
3338 auto _xattrs = CInode::allocate_xattr_map();
3339 decode_noshare(*_xattrs, p);
3340 dout(10) << "prepare_new_inode setting xattrs " << *_xattrs << dendl;
3341 in->reset_xattrs(std::move(_xattrs));
7c673cae
FG
3342 }
3343
3344 if (!mds->mdsmap->get_inline_data_enabled() ||
11fdf7f2 3345 !mdr->session->get_connection()->has_feature(CEPH_FEATURE_MDS_INLINE_DATA))
f67539c2 3346 _inode->inline_data.version = CEPH_INLINE_NONE;
7c673cae
FG
3347
3348 mdcache->add_inode(in); // add
3349 dout(10) << "prepare_new_inode " << *in << dendl;
3350 return in;
3351}
3352
3353void Server::journal_allocated_inos(MDRequestRef& mdr, EMetaBlob *blob)
3354{
3355 dout(20) << "journal_allocated_inos sessionmapv " << mds->sessionmap.get_projected()
3356 << " inotablev " << mds->inotable->get_projected_version()
3357 << dendl;
3358 blob->set_ino_alloc(mdr->alloc_ino,
3359 mdr->used_prealloc_ino,
3360 mdr->prealloc_inos,
3361 mdr->client_request->get_source(),
3362 mds->sessionmap.get_projected(),
3363 mds->inotable->get_projected_version());
3364}
3365
3366void Server::apply_allocated_inos(MDRequestRef& mdr, Session *session)
3367{
3368 dout(10) << "apply_allocated_inos " << mdr->alloc_ino
3369 << " / " << mdr->prealloc_inos
3370 << " / " << mdr->used_prealloc_ino << dendl;
3371
3372 if (mdr->alloc_ino) {
3373 mds->inotable->apply_alloc_id(mdr->alloc_ino);
3374 }
3375 if (mdr->prealloc_inos.size()) {
11fdf7f2 3376 ceph_assert(session);
7c673cae 3377 session->pending_prealloc_inos.subtract(mdr->prealloc_inos);
f67539c2 3378 session->free_prealloc_inos.insert(mdr->prealloc_inos);
7c673cae 3379 session->info.prealloc_inos.insert(mdr->prealloc_inos);
81eedcae 3380 mds->sessionmap.mark_dirty(session, !mdr->used_prealloc_ino);
7c673cae
FG
3381 mds->inotable->apply_alloc_ids(mdr->prealloc_inos);
3382 }
3383 if (mdr->used_prealloc_ino) {
11fdf7f2 3384 ceph_assert(session);
f67539c2 3385 session->info.prealloc_inos.erase(mdr->used_prealloc_ino);
7c673cae
FG
3386 mds->sessionmap.mark_dirty(session);
3387 }
3388}
3389
3390class C_MDS_TryFindInode : public ServerContext {
3391 MDRequestRef mdr;
3392public:
3393 C_MDS_TryFindInode(Server *s, MDRequestRef& r) : ServerContext(s), mdr(r) {}
3394 void finish(int r) override {
f67539c2 3395 if (r == -CEPHFS_ESTALE) // :( find_ino_peers failed
7c673cae
FG
3396 server->respond_to_request(mdr, r);
3397 else
3398 server->dispatch_client_request(mdr);
3399 }
3400};
3401
7c673cae
FG
3402/* If this returns null, the request has been handled
3403 * as appropriate: forwarded on, or the client's been replied to */
9f95a23c 3404CInode* Server::rdlock_path_pin_ref(MDRequestRef& mdr,
7c673cae 3405 bool want_auth,
9f95a23c 3406 bool no_want_auth)
7c673cae 3407{
9f95a23c 3408 const filepath& refpath = mdr->get_filepath();
7c673cae
FG
3409 dout(10) << "rdlock_path_pin_ref " << *mdr << " " << refpath << dendl;
3410
9f95a23c
TL
3411 if (mdr->locking_state & MutationImpl::PATH_LOCKED)
3412 return mdr->in[0];
7c673cae
FG
3413
3414 // traverse
f67539c2 3415 CF_MDS_RetryRequestFactory cf(mdcache, mdr, true);
9f95a23c
TL
3416 int flags = 0;
3417 if (refpath.is_last_snap()) {
3418 if (!no_want_auth)
3419 want_auth = true;
3420 } else {
f91f0fd5
TL
3421 if (!no_want_auth && forward_all_requests_to_auth)
3422 want_auth = true;
9f95a23c
TL
3423 flags |= MDS_TRAVERSE_RDLOCK_PATH | MDS_TRAVERSE_RDLOCK_SNAP;
3424 }
3425 if (want_auth)
3426 flags |= MDS_TRAVERSE_WANT_AUTH;
3427 int r = mdcache->path_traverse(mdr, cf, refpath, flags, &mdr->dn[0], &mdr->in[0]);
7c673cae 3428 if (r > 0)
9f95a23c 3429 return nullptr; // delayed
7c673cae 3430 if (r < 0) { // error
f67539c2 3431 if (r == -CEPHFS_ENOENT && !mdr->dn[0].empty()) {
9f95a23c
TL
3432 if (mdr->client_request &&
3433 mdr->client_request->get_dentry_wanted())
3434 mdr->tracedn = mdr->dn[0].back();
7c673cae 3435 respond_to_request(mdr, r);
f67539c2
TL
3436 } else if (r == -CEPHFS_ESTALE) {
3437 dout(10) << "FAIL on CEPHFS_ESTALE but attempting recovery" << dendl;
11fdf7f2 3438 MDSContext *c = new C_MDS_TryFindInode(this, mdr);
7c673cae
FG
3439 mdcache->find_ino_peers(refpath.get_ino(), c);
3440 } else {
3441 dout(10) << "FAIL on error " << r << dendl;
3442 respond_to_request(mdr, r);
3443 }
9f95a23c 3444 return nullptr;
7c673cae 3445 }
9f95a23c 3446 CInode *ref = mdr->in[0];
7c673cae
FG
3447 dout(10) << "ref is " << *ref << dendl;
3448
7c673cae 3449 if (want_auth) {
7c673cae
FG
3450 // auth_pin?
3451 // do NOT proceed if freezing, as cap release may defer in that case, and
3452 // we could deadlock when we try to lock @ref.
3453 // if we're already auth_pinned, continue; the release has already been processed.
3454 if (ref->is_frozen() || ref->is_frozen_auth_pin() ||
3455 (ref->is_freezing() && !mdr->is_auth_pinned(ref))) {
3456 dout(7) << "waiting for !frozen/authpinnable on " << *ref << dendl;
9f95a23c
TL
3457 ref->add_waiter(CInode::WAIT_UNFREEZE, cf.build());
3458 if (mdr->is_any_remote_auth_pin())
224ce89b 3459 mds->locker->notify_freeze_waiter(ref);
7c673cae
FG
3460 return 0;
3461 }
7c673cae
FG
3462 mdr->auth_pin(ref);
3463 }
3464
7c673cae
FG
3465 // set and pin ref
3466 mdr->pin(ref);
3467 return ref;
3468}
3469
3470
3471/** rdlock_path_xlock_dentry
3472 * traverse path to the directory that could/would contain dentry.
3473 * make sure i am auth for that dentry, forward as necessary.
3474 * create null dentry in place (or use existing if okexist).
3475 * get rdlocks on traversed dentries, xlock on new dentry.
3476 */
9f95a23c
TL
3477CDentry* Server::rdlock_path_xlock_dentry(MDRequestRef& mdr,
3478 bool create, bool okexist, bool want_layout)
7c673cae 3479{
9f95a23c 3480 const filepath& refpath = mdr->get_filepath();
7c673cae
FG
3481 dout(10) << "rdlock_path_xlock_dentry " << *mdr << " " << refpath << dendl;
3482
9f95a23c
TL
3483 if (mdr->locking_state & MutationImpl::PATH_LOCKED)
3484 return mdr->dn[0].back();
3485
3486 // figure parent dir vs dname
3487 if (refpath.depth() == 0) {
3488 dout(7) << "invalid path (zero length)" << dendl;
f67539c2 3489 respond_to_request(mdr, -CEPHFS_EINVAL);
9f95a23c
TL
3490 return nullptr;
3491 }
3492
3493 if (refpath.is_last_snap()) {
f67539c2 3494 respond_to_request(mdr, -CEPHFS_EROFS);
9f95a23c
TL
3495 return nullptr;
3496 }
7c673cae 3497
9f95a23c
TL
3498 if (refpath.is_last_dot_or_dotdot()) {
3499 dout(7) << "invalid path (last dot or dot_dot)" << dendl;
3500 if (create)
f67539c2 3501 respond_to_request(mdr, -CEPHFS_EEXIST);
9f95a23c 3502 else
f67539c2 3503 respond_to_request(mdr, -CEPHFS_ENOTEMPTY);
9f95a23c
TL
3504 return nullptr;
3505 }
7c673cae 3506
9f95a23c 3507 // traverse to parent dir
f67539c2 3508 CF_MDS_RetryRequestFactory cf(mdcache, mdr, true);
9f95a23c
TL
3509 int flags = MDS_TRAVERSE_RDLOCK_SNAP | MDS_TRAVERSE_RDLOCK_PATH |
3510 MDS_TRAVERSE_WANT_DENTRY | MDS_TRAVERSE_XLOCK_DENTRY |
3511 MDS_TRAVERSE_WANT_AUTH;
3512 if (refpath.depth() == 1 && !mdr->lock_cache_disabled)
3513 flags |= MDS_TRAVERSE_CHECK_LOCKCACHE;
3514 if (create)
3515 flags |= MDS_TRAVERSE_RDLOCK_AUTHLOCK;
3516 if (want_layout)
3517 flags |= MDS_TRAVERSE_WANT_DIRLAYOUT;
3518 int r = mdcache->path_traverse(mdr, cf, refpath, flags, &mdr->dn[0]);
3519 if (r > 0)
3520 return nullptr; // delayed
3521 if (r < 0) {
f67539c2
TL
3522 if (r == -CEPHFS_ESTALE) {
3523 dout(10) << "FAIL on CEPHFS_ESTALE but attempting recovery" << dendl;
9f95a23c
TL
3524 mdcache->find_ino_peers(refpath.get_ino(), new C_MDS_TryFindInode(this, mdr));
3525 return nullptr;
3526 }
3527 respond_to_request(mdr, r);
3528 return nullptr;
3529 }
7c673cae 3530
9f95a23c
TL
3531 CDentry *dn = mdr->dn[0].back();
3532 CDir *dir = dn->get_dir();
7c673cae 3533 CInode *diri = dir->get_inode();
9f95a23c 3534
7c673cae
FG
3535 if (!mdr->reqid.name.is_mds()) {
3536 if (diri->is_system() && !diri->is_root()) {
f67539c2 3537 respond_to_request(mdr, -CEPHFS_EROFS);
9f95a23c 3538 return nullptr;
7c673cae
FG
3539 }
3540 }
9f95a23c 3541
7c673cae 3542 if (!diri->is_base() && diri->get_projected_parent_dir()->inode->is_stray()) {
f67539c2 3543 respond_to_request(mdr, -CEPHFS_ENOENT);
9f95a23c 3544 return nullptr;
7c673cae
FG
3545 }
3546
9f95a23c
TL
3547 CDentry::linkage_t *dnl = dn->get_projected_linkage();
3548 if (dnl->is_null()) {
3549 if (!create && okexist) {
f67539c2 3550 respond_to_request(mdr, -CEPHFS_ENOENT);
9f95a23c 3551 return nullptr;
7c673cae
FG
3552 }
3553
9f95a23c
TL
3554 snapid_t next_snap = mdcache->get_global_snaprealm()->get_newest_seq() + 1;
3555 dn->first = std::max(dn->first, next_snap);
7c673cae 3556 } else {
9f95a23c 3557 if (!okexist) {
f67539c2 3558 respond_to_request(mdr, -CEPHFS_EEXIST);
9f95a23c
TL
3559 return nullptr;
3560 }
3561 mdr->in[0] = dnl->get_inode();
7c673cae
FG
3562 }
3563
7c673cae
FG
3564 return dn;
3565}
3566
9f95a23c
TL
3567/** rdlock_two_paths_xlock_destdn
3568 * traverse two paths and lock the two paths in proper order.
3569 * The order of taking locks is:
3570 * 1. Lock directory inodes or dentries according to which trees they
3571 * are under. Lock objects under fs root before objects under mdsdir.
3572 * 2. Lock directory inodes or dentries according to their depth, in
3573 * ascending order.
3574 * 3. Lock directory inodes or dentries according to inode numbers or
3575 * dentries' parent inode numbers, in ascending order.
3576 * 4. Lock dentries in the same directory in order of their keys.
3577 * 5. Lock non-directory inodes according to inode numbers, in ascending
3578 * order.
3579 */
3580std::pair<CDentry*, CDentry*>
3581Server::rdlock_two_paths_xlock_destdn(MDRequestRef& mdr, bool xlock_srcdn)
3582{
7c673cae 3583
9f95a23c
TL
3584 const filepath& refpath = mdr->get_filepath();
3585 const filepath& refpath2 = mdr->get_filepath2();
7c673cae 3586
9f95a23c 3587 dout(10) << "rdlock_two_paths_xlock_destdn " << *mdr << " " << refpath << " " << refpath2 << dendl;
7c673cae 3588
9f95a23c
TL
3589 if (mdr->locking_state & MutationImpl::PATH_LOCKED)
3590 return std::make_pair(mdr->dn[0].back(), mdr->dn[1].back());
7c673cae 3591
9f95a23c 3592 if (refpath.depth() != 1 || refpath2.depth() != 1) {
f67539c2 3593 respond_to_request(mdr, -CEPHFS_EINVAL);
9f95a23c
TL
3594 return std::pair<CDentry*, CDentry*>(nullptr, nullptr);
3595 }
3596
3597 if (refpath.is_last_snap() || refpath2.is_last_snap()) {
f67539c2 3598 respond_to_request(mdr, -CEPHFS_EROFS);
9f95a23c
TL
3599 return std::make_pair(nullptr, nullptr);
3600 }
3601
3602 // traverse to parent dir
f67539c2 3603 CF_MDS_RetryRequestFactory cf(mdcache, mdr, true);
9f95a23c
TL
3604 int flags = MDS_TRAVERSE_RDLOCK_SNAP | MDS_TRAVERSE_WANT_DENTRY | MDS_TRAVERSE_WANT_AUTH;
3605 int r = mdcache->path_traverse(mdr, cf, refpath, flags, &mdr->dn[0]);
3606 if (r != 0) {
f67539c2
TL
3607 if (r == -CEPHFS_ESTALE) {
3608 dout(10) << "CEPHFS_ESTALE on path, attempting recovery" << dendl;
9f95a23c
TL
3609 mdcache->find_ino_peers(refpath.get_ino(), new C_MDS_TryFindInode(this, mdr));
3610 } else if (r < 0) {
3611 respond_to_request(mdr, r);
3612 }
3613 return std::make_pair(nullptr, nullptr);
3614 }
3615
3616 flags = MDS_TRAVERSE_RDLOCK_SNAP2 | MDS_TRAVERSE_WANT_DENTRY | MDS_TRAVERSE_DISCOVER;
3617 r = mdcache->path_traverse(mdr, cf, refpath2, flags, &mdr->dn[1]);
3618 if (r != 0) {
f67539c2
TL
3619 if (r == -CEPHFS_ESTALE) {
3620 dout(10) << "CEPHFS_ESTALE on path2, attempting recovery" << dendl;
9f95a23c
TL
3621 mdcache->find_ino_peers(refpath2.get_ino(), new C_MDS_TryFindInode(this, mdr));
3622 } else if (r < 0) {
3623 respond_to_request(mdr, r);
3624 }
3625 return std::make_pair(nullptr, nullptr);
3626 }
3627
3628 CDentry *srcdn = mdr->dn[1].back();
3629 CDir *srcdir = srcdn->get_dir();
3630 CDentry *destdn = mdr->dn[0].back();
3631 CDir *destdir = destdn->get_dir();
3632
3633 if (!mdr->reqid.name.is_mds()) {
3634 if ((srcdir->get_inode()->is_system() && !srcdir->get_inode()->is_root()) ||
3635 (destdir->get_inode()->is_system() && !destdir->get_inode()->is_root())) {
f67539c2 3636 respond_to_request(mdr, -CEPHFS_EROFS);
9f95a23c
TL
3637 return std::make_pair(nullptr, nullptr);
3638 }
3639 }
3640
3641 if (!destdir->get_inode()->is_base() &&
3642 destdir->get_inode()->get_projected_parent_dir()->inode->is_stray()) {
f67539c2 3643 respond_to_request(mdr, -CEPHFS_ENOENT);
9f95a23c
TL
3644 return std::make_pair(nullptr, nullptr);
3645 }
3646
3647 MutationImpl::LockOpVec lov;
3648 if (srcdir->get_inode() == destdir->get_inode()) {
3649 lov.add_wrlock(&destdir->inode->filelock);
3650 lov.add_wrlock(&destdir->inode->nestlock);
3651 if (xlock_srcdn && srcdir != destdir) {
3652 mds_rank_t srcdir_auth = srcdir->authority().first;
3653 if (srcdir_auth != mds->get_nodeid()) {
3654 lov.add_remote_wrlock(&srcdir->inode->filelock, srcdir_auth);
3655 lov.add_remote_wrlock(&srcdir->inode->nestlock, srcdir_auth);
3656 }
3657 }
3658
3659 if (srcdn->get_name() > destdn->get_name())
3660 lov.add_xlock(&destdn->lock);
3661
3662 if (xlock_srcdn)
3663 lov.add_xlock(&srcdn->lock);
3664 else
3665 lov.add_rdlock(&srcdn->lock);
3666
3667 if (srcdn->get_name() < destdn->get_name())
3668 lov.add_xlock(&destdn->lock);
3669 } else {
3670 int cmp = mdr->compare_paths();
3671 bool lock_destdir_first =
3672 (cmp < 0 || (cmp == 0 && destdir->ino() < srcdir->ino()));
3673
3674 if (lock_destdir_first) {
3675 lov.add_wrlock(&destdir->inode->filelock);
3676 lov.add_wrlock(&destdir->inode->nestlock);
3677 lov.add_xlock(&destdn->lock);
3678 }
3679
3680 if (xlock_srcdn) {
3681 mds_rank_t srcdir_auth = srcdir->authority().first;
3682 if (srcdir_auth == mds->get_nodeid()) {
3683 lov.add_wrlock(&srcdir->inode->filelock);
3684 lov.add_wrlock(&srcdir->inode->nestlock);
3685 } else {
3686 lov.add_remote_wrlock(&srcdir->inode->filelock, srcdir_auth);
3687 lov.add_remote_wrlock(&srcdir->inode->nestlock, srcdir_auth);
3688 }
3689 lov.add_xlock(&srcdn->lock);
3690 } else {
3691 lov.add_rdlock(&srcdn->lock);
3692 }
3693
3694 if (!lock_destdir_first) {
3695 lov.add_wrlock(&destdir->inode->filelock);
3696 lov.add_wrlock(&destdir->inode->nestlock);
3697 lov.add_xlock(&destdn->lock);
3698 }
3699 }
3700
3701 CInode *auth_pin_freeze = nullptr;
3702 // XXX any better way to do this?
3703 if (xlock_srcdn && !srcdn->is_auth()) {
3704 CDentry::linkage_t *srcdnl = srcdn->get_projected_linkage();
3705 auth_pin_freeze = srcdnl->is_primary() ? srcdnl->get_inode() : nullptr;
3706 }
3707 if (!mds->locker->acquire_locks(mdr, lov, auth_pin_freeze))
3708 return std::make_pair(nullptr, nullptr);
3709
3710 if (srcdn->get_projected_linkage()->is_null()) {
f67539c2 3711 respond_to_request(mdr, -CEPHFS_ENOENT);
9f95a23c
TL
3712 return std::make_pair(nullptr, nullptr);
3713 }
3714
3715 if (destdn->get_projected_linkage()->is_null()) {
3716 snapid_t next_snap = mdcache->get_global_snaprealm()->get_newest_seq() + 1;
3717 destdn->first = std::max(destdn->first, next_snap);
3718 }
3719
3720 mdr->locking_state |= MutationImpl::PATH_LOCKED;
3721
3722 return std::make_pair(destdn, srcdn);
3723}
3724
3725/**
3726 * try_open_auth_dirfrag -- open dirfrag, or forward to dirfrag auth
3727 *
3728 * @param diri base inode
3729 * @param fg the exact frag we want
7c673cae
FG
3730 * @param mdr request
3731 * @returns the pointer, or NULL if it had to be delayed (but mdr is taken care of)
3732 */
3733CDir* Server::try_open_auth_dirfrag(CInode *diri, frag_t fg, MDRequestRef& mdr)
3734{
3735 CDir *dir = diri->get_dirfrag(fg);
3736
9f95a23c
TL
3737 if (dir) {
3738 // am i auth for the dirfrag?
3739 if (!dir->is_auth()) {
3740 mds_rank_t auth = dir->authority().first;
3741 dout(7) << "try_open_auth_dirfrag: not auth for " << *dir
3742 << ", fw to mds." << auth << dendl;
3743 mdcache->request_forward(mdr, auth);
3744 return nullptr;
3745 }
3746 } else {
3747 // not open and inode not mine?
3748 if (!diri->is_auth()) {
3749 mds_rank_t inauth = diri->authority().first;
3750 dout(7) << "try_open_auth_dirfrag: not open, not inode auth, fw to mds." << inauth << dendl;
3751 mdcache->request_forward(mdr, inauth);
3752 return nullptr;
3753 }
7c673cae 3754
9f95a23c
TL
3755 // not open and inode frozen?
3756 if (diri->is_frozen()) {
3757 dout(10) << "try_open_auth_dirfrag: dir inode is frozen, waiting " << *diri << dendl;
3758 ceph_assert(diri->get_parent_dir());
3759 diri->add_waiter(CInode::WAIT_UNFREEZE, new C_MDS_RetryRequest(mdcache, mdr));
3760 return nullptr;
3761 }
7c673cae 3762
9f95a23c 3763 // invent?
7c673cae 3764 dir = diri->get_or_open_dirfrag(mdcache, fg);
7c673cae
FG
3765 }
3766
3767 return dir;
3768}
3769
3770
3771// ===============================================================================
3772// STAT
3773
3774void Server::handle_client_getattr(MDRequestRef& mdr, bool is_lookup)
3775{
9f95a23c 3776 const cref_t<MClientRequest> &req = mdr->client_request;
7c673cae
FG
3777
3778 if (req->get_filepath().depth() == 0 && is_lookup) {
3779 // refpath can't be empty for lookup but it can for
3780 // getattr (we do getattr with empty refpath for mount of '/')
f67539c2 3781 respond_to_request(mdr, -CEPHFS_EINVAL);
7c673cae
FG
3782 return;
3783 }
3784
28e407b8
AA
3785 bool want_auth = false;
3786 int mask = req->head.args.getattr.mask;
3787 if (mask & CEPH_STAT_RSTAT)
3788 want_auth = true; // set want_auth for CEPH_STAT_RSTAT mask
3789
f91f0fd5 3790 if (!mdr->is_batch_head() && mdr->can_batch()) {
f67539c2 3791 CF_MDS_RetryRequestFactory cf(mdcache, mdr, false);
f91f0fd5
TL
3792 int r = mdcache->path_traverse(mdr, cf, mdr->get_filepath(),
3793 (want_auth ? MDS_TRAVERSE_WANT_AUTH : 0),
3794 &mdr->dn[0], &mdr->in[0]);
3795 if (r > 0)
3796 return; // delayed
9f95a23c 3797
f91f0fd5
TL
3798 if (r < 0) {
3799 // fall-thru. let rdlock_path_pin_ref() check again.
3800 } else if (is_lookup) {
3801 CDentry* dn = mdr->dn[0].back();
3802 mdr->pin(dn);
3803 auto em = dn->batch_ops.emplace(std::piecewise_construct, std::forward_as_tuple(mask), std::forward_as_tuple());
9f95a23c 3804 if (em.second) {
f91f0fd5 3805 em.first->second = std::make_unique<Batch_Getattr_Lookup>(this, mdr);
9f95a23c 3806 } else {
f91f0fd5 3807 dout(20) << __func__ << ": LOOKUP op, wait for previous same getattr ops to respond. " << *mdr << dendl;
9f95a23c
TL
3808 em.first->second->add_request(mdr);
3809 return;
3810 }
3811 } else {
f91f0fd5
TL
3812 CInode *in = mdr->in[0];
3813 mdr->pin(in);
3814 auto em = in->batch_ops.emplace(std::piecewise_construct, std::forward_as_tuple(mask), std::forward_as_tuple());
9f95a23c 3815 if (em.second) {
f91f0fd5 3816 em.first->second = std::make_unique<Batch_Getattr_Lookup>(this, mdr);
9f95a23c 3817 } else {
f91f0fd5 3818 dout(20) << __func__ << ": GETATTR op, wait for previous same getattr ops to respond. " << *mdr << dendl;
9f95a23c
TL
3819 em.first->second->add_request(mdr);
3820 return;
3821 }
3822 }
9f95a23c 3823 }
7c673cae 3824
f91f0fd5
TL
3825 CInode *ref = rdlock_path_pin_ref(mdr, want_auth, false);
3826 if (!ref)
3827 return;
3828
3829 mdr->getattr_caps = mask;
3830
7c673cae
FG
3831 /*
3832 * if client currently holds the EXCL cap on a field, do not rdlock
3833 * it; client's stat() will result in valid info if _either_ EXCL
3834 * cap is held or MDS rdlocks and reads the value here.
3835 *
3836 * handling this case here is easier than weakening rdlock
3837 * semantics... that would cause problems elsewhere.
3838 */
3839 client_t client = mdr->get_client();
3840 int issued = 0;
3841 Capability *cap = ref->get_client_cap(client);
3842 if (cap && (mdr->snapid == CEPH_NOSNAP ||
3843 mdr->snapid <= cap->client_follows))
3844 issued = cap->issued();
3845
9f95a23c
TL
3846 // FIXME
3847 MutationImpl::LockOpVec lov;
94b18763 3848 if ((mask & CEPH_CAP_LINK_SHARED) && !(issued & CEPH_CAP_LINK_EXCL))
11fdf7f2 3849 lov.add_rdlock(&ref->linklock);
94b18763 3850 if ((mask & CEPH_CAP_AUTH_SHARED) && !(issued & CEPH_CAP_AUTH_EXCL))
11fdf7f2 3851 lov.add_rdlock(&ref->authlock);
94b18763 3852 if ((mask & CEPH_CAP_XATTR_SHARED) && !(issued & CEPH_CAP_XATTR_EXCL))
11fdf7f2 3853 lov.add_rdlock(&ref->xattrlock);
94b18763
FG
3854 if ((mask & CEPH_CAP_FILE_SHARED) && !(issued & CEPH_CAP_FILE_EXCL)) {
3855 // Don't wait on unstable filelock if client is allowed to read file size.
3856 // This can reduce the response time of getattr in the case that multiple
3857 // clients do stat(2) and there are writers.
3858 // The downside of this optimization is that mds may not issue Fs caps along
3859 // with getattr reply. Client may need to send more getattr requests.
11fdf7f2
TL
3860 if (mdr->is_rdlocked(&ref->filelock)) {
3861 lov.add_rdlock(&ref->filelock);
94b18763
FG
3862 } else if (ref->filelock.is_stable() ||
3863 ref->filelock.get_num_wrlocks() > 0 ||
3864 !ref->filelock.can_read(mdr->get_client())) {
11fdf7f2 3865 lov.add_rdlock(&ref->filelock);
9f95a23c 3866 mdr->locking_state &= ~MutationImpl::ALL_LOCKED;
94b18763
FG
3867 }
3868 }
7c673cae 3869
11fdf7f2 3870 if (!mds->locker->acquire_locks(mdr, lov))
7c673cae
FG
3871 return;
3872
3873 if (!check_access(mdr, ref, MAY_READ))
3874 return;
3875
28e407b8
AA
3876 utime_t now = ceph_clock_now();
3877 mdr->set_mds_stamp(now);
3878
7c673cae
FG
3879 // note which caps are requested, so we return at least a snapshot
3880 // value for them. (currently this matters for xattrs and inline data)
3881 mdr->getattr_caps = mask;
3882
11fdf7f2 3883 mds->balancer->hit_inode(ref, META_POP_IRD, req->get_source().num());
7c673cae
FG
3884
3885 // reply
3886 dout(10) << "reply to stat on " << *req << dendl;
3887 mdr->tracei = ref;
3888 if (is_lookup)
3889 mdr->tracedn = mdr->dn[0].back();
3890 respond_to_request(mdr, 0);
3891}
3892
3893struct C_MDS_LookupIno2 : public ServerContext {
3894 MDRequestRef mdr;
3895 C_MDS_LookupIno2(Server *s, MDRequestRef& r) : ServerContext(s), mdr(r) {}
3896 void finish(int r) override {
3897 server->_lookup_ino_2(mdr, r);
3898 }
3899};
3900
7c673cae
FG
3901/*
3902 * filepath: ino
3903 */
3904void Server::handle_client_lookup_ino(MDRequestRef& mdr,
3905 bool want_parent, bool want_dentry)
3906{
9f95a23c 3907 const cref_t<MClientRequest> &req = mdr->client_request;
11fdf7f2
TL
3908
3909 if ((uint64_t)req->head.args.lookupino.snapid > 0)
3910 return _lookup_snap_ino(mdr);
7c673cae
FG
3911
3912 inodeno_t ino = req->get_filepath().get_ino();
b3b6e05e
TL
3913 auto _ino = ino.val;
3914
3915 /* It's been observed [1] that a client may lookup a private ~mdsdir inode.
3916 * I do not have an explanation for how that happened organically but this
3917 * check will ensure that the client can no longer do that.
3918 *
3919 * [1] https://tracker.ceph.com/issues/49922
3920 */
3921 if (MDS_IS_PRIVATE_INO(_ino)) {
3922 respond_to_request(mdr, -CEPHFS_ESTALE);
3923 return;
3924 }
3925
7c673cae
FG
3926 CInode *in = mdcache->get_inode(ino);
3927 if (in && in->state_test(CInode::STATE_PURGING)) {
f67539c2 3928 respond_to_request(mdr, -CEPHFS_ESTALE);
7c673cae
FG
3929 return;
3930 }
3931 if (!in) {
3932 mdcache->open_ino(ino, (int64_t)-1, new C_MDS_LookupIno2(this, mdr), false);
3933 return;
3934 }
3935
7c673cae
FG
3936 // check for nothing (not read or write); this still applies the
3937 // path check.
3938 if (!check_access(mdr, in, 0))
3939 return;
3940
3941 CDentry *dn = in->get_projected_parent_dn();
3942 CInode *diri = dn ? dn->get_dir()->inode : NULL;
3943
11fdf7f2 3944 MutationImpl::LockOpVec lov;
7c673cae
FG
3945 if (dn && (want_parent || want_dentry)) {
3946 mdr->pin(dn);
11fdf7f2 3947 lov.add_rdlock(&dn->lock);
7c673cae
FG
3948 }
3949
11fdf7f2 3950 unsigned mask = req->head.args.lookupino.mask;
7c673cae
FG
3951 if (mask) {
3952 Capability *cap = in->get_client_cap(mdr->get_client());
3953 int issued = 0;
3954 if (cap && (mdr->snapid == CEPH_NOSNAP || mdr->snapid <= cap->client_follows))
3955 issued = cap->issued();
9f95a23c 3956 // FIXME
7c673cae
FG
3957 // permission bits, ACL/security xattrs
3958 if ((mask & CEPH_CAP_AUTH_SHARED) && (issued & CEPH_CAP_AUTH_EXCL) == 0)
11fdf7f2 3959 lov.add_rdlock(&in->authlock);
7c673cae 3960 if ((mask & CEPH_CAP_XATTR_SHARED) && (issued & CEPH_CAP_XATTR_EXCL) == 0)
11fdf7f2 3961 lov.add_rdlock(&in->xattrlock);
7c673cae
FG
3962
3963 mdr->getattr_caps = mask;
3964 }
3965
11fdf7f2
TL
3966 if (!lov.empty()) {
3967 if (!mds->locker->acquire_locks(mdr, lov))
7c673cae
FG
3968 return;
3969
d2e6a577
FG
3970 if (diri != NULL) {
3971 // need read access to directory inode
3972 if (!check_access(mdr, diri, MAY_READ))
3973 return;
3974 }
7c673cae
FG
3975 }
3976
3977 if (want_parent) {
3978 if (in->is_base()) {
f67539c2 3979 respond_to_request(mdr, -CEPHFS_EINVAL);
7c673cae
FG
3980 return;
3981 }
3982 if (!diri || diri->is_stray()) {
f67539c2 3983 respond_to_request(mdr, -CEPHFS_ESTALE);
7c673cae
FG
3984 return;
3985 }
3986 dout(10) << "reply to lookup_parent " << *in << dendl;
3987 mdr->tracei = diri;
3988 respond_to_request(mdr, 0);
3989 } else {
3990 if (want_dentry) {
3991 inodeno_t dirino = req->get_filepath2().get_ino();
3992 if (!diri || (dirino != inodeno_t() && diri->ino() != dirino)) {
f67539c2 3993 respond_to_request(mdr, -CEPHFS_ENOENT);
7c673cae
FG
3994 return;
3995 }
3996 dout(10) << "reply to lookup_name " << *in << dendl;
3997 } else
3998 dout(10) << "reply to lookup_ino " << *in << dendl;
3999
4000 mdr->tracei = in;
4001 if (want_dentry)
4002 mdr->tracedn = dn;
4003 respond_to_request(mdr, 0);
4004 }
4005}
4006
11fdf7f2
TL
4007void Server::_lookup_snap_ino(MDRequestRef& mdr)
4008{
9f95a23c 4009 const cref_t<MClientRequest> &req = mdr->client_request;
11fdf7f2
TL
4010
4011 vinodeno_t vino;
4012 vino.ino = req->get_filepath().get_ino();
4013 vino.snapid = (__u64)req->head.args.lookupino.snapid;
4014 inodeno_t parent_ino = (__u64)req->head.args.lookupino.parent;
4015 __u32 hash = req->head.args.lookupino.hash;
4016
4017 dout(7) << "lookup_snap_ino " << vino << " parent " << parent_ino << " hash " << hash << dendl;
4018
4019 CInode *in = mdcache->lookup_snap_inode(vino);
4020 if (!in) {
4021 in = mdcache->get_inode(vino.ino);
4022 if (in) {
4023 if (in->state_test(CInode::STATE_PURGING) ||
4024 !in->has_snap_data(vino.snapid)) {
4025 if (in->is_dir() || !parent_ino) {
f67539c2 4026 respond_to_request(mdr, -CEPHFS_ESTALE);
11fdf7f2
TL
4027 return;
4028 }
4029 in = NULL;
4030 }
4031 }
4032 }
4033
4034 if (in) {
4035 dout(10) << "reply to lookup_snap_ino " << *in << dendl;
4036 mdr->snapid = vino.snapid;
4037 mdr->tracei = in;
4038 respond_to_request(mdr, 0);
4039 return;
4040 }
4041
4042 CInode *diri = NULL;
4043 if (parent_ino) {
4044 diri = mdcache->get_inode(parent_ino);
4045 if (!diri) {
b3b6e05e 4046 mdcache->open_ino(parent_ino, mds->get_metadata_pool(), new C_MDS_LookupIno2(this, mdr));
11fdf7f2
TL
4047 return;
4048 }
4049
4050 if (!diri->is_dir()) {
f67539c2 4051 respond_to_request(mdr, -CEPHFS_EINVAL);
11fdf7f2
TL
4052 return;
4053 }
4054
4055 MutationImpl::LockOpVec lov;
4056 lov.add_rdlock(&diri->dirfragtreelock);
4057 if (!mds->locker->acquire_locks(mdr, lov))
4058 return;
4059
4060 frag_t frag = diri->dirfragtree[hash];
4061 CDir *dir = try_open_auth_dirfrag(diri, frag, mdr);
4062 if (!dir)
4063 return;
4064
4065 if (!dir->is_complete()) {
4066 if (dir->is_frozen()) {
4067 mds->locker->drop_locks(mdr.get());
4068 mdr->drop_local_auth_pins();
4069 dir->add_waiter(CDir::WAIT_UNFREEZE, new C_MDS_RetryRequest(mdcache, mdr));
4070 return;
4071 }
4072 dir->fetch(new C_MDS_RetryRequest(mdcache, mdr), true);
4073 return;
4074 }
4075
f67539c2 4076 respond_to_request(mdr, -CEPHFS_ESTALE);
11fdf7f2 4077 } else {
b3b6e05e 4078 mdcache->open_ino(vino.ino, mds->get_metadata_pool(), new C_MDS_LookupIno2(this, mdr), false);
11fdf7f2
TL
4079 }
4080}
4081
7c673cae
FG
4082void Server::_lookup_ino_2(MDRequestRef& mdr, int r)
4083{
4084 inodeno_t ino = mdr->client_request->get_filepath().get_ino();
4085 dout(10) << "_lookup_ino_2 " << mdr.get() << " ino " << ino << " r=" << r << dendl;
4086
4087 // `r` is a rank if >=0, else an error code
4088 if (r >= 0) {
4089 mds_rank_t dest_rank(r);
4090 if (dest_rank == mds->get_nodeid())
4091 dispatch_client_request(mdr);
4092 else
4093 mdcache->request_forward(mdr, dest_rank);
4094 return;
4095 }
4096
4097 // give up
f67539c2
TL
4098 if (r == -CEPHFS_ENOENT || r == -CEPHFS_ENODATA)
4099 r = -CEPHFS_ESTALE;
7c673cae
FG
4100 respond_to_request(mdr, r);
4101}
4102
4103
4104/* This function takes responsibility for the passed mdr*/
4105void Server::handle_client_open(MDRequestRef& mdr)
4106{
9f95a23c 4107 const cref_t<MClientRequest> &req = mdr->client_request;
7c673cae
FG
4108 dout(7) << "open on " << req->get_filepath() << dendl;
4109
4110 int flags = req->head.args.open.flags;
4111 int cmode = ceph_flags_to_mode(flags);
4112 if (cmode < 0) {
f67539c2 4113 respond_to_request(mdr, -CEPHFS_EINVAL);
7c673cae
FG
4114 return;
4115 }
4116
181888fb
FG
4117 bool need_auth = !file_mode_is_readonly(cmode) ||
4118 (flags & (CEPH_O_TRUNC | CEPH_O_DIRECTORY));
7c673cae
FG
4119
4120 if ((cmode & CEPH_FILE_MODE_WR) && mdcache->is_readonly()) {
4121 dout(7) << "read-only FS" << dendl;
f67539c2 4122 respond_to_request(mdr, -CEPHFS_EROFS);
7c673cae
FG
4123 return;
4124 }
4125
9f95a23c 4126 CInode *cur = rdlock_path_pin_ref(mdr, need_auth);
7c673cae
FG
4127 if (!cur)
4128 return;
4129
4130 if (cur->is_frozen() || cur->state_test(CInode::STATE_EXPORTINGCAPS)) {
11fdf7f2 4131 ceph_assert(!need_auth);
9f95a23c
TL
4132 mdr->locking_state &= ~(MutationImpl::PATH_LOCKED | MutationImpl::ALL_LOCKED);
4133 CInode *cur = rdlock_path_pin_ref(mdr, true);
7c673cae
FG
4134 if (!cur)
4135 return;
4136 }
4137
f67539c2 4138 if (!cur->is_file()) {
7c673cae
FG
4139 // can only open non-regular inode with mode FILE_MODE_PIN, at least for now.
4140 cmode = CEPH_FILE_MODE_PIN;
4141 // the inode is symlink and client wants to follow it, ignore the O_TRUNC flag.
f67539c2 4142 if (cur->is_symlink() && !(flags & CEPH_O_NOFOLLOW))
7c673cae
FG
4143 flags &= ~CEPH_O_TRUNC;
4144 }
4145
4146 dout(10) << "open flags = " << flags
4147 << ", filemode = " << cmode
4148 << ", need_auth = " << need_auth
4149 << dendl;
4150
4151 // regular file?
4152 /*if (!cur->inode.is_file() && !cur->inode.is_dir()) {
4153 dout(7) << "not a file or dir " << *cur << dendl;
f67539c2 4154 respond_to_request(mdr, -CEPHFS_ENXIO); // FIXME what error do we want?
7c673cae
FG
4155 return;
4156 }*/
f67539c2 4157 if ((flags & CEPH_O_DIRECTORY) && !cur->is_dir() && !cur->is_symlink()) {
7c673cae 4158 dout(7) << "specified O_DIRECTORY on non-directory " << *cur << dendl;
f67539c2 4159 respond_to_request(mdr, -CEPHFS_EINVAL);
7c673cae
FG
4160 return;
4161 }
4162
f67539c2 4163 if ((flags & CEPH_O_TRUNC) && !cur->is_file()) {
7c673cae 4164 dout(7) << "specified O_TRUNC on !(file|symlink) " << *cur << dendl;
f67539c2
TL
4165 // we should return -CEPHFS_EISDIR for directory, return -CEPHFS_EINVAL for other non-regular
4166 respond_to_request(mdr, cur->is_dir() ? -CEPHFS_EISDIR : -CEPHFS_EINVAL);
7c673cae
FG
4167 return;
4168 }
4169
f67539c2 4170 if (cur->get_inode()->inline_data.version != CEPH_INLINE_NONE &&
11fdf7f2 4171 !mdr->session->get_connection()->has_feature(CEPH_FEATURE_MDS_INLINE_DATA)) {
7c673cae 4172 dout(7) << "old client cannot open inline data file " << *cur << dendl;
f67539c2 4173 respond_to_request(mdr, -CEPHFS_EPERM);
7c673cae
FG
4174 return;
4175 }
4176
4177 // snapped data is read only
4178 if (mdr->snapid != CEPH_NOSNAP &&
4179 ((cmode & CEPH_FILE_MODE_WR) || req->may_write())) {
4180 dout(7) << "snap " << mdr->snapid << " is read-only " << *cur << dendl;
f67539c2 4181 respond_to_request(mdr, -CEPHFS_EROFS);
7c673cae
FG
4182 return;
4183 }
4184
9f95a23c
TL
4185 MutationImpl::LockOpVec lov;
4186
7c673cae
FG
4187 unsigned mask = req->head.args.open.mask;
4188 if (mask) {
4189 Capability *cap = cur->get_client_cap(mdr->get_client());
4190 int issued = 0;
4191 if (cap && (mdr->snapid == CEPH_NOSNAP || mdr->snapid <= cap->client_follows))
4192 issued = cap->issued();
4193 // permission bits, ACL/security xattrs
4194 if ((mask & CEPH_CAP_AUTH_SHARED) && (issued & CEPH_CAP_AUTH_EXCL) == 0)
11fdf7f2 4195 lov.add_rdlock(&cur->authlock);
7c673cae 4196 if ((mask & CEPH_CAP_XATTR_SHARED) && (issued & CEPH_CAP_XATTR_EXCL) == 0)
11fdf7f2 4197 lov.add_rdlock(&cur->xattrlock);
7c673cae
FG
4198
4199 mdr->getattr_caps = mask;
4200 }
4201
4202 // O_TRUNC
4203 if ((flags & CEPH_O_TRUNC) && !mdr->has_completed) {
11fdf7f2 4204 ceph_assert(cur->is_auth());
7c673cae 4205
11fdf7f2
TL
4206 lov.add_xlock(&cur->filelock);
4207 if (!mds->locker->acquire_locks(mdr, lov))
7c673cae
FG
4208 return;
4209
4210 if (!check_access(mdr, cur, MAY_WRITE))
4211 return;
4212
4213 // wait for pending truncate?
f67539c2 4214 const auto& pi = cur->get_projected_inode();
7c673cae
FG
4215 if (pi->is_truncating()) {
4216 dout(10) << " waiting for pending truncate from " << pi->truncate_from
4217 << " to " << pi->truncate_size << " to complete on " << *cur << dendl;
4218 mds->locker->drop_locks(mdr.get());
4219 mdr->drop_local_auth_pins();
4220 cur->add_waiter(CInode::WAIT_TRUNC, new C_MDS_RetryRequest(mdcache, mdr));
4221 return;
4222 }
4223
4224 do_open_truncate(mdr, cmode);
4225 return;
4226 }
4227
4228 // sync filelock if snapped.
4229 // this makes us wait for writers to flushsnaps, ensuring we get accurate metadata,
4230 // and that data itself is flushed so that we can read the snapped data off disk.
4231 if (mdr->snapid != CEPH_NOSNAP && !cur->is_dir()) {
11fdf7f2 4232 lov.add_rdlock(&cur->filelock);
7c673cae
FG
4233 }
4234
11fdf7f2 4235 if (!mds->locker->acquire_locks(mdr, lov))
7c673cae
FG
4236 return;
4237
4238 mask = MAY_READ;
4239 if (cmode & CEPH_FILE_MODE_WR)
4240 mask |= MAY_WRITE;
4241 if (!check_access(mdr, cur, mask))
4242 return;
4243
28e407b8
AA
4244 utime_t now = ceph_clock_now();
4245 mdr->set_mds_stamp(now);
4246
7c673cae
FG
4247 if (cur->is_file() || cur->is_dir()) {
4248 if (mdr->snapid == CEPH_NOSNAP) {
4249 // register new cap
9f95a23c 4250 Capability *cap = mds->locker->issue_new_caps(cur, cmode, mdr, nullptr);
7c673cae
FG
4251 if (cap)
4252 dout(12) << "open issued caps " << ccap_string(cap->pending())
4253 << " for " << req->get_source()
4254 << " on " << *cur << dendl;
4255 } else {
4256 int caps = ceph_caps_for_mode(cmode);
4257 dout(12) << "open issued IMMUTABLE SNAP caps " << ccap_string(caps)
4258 << " for " << req->get_source()
4259 << " snapid " << mdr->snapid
4260 << " on " << *cur << dendl;
4261 mdr->snap_caps = caps;
4262 }
4263 }
4264
4265 // increase max_size?
4266 if (cmode & CEPH_FILE_MODE_WR)
4267 mds->locker->check_inode_max_size(cur);
4268
4269 // make sure this inode gets into the journal
4270 if (cur->is_auth() && cur->last == CEPH_NOSNAP &&
11fdf7f2 4271 mdcache->open_file_table.should_log_open(cur)) {
7c673cae
FG
4272 EOpen *le = new EOpen(mds->mdlog);
4273 mdlog->start_entry(le);
4274 le->add_clean_inode(cur);
7c673cae
FG
4275 mdlog->submit_entry(le);
4276 }
4277
4278 // hit pop
4279 if (cmode & CEPH_FILE_MODE_WR)
11fdf7f2 4280 mds->balancer->hit_inode(cur, META_POP_IWR);
7c673cae 4281 else
11fdf7f2 4282 mds->balancer->hit_inode(cur, META_POP_IRD,
7c673cae
FG
4283 mdr->client_request->get_source().num());
4284
4285 CDentry *dn = 0;
4286 if (req->get_dentry_wanted()) {
11fdf7f2 4287 ceph_assert(mdr->dn[0].size());
7c673cae
FG
4288 dn = mdr->dn[0].back();
4289 }
4290
4291 mdr->tracei = cur;
4292 mdr->tracedn = dn;
4293 respond_to_request(mdr, 0);
4294}
4295
4296class C_MDS_openc_finish : public ServerLogContext {
4297 CDentry *dn;
4298 CInode *newi;
7c673cae 4299public:
11fdf7f2
TL
4300 C_MDS_openc_finish(Server *s, MDRequestRef& r, CDentry *d, CInode *ni) :
4301 ServerLogContext(s, r), dn(d), newi(ni) {}
7c673cae 4302 void finish(int r) override {
11fdf7f2 4303 ceph_assert(r == 0);
7c673cae
FG
4304
4305 dn->pop_projected_linkage();
4306
4307 // dirty inode, dn, dir
f67539c2 4308 newi->mark_dirty(mdr->ls);
28e407b8 4309 newi->mark_dirty_parent(mdr->ls, true);
7c673cae
FG
4310
4311 mdr->apply();
4312
4313 get_mds()->locker->share_inode_max_size(newi);
4314
4315 MDRequestRef null_ref;
4316 get_mds()->mdcache->send_dentry_link(dn, null_ref);
4317
11fdf7f2 4318 get_mds()->balancer->hit_inode(newi, META_POP_IWR);
7c673cae
FG
4319
4320 server->respond_to_request(mdr, 0);
4321
11fdf7f2 4322 ceph_assert(g_conf()->mds_kill_openc_at != 1);
7c673cae
FG
4323 }
4324};
4325
4326/* This function takes responsibility for the passed mdr*/
4327void Server::handle_client_openc(MDRequestRef& mdr)
4328{
9f95a23c 4329 const cref_t<MClientRequest> &req = mdr->client_request;
7c673cae
FG
4330 client_t client = mdr->get_client();
4331
4332 dout(7) << "open w/ O_CREAT on " << req->get_filepath() << dendl;
4333
4334 int cmode = ceph_flags_to_mode(req->head.args.open.flags);
4335 if (cmode < 0) {
f67539c2 4336 respond_to_request(mdr, -CEPHFS_EINVAL);
7c673cae
FG
4337 return;
4338 }
4339
c07f9fc5 4340 bool excl = req->head.args.open.flags & CEPH_O_EXCL;
9f95a23c
TL
4341 CDentry *dn = rdlock_path_xlock_dentry(mdr, true, !excl, true);
4342 if (!dn)
4343 return;
c07f9fc5 4344
9f95a23c
TL
4345 CDentry::linkage_t *dnl = dn->get_projected_linkage();
4346 if (!excl && !dnl->is_null()) {
4347 // it existed.
4348 mds->locker->xlock_downgrade(&dn->lock, mdr.get());
4349
4350 MutationImpl::LockOpVec lov;
4351 lov.add_rdlock(&dnl->get_inode()->snaplock);
4352 if (!mds->locker->acquire_locks(mdr, lov))
7c673cae 4353 return;
7c673cae 4354
9f95a23c 4355 handle_client_open(mdr);
7c673cae
FG
4356 return;
4357 }
9f95a23c
TL
4358
4359 ceph_assert(dnl->is_null());
4360
f67539c2
TL
4361 if (req->get_alternate_name().size() > alternate_name_max) {
4362 dout(10) << " alternate_name longer than " << alternate_name_max << dendl;
4363 respond_to_request(mdr, -CEPHFS_ENAMETOOLONG);
4364 return;
4365 }
4366 dn->set_alternate_name(req->get_alternate_name());
4367
7c673cae
FG
4368 // set layout
4369 file_layout_t layout;
9f95a23c
TL
4370 if (mdr->dir_layout != file_layout_t())
4371 layout = mdr->dir_layout;
7c673cae
FG
4372 else
4373 layout = mdcache->default_file_layout;
4374
4375 // What kind of client caps are required to complete this operation
4376 uint64_t access = MAY_WRITE;
4377
4378 const auto default_layout = layout;
4379
4380 // fill in any special params from client
4381 if (req->head.args.open.stripe_unit)
4382 layout.stripe_unit = req->head.args.open.stripe_unit;
4383 if (req->head.args.open.stripe_count)
4384 layout.stripe_count = req->head.args.open.stripe_count;
4385 if (req->head.args.open.object_size)
4386 layout.object_size = req->head.args.open.object_size;
4387 if (req->get_connection()->has_feature(CEPH_FEATURE_CREATEPOOLID) &&
4388 (__s32)req->head.args.open.pool >= 0) {
4389 layout.pool_id = req->head.args.open.pool;
4390
4391 // make sure we have as new a map as the client
4392 if (req->get_mdsmap_epoch() > mds->mdsmap->get_epoch()) {
4393 mds->wait_for_mdsmap(req->get_mdsmap_epoch(), new C_MDS_RetryRequest(mdcache, mdr));
4394 return;
4395 }
4396 }
4397
4398 // If client doesn't have capability to modify layout pools, then
4399 // only permit this request if the requested pool matches what the
4400 // file would have inherited anyway from its parent.
4401 if (default_layout != layout) {
4402 access |= MAY_SET_VXATTR;
4403 }
4404
4405 if (!layout.is_valid()) {
4406 dout(10) << " invalid initial file layout" << dendl;
f67539c2 4407 respond_to_request(mdr, -CEPHFS_EINVAL);
7c673cae
FG
4408 return;
4409 }
4410 if (!mds->mdsmap->is_data_pool(layout.pool_id)) {
4411 dout(10) << " invalid data pool " << layout.pool_id << dendl;
f67539c2 4412 respond_to_request(mdr, -CEPHFS_EINVAL);
7c673cae
FG
4413 return;
4414 }
4415
c07f9fc5 4416 // created null dn.
7c673cae
FG
4417 CDir *dir = dn->get_dir();
4418 CInode *diri = dir->get_inode();
7c673cae
FG
4419 if (!check_access(mdr, diri, access))
4420 return;
7c673cae
FG
4421 if (!check_fragment_space(mdr, dir))
4422 return;
4423
9f95a23c
TL
4424 if (mdr->dn[0].size() == 1)
4425 mds->locker->create_lock_cache(mdr, diri, &mdr->dir_layout);
7c673cae 4426
7c673cae 4427 // create inode.
f67539c2
TL
4428 CInode *newi = prepare_new_inode(mdr, dn->get_dir(), inodeno_t(req->head.ino),
4429 req->head.args.open.mode | S_IFREG, &layout);
4430 ceph_assert(newi);
7c673cae
FG
4431
4432 // it's a file.
f67539c2 4433 dn->push_projected_linkage(newi);
7c673cae 4434
f67539c2
TL
4435 auto _inode = newi->_get_inode();
4436 _inode->version = dn->pre_dirty();
7c673cae 4437 if (layout.pool_id != mdcache->default_file_layout.pool_id)
f67539c2
TL
4438 _inode->add_old_pool(mdcache->default_file_layout.pool_id);
4439 _inode->update_backtrace();
4440 _inode->rstat.rfiles = 1;
4441 _inode->accounted_rstat = _inode->rstat;
a8e16298
TL
4442
4443 SnapRealm *realm = diri->find_snaprealm();
11fdf7f2
TL
4444 snapid_t follows = mdcache->get_global_snaprealm()->get_newest_seq();
4445 ceph_assert(follows >= realm->get_newest_seq());
a8e16298
TL
4446
4447 ceph_assert(dn->first == follows+1);
f67539c2 4448 newi->first = dn->first;
a8e16298
TL
4449
4450 // do the open
f67539c2
TL
4451 Capability *cap = mds->locker->issue_new_caps(newi, cmode, mdr, realm);
4452 newi->authlock.set_state(LOCK_EXCL);
4453 newi->xattrlock.set_state(LOCK_EXCL);
a8e16298
TL
4454
4455 if (cap && (cmode & CEPH_FILE_MODE_WR)) {
f67539c2
TL
4456 _inode->client_ranges[client].range.first = 0;
4457 _inode->client_ranges[client].range.last = _inode->layout.stripe_unit;
4458 _inode->client_ranges[client].follows = follows;
4459 newi->mark_clientwriteable();
a8e16298 4460 cap->mark_clientwriteable();
7c673cae 4461 }
7c673cae
FG
4462
4463 // prepare finisher
4464 mdr->ls = mdlog->get_current_segment();
4465 EUpdate *le = new EUpdate(mdlog, "openc");
4466 mdlog->start_entry(le);
4467 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
4468 journal_allocated_inos(mdr, &le->metablob);
f67539c2
TL
4469 mdcache->predirty_journal_parents(mdr, &le->metablob, newi, dn->get_dir(), PREDIRTY_PRIMARY|PREDIRTY_DIR, 1);
4470 le->metablob.add_primary_dentry(dn, newi, true, true, true);
7c673cae 4471
7c673cae 4472 // make sure this inode gets into the journal
f67539c2 4473 le->metablob.add_opened_ino(newi->ino());
7c673cae 4474
f67539c2 4475 C_MDS_openc_finish *fin = new C_MDS_openc_finish(this, mdr, dn, newi);
7c673cae 4476
9f95a23c
TL
4477 if (mdr->session->info.has_feature(CEPHFS_FEATURE_DELEG_INO)) {
4478 openc_response_t ocresp;
4479
4480 dout(10) << "adding created_ino and delegated_inos" << dendl;
f67539c2 4481 ocresp.created_ino = _inode->ino;
9f95a23c
TL
4482
4483 if (delegate_inos_pct && !req->is_queued_for_replay()) {
4484 // Try to delegate some prealloc_inos to the client, if it's down to half the max
4485 unsigned frac = 100 / delegate_inos_pct;
4486 if (mdr->session->delegated_inos.size() < (unsigned)g_conf()->mds_client_prealloc_inos / frac / 2)
4487 mdr->session->delegate_inos(g_conf()->mds_client_prealloc_inos / frac, ocresp.delegated_inos);
4488 }
4489
4490 encode(ocresp, mdr->reply_extra_bl);
4491 } else if (mdr->client_request->get_connection()->has_feature(CEPH_FEATURE_REPLY_CREATE_INODE)) {
7c673cae
FG
4492 dout(10) << "adding ino to reply to indicate inode was created" << dendl;
4493 // add the file created flag onto the reply if create_flags features is supported
f67539c2 4494 encode(newi->ino(), mdr->reply_extra_bl);
7c673cae
FG
4495 }
4496
f67539c2 4497 journal_and_reply(mdr, newi, dn, le, fin);
7c673cae
FG
4498
4499 // We hit_dir (via hit_inode) in our finish callback, but by then we might
4500 // have overshot the split size (multiple opencs in flight), so here is
4501 // an early chance to split the dir if this openc makes it oversized.
4502 mds->balancer->maybe_fragment(dir, false);
4503}
4504
4505
4506
4507void Server::handle_client_readdir(MDRequestRef& mdr)
4508{
9f95a23c 4509 const cref_t<MClientRequest> &req = mdr->client_request;
adb31ebb 4510 Session *session = mds->get_session(req);
7c673cae 4511 client_t client = req->get_source().num();
11fdf7f2 4512 MutationImpl::LockOpVec lov;
9f95a23c 4513 CInode *diri = rdlock_path_pin_ref(mdr, false, true);
7c673cae
FG
4514 if (!diri) return;
4515
4516 // it's a directory, right?
4517 if (!diri->is_dir()) {
4518 // not a dir
f67539c2
TL
4519 dout(10) << "reply to " << *req << " readdir -CEPHFS_ENOTDIR" << dendl;
4520 respond_to_request(mdr, -CEPHFS_ENOTDIR);
7c673cae
FG
4521 return;
4522 }
4523
adb31ebb
TL
4524 auto num_caps = session->get_num_caps();
4525 auto session_cap_acquisition = session->get_cap_acquisition();
4526
4527 if (num_caps > static_cast<uint64_t>(max_caps_per_client * max_caps_throttle_ratio) && session_cap_acquisition >= cap_acquisition_throttle) {
4528 dout(20) << "readdir throttled. max_caps_per_client: " << max_caps_per_client << " num_caps: " << num_caps
4529 << " session_cap_acquistion: " << session_cap_acquisition << " cap_acquisition_throttle: " << cap_acquisition_throttle << dendl;
4530 if (logger)
4531 logger->inc(l_mdss_cap_acquisition_throttle);
4532
4533 mds->timer.add_event_after(caps_throttle_retry_request_timeout, new C_MDS_RetryRequest(mdcache, mdr));
4534 return;
4535 }
4536
11fdf7f2
TL
4537 lov.add_rdlock(&diri->filelock);
4538 lov.add_rdlock(&diri->dirfragtreelock);
7c673cae 4539
11fdf7f2 4540 if (!mds->locker->acquire_locks(mdr, lov))
7c673cae
FG
4541 return;
4542
4543 if (!check_access(mdr, diri, MAY_READ))
4544 return;
4545
4546 // which frag?
4547 frag_t fg = (__u32)req->head.args.readdir.frag;
4548 unsigned req_flags = (__u32)req->head.args.readdir.flags;
4549 string offset_str = req->get_path2();
4550
4551 __u32 offset_hash = 0;
4552 if (!offset_str.empty())
4553 offset_hash = ceph_frag_value(diri->hash_dentry_name(offset_str));
4554 else
4555 offset_hash = (__u32)req->head.args.readdir.offset_hash;
4556
4557 dout(10) << " frag " << fg << " offset '" << offset_str << "'"
4558 << " offset_hash " << offset_hash << " flags " << req_flags << dendl;
4559
4560 // does the frag exist?
4561 if (diri->dirfragtree[fg.value()] != fg) {
4562 frag_t newfg;
4563 if (req_flags & CEPH_READDIR_REPLY_BITFLAGS) {
4564 if (fg.contains((unsigned)offset_hash)) {
4565 newfg = diri->dirfragtree[offset_hash];
4566 } else {
4567 // client actually wants next frag
4568 newfg = diri->dirfragtree[fg.value()];
4569 }
4570 } else {
4571 offset_str.clear();
4572 newfg = diri->dirfragtree[fg.value()];
4573 }
4574 dout(10) << " adjust frag " << fg << " -> " << newfg << " " << diri->dirfragtree << dendl;
4575 fg = newfg;
4576 }
4577
4578 CDir *dir = try_open_auth_dirfrag(diri, fg, mdr);
4579 if (!dir) return;
4580
4581 // ok!
4582 dout(10) << "handle_client_readdir on " << *dir << dendl;
11fdf7f2 4583 ceph_assert(dir->is_auth());
7c673cae
FG
4584
4585 if (!dir->is_complete()) {
4586 if (dir->is_frozen()) {
4587 dout(7) << "dir is frozen " << *dir << dendl;
4588 mds->locker->drop_locks(mdr.get());
4589 mdr->drop_local_auth_pins();
4590 dir->add_waiter(CDir::WAIT_UNFREEZE, new C_MDS_RetryRequest(mdcache, mdr));
4591 return;
4592 }
4593 // fetch
4594 dout(10) << " incomplete dir contents for readdir on " << *dir << ", fetching" << dendl;
4595 dir->fetch(new C_MDS_RetryRequest(mdcache, mdr), true);
4596 return;
4597 }
4598
4599#ifdef MDS_VERIFY_FRAGSTAT
4600 dir->verify_fragstat();
4601#endif
4602
4603 utime_t now = ceph_clock_now();
4604 mdr->set_mds_stamp(now);
4605
4606 snapid_t snapid = mdr->snapid;
4607 dout(10) << "snapid " << snapid << dendl;
4608
4609 SnapRealm *realm = diri->find_snaprealm();
4610
4611 unsigned max = req->head.args.readdir.max_entries;
4612 if (!max)
4613 max = dir->get_num_any(); // whatever, something big.
4614 unsigned max_bytes = req->head.args.readdir.max_bytes;
4615 if (!max_bytes)
4616 // make sure at least one item can be encoded
11fdf7f2 4617 max_bytes = (512 << 10) + g_conf()->mds_max_xattr_pairs_size;
7c673cae
FG
4618
4619 // start final blob
4620 bufferlist dirbl;
11fdf7f2
TL
4621 DirStat ds;
4622 ds.frag = dir->get_frag();
4623 ds.auth = dir->get_dir_auth().first;
f91f0fd5 4624 if (dir->is_auth() && !forward_all_requests_to_auth)
11fdf7f2
TL
4625 dir->get_dist_spec(ds.dist, mds->get_nodeid());
4626
4627 dir->encode_dirstat(dirbl, mdr->session->info, ds);
7c673cae
FG
4628
4629 // count bytes available.
4630 // this isn't perfect, but we should capture the main variable/unbounded size items!
4631 int front_bytes = dirbl.length() + sizeof(__u32) + sizeof(__u8)*2;
4632 int bytes_left = max_bytes - front_bytes;
4633 bytes_left -= realm->get_snap_trace().length();
4634
4635 // build dir contents
4636 bufferlist dnbl;
4637 __u32 numfiles = 0;
4638 bool start = !offset_hash && offset_str.empty();
7c673cae
FG
4639 // skip all dns < dentry_key_t(snapid, offset_str, offset_hash)
4640 dentry_key_t skip_key(snapid, offset_str.c_str(), offset_hash);
181888fb
FG
4641 auto it = start ? dir->begin() : dir->lower_bound(skip_key);
4642 bool end = (it == dir->end());
4643 for (; !end && numfiles < max; end = (it == dir->end())) {
7c673cae
FG
4644 CDentry *dn = it->second;
4645 ++it;
4646
4647 if (dn->state_test(CDentry::STATE_PURGING))
4648 continue;
4649
4650 bool dnp = dn->use_projected(client, mdr);
4651 CDentry::linkage_t *dnl = dnp ? dn->get_projected_linkage() : dn->get_linkage();
4652
4653 if (dnl->is_null())
4654 continue;
4655
4656 if (dn->last < snapid || dn->first > snapid) {
4657 dout(20) << "skipping non-overlapping snap " << *dn << dendl;
4658 continue;
4659 }
4660
4661 if (!start) {
4662 dentry_key_t offset_key(dn->last, offset_str.c_str(), offset_hash);
4663 if (!(offset_key < dn->key()))
4664 continue;
4665 }
4666
4667 CInode *in = dnl->get_inode();
4668
4669 if (in && in->ino() == CEPH_INO_CEPH)
4670 continue;
4671
4672 // remote link?
4673 // better for the MDS to do the work, if we think the client will stat any of these files.
4674 if (dnl->is_remote() && !in) {
4675 in = mdcache->get_inode(dnl->get_remote_ino());
4676 if (in) {
4677 dn->link_remote(dnl, in);
4678 } else if (dn->state_test(CDentry::STATE_BADREMOTEINO)) {
4679 dout(10) << "skipping bad remote ino on " << *dn << dendl;
4680 continue;
4681 } else {
4682 // touch everything i _do_ have
94b18763
FG
4683 for (auto &p : *dir) {
4684 if (!p.second->get_linkage()->is_null())
4685 mdcache->lru.lru_touch(p.second);
4686 }
7c673cae
FG
4687
4688 // already issued caps and leases, reply immediately.
4689 if (dnbl.length() > 0) {
4690 mdcache->open_remote_dentry(dn, dnp, new C_MDSInternalNoop);
4691 dout(10) << " open remote dentry after caps were issued, stopping at "
4692 << dnbl.length() << " < " << bytes_left << dendl;
4693 break;
4694 }
4695
4696 mds->locker->drop_locks(mdr.get());
4697 mdr->drop_local_auth_pins();
4698 mdcache->open_remote_dentry(dn, dnp, new C_MDS_RetryRequest(mdcache, mdr));
4699 return;
4700 }
4701 }
11fdf7f2 4702 ceph_assert(in);
7c673cae 4703
94b18763 4704 if ((int)(dnbl.length() + dn->get_name().length() + sizeof(__u32) + sizeof(LeaseStat)) > bytes_left) {
7c673cae
FG
4705 dout(10) << " ran out of room, stopping at " << dnbl.length() << " < " << bytes_left << dendl;
4706 break;
4707 }
4708
4709 unsigned start_len = dnbl.length();
4710
4711 // dentry
4712 dout(12) << "including dn " << *dn << dendl;
11fdf7f2 4713 encode(dn->get_name(), dnbl);
9f95a23c
TL
4714 int lease_mask = dnl->is_primary() ? CEPH_LEASE_PRIMARY_LINK : 0;
4715 mds->locker->issue_client_lease(dn, mdr, lease_mask, now, dnbl);
7c673cae
FG
4716
4717 // inode
4718 dout(12) << "including inode " << *in << dendl;
4719 int r = in->encode_inodestat(dnbl, mdr->session, realm, snapid, bytes_left - (int)dnbl.length());
4720 if (r < 0) {
4721 // chop off dn->name, lease
4722 dout(10) << " ran out of room, stopping at " << start_len << " < " << bytes_left << dendl;
4723 bufferlist keep;
4724 keep.substr_of(dnbl, 0, start_len);
4725 dnbl.swap(keep);
4726 break;
4727 }
11fdf7f2 4728 ceph_assert(r >= 0);
7c673cae
FG
4729 numfiles++;
4730
4731 // touch dn
4732 mdcache->lru.lru_touch(dn);
4733 }
4734
adb31ebb
TL
4735 session->touch_readdir_cap(numfiles);
4736
7c673cae
FG
4737 __u16 flags = 0;
4738 if (end) {
4739 flags = CEPH_READDIR_FRAG_END;
4740 if (start)
4741 flags |= CEPH_READDIR_FRAG_COMPLETE; // FIXME: what purpose does this serve
4742 }
4743 // client only understand END and COMPLETE flags ?
4744 if (req_flags & CEPH_READDIR_REPLY_BITFLAGS) {
4745 flags |= CEPH_READDIR_HASH_ORDER | CEPH_READDIR_OFFSET_HASH;
4746 }
4747
4748 // finish final blob
11fdf7f2
TL
4749 encode(numfiles, dirbl);
4750 encode(flags, dirbl);
7c673cae
FG
4751 dirbl.claim_append(dnbl);
4752
4753 // yay, reply
4754 dout(10) << "reply to " << *req << " readdir num=" << numfiles
4755 << " bytes=" << dirbl.length()
4756 << " start=" << (int)start
4757 << " end=" << (int)end
4758 << dendl;
4759 mdr->reply_extra_bl = dirbl;
4760
4761 // bump popularity. NOTE: this doesn't quite capture it.
522d829b 4762 mds->balancer->hit_dir(dir, META_POP_READDIR, -1, numfiles);
7c673cae
FG
4763
4764 // reply
4765 mdr->tracei = diri;
4766 respond_to_request(mdr, 0);
4767}
4768
4769
4770
4771// ===============================================================================
4772// INODE UPDATES
4773
4774
4775/*
4776 * finisher for basic inode updates
4777 */
4778class C_MDS_inode_update_finish : public ServerLogContext {
4779 CInode *in;
adb31ebb 4780 bool truncating_smaller, changed_ranges, adjust_realm;
7c673cae
FG
4781public:
4782 C_MDS_inode_update_finish(Server *s, MDRequestRef& r, CInode *i,
adb31ebb 4783 bool sm=false, bool cr=false, bool ar=false) :
11fdf7f2 4784 ServerLogContext(s, r), in(i),
adb31ebb 4785 truncating_smaller(sm), changed_ranges(cr), adjust_realm(ar) { }
7c673cae 4786 void finish(int r) override {
11fdf7f2 4787 ceph_assert(r == 0);
7c673cae 4788
adb31ebb
TL
4789 int snap_op = (in->snaprealm ? CEPH_SNAP_OP_UPDATE : CEPH_SNAP_OP_SPLIT);
4790
7c673cae 4791 // apply
7c673cae
FG
4792 mdr->apply();
4793
11fdf7f2
TL
4794 MDSRank *mds = get_mds();
4795
7c673cae 4796 // notify any clients
f67539c2 4797 if (truncating_smaller && in->get_inode()->is_truncating()) {
11fdf7f2
TL
4798 mds->locker->issue_truncate(in);
4799 mds->mdcache->truncate_inode(in, mdr->ls);
4800 }
4801
adb31ebb
TL
4802 if (adjust_realm) {
4803 mds->mdcache->send_snap_update(in, 0, snap_op);
4804 mds->mdcache->do_realm_invalidate_and_update_notify(in, snap_op);
7c673cae
FG
4805 }
4806
11fdf7f2 4807 get_mds()->balancer->hit_inode(in, META_POP_IWR);
7c673cae
FG
4808
4809 server->respond_to_request(mdr, 0);
4810
4811 if (changed_ranges)
4812 get_mds()->locker->share_inode_max_size(in);
4813 }
4814};
4815
4816void Server::handle_client_file_setlock(MDRequestRef& mdr)
4817{
9f95a23c 4818 const cref_t<MClientRequest> &req = mdr->client_request;
11fdf7f2 4819 MutationImpl::LockOpVec lov;
7c673cae
FG
4820
4821 // get the inode to operate on, and set up any locks needed for that
9f95a23c 4822 CInode *cur = rdlock_path_pin_ref(mdr, true);
7c673cae
FG
4823 if (!cur)
4824 return;
4825
11fdf7f2 4826 lov.add_xlock(&cur->flocklock);
7c673cae
FG
4827 /* acquire_locks will return true if it gets the locks. If it fails,
4828 it will redeliver this request at a later date, so drop the request.
4829 */
11fdf7f2 4830 if (!mds->locker->acquire_locks(mdr, lov)) {
7c673cae
FG
4831 dout(10) << "handle_client_file_setlock could not get locks!" << dendl;
4832 return;
4833 }
4834
4835 // copy the lock change into a ceph_filelock so we can store/apply it
4836 ceph_filelock set_lock;
4837 set_lock.start = req->head.args.filelock_change.start;
4838 set_lock.length = req->head.args.filelock_change.length;
4839 set_lock.client = req->get_orig_source().num();
4840 set_lock.owner = req->head.args.filelock_change.owner;
4841 set_lock.pid = req->head.args.filelock_change.pid;
4842 set_lock.type = req->head.args.filelock_change.type;
4843 bool will_wait = req->head.args.filelock_change.wait;
4844
4845 dout(10) << "handle_client_file_setlock: " << set_lock << dendl;
4846
4847 ceph_lock_state_t *lock_state = NULL;
4848 bool interrupt = false;
4849
4850 // get the appropriate lock state
4851 switch (req->head.args.filelock_change.rule) {
4852 case CEPH_LOCK_FLOCK_INTR:
4853 interrupt = true;
4854 // fall-thru
4855 case CEPH_LOCK_FLOCK:
4856 lock_state = cur->get_flock_lock_state();
4857 break;
4858
4859 case CEPH_LOCK_FCNTL_INTR:
4860 interrupt = true;
4861 // fall-thru
4862 case CEPH_LOCK_FCNTL:
4863 lock_state = cur->get_fcntl_lock_state();
4864 break;
4865
4866 default:
4867 dout(10) << "got unknown lock type " << set_lock.type
4868 << ", dropping request!" << dendl;
f67539c2 4869 respond_to_request(mdr, -CEPHFS_EOPNOTSUPP);
7c673cae
FG
4870 return;
4871 }
4872
4873 dout(10) << " state prior to lock change: " << *lock_state << dendl;
4874 if (CEPH_LOCK_UNLOCK == set_lock.type) {
4875 list<ceph_filelock> activated_locks;
11fdf7f2 4876 MDSContext::vec waiters;
7c673cae
FG
4877 if (lock_state->is_waiting(set_lock)) {
4878 dout(10) << " unlock removing waiting lock " << set_lock << dendl;
4879 lock_state->remove_waiting(set_lock);
4880 cur->take_waiting(CInode::WAIT_FLOCK, waiters);
4881 } else if (!interrupt) {
4882 dout(10) << " unlock attempt on " << set_lock << dendl;
4883 lock_state->remove_lock(set_lock, activated_locks);
4884 cur->take_waiting(CInode::WAIT_FLOCK, waiters);
4885 }
4886 mds->queue_waiters(waiters);
4887
4888 respond_to_request(mdr, 0);
4889 } else {
4890 dout(10) << " lock attempt on " << set_lock << dendl;
4891 bool deadlock = false;
4892 if (mdr->more()->flock_was_waiting &&
4893 !lock_state->is_waiting(set_lock)) {
4894 dout(10) << " was waiting for lock but not anymore, must have been canceled " << set_lock << dendl;
f67539c2 4895 respond_to_request(mdr, -CEPHFS_EINTR);
7c673cae
FG
4896 } else if (!lock_state->add_lock(set_lock, will_wait, mdr->more()->flock_was_waiting, &deadlock)) {
4897 dout(10) << " it failed on this attempt" << dendl;
4898 // couldn't set lock right now
4899 if (deadlock) {
f67539c2 4900 respond_to_request(mdr, -CEPHFS_EDEADLK);
7c673cae 4901 } else if (!will_wait) {
f67539c2 4902 respond_to_request(mdr, -CEPHFS_EWOULDBLOCK);
7c673cae
FG
4903 } else {
4904 dout(10) << " added to waiting list" << dendl;
11fdf7f2 4905 ceph_assert(lock_state->is_waiting(set_lock));
7c673cae
FG
4906 mdr->more()->flock_was_waiting = true;
4907 mds->locker->drop_locks(mdr.get());
4908 mdr->drop_local_auth_pins();
1adf2230
AA
4909 mdr->mark_event("failed to add lock, waiting");
4910 mdr->mark_nowarn();
7c673cae
FG
4911 cur->add_waiter(CInode::WAIT_FLOCK, new C_MDS_RetryRequest(mdcache, mdr));
4912 }
4913 } else
4914 respond_to_request(mdr, 0);
4915 }
4916 dout(10) << " state after lock change: " << *lock_state << dendl;
4917}
4918
4919void Server::handle_client_file_readlock(MDRequestRef& mdr)
4920{
9f95a23c 4921 const cref_t<MClientRequest> &req = mdr->client_request;
11fdf7f2 4922 MutationImpl::LockOpVec lov;
7c673cae
FG
4923
4924 // get the inode to operate on, and set up any locks needed for that
9f95a23c 4925 CInode *cur = rdlock_path_pin_ref(mdr, true);
7c673cae
FG
4926 if (!cur)
4927 return;
4928
4929 /* acquire_locks will return true if it gets the locks. If it fails,
4930 it will redeliver this request at a later date, so drop the request.
4931 */
11fdf7f2
TL
4932 lov.add_rdlock(&cur->flocklock);
4933 if (!mds->locker->acquire_locks(mdr, lov)) {
7c673cae
FG
4934 dout(10) << "handle_client_file_readlock could not get locks!" << dendl;
4935 return;
4936 }
4937
4938 // copy the lock change into a ceph_filelock so we can store/apply it
4939 ceph_filelock checking_lock;
4940 checking_lock.start = req->head.args.filelock_change.start;
4941 checking_lock.length = req->head.args.filelock_change.length;
4942 checking_lock.client = req->get_orig_source().num();
4943 checking_lock.owner = req->head.args.filelock_change.owner;
4944 checking_lock.pid = req->head.args.filelock_change.pid;
4945 checking_lock.type = req->head.args.filelock_change.type;
4946
4947 // get the appropriate lock state
4948 ceph_lock_state_t *lock_state = NULL;
4949 switch (req->head.args.filelock_change.rule) {
4950 case CEPH_LOCK_FLOCK:
4951 lock_state = cur->get_flock_lock_state();
4952 break;
4953
4954 case CEPH_LOCK_FCNTL:
4955 lock_state = cur->get_fcntl_lock_state();
4956 break;
4957
4958 default:
4959 dout(10) << "got unknown lock type " << checking_lock.type << dendl;
f67539c2 4960 respond_to_request(mdr, -CEPHFS_EINVAL);
7c673cae
FG
4961 return;
4962 }
4963 lock_state->look_for_lock(checking_lock);
4964
4965 bufferlist lock_bl;
11fdf7f2 4966 encode(checking_lock, lock_bl);
7c673cae
FG
4967
4968 mdr->reply_extra_bl = lock_bl;
4969 respond_to_request(mdr, 0);
4970}
4971
4972void Server::handle_client_setattr(MDRequestRef& mdr)
4973{
9f95a23c 4974 const cref_t<MClientRequest> &req = mdr->client_request;
11fdf7f2 4975 MutationImpl::LockOpVec lov;
9f95a23c 4976 CInode *cur = rdlock_path_pin_ref(mdr, true);
7c673cae
FG
4977 if (!cur) return;
4978
4979 if (mdr->snapid != CEPH_NOSNAP) {
f67539c2 4980 respond_to_request(mdr, -CEPHFS_EROFS);
7c673cae
FG
4981 return;
4982 }
4983 if (cur->ino() < MDS_INO_SYSTEM_BASE && !cur->is_base()) {
f67539c2 4984 respond_to_request(mdr, -CEPHFS_EPERM);
7c673cae
FG
4985 return;
4986 }
4987
4988 __u32 mask = req->head.args.setattr.mask;
4989 __u32 access_mask = MAY_WRITE;
4990
4991 // xlock inode
4992 if (mask & (CEPH_SETATTR_MODE|CEPH_SETATTR_UID|CEPH_SETATTR_GID|CEPH_SETATTR_BTIME|CEPH_SETATTR_KILL_SGUID))
11fdf7f2 4993 lov.add_xlock(&cur->authlock);
7c673cae 4994 if (mask & (CEPH_SETATTR_MTIME|CEPH_SETATTR_ATIME|CEPH_SETATTR_SIZE))
11fdf7f2 4995 lov.add_xlock(&cur->filelock);
7c673cae 4996 if (mask & CEPH_SETATTR_CTIME)
11fdf7f2 4997 lov.add_wrlock(&cur->versionlock);
7c673cae 4998
11fdf7f2 4999 if (!mds->locker->acquire_locks(mdr, lov))
7c673cae
FG
5000 return;
5001
f67539c2 5002 if ((mask & CEPH_SETATTR_UID) && (cur->get_inode()->uid != req->head.args.setattr.uid))
7c673cae
FG
5003 access_mask |= MAY_CHOWN;
5004
f67539c2 5005 if ((mask & CEPH_SETATTR_GID) && (cur->get_inode()->gid != req->head.args.setattr.gid))
7c673cae
FG
5006 access_mask |= MAY_CHGRP;
5007
5008 if (!check_access(mdr, cur, access_mask))
5009 return;
5010
5011 // trunc from bigger -> smaller?
f67539c2 5012 const auto& pip = cur->get_projected_inode();
7c673cae 5013
94b18763 5014 uint64_t old_size = std::max<uint64_t>(pip->size, req->head.args.setattr.old_size);
7c673cae 5015
f67539c2 5016 // CEPHFS_ENOSPC on growing file while full, but allow shrinks
7c673cae 5017 if (is_full && req->head.args.setattr.size > old_size) {
f67539c2
TL
5018 dout(20) << __func__ << ": full, responding CEPHFS_ENOSPC to setattr with larger size" << dendl;
5019 respond_to_request(mdr, -CEPHFS_ENOSPC);
7c673cae
FG
5020 return;
5021 }
5022
5023 bool truncating_smaller = false;
5024 if (mask & CEPH_SETATTR_SIZE) {
5025 truncating_smaller = req->head.args.setattr.size < old_size;
94b18763
FG
5026 if (truncating_smaller && pip->is_truncating()) {
5027 dout(10) << " waiting for pending truncate from " << pip->truncate_from
5028 << " to " << pip->truncate_size << " to complete on " << *cur << dendl;
7c673cae
FG
5029 mds->locker->drop_locks(mdr.get());
5030 mdr->drop_local_auth_pins();
5031 cur->add_waiter(CInode::WAIT_TRUNC, new C_MDS_RetryRequest(mdcache, mdr));
5032 return;
5033 }
5034 }
5035
5036 bool changed_ranges = false;
5037
5038 // project update
5039 mdr->ls = mdlog->get_current_segment();
5040 EUpdate *le = new EUpdate(mdlog, "setattr");
5041 mdlog->start_entry(le);
5042
f67539c2 5043 auto pi = cur->project_inode(mdr);
7c673cae
FG
5044
5045 if (mask & CEPH_SETATTR_UID)
f67539c2 5046 pi.inode->uid = req->head.args.setattr.uid;
7c673cae 5047 if (mask & CEPH_SETATTR_GID)
f67539c2 5048 pi.inode->gid = req->head.args.setattr.gid;
7c673cae
FG
5049
5050 if (mask & CEPH_SETATTR_MODE)
f67539c2 5051 pi.inode->mode = (pi.inode->mode & ~07777) | (req->head.args.setattr.mode & 07777);
7c673cae 5052 else if ((mask & (CEPH_SETATTR_UID|CEPH_SETATTR_GID|CEPH_SETATTR_KILL_SGUID)) &&
f67539c2
TL
5053 S_ISREG(pi.inode->mode) &&
5054 (pi.inode->mode & (S_IXUSR|S_IXGRP|S_IXOTH))) {
5055 pi.inode->mode &= ~(S_ISUID|S_ISGID);
7c673cae
FG
5056 }
5057
5058 if (mask & CEPH_SETATTR_MTIME)
f67539c2 5059 pi.inode->mtime = req->head.args.setattr.mtime;
7c673cae 5060 if (mask & CEPH_SETATTR_ATIME)
f67539c2 5061 pi.inode->atime = req->head.args.setattr.atime;
7c673cae 5062 if (mask & CEPH_SETATTR_BTIME)
f67539c2 5063 pi.inode->btime = req->head.args.setattr.btime;
7c673cae 5064 if (mask & (CEPH_SETATTR_ATIME | CEPH_SETATTR_MTIME | CEPH_SETATTR_BTIME))
f67539c2 5065 pi.inode->time_warp_seq++; // maybe not a timewarp, but still a serialization point.
7c673cae
FG
5066 if (mask & CEPH_SETATTR_SIZE) {
5067 if (truncating_smaller) {
f67539c2 5068 pi.inode->truncate(old_size, req->head.args.setattr.size);
7c673cae
FG
5069 le->metablob.add_truncate_start(cur->ino());
5070 } else {
f67539c2
TL
5071 pi.inode->size = req->head.args.setattr.size;
5072 pi.inode->rstat.rbytes = pi.inode->size;
7c673cae 5073 }
f67539c2 5074 pi.inode->mtime = mdr->get_op_stamp();
7c673cae
FG
5075
5076 // adjust client's max_size?
f67539c2 5077 if (mds->locker->calc_new_client_ranges(cur, pi.inode->size)) {
f91f0fd5 5078 dout(10) << " client_ranges " << cur->get_previous_projected_inode()->client_ranges
f67539c2 5079 << " -> " << pi.inode->client_ranges << dendl;
7c673cae
FG
5080 changed_ranges = true;
5081 }
5082 }
5083
f67539c2
TL
5084 pi.inode->version = cur->pre_dirty();
5085 pi.inode->ctime = mdr->get_op_stamp();
5086 if (mdr->get_op_stamp() > pi.inode->rstat.rctime)
5087 pi.inode->rstat.rctime = mdr->get_op_stamp();
5088 pi.inode->change_attr++;
7c673cae
FG
5089
5090 // log + wait
5091 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
5092 mdcache->predirty_journal_parents(mdr, &le->metablob, cur, 0, PREDIRTY_PRIMARY);
5093 mdcache->journal_dirty_inode(mdr.get(), &le->metablob, cur);
5094
5095 journal_and_reply(mdr, cur, 0, le, new C_MDS_inode_update_finish(this, mdr, cur,
5096 truncating_smaller, changed_ranges));
5097
5098 // flush immediately if there are readers/writers waiting
11fdf7f2 5099 if (mdr->is_xlocked(&cur->filelock) &&
7c673cae
FG
5100 (cur->get_caps_wanted() & (CEPH_CAP_FILE_RD|CEPH_CAP_FILE_WR)))
5101 mds->mdlog->flush();
5102}
5103
5104/* Takes responsibility for mdr */
5105void Server::do_open_truncate(MDRequestRef& mdr, int cmode)
5106{
5107 CInode *in = mdr->in[0];
5108 client_t client = mdr->get_client();
11fdf7f2 5109 ceph_assert(in);
7c673cae
FG
5110
5111 dout(10) << "do_open_truncate " << *in << dendl;
5112
5113 SnapRealm *realm = in->find_snaprealm();
9f95a23c 5114 Capability *cap = mds->locker->issue_new_caps(in, cmode, mdr, realm);
7c673cae
FG
5115
5116 mdr->ls = mdlog->get_current_segment();
5117 EUpdate *le = new EUpdate(mdlog, "open_truncate");
5118 mdlog->start_entry(le);
5119
5120 // prepare
f67539c2
TL
5121 auto pi = in->project_inode(mdr);
5122 pi.inode->version = in->pre_dirty();
5123 pi.inode->mtime = pi.inode->ctime = mdr->get_op_stamp();
5124 if (mdr->get_op_stamp() > pi.inode->rstat.rctime)
5125 pi.inode->rstat.rctime = mdr->get_op_stamp();
5126 pi.inode->change_attr++;
5127
5128 uint64_t old_size = std::max<uint64_t>(pi.inode->size, mdr->client_request->head.args.open.old_size);
7c673cae 5129 if (old_size > 0) {
f67539c2 5130 pi.inode->truncate(old_size, 0);
7c673cae
FG
5131 le->metablob.add_truncate_start(in->ino());
5132 }
5133
5134 bool changed_ranges = false;
a8e16298 5135 if (cap && (cmode & CEPH_FILE_MODE_WR)) {
f67539c2
TL
5136 pi.inode->client_ranges[client].range.first = 0;
5137 pi.inode->client_ranges[client].range.last = pi.inode->get_layout_size_increment();
5138 pi.inode->client_ranges[client].follows = realm->get_newest_seq();
7c673cae 5139 changed_ranges = true;
f91f0fd5 5140 in->mark_clientwriteable();
a8e16298 5141 cap->mark_clientwriteable();
7c673cae
FG
5142 }
5143
5144 le->metablob.add_client_req(mdr->reqid, mdr->client_request->get_oldest_client_tid());
5145
5146 mdcache->predirty_journal_parents(mdr, &le->metablob, in, 0, PREDIRTY_PRIMARY);
5147 mdcache->journal_dirty_inode(mdr.get(), &le->metablob, in);
5148
5149 // make sure ino gets into the journal
5150 le->metablob.add_opened_ino(in->ino());
7c673cae
FG
5151
5152 mdr->o_trunc = true;
5153
5154 CDentry *dn = 0;
5155 if (mdr->client_request->get_dentry_wanted()) {
11fdf7f2 5156 ceph_assert(mdr->dn[0].size());
7c673cae
FG
5157 dn = mdr->dn[0].back();
5158 }
5159
5160 journal_and_reply(mdr, in, dn, le, new C_MDS_inode_update_finish(this, mdr, in, old_size > 0,
5161 changed_ranges));
5162 // Although the `open` part can give an early reply, the truncation won't
5163 // happen until our EUpdate is persistent, to give the client a prompt
5164 // response we must also flush that event.
5165 mdlog->flush();
5166}
5167
5168
5169/* This function cleans up the passed mdr */
5170void Server::handle_client_setlayout(MDRequestRef& mdr)
5171{
9f95a23c
TL
5172 const cref_t<MClientRequest> &req = mdr->client_request;
5173 CInode *cur = rdlock_path_pin_ref(mdr, true);
7c673cae
FG
5174 if (!cur) return;
5175
5176 if (mdr->snapid != CEPH_NOSNAP) {
f67539c2 5177 respond_to_request(mdr, -CEPHFS_EROFS);
7c673cae
FG
5178 return;
5179 }
5180 if (!cur->is_file()) {
f67539c2 5181 respond_to_request(mdr, -CEPHFS_EINVAL);
7c673cae
FG
5182 return;
5183 }
5184 if (cur->get_projected_inode()->size ||
5185 cur->get_projected_inode()->truncate_seq > 1) {
f67539c2 5186 respond_to_request(mdr, -CEPHFS_ENOTEMPTY);
7c673cae
FG
5187 return;
5188 }
5189
5190 // validate layout
5191 file_layout_t layout = cur->get_projected_inode()->layout;
5192 // save existing layout for later
5193 const auto old_layout = layout;
5194
5195 int access = MAY_WRITE;
5196
5197 if (req->head.args.setlayout.layout.fl_object_size > 0)
5198 layout.object_size = req->head.args.setlayout.layout.fl_object_size;
5199 if (req->head.args.setlayout.layout.fl_stripe_unit > 0)
5200 layout.stripe_unit = req->head.args.setlayout.layout.fl_stripe_unit;
5201 if (req->head.args.setlayout.layout.fl_stripe_count > 0)
5202 layout.stripe_count=req->head.args.setlayout.layout.fl_stripe_count;
5203 if (req->head.args.setlayout.layout.fl_pg_pool > 0) {
5204 layout.pool_id = req->head.args.setlayout.layout.fl_pg_pool;
5205
5206 // make sure we have as new a map as the client
5207 if (req->get_mdsmap_epoch() > mds->mdsmap->get_epoch()) {
5208 mds->wait_for_mdsmap(req->get_mdsmap_epoch(), new C_MDS_RetryRequest(mdcache, mdr));
5209 return;
5210 }
5211 }
5212
5213 // Don't permit layout modifications without 'p' caps
5214 if (layout != old_layout) {
5215 access |= MAY_SET_VXATTR;
5216 }
5217
5218 if (!layout.is_valid()) {
5219 dout(10) << "bad layout" << dendl;
f67539c2 5220 respond_to_request(mdr, -CEPHFS_EINVAL);
7c673cae
FG
5221 return;
5222 }
5223 if (!mds->mdsmap->is_data_pool(layout.pool_id)) {
5224 dout(10) << " invalid data pool " << layout.pool_id << dendl;
f67539c2 5225 respond_to_request(mdr, -CEPHFS_EINVAL);
7c673cae
FG
5226 return;
5227 }
5228
9f95a23c 5229 MutationImpl::LockOpVec lov;
11fdf7f2
TL
5230 lov.add_xlock(&cur->filelock);
5231 if (!mds->locker->acquire_locks(mdr, lov))
7c673cae
FG
5232 return;
5233
5234 if (!check_access(mdr, cur, access))
5235 return;
5236
5237 // project update
f67539c2
TL
5238 auto pi = cur->project_inode(mdr);
5239 pi.inode->layout = layout;
7c673cae 5240 // add the old pool to the inode
f67539c2
TL
5241 pi.inode->add_old_pool(old_layout.pool_id);
5242 pi.inode->version = cur->pre_dirty();
5243 pi.inode->ctime = mdr->get_op_stamp();
5244 if (mdr->get_op_stamp() > pi.inode->rstat.rctime)
5245 pi.inode->rstat.rctime = mdr->get_op_stamp();
5246 pi.inode->change_attr++;
7c673cae
FG
5247
5248 // log + wait
5249 mdr->ls = mdlog->get_current_segment();
5250 EUpdate *le = new EUpdate(mdlog, "setlayout");
5251 mdlog->start_entry(le);
5252 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
5253 mdcache->predirty_journal_parents(mdr, &le->metablob, cur, 0, PREDIRTY_PRIMARY);
5254 mdcache->journal_dirty_inode(mdr.get(), &le->metablob, cur);
5255
5256 journal_and_reply(mdr, cur, 0, le, new C_MDS_inode_update_finish(this, mdr, cur));
5257}
5258
9f95a23c 5259bool Server::xlock_policylock(MDRequestRef& mdr, CInode *in, bool want_layout, bool xlock_snaplock)
7c673cae 5260{
9f95a23c
TL
5261 if (mdr->locking_state & MutationImpl::ALL_LOCKED)
5262 return true;
5263
11fdf7f2 5264 MutationImpl::LockOpVec lov;
9f95a23c
TL
5265 lov.add_xlock(&in->policylock);
5266 if (xlock_snaplock)
5267 lov.add_xlock(&in->snaplock);
5268 else
5269 lov.add_rdlock(&in->snaplock);
5270 if (!mds->locker->acquire_locks(mdr, lov))
5271 return false;
7c673cae 5272
9f95a23c
TL
5273 if (want_layout && in->get_projected_inode()->has_layout()) {
5274 mdr->dir_layout = in->get_projected_inode()->layout;
5275 want_layout = false;
5276 }
5277 if (CDentry *pdn = in->get_projected_parent_dn(); pdn) {
5278 if (!mds->locker->try_rdlock_snap_layout(pdn->get_dir()->get_inode(), mdr, 0, want_layout))
5279 return false;
7c673cae
FG
5280 }
5281
9f95a23c
TL
5282 mdr->locking_state |= MutationImpl::ALL_LOCKED;
5283 return true;
5284}
5285
5286CInode* Server::try_get_auth_inode(MDRequestRef& mdr, inodeno_t ino)
5287{
5288 CInode *in = mdcache->get_inode(ino);
5289 if (!in || in->state_test(CInode::STATE_PURGING)) {
f67539c2 5290 respond_to_request(mdr, -CEPHFS_ESTALE);
9f95a23c
TL
5291 return nullptr;
5292 }
5293 if (!in->is_auth()) {
5294 mdcache->request_forward(mdr, in->authority().first);
5295 return nullptr;
5296 }
5297
5298 return in;
5299}
5300
5301void Server::handle_client_setdirlayout(MDRequestRef& mdr)
5302{
5303 const cref_t<MClientRequest> &req = mdr->client_request;
5304
5305 // can't use rdlock_path_pin_ref because we need to xlock snaplock/policylock
5306 CInode *cur = try_get_auth_inode(mdr, req->get_filepath().get_ino());
5307 if (!cur)
5308 return;
5309
7c673cae 5310 if (!cur->is_dir()) {
f67539c2 5311 respond_to_request(mdr, -CEPHFS_ENOTDIR);
7c673cae
FG
5312 return;
5313 }
5314
9f95a23c 5315 if (!xlock_policylock(mdr, cur, true))
7c673cae
FG
5316 return;
5317
5318 // validate layout
f67539c2 5319 const auto& old_pi = cur->get_projected_inode();
7c673cae
FG
5320 file_layout_t layout;
5321 if (old_pi->has_layout())
5322 layout = old_pi->layout;
9f95a23c
TL
5323 else if (mdr->dir_layout != file_layout_t())
5324 layout = mdr->dir_layout;
7c673cae
FG
5325 else
5326 layout = mdcache->default_file_layout;
5327
5328 // Level of access required to complete
5329 int access = MAY_WRITE;
5330
5331 const auto old_layout = layout;
5332
5333 if (req->head.args.setlayout.layout.fl_object_size > 0)
5334 layout.object_size = req->head.args.setlayout.layout.fl_object_size;
5335 if (req->head.args.setlayout.layout.fl_stripe_unit > 0)
5336 layout.stripe_unit = req->head.args.setlayout.layout.fl_stripe_unit;
5337 if (req->head.args.setlayout.layout.fl_stripe_count > 0)
5338 layout.stripe_count=req->head.args.setlayout.layout.fl_stripe_count;
5339 if (req->head.args.setlayout.layout.fl_pg_pool > 0) {
5340 layout.pool_id = req->head.args.setlayout.layout.fl_pg_pool;
5341 // make sure we have as new a map as the client
5342 if (req->get_mdsmap_epoch() > mds->mdsmap->get_epoch()) {
5343 mds->wait_for_mdsmap(req->get_mdsmap_epoch(), new C_MDS_RetryRequest(mdcache, mdr));
5344 return;
5345 }
5346 }
5347
5348 if (layout != old_layout) {
5349 access |= MAY_SET_VXATTR;
5350 }
5351
5352 if (!layout.is_valid()) {
5353 dout(10) << "bad layout" << dendl;
f67539c2 5354 respond_to_request(mdr, -CEPHFS_EINVAL);
7c673cae
FG
5355 return;
5356 }
5357 if (!mds->mdsmap->is_data_pool(layout.pool_id)) {
5358 dout(10) << " invalid data pool " << layout.pool_id << dendl;
f67539c2 5359 respond_to_request(mdr, -CEPHFS_EINVAL);
7c673cae
FG
5360 return;
5361 }
5362
5363 if (!check_access(mdr, cur, access))
5364 return;
5365
f67539c2
TL
5366 auto pi = cur->project_inode(mdr);
5367 pi.inode->layout = layout;
5368 pi.inode->version = cur->pre_dirty();
7c673cae
FG
5369
5370 // log + wait
5371 mdr->ls = mdlog->get_current_segment();
5372 EUpdate *le = new EUpdate(mdlog, "setlayout");
5373 mdlog->start_entry(le);
5374 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
5375 mdcache->predirty_journal_parents(mdr, &le->metablob, cur, 0, PREDIRTY_PRIMARY);
5376 mdcache->journal_dirty_inode(mdr.get(), &le->metablob, cur);
5377
b32b8144 5378 mdr->no_early_reply = true;
7c673cae
FG
5379 journal_and_reply(mdr, cur, 0, le, new C_MDS_inode_update_finish(this, mdr, cur));
5380}
5381
5382// XATTRS
5383
5384int Server::parse_layout_vxattr(string name, string value, const OSDMap& osdmap,
5385 file_layout_t *layout, bool validate)
5386{
5387 dout(20) << "parse_layout_vxattr name " << name << " value '" << value << "'" << dendl;
5388 try {
5389 if (name == "layout") {
5390 string::iterator begin = value.begin();
5391 string::iterator end = value.end();
5392 keys_and_values<string::iterator> p; // create instance of parser
5393 std::map<string, string> m; // map to receive results
5394 if (!qi::parse(begin, end, p, m)) { // returns true if successful
f67539c2 5395 return -CEPHFS_EINVAL;
7c673cae
FG
5396 }
5397 string left(begin, end);
5398 dout(10) << " parsed " << m << " left '" << left << "'" << dendl;
5399 if (begin != end)
f67539c2 5400 return -CEPHFS_EINVAL;
7c673cae
FG
5401 for (map<string,string>::iterator q = m.begin(); q != m.end(); ++q) {
5402 // Skip validation on each attr, we do it once at the end (avoid
5403 // rejecting intermediate states if the overall result is ok)
5404 int r = parse_layout_vxattr(string("layout.") + q->first, q->second,
5405 osdmap, layout, false);
5406 if (r < 0)
5407 return r;
5408 }
5409 } else if (name == "layout.object_size") {
5410 layout->object_size = boost::lexical_cast<unsigned>(value);
5411 } else if (name == "layout.stripe_unit") {
5412 layout->stripe_unit = boost::lexical_cast<unsigned>(value);
5413 } else if (name == "layout.stripe_count") {
5414 layout->stripe_count = boost::lexical_cast<unsigned>(value);
5415 } else if (name == "layout.pool") {
5416 try {
5417 layout->pool_id = boost::lexical_cast<unsigned>(value);
5418 } catch (boost::bad_lexical_cast const&) {
5419 int64_t pool = osdmap.lookup_pg_pool_name(value);
5420 if (pool < 0) {
5421 dout(10) << " unknown pool " << value << dendl;
f67539c2 5422 return -CEPHFS_ENOENT;
7c673cae
FG
5423 }
5424 layout->pool_id = pool;
5425 }
5426 } else if (name == "layout.pool_namespace") {
5427 layout->pool_ns = value;
5428 } else {
5429 dout(10) << " unknown layout vxattr " << name << dendl;
f67539c2 5430 return -CEPHFS_EINVAL;
7c673cae
FG
5431 }
5432 } catch (boost::bad_lexical_cast const&) {
5433 dout(10) << "bad vxattr value, unable to parse int for " << name << dendl;
f67539c2 5434 return -CEPHFS_EINVAL;
7c673cae
FG
5435 }
5436
5437 if (validate && !layout->is_valid()) {
5438 dout(10) << "bad layout" << dendl;
f67539c2 5439 return -CEPHFS_EINVAL;
7c673cae
FG
5440 }
5441 if (!mds->mdsmap->is_data_pool(layout->pool_id)) {
5442 dout(10) << " invalid data pool " << layout->pool_id << dendl;
f67539c2 5443 return -CEPHFS_EINVAL;
7c673cae
FG
5444 }
5445 return 0;
5446}
5447
5448int Server::parse_quota_vxattr(string name, string value, quota_info_t *quota)
5449{
5450 dout(20) << "parse_quota_vxattr name " << name << " value '" << value << "'" << dendl;
5451 try {
5452 if (name == "quota") {
5453 string::iterator begin = value.begin();
5454 string::iterator end = value.end();
11fdf7f2
TL
5455 if (begin == end) {
5456 // keep quota unchanged. (for create_quota_realm())
5457 return 0;
5458 }
7c673cae
FG
5459 keys_and_values<string::iterator> p; // create instance of parser
5460 std::map<string, string> m; // map to receive results
5461 if (!qi::parse(begin, end, p, m)) { // returns true if successful
f67539c2 5462 return -CEPHFS_EINVAL;
7c673cae
FG
5463 }
5464 string left(begin, end);
5465 dout(10) << " parsed " << m << " left '" << left << "'" << dendl;
5466 if (begin != end)
f67539c2 5467 return -CEPHFS_EINVAL;
7c673cae
FG
5468 for (map<string,string>::iterator q = m.begin(); q != m.end(); ++q) {
5469 int r = parse_quota_vxattr(string("quota.") + q->first, q->second, quota);
5470 if (r < 0)
5471 return r;
5472 }
5473 } else if (name == "quota.max_bytes") {
5474 int64_t q = boost::lexical_cast<int64_t>(value);
5475 if (q < 0)
f67539c2 5476 return -CEPHFS_EINVAL;
7c673cae
FG
5477 quota->max_bytes = q;
5478 } else if (name == "quota.max_files") {
5479 int64_t q = boost::lexical_cast<int64_t>(value);
5480 if (q < 0)
f67539c2 5481 return -CEPHFS_EINVAL;
7c673cae
FG
5482 quota->max_files = q;
5483 } else {
5484 dout(10) << " unknown quota vxattr " << name << dendl;
f67539c2 5485 return -CEPHFS_EINVAL;
7c673cae
FG
5486 }
5487 } catch (boost::bad_lexical_cast const&) {
5488 dout(10) << "bad vxattr value, unable to parse int for " << name << dendl;
f67539c2 5489 return -CEPHFS_EINVAL;
7c673cae
FG
5490 }
5491
5492 if (!quota->is_valid()) {
5493 dout(10) << "bad quota" << dendl;
f67539c2 5494 return -CEPHFS_EINVAL;
7c673cae
FG
5495 }
5496 return 0;
5497}
5498
11fdf7f2
TL
5499void Server::create_quota_realm(CInode *in)
5500{
5501 dout(10) << __func__ << " " << *in << dendl;
5502
9f95a23c 5503 auto req = make_message<MClientRequest>(CEPH_MDS_OP_SETXATTR);
11fdf7f2
TL
5504 req->set_filepath(filepath(in->ino()));
5505 req->set_string2("ceph.quota");
5506 // empty vxattr value
5507 req->set_tid(mds->issue_tid());
5508
5509 mds->send_message_mds(req, in->authority().first);
5510}
5511
7c673cae
FG
5512/*
5513 * Verify that the file layout attribute carried by client
5514 * is well-formatted.
5515 * Return 0 on success, otherwise this function takes
5516 * responsibility for the passed mdr.
5517 */
5518int Server::check_layout_vxattr(MDRequestRef& mdr,
5519 string name,
5520 string value,
5521 file_layout_t *layout)
5522{
9f95a23c 5523 const cref_t<MClientRequest> &req = mdr->client_request;
7c673cae
FG
5524 epoch_t epoch;
5525 int r;
5526
5527 mds->objecter->with_osdmap([&](const OSDMap& osdmap) {
5528 r = parse_layout_vxattr(name, value, osdmap, layout);
5529 epoch = osdmap.get_epoch();
5530 });
5531
f67539c2 5532 if (r == -CEPHFS_ENOENT) {
7c673cae
FG
5533
5534 // we don't have the specified pool, make sure our map
5535 // is newer than or as new as the client.
5536 epoch_t req_epoch = req->get_osdmap_epoch();
5537
5538 if (req_epoch > epoch) {
5539
5540 // well, our map is older. consult mds.
f67539c2 5541 auto fin = new C_IO_Wrapper(mds, new C_MDS_RetryRequest(mdcache, mdr));
7c673cae 5542
f67539c2
TL
5543 mds->objecter->wait_for_map(req_epoch, lambdafy(fin));
5544 return r;
7c673cae
FG
5545 } else if (req_epoch == 0 && !mdr->waited_for_osdmap) {
5546
5547 // For compatibility with client w/ old code, we still need get the
5548 // latest map. One day if COMPACT_VERSION of MClientRequest >=3,
5549 // we can remove those code.
5550 mdr->waited_for_osdmap = true;
f67539c2
TL
5551 mds->objecter->wait_for_latest_osdmap(std::ref(*new C_IO_Wrapper(
5552 mds, new C_MDS_RetryRequest(mdcache, mdr))));
7c673cae
FG
5553 return r;
5554 }
5555 }
5556
5557 if (r < 0) {
5558
f67539c2
TL
5559 if (r == -CEPHFS_ENOENT)
5560 r = -CEPHFS_EINVAL;
7c673cae
FG
5561
5562 respond_to_request(mdr, r);
5563 return r;
5564 }
5565
5566 // all is well
5567 return 0;
5568}
5569
9f95a23c 5570void Server::handle_set_vxattr(MDRequestRef& mdr, CInode *cur)
7c673cae 5571{
9f95a23c 5572 const cref_t<MClientRequest> &req = mdr->client_request;
7c673cae
FG
5573 string name(req->get_path2());
5574 bufferlist bl = req->get_data();
5575 string value (bl.c_str(), bl.length());
5576 dout(10) << "handle_set_vxattr " << name
5577 << " val " << value.length()
5578 << " bytes on " << *cur
5579 << dendl;
5580
94b18763 5581 CInode::mempool_inode *pip = nullptr;
7c673cae
FG
5582 string rest;
5583
5584 if (!check_access(mdr, cur, MAY_SET_VXATTR)) {
5585 return;
5586 }
5587
adb31ebb 5588 bool adjust_realm = false;
7c673cae
FG
5589 if (name.compare(0, 15, "ceph.dir.layout") == 0) {
5590 if (!cur->is_dir()) {
f67539c2 5591 respond_to_request(mdr, -CEPHFS_EINVAL);
7c673cae
FG
5592 return;
5593 }
5594
9f95a23c
TL
5595 if (!xlock_policylock(mdr, cur, true))
5596 return;
5597
7c673cae
FG
5598 file_layout_t layout;
5599 if (cur->get_projected_inode()->has_layout())
5600 layout = cur->get_projected_inode()->layout;
9f95a23c
TL
5601 else if (mdr->dir_layout != file_layout_t())
5602 layout = mdr->dir_layout;
7c673cae
FG
5603 else
5604 layout = mdcache->default_file_layout;
5605
5606 rest = name.substr(name.find("layout"));
5607 if (check_layout_vxattr(mdr, rest, value, &layout) < 0)
5608 return;
5609
f67539c2
TL
5610 auto pi = cur->project_inode(mdr);
5611 pi.inode->layout = layout;
b32b8144 5612 mdr->no_early_reply = true;
f67539c2 5613 pip = pi.inode.get();
7c673cae
FG
5614 } else if (name.compare(0, 16, "ceph.file.layout") == 0) {
5615 if (!cur->is_file()) {
f67539c2 5616 respond_to_request(mdr, -CEPHFS_EINVAL);
7c673cae
FG
5617 return;
5618 }
5619 if (cur->get_projected_inode()->size ||
5620 cur->get_projected_inode()->truncate_seq > 1) {
f67539c2 5621 respond_to_request(mdr, -CEPHFS_ENOTEMPTY);
7c673cae
FG
5622 return;
5623 }
5624 file_layout_t layout = cur->get_projected_inode()->layout;
5625 rest = name.substr(name.find("layout"));
5626 if (check_layout_vxattr(mdr, rest, value, &layout) < 0)
5627 return;
5628
9f95a23c 5629 MutationImpl::LockOpVec lov;
11fdf7f2
TL
5630 lov.add_xlock(&cur->filelock);
5631 if (!mds->locker->acquire_locks(mdr, lov))
7c673cae
FG
5632 return;
5633
f67539c2
TL
5634 auto pi = cur->project_inode(mdr);
5635 int64_t old_pool = pi.inode->layout.pool_id;
5636 pi.inode->add_old_pool(old_pool);
5637 pi.inode->layout = layout;
5638 pip = pi.inode.get();
7c673cae 5639 } else if (name.compare(0, 10, "ceph.quota") == 0) {
f67539c2
TL
5640 if (!cur->is_dir()) {
5641 respond_to_request(mdr, -CEPHFS_EINVAL);
7c673cae
FG
5642 return;
5643 }
5644
5645 quota_info_t quota = cur->get_projected_inode()->quota;
5646
5647 rest = name.substr(name.find("quota"));
5648 int r = parse_quota_vxattr(rest, value, &quota);
5649 if (r < 0) {
5650 respond_to_request(mdr, r);
5651 return;
5652 }
5653
9f95a23c 5654 if (quota.is_enable() && !cur->get_projected_srnode())
adb31ebb
TL
5655 adjust_realm = true;
5656
5657 if (!xlock_policylock(mdr, cur, false, adjust_realm))
5658 return;
11fdf7f2 5659
adb31ebb
TL
5660 if (cur->get_projected_inode()->quota == quota) {
5661 respond_to_request(mdr, 0);
7c673cae 5662 return;
adb31ebb 5663 }
7c673cae 5664
f67539c2
TL
5665 auto pi = cur->project_inode(mdr, false, adjust_realm);
5666 pi.inode->quota = quota;
94b18763 5667
adb31ebb
TL
5668 if (adjust_realm)
5669 pi.snapnode->created = pi.snapnode->seq = cur->find_snaprealm()->get_newest_seq();
5670
b32b8144 5671 mdr->no_early_reply = true;
f67539c2 5672 pip = pi.inode.get();
28e407b8
AA
5673
5674 client_t exclude_ct = mdr->get_client();
a8e16298 5675 mdcache->broadcast_quota_to_client(cur, exclude_ct, true);
adb31ebb
TL
5676 } else if (name == "ceph.dir.subvolume"sv) {
5677 if (!cur->is_dir()) {
f67539c2 5678 respond_to_request(mdr, -CEPHFS_EINVAL);
adb31ebb
TL
5679 return;
5680 }
5681
5682 bool val;
5683 try {
5684 val = boost::lexical_cast<bool>(value);
5685 } catch (boost::bad_lexical_cast const&) {
5686 dout(10) << "bad vxattr value, unable to parse bool for " << name << dendl;
f67539c2 5687 respond_to_request(mdr, -CEPHFS_EINVAL);
adb31ebb
TL
5688 return;
5689 }
5690
b3b6e05e
TL
5691 /* Verify it's not already a subvolume with lighter weight
5692 * rdlock.
5693 */
5694 if (!mdr->more()->rdonly_checks) {
5695 if (!(mdr->locking_state & MutationImpl::ALL_LOCKED)) {
5696 MutationImpl::LockOpVec lov;
5697 lov.add_rdlock(&cur->snaplock);
5698 if (!mds->locker->acquire_locks(mdr, lov))
5699 return;
5700 mdr->locking_state |= MutationImpl::ALL_LOCKED;
5701 }
5702 SnapRealm *realm = cur->find_snaprealm();
5703 const auto srnode = cur->get_projected_srnode();
5704 if (val == (srnode && srnode->is_subvolume())) {
5705 dout(20) << "already marked subvolume" << dendl;
5706 respond_to_request(mdr, 0);
5707 return;
5708 }
5709 mdr->more()->rdonly_checks = true;
5710 }
5711
5712 if ((mdr->locking_state & MutationImpl::ALL_LOCKED) && !mdr->is_xlocked(&cur->snaplock)) {
5713 /* drop the rdlock and acquire xlocks */
5714 dout(20) << "dropping rdlocks" << dendl;
5715 mds->locker->drop_locks(mdr.get());
5716 if (!xlock_policylock(mdr, cur, false, true))
5717 return;
5718 }
adb31ebb 5719
b3b6e05e 5720 /* repeat rdonly checks in case changed between rdlock -> xlock */
adb31ebb
TL
5721 SnapRealm *realm = cur->find_snaprealm();
5722 if (val) {
5723 inodeno_t subvol_ino = realm->get_subvolume_ino();
5724 // can't create subvolume inside another subvolume
5725 if (subvol_ino && subvol_ino != cur->ino()) {
f67539c2 5726 respond_to_request(mdr, -CEPHFS_EINVAL);
adb31ebb
TL
5727 return;
5728 }
5729 }
5730
5731 const auto srnode = cur->get_projected_srnode();
5732 if (val == (srnode && srnode->is_subvolume())) {
5733 respond_to_request(mdr, 0);
5734 return;
5735 }
5736
f67539c2 5737 auto pi = cur->project_inode(mdr, false, true);
adb31ebb
TL
5738 if (!srnode)
5739 pi.snapnode->created = pi.snapnode->seq = realm->get_newest_seq();
5740 if (val)
5741 pi.snapnode->mark_subvolume();
5742 else
5743 pi.snapnode->clear_subvolume();
5744
5745 mdr->no_early_reply = true;
f67539c2 5746 pip = pi.inode.get();
adb31ebb 5747 adjust_realm = true;
f6b5b4d7 5748 } else if (name == "ceph.dir.pin"sv) {
7c673cae 5749 if (!cur->is_dir() || cur->is_root()) {
f67539c2 5750 respond_to_request(mdr, -CEPHFS_EINVAL);
7c673cae
FG
5751 return;
5752 }
5753
5754 mds_rank_t rank;
5755 try {
5756 rank = boost::lexical_cast<mds_rank_t>(value);
5757 if (rank < 0) rank = MDS_RANK_NONE;
5758 } catch (boost::bad_lexical_cast const&) {
5759 dout(10) << "bad vxattr value, unable to parse int for " << name << dendl;
f67539c2 5760 respond_to_request(mdr, -CEPHFS_EINVAL);
7c673cae
FG
5761 return;
5762 }
5763
9f95a23c 5764 if (!xlock_policylock(mdr, cur))
7c673cae
FG
5765 return;
5766
f67539c2 5767 auto pi = cur->project_inode(mdr);
7c673cae 5768 cur->set_export_pin(rank);
f67539c2 5769 pip = pi.inode.get();
f6b5b4d7
TL
5770 } else if (name == "ceph.dir.pin.random"sv) {
5771 if (!cur->is_dir() || cur->is_root()) {
f67539c2 5772 respond_to_request(mdr, -CEPHFS_EINVAL);
f6b5b4d7
TL
5773 return;
5774 }
5775
5776 double val;
5777 try {
5778 val = boost::lexical_cast<double>(value);
5779 } catch (boost::bad_lexical_cast const&) {
5780 dout(10) << "bad vxattr value, unable to parse float for " << name << dendl;
f67539c2 5781 respond_to_request(mdr, -CEPHFS_EINVAL);
f6b5b4d7
TL
5782 return;
5783 }
5784
5785 if (val < 0.0 || 1.0 < val) {
f67539c2 5786 respond_to_request(mdr, -CEPHFS_EDOM);
f6b5b4d7
TL
5787 return;
5788 } else if (mdcache->export_ephemeral_random_max < val) {
f67539c2 5789 respond_to_request(mdr, -CEPHFS_EINVAL);
f6b5b4d7
TL
5790 return;
5791 }
5792
5793 if (!xlock_policylock(mdr, cur))
5794 return;
5795
f67539c2 5796 auto pi = cur->project_inode(mdr);
f6b5b4d7 5797 cur->setxattr_ephemeral_rand(val);
f67539c2 5798 pip = pi.inode.get();
f6b5b4d7
TL
5799 } else if (name == "ceph.dir.pin.distributed"sv) {
5800 if (!cur->is_dir() || cur->is_root()) {
f67539c2 5801 respond_to_request(mdr, -CEPHFS_EINVAL);
f6b5b4d7
TL
5802 return;
5803 }
5804
5805 bool val;
5806 try {
5807 val = boost::lexical_cast<bool>(value);
5808 } catch (boost::bad_lexical_cast const&) {
5809 dout(10) << "bad vxattr value, unable to parse bool for " << name << dendl;
f67539c2 5810 respond_to_request(mdr, -CEPHFS_EINVAL);
f6b5b4d7
TL
5811 return;
5812 }
5813
5814 if (!xlock_policylock(mdr, cur))
5815 return;
5816
f67539c2 5817 auto pi = cur->project_inode(mdr);
f6b5b4d7 5818 cur->setxattr_ephemeral_dist(val);
f67539c2 5819 pip = pi.inode.get();
7c673cae
FG
5820 } else {
5821 dout(10) << " unknown vxattr " << name << dendl;
f67539c2 5822 respond_to_request(mdr, -CEPHFS_EINVAL);
7c673cae
FG
5823 return;
5824 }
5825
94b18763 5826 pip->change_attr++;
91327a77
AA
5827 pip->ctime = mdr->get_op_stamp();
5828 if (mdr->get_op_stamp() > pip->rstat.rctime)
5829 pip->rstat.rctime = mdr->get_op_stamp();
94b18763 5830 pip->version = cur->pre_dirty();
7c673cae 5831 if (cur->is_file())
94b18763 5832 pip->update_backtrace();
7c673cae
FG
5833
5834 // log + wait
5835 mdr->ls = mdlog->get_current_segment();
5836 EUpdate *le = new EUpdate(mdlog, "set vxattr layout");
5837 mdlog->start_entry(le);
5838 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
5839 mdcache->predirty_journal_parents(mdr, &le->metablob, cur, 0, PREDIRTY_PRIMARY);
5840 mdcache->journal_dirty_inode(mdr.get(), &le->metablob, cur);
5841
11fdf7f2 5842 journal_and_reply(mdr, cur, 0, le, new C_MDS_inode_update_finish(this, mdr, cur,
adb31ebb 5843 false, false, adjust_realm));
7c673cae
FG
5844 return;
5845}
5846
9f95a23c 5847void Server::handle_remove_vxattr(MDRequestRef& mdr, CInode *cur)
7c673cae 5848{
9f95a23c 5849 const cref_t<MClientRequest> &req = mdr->client_request;
7c673cae
FG
5850 string name(req->get_path2());
5851
5852 dout(10) << __func__ << " " << name << " on " << *cur << dendl;
5853
5854 if (name == "ceph.dir.layout") {
5855 if (!cur->is_dir()) {
f67539c2 5856 respond_to_request(mdr, -CEPHFS_ENODATA);
7c673cae
FG
5857 return;
5858 }
5859 if (cur->is_root()) {
5860 dout(10) << "can't remove layout policy on the root directory" << dendl;
f67539c2 5861 respond_to_request(mdr, -CEPHFS_EINVAL);
7c673cae
FG
5862 return;
5863 }
5864
5865 if (!cur->get_projected_inode()->has_layout()) {
f67539c2 5866 respond_to_request(mdr, -CEPHFS_ENODATA);
7c673cae
FG
5867 return;
5868 }
5869
9f95a23c 5870 MutationImpl::LockOpVec lov;
11fdf7f2
TL
5871 lov.add_xlock(&cur->policylock);
5872 if (!mds->locker->acquire_locks(mdr, lov))
7c673cae
FG
5873 return;
5874
f67539c2
TL
5875 auto pi = cur->project_inode(mdr);
5876 pi.inode->clear_layout();
5877 pi.inode->version = cur->pre_dirty();
7c673cae
FG
5878
5879 // log + wait
5880 mdr->ls = mdlog->get_current_segment();
5881 EUpdate *le = new EUpdate(mdlog, "remove dir layout vxattr");
5882 mdlog->start_entry(le);
5883 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
5884 mdcache->predirty_journal_parents(mdr, &le->metablob, cur, 0, PREDIRTY_PRIMARY);
5885 mdcache->journal_dirty_inode(mdr.get(), &le->metablob, cur);
5886
b32b8144 5887 mdr->no_early_reply = true;
7c673cae
FG
5888 journal_and_reply(mdr, cur, 0, le, new C_MDS_inode_update_finish(this, mdr, cur));
5889 return;
5890 } else if (name == "ceph.dir.layout.pool_namespace"
5891 || name == "ceph.file.layout.pool_namespace") {
5892 // Namespace is the only layout field that has a meaningful
5893 // null/none value (empty string, means default layout). Is equivalent
5894 // to a setxattr with empty string: pass through the empty payload of
5895 // the rmxattr request to do this.
9f95a23c 5896 handle_set_vxattr(mdr, cur);
7c673cae
FG
5897 return;
5898 }
5899
f67539c2 5900 respond_to_request(mdr, -CEPHFS_ENODATA);
7c673cae
FG
5901}
5902
f67539c2
TL
5903const Server::XattrHandler Server::xattr_handlers[] = {
5904 {
5905 xattr_name: Server::DEFAULT_HANDLER,
5906 description: "default xattr handler",
5907 validate: &Server::default_xattr_validate,
5908 setxattr: &Server::default_setxattr_handler,
5909 removexattr: &Server::default_removexattr_handler,
5910 },
5911 {
5912 xattr_name: "ceph.mirror.info",
5913 description: "mirror info xattr handler",
5914 validate: &Server::mirror_info_xattr_validate,
5915 setxattr: &Server::mirror_info_setxattr_handler,
5916 removexattr: &Server::mirror_info_removexattr_handler
5917 },
5918};
7c673cae 5919
f67539c2
TL
5920const Server::XattrHandler* Server::get_xattr_or_default_handler(std::string_view xattr_name) {
5921 const XattrHandler *default_xattr_handler = nullptr;
7c673cae 5922
f67539c2
TL
5923 for (auto &handler : xattr_handlers) {
5924 if (handler.xattr_name == Server::DEFAULT_HANDLER) {
5925 ceph_assert(default_xattr_handler == nullptr);
5926 default_xattr_handler = &handler;
5927 }
5928 if (handler.xattr_name == xattr_name) {
5929 dout(20) << "handler=" << handler.description << dendl;
5930 return &handler;
5931 }
5932 }
7c673cae 5933
f67539c2
TL
5934 ceph_assert(default_xattr_handler != nullptr);
5935 dout(20) << "handler=" << default_xattr_handler->description << dendl;
5936 return default_xattr_handler;
5937}
7c673cae 5938
f67539c2
TL
5939int Server::xattr_validate(CInode *cur, const InodeStoreBase::xattr_map_const_ptr xattrs,
5940 const std::string &xattr_name, int op, int flags) {
5941 if (op == CEPH_MDS_OP_SETXATTR) {
5942 if (xattrs) {
5943 if ((flags & CEPH_XATTR_CREATE) && xattrs->count(mempool::mds_co::string(xattr_name))) {
5944 dout(10) << "setxattr '" << xattr_name << "' XATTR_CREATE and CEPHFS_EEXIST on " << *cur << dendl;
5945 return -CEPHFS_EEXIST;
5946 }
5947 }
5948 if ((flags & CEPH_XATTR_REPLACE) && !(xattrs && xattrs->count(mempool::mds_co::string(xattr_name)))) {
5949 dout(10) << "setxattr '" << xattr_name << "' XATTR_REPLACE and CEPHFS_ENODATA on " << *cur << dendl;
5950 return -CEPHFS_ENODATA;
5951 }
5952
5953 return 0;
7c673cae 5954 }
f67539c2
TL
5955
5956 if (op == CEPH_MDS_OP_RMXATTR) {
5957 if (!xattrs || xattrs->count(mempool::mds_co::string(xattr_name)) == 0) {
5958 dout(10) << "removexattr '" << xattr_name << "' and CEPHFS_ENODATA on " << *cur << dendl;
5959 return -CEPHFS_ENODATA;
5960 }
5961
5962 return 0;
5963 }
5964
5965 derr << ": unhandled validation for: " << xattr_name << dendl;
5966 return -CEPHFS_EINVAL;
5967}
5968
5969void Server::xattr_set(InodeStoreBase::xattr_map_ptr xattrs, const std::string &xattr_name,
5970 const bufferlist &xattr_value) {
5971 size_t len = xattr_value.length();
5972 bufferptr b = buffer::create(len);
5973 if (len) {
5974 xattr_value.begin().copy(len, b.c_str());
5975 }
5976 auto em = xattrs->emplace(std::piecewise_construct,
5977 std::forward_as_tuple(mempool::mds_co::string(xattr_name)),
5978 std::forward_as_tuple(b));
5979 if (!em.second) {
5980 em.first->second = b;
5981 }
5982}
5983
5984void Server::xattr_rm(InodeStoreBase::xattr_map_ptr xattrs, const std::string &xattr_name) {
5985 xattrs->erase(mempool::mds_co::string(xattr_name));
5986}
5987
5988int Server::default_xattr_validate(CInode *cur, const InodeStoreBase::xattr_map_const_ptr xattrs,
5989 XattrOp *xattr_op) {
5990 return xattr_validate(cur, xattrs, xattr_op->xattr_name, xattr_op->op, xattr_op->flags);
5991}
5992
5993void Server::default_setxattr_handler(CInode *cur, InodeStoreBase::xattr_map_ptr xattrs,
5994 const XattrOp &xattr_op) {
5995 xattr_set(xattrs, xattr_op.xattr_name, xattr_op.xattr_value);
5996}
5997
5998void Server::default_removexattr_handler(CInode *cur, InodeStoreBase::xattr_map_ptr xattrs,
5999 const XattrOp &xattr_op) {
6000 xattr_rm(xattrs, xattr_op.xattr_name);
6001}
6002
6003// mirror info xattr handlers
6004const std::string Server::MirrorXattrInfo::MIRROR_INFO_REGEX = "^cluster_id=([a-f0-9]{8}-" \
6005 "[a-f0-9]{4}-[a-f0-9]{4}-" \
6006 "[a-f0-9]{4}-[a-f0-9]{12})" \
6007 " fs_id=(\\d+)$";
6008const std::string Server::MirrorXattrInfo::CLUSTER_ID = "ceph.mirror.info.cluster_id";
6009const std::string Server::MirrorXattrInfo::FS_ID = "ceph.mirror.info.fs_id";
6010int Server::parse_mirror_info_xattr(const std::string &name, const std::string &value,
6011 std::string &cluster_id, std::string &fs_id) {
6012 dout(20) << "parsing name=" << name << ", value=" << value << dendl;
6013
6014 static const std::regex regex(Server::MirrorXattrInfo::MIRROR_INFO_REGEX);
6015 std::smatch match;
6016
6017 std::regex_search(value, match, regex);
6018 if (match.size() != 3) {
6019 derr << "mirror info parse error" << dendl;
6020 return -CEPHFS_EINVAL;
6021 }
6022
6023 cluster_id = match[1];
6024 fs_id = match[2];
6025 dout(20) << " parsed cluster_id=" << cluster_id << ", fs_id=" << fs_id << dendl;
6026 return 0;
6027}
6028
6029int Server::mirror_info_xattr_validate(CInode *cur, const InodeStoreBase::xattr_map_const_ptr xattrs,
6030 XattrOp *xattr_op) {
6031 if (!cur->is_root()) {
6032 return -CEPHFS_EINVAL;
6033 }
6034
6035 int v1 = xattr_validate(cur, xattrs, Server::MirrorXattrInfo::CLUSTER_ID, xattr_op->op, xattr_op->flags);
6036 int v2 = xattr_validate(cur, xattrs, Server::MirrorXattrInfo::FS_ID, xattr_op->op, xattr_op->flags);
6037 if (v1 != v2) {
6038 derr << "inconsistent mirror info state (" << v1 << "," << v2 << ")" << dendl;
6039 return -CEPHFS_EINVAL;
6040 }
6041
6042 if (v1 < 0) {
6043 return v1;
6044 }
6045
6046 if (xattr_op->op == CEPH_MDS_OP_RMXATTR) {
6047 return 0;
6048 }
6049
6050 std::string cluster_id;
6051 std::string fs_id;
6052 int r = parse_mirror_info_xattr(xattr_op->xattr_name, xattr_op->xattr_value.to_str(),
6053 cluster_id, fs_id);
6054 if (r < 0) {
6055 return r;
6056 }
6057
6058 xattr_op->xinfo = std::make_unique<MirrorXattrInfo>(cluster_id, fs_id);
6059 return 0;
6060}
6061
6062void Server::mirror_info_setxattr_handler(CInode *cur, InodeStoreBase::xattr_map_ptr xattrs,
6063 const XattrOp &xattr_op) {
6064 auto mirror_info = dynamic_cast<MirrorXattrInfo&>(*(xattr_op.xinfo));
6065
6066 bufferlist bl;
6067 bl.append(mirror_info.cluster_id.c_str(), mirror_info.cluster_id.length());
6068 xattr_set(xattrs, Server::MirrorXattrInfo::CLUSTER_ID, bl);
6069
6070 bl.clear();
6071 bl.append(mirror_info.fs_id.c_str(), mirror_info.fs_id.length());
6072 xattr_set(xattrs, Server::MirrorXattrInfo::FS_ID, bl);
6073}
6074
6075void Server::mirror_info_removexattr_handler(CInode *cur, InodeStoreBase::xattr_map_ptr xattrs,
6076 const XattrOp &xattr_op) {
6077 xattr_rm(xattrs, Server::MirrorXattrInfo::CLUSTER_ID);
6078 xattr_rm(xattrs, Server::MirrorXattrInfo::FS_ID);
6079}
7c673cae
FG
6080
6081void Server::handle_client_setxattr(MDRequestRef& mdr)
6082{
9f95a23c 6083 const cref_t<MClientRequest> &req = mdr->client_request;
7c673cae 6084 string name(req->get_path2());
7c673cae 6085
f67539c2
TL
6086 // is a ceph virtual xattr?
6087 if (is_ceph_vxattr(name)) {
9f95a23c
TL
6088 // can't use rdlock_path_pin_ref because we need to xlock snaplock/policylock
6089 CInode *cur = try_get_auth_inode(mdr, req->get_filepath().get_ino());
6090 if (!cur)
6091 return;
6092
6093 handle_set_vxattr(mdr, cur);
6094 return;
6095 }
6096
f67539c2
TL
6097 if (!is_allowed_ceph_xattr(name)) {
6098 respond_to_request(mdr, -CEPHFS_EINVAL);
6099 return;
6100 }
6101
9f95a23c 6102 CInode *cur = rdlock_path_pin_ref(mdr, true);
7c673cae
FG
6103 if (!cur)
6104 return;
6105
6106 if (mdr->snapid != CEPH_NOSNAP) {
f67539c2 6107 respond_to_request(mdr, -CEPHFS_EROFS);
7c673cae
FG
6108 return;
6109 }
6110
6111 int flags = req->head.args.setxattr.flags;
6112
9f95a23c 6113 MutationImpl::LockOpVec lov;
11fdf7f2
TL
6114 lov.add_xlock(&cur->xattrlock);
6115 if (!mds->locker->acquire_locks(mdr, lov))
7c673cae
FG
6116 return;
6117
6118 if (!check_access(mdr, cur, MAY_WRITE))
6119 return;
6120
7c673cae
FG
6121 size_t len = req->get_data().length();
6122 size_t inc = len + name.length();
6123
f67539c2
TL
6124 auto handler = Server::get_xattr_or_default_handler(name);
6125 const auto& pxattrs = cur->get_projected_xattrs();
6126 if (pxattrs) {
6127 // check xattrs kv pairs size
6128 size_t cur_xattrs_size = 0;
6129 for (const auto& p : *pxattrs) {
6130 if ((flags & CEPH_XATTR_REPLACE) && name.compare(p.first) == 0) {
6131 continue;
6132 }
6133 cur_xattrs_size += p.first.length() + p.second.length();
7c673cae 6134 }
7c673cae 6135
f67539c2
TL
6136 if (((cur_xattrs_size + inc) > g_conf()->mds_max_xattr_pairs_size)) {
6137 dout(10) << "xattr kv pairs size too big. cur_xattrs_size "
6138 << cur_xattrs_size << ", inc " << inc << dendl;
6139 respond_to_request(mdr, -CEPHFS_ENOSPC);
6140 return;
6141 }
7c673cae
FG
6142 }
6143
f67539c2
TL
6144 XattrOp xattr_op(CEPH_MDS_OP_SETXATTR, name, req->get_data(), flags);
6145 int r = std::invoke(handler->validate, this, cur, pxattrs, &xattr_op);
6146 if (r < 0) {
6147 respond_to_request(mdr, r);
7c673cae
FG
6148 return;
6149 }
6150
6151 dout(10) << "setxattr '" << name << "' len " << len << " on " << *cur << dendl;
6152
6153 // project update
f67539c2
TL
6154 auto pi = cur->project_inode(mdr, true);
6155 pi.inode->version = cur->pre_dirty();
6156 pi.inode->ctime = mdr->get_op_stamp();
6157 if (mdr->get_op_stamp() > pi.inode->rstat.rctime)
6158 pi.inode->rstat.rctime = mdr->get_op_stamp();
6159 if (name == "encryption.ctx"sv)
6160 pi.inode->fscrypt = true;
6161 pi.inode->change_attr++;
6162 pi.inode->xattr_version++;
6163
94b18763 6164 if ((flags & CEPH_XATTR_REMOVE)) {
f67539c2 6165 std::invoke(handler->removexattr, this, cur, pi.xattrs, xattr_op);
94b18763 6166 } else {
f67539c2 6167 std::invoke(handler->setxattr, this, cur, pi.xattrs, xattr_op);
7c673cae
FG
6168 }
6169
6170 // log + wait
6171 mdr->ls = mdlog->get_current_segment();
6172 EUpdate *le = new EUpdate(mdlog, "setxattr");
6173 mdlog->start_entry(le);
6174 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
6175 mdcache->predirty_journal_parents(mdr, &le->metablob, cur, 0, PREDIRTY_PRIMARY);
6176 mdcache->journal_dirty_inode(mdr.get(), &le->metablob, cur);
6177
6178 journal_and_reply(mdr, cur, 0, le, new C_MDS_inode_update_finish(this, mdr, cur));
6179}
6180
6181void Server::handle_client_removexattr(MDRequestRef& mdr)
6182{
9f95a23c 6183 const cref_t<MClientRequest> &req = mdr->client_request;
94b18763 6184 std::string name(req->get_path2());
11fdf7f2 6185
f67539c2
TL
6186 // is a ceph virtual xattr?
6187 if (is_ceph_vxattr(name)) {
9f95a23c
TL
6188 // can't use rdlock_path_pin_ref because we need to xlock snaplock/policylock
6189 CInode *cur = try_get_auth_inode(mdr, req->get_filepath().get_ino());
6190 if (!cur)
6191 return;
6192
6193 handle_remove_vxattr(mdr, cur);
6194 return;
6195 }
6196
f67539c2
TL
6197 if (!is_allowed_ceph_xattr(name)) {
6198 respond_to_request(mdr, -CEPHFS_EINVAL);
6199 return;
6200 }
6201
9f95a23c 6202 CInode* cur = rdlock_path_pin_ref(mdr, true);
7c673cae
FG
6203 if (!cur)
6204 return;
6205
6206 if (mdr->snapid != CEPH_NOSNAP) {
f67539c2 6207 respond_to_request(mdr, -CEPHFS_EROFS);
7c673cae
FG
6208 return;
6209 }
6210
9f95a23c 6211 MutationImpl::LockOpVec lov;
11fdf7f2
TL
6212 lov.add_xlock(&cur->xattrlock);
6213 if (!mds->locker->acquire_locks(mdr, lov))
7c673cae
FG
6214 return;
6215
f67539c2
TL
6216
6217 auto handler = Server::get_xattr_or_default_handler(name);
6218 bufferlist bl;
6219 XattrOp xattr_op(CEPH_MDS_OP_RMXATTR, name, bl, 0);
6220
6221 const auto& pxattrs = cur->get_projected_xattrs();
6222 int r = std::invoke(handler->validate, this, cur, pxattrs, &xattr_op);
6223 if (r < 0) {
6224 respond_to_request(mdr, r);
7c673cae
FG
6225 return;
6226 }
6227
6228 dout(10) << "removexattr '" << name << "' on " << *cur << dendl;
6229
6230 // project update
f67539c2
TL
6231 auto pi = cur->project_inode(mdr, true);
6232 pi.inode->version = cur->pre_dirty();
6233 pi.inode->ctime = mdr->get_op_stamp();
6234 if (mdr->get_op_stamp() > pi.inode->rstat.rctime)
6235 pi.inode->rstat.rctime = mdr->get_op_stamp();
6236 pi.inode->change_attr++;
6237 pi.inode->xattr_version++;
6238 std::invoke(handler->removexattr, this, cur, pi.xattrs, xattr_op);
7c673cae
FG
6239
6240 // log + wait
6241 mdr->ls = mdlog->get_current_segment();
6242 EUpdate *le = new EUpdate(mdlog, "removexattr");
6243 mdlog->start_entry(le);
6244 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
6245 mdcache->predirty_journal_parents(mdr, &le->metablob, cur, 0, PREDIRTY_PRIMARY);
6246 mdcache->journal_dirty_inode(mdr.get(), &le->metablob, cur);
6247
6248 journal_and_reply(mdr, cur, 0, le, new C_MDS_inode_update_finish(this, mdr, cur));
6249}
6250
6251
6252// =================================================================
6253// DIRECTORY and NAMESPACE OPS
6254
6255
6256// ------------------------------------------------
6257
6258// MKNOD
6259
6260class C_MDS_mknod_finish : public ServerLogContext {
6261 CDentry *dn;
6262 CInode *newi;
6263public:
6264 C_MDS_mknod_finish(Server *s, MDRequestRef& r, CDentry *d, CInode *ni) :
6265 ServerLogContext(s, r), dn(d), newi(ni) {}
6266 void finish(int r) override {
11fdf7f2 6267 ceph_assert(r == 0);
7c673cae
FG
6268
6269 // link the inode
6270 dn->pop_projected_linkage();
6271
6272 // be a bit hacky with the inode version, here.. we decrement it
6273 // just to keep mark_dirty() happen. (we didn't bother projecting
6274 // a new version of hte inode since it's just been created)
f67539c2 6275 newi->mark_dirty(mdr->ls);
28e407b8 6276 newi->mark_dirty_parent(mdr->ls, true);
7c673cae
FG
6277
6278 // mkdir?
f67539c2 6279 if (newi->is_dir()) {
7c673cae 6280 CDir *dir = newi->get_dirfrag(frag_t());
11fdf7f2 6281 ceph_assert(dir);
f67539c2 6282 dir->mark_dirty(mdr->ls);
7c673cae
FG
6283 dir->mark_new(mdr->ls);
6284 }
6285
6286 mdr->apply();
6287
6288 MDRequestRef null_ref;
6289 get_mds()->mdcache->send_dentry_link(dn, null_ref);
6290
f67539c2 6291 if (newi->is_file()) {
7c673cae 6292 get_mds()->locker->share_inode_max_size(newi);
f67539c2 6293 } else if (newi->is_dir()) {
f6b5b4d7 6294 // We do this now so that the linkages on the new directory are stable.
f67539c2 6295 newi->maybe_ephemeral_rand();
f6b5b4d7 6296 }
7c673cae
FG
6297
6298 // hit pop
11fdf7f2 6299 get_mds()->balancer->hit_inode(newi, META_POP_IWR);
7c673cae
FG
6300
6301 // reply
6302 server->respond_to_request(mdr, 0);
6303 }
6304};
6305
6306
6307void Server::handle_client_mknod(MDRequestRef& mdr)
6308{
9f95a23c 6309 const cref_t<MClientRequest> &req = mdr->client_request;
7c673cae 6310 client_t client = mdr->get_client();
9f95a23c
TL
6311
6312 unsigned mode = req->head.args.mknod.mode;
6313 if ((mode & S_IFMT) == 0)
6314 mode |= S_IFREG;
6315
6316 mdr->disable_lock_cache();
6317 CDentry *dn = rdlock_path_xlock_dentry(mdr, true, false, S_ISREG(mode));
6318 if (!dn)
7c673cae
FG
6319 return;
6320
9f95a23c
TL
6321 CDir *dir = dn->get_dir();
6322 CInode *diri = dir->get_inode();
7c673cae
FG
6323 if (!check_access(mdr, diri, MAY_WRITE))
6324 return;
7c673cae
FG
6325 if (!check_fragment_space(mdr, dn->get_dir()))
6326 return;
6327
f67539c2
TL
6328 ceph_assert(dn->get_projected_linkage()->is_null());
6329 if (req->get_alternate_name().size() > alternate_name_max) {
6330 dout(10) << " alternate_name longer than " << alternate_name_max << dendl;
6331 respond_to_request(mdr, -CEPHFS_ENAMETOOLONG);
6332 return;
6333 }
6334 dn->set_alternate_name(req->get_alternate_name());
6335
7c673cae
FG
6336 // set layout
6337 file_layout_t layout;
9f95a23c
TL
6338 if (mdr->dir_layout != file_layout_t())
6339 layout = mdr->dir_layout;
7c673cae
FG
6340 else
6341 layout = mdcache->default_file_layout;
6342
11fdf7f2
TL
6343 CInode *newi = prepare_new_inode(mdr, dn->get_dir(), inodeno_t(req->head.ino), mode, &layout);
6344 ceph_assert(newi);
7c673cae
FG
6345
6346 dn->push_projected_linkage(newi);
6347
f67539c2
TL
6348 auto _inode = newi->_get_inode();
6349 _inode->version = dn->pre_dirty();
6350 _inode->rdev = req->head.args.mknod.rdev;
6351 _inode->rstat.rfiles = 1;
6352 _inode->accounted_rstat = _inode->rstat;
7c673cae 6353 if (layout.pool_id != mdcache->default_file_layout.pool_id)
f67539c2
TL
6354 _inode->add_old_pool(mdcache->default_file_layout.pool_id);
6355 _inode->update_backtrace();
7c673cae 6356
11fdf7f2
TL
6357 snapid_t follows = mdcache->get_global_snaprealm()->get_newest_seq();
6358 SnapRealm *realm = dn->get_dir()->inode->find_snaprealm();
6359 ceph_assert(follows >= realm->get_newest_seq());
6360
7c673cae
FG
6361 // if the client created a _regular_ file via MKNOD, it's highly likely they'll
6362 // want to write to it (e.g., if they are reexporting NFS)
f67539c2 6363 if (S_ISREG(_inode->mode)) {
7c673cae
FG
6364 // issue a cap on the file
6365 int cmode = CEPH_FILE_MODE_RDWR;
9f95a23c 6366 Capability *cap = mds->locker->issue_new_caps(newi, cmode, mdr, realm);
7c673cae
FG
6367 if (cap) {
6368 cap->set_wanted(0);
6369
6370 // put locks in excl mode
6371 newi->filelock.set_state(LOCK_EXCL);
6372 newi->authlock.set_state(LOCK_EXCL);
6373 newi->xattrlock.set_state(LOCK_EXCL);
a8e16298
TL
6374
6375 dout(15) << " setting a client_range too, since this is a regular file" << dendl;
f67539c2
TL
6376 _inode->client_ranges[client].range.first = 0;
6377 _inode->client_ranges[client].range.last = _inode->layout.stripe_unit;
6378 _inode->client_ranges[client].follows = follows;
f91f0fd5 6379 newi->mark_clientwriteable();
a8e16298 6380 cap->mark_clientwriteable();
7c673cae
FG
6381 }
6382 }
6383
11fdf7f2 6384 ceph_assert(dn->first == follows + 1);
7c673cae
FG
6385 newi->first = dn->first;
6386
f67539c2 6387 dout(10) << "mknod mode " << _inode->mode << " rdev " << _inode->rdev << dendl;
7c673cae
FG
6388
6389 // prepare finisher
6390 mdr->ls = mdlog->get_current_segment();
6391 EUpdate *le = new EUpdate(mdlog, "mknod");
6392 mdlog->start_entry(le);
6393 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
6394 journal_allocated_inos(mdr, &le->metablob);
6395
6396 mdcache->predirty_journal_parents(mdr, &le->metablob, newi, dn->get_dir(),
6397 PREDIRTY_PRIMARY|PREDIRTY_DIR, 1);
6398 le->metablob.add_primary_dentry(dn, newi, true, true, true);
6399
6400 journal_and_reply(mdr, newi, dn, le, new C_MDS_mknod_finish(this, mdr, dn, newi));
92f5a8d4 6401 mds->balancer->maybe_fragment(dn->get_dir(), false);
7c673cae
FG
6402}
6403
6404
6405
6406// MKDIR
6407/* This function takes responsibility for the passed mdr*/
6408void Server::handle_client_mkdir(MDRequestRef& mdr)
6409{
9f95a23c 6410 const cref_t<MClientRequest> &req = mdr->client_request;
91327a77 6411
9f95a23c
TL
6412 mdr->disable_lock_cache();
6413 CDentry *dn = rdlock_path_xlock_dentry(mdr, true);
6414 if (!dn)
7c673cae 6415 return;
9f95a23c 6416
7c673cae
FG
6417 CDir *dir = dn->get_dir();
6418 CInode *diri = dir->get_inode();
7c673cae
FG
6419
6420 // mkdir check access
6421 if (!check_access(mdr, diri, MAY_WRITE))
6422 return;
6423
6424 if (!check_fragment_space(mdr, dir))
6425 return;
6426
f67539c2
TL
6427 ceph_assert(dn->get_projected_linkage()->is_null());
6428 if (req->get_alternate_name().size() > alternate_name_max) {
6429 dout(10) << " alternate_name longer than " << alternate_name_max << dendl;
6430 respond_to_request(mdr, -CEPHFS_ENAMETOOLONG);
6431 return;
6432 }
6433 dn->set_alternate_name(req->get_alternate_name());
6434
7c673cae 6435 // new inode
7c673cae
FG
6436 unsigned mode = req->head.args.mkdir.mode;
6437 mode &= ~S_IFMT;
6438 mode |= S_IFDIR;
9f95a23c 6439 CInode *newi = prepare_new_inode(mdr, dir, inodeno_t(req->head.ino), mode);
11fdf7f2 6440 ceph_assert(newi);
7c673cae
FG
6441
6442 // it's a directory.
6443 dn->push_projected_linkage(newi);
6444
f67539c2
TL
6445 auto _inode = newi->_get_inode();
6446 _inode->version = dn->pre_dirty();
6447 _inode->rstat.rsubdirs = 1;
6448 _inode->accounted_rstat = _inode->rstat;
6449 _inode->update_backtrace();
7c673cae 6450
11fdf7f2
TL
6451 snapid_t follows = mdcache->get_global_snaprealm()->get_newest_seq();
6452 SnapRealm *realm = dn->get_dir()->inode->find_snaprealm();
6453 ceph_assert(follows >= realm->get_newest_seq());
6454
7c673cae 6455 dout(12) << " follows " << follows << dendl;
11fdf7f2 6456 ceph_assert(dn->first == follows + 1);
7c673cae
FG
6457 newi->first = dn->first;
6458
6459 // ...and that new dir is empty.
6460 CDir *newdir = newi->get_or_open_dirfrag(mdcache, frag_t());
6461 newdir->state_set(CDir::STATE_CREATING);
6462 newdir->mark_complete();
f67539c2 6463 newdir->_get_fnode()->version = newdir->pre_dirty();
7c673cae
FG
6464
6465 // prepare finisher
6466 mdr->ls = mdlog->get_current_segment();
6467 EUpdate *le = new EUpdate(mdlog, "mkdir");
6468 mdlog->start_entry(le);
6469 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
6470 journal_allocated_inos(mdr, &le->metablob);
6471 mdcache->predirty_journal_parents(mdr, &le->metablob, newi, dn->get_dir(), PREDIRTY_PRIMARY|PREDIRTY_DIR, 1);
6472 le->metablob.add_primary_dentry(dn, newi, true, true);
6473 le->metablob.add_new_dir(newdir); // dirty AND complete AND new
6474
6475 // issue a cap on the directory
6476 int cmode = CEPH_FILE_MODE_RDWR;
9f95a23c 6477 Capability *cap = mds->locker->issue_new_caps(newi, cmode, mdr, realm);
7c673cae
FG
6478 if (cap) {
6479 cap->set_wanted(0);
6480
6481 // put locks in excl mode
6482 newi->filelock.set_state(LOCK_EXCL);
6483 newi->authlock.set_state(LOCK_EXCL);
6484 newi->xattrlock.set_state(LOCK_EXCL);
6485 }
6486
6487 // make sure this inode gets into the journal
6488 le->metablob.add_opened_ino(newi->ino());
7c673cae
FG
6489
6490 journal_and_reply(mdr, newi, dn, le, new C_MDS_mknod_finish(this, mdr, dn, newi));
81eedcae
TL
6491
6492 // We hit_dir (via hit_inode) in our finish callback, but by then we might
6493 // have overshot the split size (multiple mkdir in flight), so here is
6494 // an early chance to split the dir if this mkdir makes it oversized.
6495 mds->balancer->maybe_fragment(dir, false);
7c673cae
FG
6496}
6497
6498
6499// SYMLINK
6500
6501void Server::handle_client_symlink(MDRequestRef& mdr)
6502{
f67539c2
TL
6503 const auto& req = mdr->client_request;
6504
9f95a23c
TL
6505 mdr->disable_lock_cache();
6506 CDentry *dn = rdlock_path_xlock_dentry(mdr, true);
6507 if (!dn)
7c673cae 6508 return;
9f95a23c 6509
7c673cae
FG
6510 CDir *dir = dn->get_dir();
6511 CInode *diri = dir->get_inode();
7c673cae
FG
6512
6513 if (!check_access(mdr, diri, MAY_WRITE))
9f95a23c 6514 return;
7c673cae
FG
6515 if (!check_fragment_space(mdr, dir))
6516 return;
6517
f67539c2
TL
6518 ceph_assert(dn->get_projected_linkage()->is_null());
6519 if (req->get_alternate_name().size() > alternate_name_max) {
6520 dout(10) << " alternate_name longer than " << alternate_name_max << dendl;
6521 respond_to_request(mdr, -CEPHFS_ENAMETOOLONG);
6522 }
6523 dn->set_alternate_name(req->get_alternate_name());
9f95a23c 6524
7c673cae 6525 unsigned mode = S_IFLNK | 0777;
9f95a23c 6526 CInode *newi = prepare_new_inode(mdr, dir, inodeno_t(req->head.ino), mode);
11fdf7f2 6527 ceph_assert(newi);
7c673cae
FG
6528
6529 // it's a symlink
6530 dn->push_projected_linkage(newi);
6531
11fdf7f2 6532 newi->symlink = req->get_path2();
f67539c2
TL
6533 auto _inode = newi->_get_inode();
6534 _inode->version = dn->pre_dirty();
6535 _inode->size = newi->symlink.length();
6536 _inode->rstat.rbytes = _inode->size;
6537 _inode->rstat.rfiles = 1;
6538 _inode->accounted_rstat = _inode->rstat;
6539 _inode->update_backtrace();
7c673cae
FG
6540
6541 newi->first = dn->first;
6542
6543 // prepare finisher
6544 mdr->ls = mdlog->get_current_segment();
6545 EUpdate *le = new EUpdate(mdlog, "symlink");
6546 mdlog->start_entry(le);
6547 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
6548 journal_allocated_inos(mdr, &le->metablob);
6549 mdcache->predirty_journal_parents(mdr, &le->metablob, newi, dn->get_dir(), PREDIRTY_PRIMARY|PREDIRTY_DIR, 1);
6550 le->metablob.add_primary_dentry(dn, newi, true, true);
6551
6552 journal_and_reply(mdr, newi, dn, le, new C_MDS_mknod_finish(this, mdr, dn, newi));
92f5a8d4 6553 mds->balancer->maybe_fragment(dir, false);
7c673cae
FG
6554}
6555
6556
6557
6558
6559
6560// LINK
6561
6562void Server::handle_client_link(MDRequestRef& mdr)
6563{
9f95a23c 6564 const cref_t<MClientRequest> &req = mdr->client_request;
7c673cae
FG
6565
6566 dout(7) << "handle_client_link " << req->get_filepath()
6567 << " to " << req->get_filepath2()
6568 << dendl;
6569
9f95a23c 6570 mdr->disable_lock_cache();
7c673cae 6571
9f95a23c
TL
6572 CDentry *destdn;
6573 CInode *targeti;
6574
6575 if (req->get_filepath2().depth() == 0) {
6576 targeti = mdcache->get_inode(req->get_filepath2().get_ino());
6577 if (!targeti) {
f67539c2 6578 dout(10) << "CEPHFS_ESTALE on path2, attempting recovery" << dendl;
9f95a23c
TL
6579 mdcache->find_ino_peers(req->get_filepath2().get_ino(), new C_MDS_TryFindInode(this, mdr));
6580 return;
6581 }
6582 mdr->pin(targeti);
6583
6584 if (!(mdr->locking_state & MutationImpl::SNAP2_LOCKED)) {
6585 CDentry *pdn = targeti->get_projected_parent_dn();
6586 if (!pdn) {
6587 dout(7) << "target has no parent dn, failing..." << dendl;
f67539c2 6588 respond_to_request(mdr, -CEPHFS_EINVAL);
9f95a23c
TL
6589 return;
6590 }
6591 if (!mds->locker->try_rdlock_snap_layout(pdn->get_dir()->get_inode(), mdr, 1))
6592 return;
6593 mdr->locking_state |= MutationImpl::SNAP2_LOCKED;
6594 }
6595
6596 destdn = rdlock_path_xlock_dentry(mdr, false);
6597 if (!destdn)
6598 return;
9f95a23c
TL
6599 } else {
6600 auto ret = rdlock_two_paths_xlock_destdn(mdr, false);
6601 destdn = ret.first;
6602 if (!destdn)
6603 return;
6604
6605 if (!destdn->get_projected_linkage()->is_null()) {
f67539c2 6606 respond_to_request(mdr, -CEPHFS_EEXIST);
9f95a23c
TL
6607 return;
6608 }
6609
6610 targeti = ret.second->get_projected_linkage()->get_inode();
6611 }
6612
f67539c2
TL
6613 ceph_assert(destdn->get_projected_linkage()->is_null());
6614 if (req->get_alternate_name().size() > alternate_name_max) {
6615 dout(10) << " alternate_name longer than " << alternate_name_max << dendl;
6616 respond_to_request(mdr, -CEPHFS_ENAMETOOLONG);
6617 return;
6618 }
6619 destdn->set_alternate_name(req->get_alternate_name());
6620
9f95a23c
TL
6621 if (targeti->is_dir()) {
6622 dout(7) << "target is a dir, failing..." << dendl;
f67539c2 6623 respond_to_request(mdr, -CEPHFS_EINVAL);
7c673cae
FG
6624 return;
6625 }
6626
9f95a23c
TL
6627 CDir *dir = destdn->get_dir();
6628 dout(7) << "handle_client_link link " << destdn->get_name() << " in " << *dir << dendl;
7c673cae 6629 dout(7) << "target is " << *targeti << dendl;
9f95a23c
TL
6630
6631 if (!(mdr->locking_state & MutationImpl::ALL_LOCKED)) {
6632 MutationImpl::LockOpVec lov;
6633 lov.add_xlock(&targeti->snaplock);
6634 lov.add_xlock(&targeti->linklock);
6635
6636 if (!mds->locker->acquire_locks(mdr, lov))
181888fb 6637 return;
7c673cae 6638
9f95a23c
TL
6639 mdr->locking_state |= MutationImpl::ALL_LOCKED;
6640 }
7c673cae 6641
9f95a23c
TL
6642 if (targeti->get_projected_inode()->nlink == 0) {
6643 dout(7) << "target has no link, failing..." << dendl;
f67539c2 6644 respond_to_request(mdr, -CEPHFS_ENOENT);
9f95a23c 6645 }
7c673cae
FG
6646
6647 if ((!mdr->has_more() || mdr->more()->witnessed.empty())) {
6648 if (!check_access(mdr, targeti, MAY_WRITE))
6649 return;
6650
6651 if (!check_access(mdr, dir->get_inode(), MAY_WRITE))
6652 return;
6653
6654 if (!check_fragment_space(mdr, dir))
6655 return;
6656 }
6657
adb31ebb
TL
6658 CInode* target_pin = targeti->get_projected_parent_dir()->inode;
6659 SnapRealm *target_realm = target_pin->find_snaprealm();
6660 if (target_pin != dir->inode &&
6661 target_realm->get_subvolume_ino() !=
6662 dir->inode->find_snaprealm()->get_subvolume_ino()) {
6663 dout(7) << "target is in different subvolume, failing..." << dendl;
f67539c2 6664 respond_to_request(mdr, -CEPHFS_EXDEV);
adb31ebb
TL
6665 return;
6666 }
6667
7c673cae 6668 // go!
11fdf7f2 6669 ceph_assert(g_conf()->mds_kill_link_at != 1);
7c673cae
FG
6670
6671 // local or remote?
6672 if (targeti->is_auth())
adb31ebb 6673 _link_local(mdr, destdn, targeti, target_realm);
7c673cae 6674 else
9f95a23c 6675 _link_remote(mdr, true, destdn, targeti);
92f5a8d4 6676 mds->balancer->maybe_fragment(dir, false);
7c673cae
FG
6677}
6678
6679
6680class C_MDS_link_local_finish : public ServerLogContext {
6681 CDentry *dn;
6682 CInode *targeti;
6683 version_t dnpv;
6684 version_t tipv;
11fdf7f2 6685 bool adjust_realm;
7c673cae
FG
6686public:
6687 C_MDS_link_local_finish(Server *s, MDRequestRef& r, CDentry *d, CInode *ti,
11fdf7f2 6688 version_t dnpv_, version_t tipv_, bool ar) :
7c673cae 6689 ServerLogContext(s, r), dn(d), targeti(ti),
11fdf7f2 6690 dnpv(dnpv_), tipv(tipv_), adjust_realm(ar) { }
7c673cae 6691 void finish(int r) override {
11fdf7f2
TL
6692 ceph_assert(r == 0);
6693 server->_link_local_finish(mdr, dn, targeti, dnpv, tipv, adjust_realm);
7c673cae
FG
6694 }
6695};
6696
6697
adb31ebb 6698void Server::_link_local(MDRequestRef& mdr, CDentry *dn, CInode *targeti, SnapRealm *target_realm)
7c673cae
FG
6699{
6700 dout(10) << "_link_local " << *dn << " to " << *targeti << dendl;
6701
6702 mdr->ls = mdlog->get_current_segment();
6703
6704 // predirty NEW dentry
6705 version_t dnpv = dn->pre_dirty();
6706 version_t tipv = targeti->pre_dirty();
6707
6708 // project inode update
f67539c2
TL
6709 auto pi = targeti->project_inode(mdr);
6710 pi.inode->nlink++;
6711 pi.inode->ctime = mdr->get_op_stamp();
6712 if (mdr->get_op_stamp() > pi.inode->rstat.rctime)
6713 pi.inode->rstat.rctime = mdr->get_op_stamp();
6714 pi.inode->change_attr++;
6715 pi.inode->version = tipv;
7c673cae 6716
11fdf7f2 6717 bool adjust_realm = false;
adb31ebb 6718 if (!target_realm->get_subvolume_ino() && !targeti->is_projected_snaprealm_global()) {
11fdf7f2
TL
6719 sr_t *newsnap = targeti->project_snaprealm();
6720 targeti->mark_snaprealm_global(newsnap);
adb31ebb 6721 targeti->record_snaprealm_parent_dentry(newsnap, target_realm, targeti->get_projected_parent_dn(), true);
11fdf7f2
TL
6722 adjust_realm = true;
6723 }
6724
7c673cae
FG
6725 // log + wait
6726 EUpdate *le = new EUpdate(mdlog, "link_local");
6727 mdlog->start_entry(le);
6728 le->metablob.add_client_req(mdr->reqid, mdr->client_request->get_oldest_client_tid());
6729 mdcache->predirty_journal_parents(mdr, &le->metablob, targeti, dn->get_dir(), PREDIRTY_DIR, 1); // new dn
6730 mdcache->predirty_journal_parents(mdr, &le->metablob, targeti, 0, PREDIRTY_PRIMARY); // targeti
6731 le->metablob.add_remote_dentry(dn, true, targeti->ino(), targeti->d_type()); // new remote
6732 mdcache->journal_dirty_inode(mdr.get(), &le->metablob, targeti);
6733
6734 // do this after predirty_*, to avoid funky extra dnl arg
6735 dn->push_projected_linkage(targeti->ino(), targeti->d_type());
6736
11fdf7f2
TL
6737 journal_and_reply(mdr, targeti, dn, le,
6738 new C_MDS_link_local_finish(this, mdr, dn, targeti, dnpv, tipv, adjust_realm));
7c673cae
FG
6739}
6740
6741void Server::_link_local_finish(MDRequestRef& mdr, CDentry *dn, CInode *targeti,
11fdf7f2 6742 version_t dnpv, version_t tipv, bool adjust_realm)
7c673cae
FG
6743{
6744 dout(10) << "_link_local_finish " << *dn << " to " << *targeti << dendl;
6745
6746 // link and unlock the NEW dentry
31f18b77
FG
6747 CDentry::linkage_t *dnl = dn->pop_projected_linkage();
6748 if (!dnl->get_inode())
6749 dn->link_remote(dnl, targeti);
7c673cae
FG
6750 dn->mark_dirty(dnpv, mdr->ls);
6751
6752 // target inode
7c673cae
FG
6753 mdr->apply();
6754
6755 MDRequestRef null_ref;
6756 mdcache->send_dentry_link(dn, null_ref);
6757
11fdf7f2
TL
6758 if (adjust_realm) {
6759 int op = CEPH_SNAP_OP_SPLIT;
6760 mds->mdcache->send_snap_update(targeti, 0, op);
6761 mds->mdcache->do_realm_invalidate_and_update_notify(targeti, op);
6762 }
6763
7c673cae 6764 // bump target popularity
11fdf7f2
TL
6765 mds->balancer->hit_inode(targeti, META_POP_IWR);
6766 mds->balancer->hit_dir(dn->get_dir(), META_POP_IWR);
7c673cae
FG
6767
6768 // reply
6769 respond_to_request(mdr, 0);
6770}
6771
6772
6773// link / unlink remote
6774
6775class C_MDS_link_remote_finish : public ServerLogContext {
6776 bool inc;
6777 CDentry *dn;
6778 CInode *targeti;
6779 version_t dpv;
6780public:
6781 C_MDS_link_remote_finish(Server *s, MDRequestRef& r, bool i, CDentry *d, CInode *ti) :
6782 ServerLogContext(s, r), inc(i), dn(d), targeti(ti),
6783 dpv(d->get_projected_version()) {}
6784 void finish(int r) override {
11fdf7f2 6785 ceph_assert(r == 0);
7c673cae
FG
6786 server->_link_remote_finish(mdr, inc, dn, targeti, dpv);
6787 }
6788};
6789
6790void Server::_link_remote(MDRequestRef& mdr, bool inc, CDentry *dn, CInode *targeti)
6791{
6792 dout(10) << "_link_remote "
6793 << (inc ? "link ":"unlink ")
6794 << *dn << " to " << *targeti << dendl;
6795
6796 // 1. send LinkPrepare to dest (journal nlink++ prepare)
6797 mds_rank_t linkauth = targeti->authority().first;
6798 if (mdr->more()->witnessed.count(linkauth) == 0) {
6799 if (mds->is_cluster_degraded() &&
6800 !mds->mdsmap->is_clientreplay_or_active_or_stopping(linkauth)) {
6801 dout(10) << " targeti auth mds." << linkauth << " is not active" << dendl;
f67539c2 6802 if (mdr->more()->waiting_on_peer.empty())
7c673cae
FG
6803 mds->wait_for_active_peer(linkauth, new C_MDS_RetryRequest(mdcache, mdr));
6804 return;
6805 }
6806
6807 dout(10) << " targeti auth must prepare nlink++/--" << dendl;
6808 int op;
6809 if (inc)
f67539c2 6810 op = MMDSPeerRequest::OP_LINKPREP;
7c673cae 6811 else
f67539c2
TL
6812 op = MMDSPeerRequest::OP_UNLINKPREP;
6813 auto req = make_message<MMDSPeerRequest>(mdr->reqid, mdr->attempt, op);
7c673cae
FG
6814 targeti->set_object_info(req->get_object_info());
6815 req->op_stamp = mdr->get_op_stamp();
11fdf7f2
TL
6816 if (auto& desti_srnode = mdr->more()->desti_srnode)
6817 encode(*desti_srnode, req->desti_snapbl);
7c673cae
FG
6818 mds->send_message_mds(req, linkauth);
6819
f67539c2
TL
6820 ceph_assert(mdr->more()->waiting_on_peer.count(linkauth) == 0);
6821 mdr->more()->waiting_on_peer.insert(linkauth);
7c673cae
FG
6822 return;
6823 }
6824 dout(10) << " targeti auth has prepared nlink++/--" << dendl;
6825
11fdf7f2
TL
6826 ceph_assert(g_conf()->mds_kill_link_at != 2);
6827
6828 if (auto& desti_srnode = mdr->more()->desti_srnode) {
6829 delete desti_srnode;
6830 desti_srnode = NULL;
6831 }
7c673cae
FG
6832
6833 mdr->set_mds_stamp(ceph_clock_now());
6834
6835 // add to event
6836 mdr->ls = mdlog->get_current_segment();
6837 EUpdate *le = new EUpdate(mdlog, inc ? "link_remote":"unlink_remote");
6838 mdlog->start_entry(le);
6839 le->metablob.add_client_req(mdr->reqid, mdr->client_request->get_oldest_client_tid());
6840 if (!mdr->more()->witnessed.empty()) {
f67539c2 6841 dout(20) << " noting uncommitted_peers " << mdr->more()->witnessed << dendl;
7c673cae 6842 le->reqid = mdr->reqid;
f67539c2
TL
6843 le->had_peers = true;
6844 mdcache->add_uncommitted_leader(mdr->reqid, mdr->ls, mdr->more()->witnessed);
7c673cae
FG
6845 }
6846
6847 if (inc) {
6848 dn->pre_dirty();
6849 mdcache->predirty_journal_parents(mdr, &le->metablob, targeti, dn->get_dir(), PREDIRTY_DIR, 1);
6850 le->metablob.add_remote_dentry(dn, true, targeti->ino(), targeti->d_type()); // new remote
6851 dn->push_projected_linkage(targeti->ino(), targeti->d_type());
6852 } else {
6853 dn->pre_dirty();
6854 mdcache->predirty_journal_parents(mdr, &le->metablob, targeti, dn->get_dir(), PREDIRTY_DIR, -1);
6855 mdcache->journal_cow_dentry(mdr.get(), &le->metablob, dn);
6856 le->metablob.add_null_dentry(dn, true);
31f18b77 6857 dn->push_projected_linkage();
7c673cae
FG
6858 }
6859
9f95a23c
TL
6860 journal_and_reply(mdr, (inc ? targeti : nullptr), dn, le,
6861 new C_MDS_link_remote_finish(this, mdr, inc, dn, targeti));
7c673cae
FG
6862}
6863
6864void Server::_link_remote_finish(MDRequestRef& mdr, bool inc,
6865 CDentry *dn, CInode *targeti,
6866 version_t dpv)
6867{
6868 dout(10) << "_link_remote_finish "
6869 << (inc ? "link ":"unlink ")
6870 << *dn << " to " << *targeti << dendl;
6871
11fdf7f2 6872 ceph_assert(g_conf()->mds_kill_link_at != 3);
7c673cae
FG
6873
6874 if (!mdr->more()->witnessed.empty())
f67539c2 6875 mdcache->logged_leader_update(mdr->reqid);
7c673cae
FG
6876
6877 if (inc) {
6878 // link the new dentry
31f18b77
FG
6879 CDentry::linkage_t *dnl = dn->pop_projected_linkage();
6880 if (!dnl->get_inode())
6881 dn->link_remote(dnl, targeti);
7c673cae
FG
6882 dn->mark_dirty(dpv, mdr->ls);
6883 } else {
6884 // unlink main dentry
6885 dn->get_dir()->unlink_inode(dn);
31f18b77 6886 dn->pop_projected_linkage();
7c673cae
FG
6887 dn->mark_dirty(dn->get_projected_version(), mdr->ls); // dirty old dentry
6888 }
6889
6890 mdr->apply();
6891
6892 MDRequestRef null_ref;
6893 if (inc)
6894 mdcache->send_dentry_link(dn, null_ref);
6895 else
6896 mdcache->send_dentry_unlink(dn, NULL, null_ref);
6897
6898 // bump target popularity
11fdf7f2
TL
6899 mds->balancer->hit_inode(targeti, META_POP_IWR);
6900 mds->balancer->hit_dir(dn->get_dir(), META_POP_IWR);
7c673cae
FG
6901
6902 // reply
6903 respond_to_request(mdr, 0);
6904
6905 if (!inc)
6906 // removing a new dn?
6907 dn->get_dir()->try_remove_unlinked_dn(dn);
6908}
6909
6910
6911// remote linking/unlinking
6912
f67539c2 6913class C_MDS_PeerLinkPrep : public ServerLogContext {
7c673cae 6914 CInode *targeti;
11fdf7f2 6915 bool adjust_realm;
7c673cae 6916public:
f67539c2 6917 C_MDS_PeerLinkPrep(Server *s, MDRequestRef& r, CInode *t, bool ar) :
11fdf7f2 6918 ServerLogContext(s, r), targeti(t), adjust_realm(ar) { }
7c673cae 6919 void finish(int r) override {
11fdf7f2 6920 ceph_assert(r == 0);
f67539c2 6921 server->_logged_peer_link(mdr, targeti, adjust_realm);
7c673cae
FG
6922 }
6923};
6924
f67539c2 6925class C_MDS_PeerLinkCommit : public ServerContext {
7c673cae
FG
6926 MDRequestRef mdr;
6927 CInode *targeti;
6928public:
f67539c2 6929 C_MDS_PeerLinkCommit(Server *s, MDRequestRef& r, CInode *t) :
7c673cae
FG
6930 ServerContext(s), mdr(r), targeti(t) { }
6931 void finish(int r) override {
f67539c2 6932 server->_commit_peer_link(mdr, r, targeti);
7c673cae
FG
6933 }
6934};
6935
f67539c2 6936void Server::handle_peer_link_prep(MDRequestRef& mdr)
7c673cae 6937{
f67539c2
TL
6938 dout(10) << "handle_peer_link_prep " << *mdr
6939 << " on " << mdr->peer_request->get_object_info()
7c673cae
FG
6940 << dendl;
6941
11fdf7f2 6942 ceph_assert(g_conf()->mds_kill_link_at != 4);
7c673cae 6943
f67539c2 6944 CInode *targeti = mdcache->get_inode(mdr->peer_request->get_object_info().ino);
11fdf7f2 6945 ceph_assert(targeti);
7c673cae
FG
6946 dout(10) << "targeti " << *targeti << dendl;
6947 CDentry *dn = targeti->get_parent_dn();
6948 CDentry::linkage_t *dnl = dn->get_linkage();
11fdf7f2 6949 ceph_assert(dnl->is_primary());
7c673cae 6950
f67539c2 6951 mdr->set_op_stamp(mdr->peer_request->op_stamp);
7c673cae
FG
6952
6953 mdr->auth_pin(targeti);
6954
f67539c2 6955 //ceph_abort(); // test hack: make sure leader can handle a peer that fails to prepare...
11fdf7f2 6956 ceph_assert(g_conf()->mds_kill_link_at != 5);
7c673cae
FG
6957
6958 // journal it
6959 mdr->ls = mdlog->get_current_segment();
f67539c2
TL
6960 EPeerUpdate *le = new EPeerUpdate(mdlog, "peer_link_prep", mdr->reqid, mdr->peer_to_mds,
6961 EPeerUpdate::OP_PREPARE, EPeerUpdate::LINK);
7c673cae
FG
6962 mdlog->start_entry(le);
6963
f67539c2 6964 auto pi = dnl->get_inode()->project_inode(mdr);
7c673cae
FG
6965
6966 // update journaled target inode
6967 bool inc;
11fdf7f2
TL
6968 bool adjust_realm = false;
6969 bool realm_projected = false;
f67539c2 6970 if (mdr->peer_request->get_op() == MMDSPeerRequest::OP_LINKPREP) {
7c673cae 6971 inc = true;
f67539c2 6972 pi.inode->nlink++;
adb31ebb
TL
6973
6974 CDentry *target_pdn = targeti->get_projected_parent_dn();
6975 SnapRealm *target_realm = target_pdn->get_dir()->inode->find_snaprealm();
6976 if (!target_realm->get_subvolume_ino() && !targeti->is_projected_snaprealm_global()) {
11fdf7f2
TL
6977 sr_t *newsnap = targeti->project_snaprealm();
6978 targeti->mark_snaprealm_global(newsnap);
adb31ebb 6979 targeti->record_snaprealm_parent_dentry(newsnap, target_realm, target_pdn, true);
11fdf7f2
TL
6980 adjust_realm = true;
6981 realm_projected = true;
6982 }
7c673cae
FG
6983 } else {
6984 inc = false;
f67539c2 6985 pi.inode->nlink--;
11fdf7f2 6986 if (targeti->is_projected_snaprealm_global()) {
f67539c2
TL
6987 ceph_assert(mdr->peer_request->desti_snapbl.length());
6988 auto p = mdr->peer_request->desti_snapbl.cbegin();
11fdf7f2
TL
6989
6990 sr_t *newsnap = targeti->project_snaprealm();
6991 decode(*newsnap, p);
6992
f67539c2 6993 if (pi.inode->nlink == 0)
11fdf7f2
TL
6994 ceph_assert(!newsnap->is_parent_global());
6995
6996 realm_projected = true;
6997 } else {
f67539c2 6998 ceph_assert(mdr->peer_request->desti_snapbl.length() == 0);
11fdf7f2 6999 }
7c673cae
FG
7000 }
7001
7002 link_rollback rollback;
7003 rollback.reqid = mdr->reqid;
7004 rollback.ino = targeti->ino();
f67539c2
TL
7005 rollback.old_ctime = targeti->get_inode()->ctime; // we hold versionlock xlock; no concorrent projections
7006 const auto& pf = targeti->get_parent_dn()->get_dir()->get_projected_fnode();
7c673cae
FG
7007 rollback.old_dir_mtime = pf->fragstat.mtime;
7008 rollback.old_dir_rctime = pf->rstat.rctime;
7009 rollback.was_inc = inc;
11fdf7f2
TL
7010 if (realm_projected) {
7011 if (targeti->snaprealm) {
7012 encode(true, rollback.snapbl);
7013 targeti->encode_snap_blob(rollback.snapbl);
7014 } else {
7015 encode(false, rollback.snapbl);
7016 }
7017 }
7018 encode(rollback, le->rollback);
7c673cae
FG
7019 mdr->more()->rollback_bl = le->rollback;
7020
f67539c2
TL
7021 pi.inode->ctime = mdr->get_op_stamp();
7022 pi.inode->version = targeti->pre_dirty();
7c673cae 7023
f67539c2 7024 dout(10) << " projected inode " << pi.inode->ino << " v " << pi.inode->version << dendl;
7c673cae
FG
7025
7026 // commit case
7027 mdcache->predirty_journal_parents(mdr, &le->commit, dnl->get_inode(), 0, PREDIRTY_SHALLOW|PREDIRTY_PRIMARY);
7028 mdcache->journal_dirty_inode(mdr.get(), &le->commit, targeti);
f67539c2 7029 mdcache->add_uncommitted_peer(mdr->reqid, mdr->ls, mdr->peer_to_mds);
7c673cae
FG
7030
7031 // set up commit waiter
f67539c2 7032 mdr->more()->peer_commit = new C_MDS_PeerLinkCommit(this, mdr, targeti);
7c673cae 7033
f67539c2
TL
7034 mdr->more()->peer_update_journaled = true;
7035 submit_mdlog_entry(le, new C_MDS_PeerLinkPrep(this, mdr, targeti, adjust_realm),
7c673cae
FG
7036 mdr, __func__);
7037 mdlog->flush();
7038}
7039
f67539c2 7040void Server::_logged_peer_link(MDRequestRef& mdr, CInode *targeti, bool adjust_realm)
7c673cae 7041{
f67539c2 7042 dout(10) << "_logged_peer_link " << *mdr
7c673cae
FG
7043 << " " << *targeti << dendl;
7044
11fdf7f2 7045 ceph_assert(g_conf()->mds_kill_link_at != 6);
7c673cae
FG
7046
7047 // update the target
7c673cae
FG
7048 mdr->apply();
7049
7050 // hit pop
11fdf7f2 7051 mds->balancer->hit_inode(targeti, META_POP_IWR);
7c673cae
FG
7052
7053 // done.
f67539c2 7054 mdr->reset_peer_request();
7c673cae 7055
11fdf7f2
TL
7056 if (adjust_realm) {
7057 int op = CEPH_SNAP_OP_SPLIT;
7058 mds->mdcache->send_snap_update(targeti, 0, op);
7059 mds->mdcache->do_realm_invalidate_and_update_notify(targeti, op);
7060 }
7061
7c673cae
FG
7062 // ack
7063 if (!mdr->aborted) {
f67539c2
TL
7064 auto reply = make_message<MMDSPeerRequest>(mdr->reqid, mdr->attempt, MMDSPeerRequest::OP_LINKPREPACK);
7065 mds->send_message_mds(reply, mdr->peer_to_mds);
7c673cae
FG
7066 } else {
7067 dout(10) << " abort flag set, finishing" << dendl;
7068 mdcache->request_finish(mdr);
7069 }
7070}
7071
7072
f67539c2
TL
7073struct C_MDS_CommittedPeer : public ServerLogContext {
7074 C_MDS_CommittedPeer(Server *s, MDRequestRef& m) : ServerLogContext(s, m) {}
7c673cae 7075 void finish(int r) override {
f67539c2 7076 server->_committed_peer(mdr);
7c673cae
FG
7077 }
7078};
7079
f67539c2 7080void Server::_commit_peer_link(MDRequestRef& mdr, int r, CInode *targeti)
7c673cae 7081{
f67539c2 7082 dout(10) << "_commit_peer_link " << *mdr
7c673cae
FG
7083 << " r=" << r
7084 << " " << *targeti << dendl;
7085
11fdf7f2 7086 ceph_assert(g_conf()->mds_kill_link_at != 7);
7c673cae
FG
7087
7088 if (r == 0) {
7089 // drop our pins, etc.
7090 mdr->cleanup();
7091
7092 // write a commit to the journal
f67539c2
TL
7093 EPeerUpdate *le = new EPeerUpdate(mdlog, "peer_link_commit", mdr->reqid, mdr->peer_to_mds,
7094 EPeerUpdate::OP_COMMIT, EPeerUpdate::LINK);
7c673cae 7095 mdlog->start_entry(le);
f67539c2 7096 submit_mdlog_entry(le, new C_MDS_CommittedPeer(this, mdr), mdr, __func__);
7c673cae
FG
7097 mdlog->flush();
7098 } else {
f67539c2 7099 do_link_rollback(mdr->more()->rollback_bl, mdr->peer_to_mds, mdr);
7c673cae
FG
7100 }
7101}
7102
f67539c2 7103void Server::_committed_peer(MDRequestRef& mdr)
7c673cae 7104{
f67539c2 7105 dout(10) << "_committed_peer " << *mdr << dendl;
7c673cae 7106
11fdf7f2 7107 ceph_assert(g_conf()->mds_kill_link_at != 8);
7c673cae 7108
f67539c2
TL
7109 bool assert_exist = mdr->more()->peer_update_journaled;
7110 mdcache->finish_uncommitted_peer(mdr->reqid, assert_exist);
7111 auto req = make_message<MMDSPeerRequest>(mdr->reqid, mdr->attempt, MMDSPeerRequest::OP_COMMITTED);
7112 mds->send_message_mds(req, mdr->peer_to_mds);
7c673cae
FG
7113 mdcache->request_finish(mdr);
7114}
7115
7116struct C_MDS_LoggedLinkRollback : public ServerLogContext {
7117 MutationRef mut;
9f95a23c 7118 map<client_t,ref_t<MClientSnap>> splits;
11fdf7f2 7119 C_MDS_LoggedLinkRollback(Server *s, MutationRef& m, MDRequestRef& r,
9f95a23c 7120 map<client_t,ref_t<MClientSnap>>&& _splits) :
11fdf7f2
TL
7121 ServerLogContext(s, r), mut(m), splits(std::move(_splits)) {
7122 }
7c673cae 7123 void finish(int r) override {
11fdf7f2 7124 server->_link_rollback_finish(mut, mdr, splits);
7c673cae
FG
7125 }
7126};
7127
f67539c2 7128void Server::do_link_rollback(bufferlist &rbl, mds_rank_t leader, MDRequestRef& mdr)
7c673cae
FG
7129{
7130 link_rollback rollback;
11fdf7f2
TL
7131 auto p = rbl.cbegin();
7132 decode(rollback, p);
7c673cae
FG
7133
7134 dout(10) << "do_link_rollback on " << rollback.reqid
7135 << (rollback.was_inc ? " inc":" dec")
7136 << " ino " << rollback.ino
7137 << dendl;
7138
11fdf7f2 7139 ceph_assert(g_conf()->mds_kill_link_at != 9);
7c673cae 7140
f67539c2 7141 mdcache->add_rollback(rollback.reqid, leader); // need to finish this update before resolve finishes
11fdf7f2 7142 ceph_assert(mdr || mds->is_resolve());
7c673cae
FG
7143
7144 MutationRef mut(new MutationImpl(nullptr, utime_t(), rollback.reqid));
7145 mut->ls = mds->mdlog->get_current_segment();
7146
7147 CInode *in = mdcache->get_inode(rollback.ino);
11fdf7f2 7148 ceph_assert(in);
7c673cae 7149 dout(10) << " target is " << *in << dendl;
f67539c2 7150 ceph_assert(!in->is_projected()); // live peer request hold versionlock xlock.
7c673cae 7151
f67539c2
TL
7152 auto pi = in->project_inode(mut);
7153 pi.inode->version = in->pre_dirty();
7c673cae
FG
7154
7155 // parent dir rctime
7156 CDir *parent = in->get_projected_parent_dn()->get_dir();
f67539c2 7157 auto pf = parent->project_fnode(mut);
7c673cae 7158 pf->version = parent->pre_dirty();
f67539c2 7159 if (pf->fragstat.mtime == pi.inode->ctime) {
7c673cae 7160 pf->fragstat.mtime = rollback.old_dir_mtime;
f67539c2 7161 if (pf->rstat.rctime == pi.inode->ctime)
7c673cae
FG
7162 pf->rstat.rctime = rollback.old_dir_rctime;
7163 mut->add_updated_lock(&parent->get_inode()->filelock);
7164 mut->add_updated_lock(&parent->get_inode()->nestlock);
7165 }
7166
7167 // inode
f67539c2 7168 pi.inode->ctime = rollback.old_ctime;
7c673cae 7169 if (rollback.was_inc)
f67539c2 7170 pi.inode->nlink--;
7c673cae 7171 else
f67539c2 7172 pi.inode->nlink++;
7c673cae 7173
9f95a23c 7174 map<client_t,ref_t<MClientSnap>> splits;
11fdf7f2
TL
7175 if (rollback.snapbl.length() && in->snaprealm) {
7176 bool hadrealm;
7177 auto p = rollback.snapbl.cbegin();
7178 decode(hadrealm, p);
7179 if (hadrealm) {
7180 if (!mds->is_resolve()) {
7181 sr_t *new_srnode = new sr_t();
7182 decode(*new_srnode, p);
7183 in->project_snaprealm(new_srnode);
7184 } else {
7185 decode(in->snaprealm->srnode, p);
7186 }
7187 } else {
7188 SnapRealm *realm = parent->get_inode()->find_snaprealm();
7189 if (!mds->is_resolve())
7190 mdcache->prepare_realm_merge(in->snaprealm, realm, splits);
7191 in->project_snaprealm(NULL);
7192 }
7193 }
7194
7c673cae 7195 // journal it
f67539c2
TL
7196 EPeerUpdate *le = new EPeerUpdate(mdlog, "peer_link_rollback", rollback.reqid, leader,
7197 EPeerUpdate::OP_ROLLBACK, EPeerUpdate::LINK);
7c673cae
FG
7198 mdlog->start_entry(le);
7199 le->commit.add_dir_context(parent);
7200 le->commit.add_dir(parent, true);
7201 le->commit.add_primary_dentry(in->get_projected_parent_dn(), 0, true);
7202
11fdf7f2 7203 submit_mdlog_entry(le, new C_MDS_LoggedLinkRollback(this, mut, mdr, std::move(splits)),
7c673cae
FG
7204 mdr, __func__);
7205 mdlog->flush();
7206}
7207
11fdf7f2 7208void Server::_link_rollback_finish(MutationRef& mut, MDRequestRef& mdr,
9f95a23c 7209 map<client_t,ref_t<MClientSnap>>& splits)
7c673cae
FG
7210{
7211 dout(10) << "_link_rollback_finish" << dendl;
7212
11fdf7f2 7213 ceph_assert(g_conf()->mds_kill_link_at != 10);
7c673cae
FG
7214
7215 mut->apply();
11fdf7f2
TL
7216
7217 if (!mds->is_resolve())
7218 mdcache->send_snaps(splits);
7219
7c673cae
FG
7220 if (mdr)
7221 mdcache->request_finish(mdr);
7222
e306af50 7223 mdcache->finish_rollback(mut->reqid, mdr);
7c673cae
FG
7224
7225 mut->cleanup();
7226}
7227
7228
f67539c2 7229void Server::handle_peer_link_prep_ack(MDRequestRef& mdr, const cref_t<MMDSPeerRequest> &m)
7c673cae 7230{
f67539c2 7231 dout(10) << "handle_peer_link_prep_ack " << *mdr
7c673cae
FG
7232 << " " << *m << dendl;
7233 mds_rank_t from = mds_rank_t(m->get_source().num());
7234
11fdf7f2 7235 ceph_assert(g_conf()->mds_kill_link_at != 11);
7c673cae 7236
f67539c2
TL
7237 // note peer
7238 mdr->more()->peers.insert(from);
7c673cae
FG
7239
7240 // witnessed!
11fdf7f2 7241 ceph_assert(mdr->more()->witnessed.count(from) == 0);
7c673cae 7242 mdr->more()->witnessed.insert(from);
11fdf7f2 7243 ceph_assert(!m->is_not_journaled());
f67539c2 7244 mdr->more()->has_journaled_peers = true;
7c673cae
FG
7245
7246 // remove from waiting list
f67539c2
TL
7247 ceph_assert(mdr->more()->waiting_on_peer.count(from));
7248 mdr->more()->waiting_on_peer.erase(from);
7c673cae 7249
f67539c2 7250 ceph_assert(mdr->more()->waiting_on_peer.empty());
7c673cae 7251
9f95a23c
TL
7252 dispatch_client_request(mdr); // go again!
7253}
7c673cae 7254
9f95a23c
TL
7255
7256
7257
7258
7259// UNLINK
7260
7261void Server::handle_client_unlink(MDRequestRef& mdr)
7262{
7263 const cref_t<MClientRequest> &req = mdr->client_request;
7264 client_t client = mdr->get_client();
7265
7266 // rmdir or unlink?
7267 bool rmdir = (req->get_op() == CEPH_MDS_OP_RMDIR);
7268
7269 if (rmdir)
7270 mdr->disable_lock_cache();
7271 CDentry *dn = rdlock_path_xlock_dentry(mdr, false, true);
7272 if (!dn)
7273 return;
7c673cae
FG
7274
7275 CDentry::linkage_t *dnl = dn->get_linkage(client, mdr);
11fdf7f2 7276 ceph_assert(!dnl->is_null());
9f95a23c 7277 CInode *in = dnl->get_inode();
7c673cae
FG
7278
7279 if (rmdir) {
7280 dout(7) << "handle_client_rmdir on " << *dn << dendl;
7281 } else {
7282 dout(7) << "handle_client_unlink on " << *dn << dendl;
7283 }
7284 dout(7) << "dn links to " << *in << dendl;
7285
7286 // rmdir vs is_dir
7287 if (in->is_dir()) {
7288 if (rmdir) {
7289 // do empty directory checks
7290 if (_dir_is_nonempty_unlocked(mdr, in)) {
f67539c2 7291 respond_to_request(mdr, -CEPHFS_ENOTEMPTY);
7c673cae
FG
7292 return;
7293 }
7294 } else {
7295 dout(7) << "handle_client_unlink on dir " << *in << ", returning error" << dendl;
f67539c2 7296 respond_to_request(mdr, -CEPHFS_EISDIR);
7c673cae
FG
7297 return;
7298 }
7299 } else {
7300 if (rmdir) {
7301 // unlink
7302 dout(7) << "handle_client_rmdir on non-dir " << *in << ", returning error" << dendl;
f67539c2 7303 respond_to_request(mdr, -CEPHFS_ENOTDIR);
7c673cae
FG
7304 return;
7305 }
7306 }
7307
9f95a23c
TL
7308 CInode *diri = dn->get_dir()->get_inode();
7309 if ((!mdr->has_more() || mdr->more()->witnessed.empty())) {
7310 if (!check_access(mdr, diri, MAY_WRITE))
7311 return;
7312 }
7313
7c673cae
FG
7314 // -- create stray dentry? --
7315 CDentry *straydn = NULL;
7316 if (dnl->is_primary()) {
7317 straydn = prepare_stray_dentry(mdr, dnl->get_inode());
7318 if (!straydn)
7319 return;
7320 dout(10) << " straydn is " << *straydn << dendl;
7321 } else if (mdr->straydn) {
7322 mdr->unpin(mdr->straydn);
7323 mdr->straydn = NULL;
7324 }
7325
7326 // lock
9f95a23c
TL
7327 if (!(mdr->locking_state & MutationImpl::ALL_LOCKED)) {
7328 MutationImpl::LockOpVec lov;
11fdf7f2 7329
9f95a23c
TL
7330 lov.add_xlock(&in->linklock);
7331 lov.add_xlock(&in->snaplock);
7332 if (in->is_dir())
7333 lov.add_rdlock(&in->filelock); // to verify it's empty
7334
7335 if (straydn) {
7336 lov.add_wrlock(&straydn->get_dir()->inode->filelock);
7337 lov.add_wrlock(&straydn->get_dir()->inode->nestlock);
7338 lov.add_xlock(&straydn->lock);
7339 }
11fdf7f2 7340
9f95a23c
TL
7341 if (!mds->locker->acquire_locks(mdr, lov))
7342 return;
7c673cae 7343
9f95a23c
TL
7344 mdr->locking_state |= MutationImpl::ALL_LOCKED;
7345 }
7c673cae
FG
7346
7347 if (in->is_dir() &&
7348 _dir_is_nonempty(mdr, in)) {
f67539c2 7349 respond_to_request(mdr, -CEPHFS_ENOTEMPTY);
7c673cae
FG
7350 return;
7351 }
7352
11fdf7f2
TL
7353 if (straydn)
7354 straydn->first = mdcache->get_global_snaprealm()->get_newest_seq() + 1;
7355
7356 if (!mdr->more()->desti_srnode) {
7357 if (in->is_projected_snaprealm_global()) {
7358 sr_t *new_srnode = in->prepare_new_srnode(0);
adb31ebb 7359 in->record_snaprealm_parent_dentry(new_srnode, nullptr, dn, dnl->is_primary());
11fdf7f2
TL
7360 // dropping the last linkage or dropping the last remote linkage,
7361 // detch the inode from global snaprealm
7362 auto nlink = in->get_projected_inode()->nlink;
7363 if (nlink == 1 ||
7364 (nlink == 2 && !dnl->is_primary() &&
7365 !in->get_projected_parent_dir()->inode->is_stray()))
7366 in->clear_snaprealm_global(new_srnode);
7367 mdr->more()->desti_srnode = new_srnode;
7368 } else if (dnl->is_primary()) {
f67539c2 7369 // prepare snaprealm blob for peer request
11fdf7f2
TL
7370 SnapRealm *realm = in->find_snaprealm();
7371 snapid_t follows = realm->get_newest_seq();
7372 if (in->snaprealm || follows + 1 > in->get_oldest_snap()) {
7373 sr_t *new_srnode = in->prepare_new_srnode(follows);
7374 in->record_snaprealm_past_parent(new_srnode, straydn->get_dir()->inode->find_snaprealm());
7375 mdr->more()->desti_srnode = new_srnode;
7376 }
7377 }
7378 }
7379
7c673cae
FG
7380 // yay!
7381 if (in->is_dir() && in->has_subtree_root_dirfrag()) {
7382 // subtree root auths need to be witnesses
7383 set<mds_rank_t> witnesses;
7384 in->list_replicas(witnesses);
7385 dout(10) << " witnesses " << witnesses << ", have " << mdr->more()->witnessed << dendl;
7386
7387 for (set<mds_rank_t>::iterator p = witnesses.begin();
7388 p != witnesses.end();
7389 ++p) {
7390 if (mdr->more()->witnessed.count(*p)) {
7391 dout(10) << " already witnessed by mds." << *p << dendl;
f67539c2 7392 } else if (mdr->more()->waiting_on_peer.count(*p)) {
7c673cae
FG
7393 dout(10) << " already waiting on witness mds." << *p << dendl;
7394 } else {
9f95a23c 7395 if (!_rmdir_prepare_witness(mdr, *p, mdr->dn[0], straydn))
7c673cae
FG
7396 return;
7397 }
7398 }
f67539c2 7399 if (!mdr->more()->waiting_on_peer.empty())
7c673cae
FG
7400 return; // we're waiting for a witness.
7401 }
7402
9f95a23c
TL
7403 if (!rmdir && dnl->is_primary() && mdr->dn[0].size() == 1)
7404 mds->locker->create_lock_cache(mdr, diri);
7405
7c673cae
FG
7406 // ok!
7407 if (dnl->is_remote() && !dnl->get_inode()->is_auth())
7408 _link_remote(mdr, false, dn, dnl->get_inode());
7409 else
7410 _unlink_local(mdr, dn, straydn);
7411}
7412
7413class C_MDS_unlink_local_finish : public ServerLogContext {
7414 CDentry *dn;
7415 CDentry *straydn;
7416 version_t dnpv; // deleted dentry
7417public:
7418 C_MDS_unlink_local_finish(Server *s, MDRequestRef& r, CDentry *d, CDentry *sd) :
7419 ServerLogContext(s, r), dn(d), straydn(sd),
7420 dnpv(d->get_projected_version()) {}
7421 void finish(int r) override {
11fdf7f2 7422 ceph_assert(r == 0);
7c673cae
FG
7423 server->_unlink_local_finish(mdr, dn, straydn, dnpv);
7424 }
7425};
7426
7427void Server::_unlink_local(MDRequestRef& mdr, CDentry *dn, CDentry *straydn)
7428{
7429 dout(10) << "_unlink_local " << *dn << dendl;
7430
7431 CDentry::linkage_t *dnl = dn->get_projected_linkage();
7432 CInode *in = dnl->get_inode();
7433
7c673cae
FG
7434
7435 // ok, let's do it.
7436 mdr->ls = mdlog->get_current_segment();
7437
7438 // prepare log entry
7439 EUpdate *le = new EUpdate(mdlog, "unlink_local");
7440 mdlog->start_entry(le);
7441 le->metablob.add_client_req(mdr->reqid, mdr->client_request->get_oldest_client_tid());
7442 if (!mdr->more()->witnessed.empty()) {
f67539c2 7443 dout(20) << " noting uncommitted_peers " << mdr->more()->witnessed << dendl;
7c673cae 7444 le->reqid = mdr->reqid;
f67539c2
TL
7445 le->had_peers = true;
7446 mdcache->add_uncommitted_leader(mdr->reqid, mdr->ls, mdr->more()->witnessed);
7c673cae
FG
7447 }
7448
7449 if (straydn) {
11fdf7f2 7450 ceph_assert(dnl->is_primary());
7c673cae 7451 straydn->push_projected_linkage(in);
7c673cae
FG
7452 }
7453
7454 // the unlinked dentry
7455 dn->pre_dirty();
7456
f67539c2 7457 auto pi = in->project_inode(mdr);
94b18763
FG
7458 {
7459 std::string t;
7460 dn->make_path_string(t, true);
f67539c2
TL
7461 pi.inode->stray_prior_path = std::move(t);
7462 }
7463 pi.inode->version = in->pre_dirty();
7464 pi.inode->ctime = mdr->get_op_stamp();
7465 if (mdr->get_op_stamp() > pi.inode->rstat.rctime)
7466 pi.inode->rstat.rctime = mdr->get_op_stamp();
7467 pi.inode->change_attr++;
7468 pi.inode->nlink--;
7469 if (pi.inode->nlink == 0)
7c673cae
FG
7470 in->state_set(CInode::STATE_ORPHAN);
7471
11fdf7f2
TL
7472 if (mdr->more()->desti_srnode) {
7473 auto& desti_srnode = mdr->more()->desti_srnode;
7474 in->project_snaprealm(desti_srnode);
7475 desti_srnode = NULL;
7476 }
7477
7478 if (straydn) {
7479 // will manually pop projected inode
7480
7c673cae 7481 // primary link. add stray dentry.
7c673cae
FG
7482 mdcache->predirty_journal_parents(mdr, &le->metablob, in, dn->get_dir(), PREDIRTY_PRIMARY|PREDIRTY_DIR, -1);
7483 mdcache->predirty_journal_parents(mdr, &le->metablob, in, straydn->get_dir(), PREDIRTY_PRIMARY|PREDIRTY_DIR, 1);
7484
f67539c2 7485 pi.inode->update_backtrace();
7c673cae
FG
7486 le->metablob.add_primary_dentry(straydn, in, true, true);
7487 } else {
7488 // remote link. update remote inode.
7489 mdcache->predirty_journal_parents(mdr, &le->metablob, in, dn->get_dir(), PREDIRTY_DIR, -1);
7490 mdcache->predirty_journal_parents(mdr, &le->metablob, in, 0, PREDIRTY_PRIMARY);
7491 mdcache->journal_dirty_inode(mdr.get(), &le->metablob, in);
7492 }
7493
7494 mdcache->journal_cow_dentry(mdr.get(), &le->metablob, dn);
7495 le->metablob.add_null_dentry(dn, true);
7496
7497 if (in->is_dir()) {
7498 dout(10) << " noting renamed (unlinked) dir ino " << in->ino() << " in metablob" << dendl;
7499 le->metablob.renamed_dirino = in->ino();
7500 }
7501
7502 dn->push_projected_linkage();
7503
11fdf7f2
TL
7504 if (straydn) {
7505 ceph_assert(in->first <= straydn->first);
7506 in->first = straydn->first;
7507 }
7508
7c673cae 7509 if (in->is_dir()) {
11fdf7f2 7510 ceph_assert(straydn);
7c673cae
FG
7511 mdcache->project_subtree_rename(in, dn->get_dir(), straydn->get_dir());
7512 }
7513
7514 journal_and_reply(mdr, 0, dn, le, new C_MDS_unlink_local_finish(this, mdr, dn, straydn));
7515}
7516
7517void Server::_unlink_local_finish(MDRequestRef& mdr,
7518 CDentry *dn, CDentry *straydn,
7519 version_t dnpv)
7520{
7521 dout(10) << "_unlink_local_finish " << *dn << dendl;
7522
7523 if (!mdr->more()->witnessed.empty())
f67539c2 7524 mdcache->logged_leader_update(mdr->reqid);
7c673cae 7525
11fdf7f2
TL
7526 CInode *strayin = NULL;
7527 bool hadrealm = false;
7528 if (straydn) {
7529 // if there is newly created snaprealm, need to split old snaprealm's
7530 // inodes_with_caps. So pop snaprealm before linkage changes.
7531 strayin = dn->get_linkage()->get_inode();
7532 hadrealm = strayin->snaprealm ? true : false;
7533 strayin->early_pop_projected_snaprealm();
7534 }
7535
7c673cae
FG
7536 // unlink main dentry
7537 dn->get_dir()->unlink_inode(dn);
7538 dn->pop_projected_linkage();
f67539c2 7539 dn->mark_dirty(dnpv, mdr->ls);
7c673cae
FG
7540
7541 // relink as stray? (i.e. was primary link?)
7c673cae
FG
7542 if (straydn) {
7543 dout(20) << " straydn is " << *straydn << dendl;
11fdf7f2 7544 straydn->pop_projected_linkage();
7c673cae
FG
7545 mdcache->touch_dentry_bottom(straydn);
7546 }
7547
7c673cae 7548 mdr->apply();
7c673cae
FG
7549
7550 mdcache->send_dentry_unlink(dn, straydn, mdr);
7551
11fdf7f2
TL
7552 if (straydn) {
7553 // update subtree map?
7554 if (strayin->is_dir())
7555 mdcache->adjust_subtree_after_rename(strayin, dn->get_dir(), true);
7556
7557 if (strayin->snaprealm && !hadrealm)
7558 mdcache->do_realm_invalidate_and_update_notify(strayin, CEPH_SNAP_OP_SPLIT, false);
7559 }
7c673cae
FG
7560
7561 // bump pop
11fdf7f2 7562 mds->balancer->hit_dir(dn->get_dir(), META_POP_IWR);
7c673cae
FG
7563
7564 // reply
7565 respond_to_request(mdr, 0);
7566
7567 // removing a new dn?
7568 dn->get_dir()->try_remove_unlinked_dn(dn);
7569
7570 // clean up ?
7571 // respond_to_request() drops locks. So stray reintegration can race with us.
7572 if (straydn && !straydn->get_projected_linkage()->is_null()) {
7573 // Tip off the MDCache that this dentry is a stray that
7574 // might be elegible for purge.
7575 mdcache->notify_stray(straydn);
7576 }
7577}
7578
7579bool Server::_rmdir_prepare_witness(MDRequestRef& mdr, mds_rank_t who, vector<CDentry*>& trace, CDentry *straydn)
7580{
7581 if (mds->is_cluster_degraded() &&
7582 !mds->mdsmap->is_clientreplay_or_active_or_stopping(who)) {
7583 dout(10) << "_rmdir_prepare_witness mds." << who << " is not active" << dendl;
f67539c2 7584 if (mdr->more()->waiting_on_peer.empty())
7c673cae
FG
7585 mds->wait_for_active_peer(who, new C_MDS_RetryRequest(mdcache, mdr));
7586 return false;
7587 }
7588
7589 dout(10) << "_rmdir_prepare_witness mds." << who << dendl;
f67539c2 7590 auto req = make_message<MMDSPeerRequest>(mdr->reqid, mdr->attempt, MMDSPeerRequest::OP_RMDIRPREP);
7c673cae
FG
7591 req->srcdnpath = filepath(trace.front()->get_dir()->ino());
7592 for (auto dn : trace)
94b18763 7593 req->srcdnpath.push_dentry(dn->get_name());
9f95a23c 7594 mdcache->encode_replica_stray(straydn, who, req->straybl);
11fdf7f2
TL
7595 if (mdr->more()->desti_srnode)
7596 encode(*mdr->more()->desti_srnode, req->desti_snapbl);
7c673cae
FG
7597
7598 req->op_stamp = mdr->get_op_stamp();
7599 mds->send_message_mds(req, who);
7600
f67539c2
TL
7601 ceph_assert(mdr->more()->waiting_on_peer.count(who) == 0);
7602 mdr->more()->waiting_on_peer.insert(who);
7c673cae
FG
7603 return true;
7604}
7605
f67539c2 7606struct C_MDS_PeerRmdirPrep : public ServerLogContext {
7c673cae 7607 CDentry *dn, *straydn;
f67539c2 7608 C_MDS_PeerRmdirPrep(Server *s, MDRequestRef& r, CDentry *d, CDentry *st)
7c673cae
FG
7609 : ServerLogContext(s, r), dn(d), straydn(st) {}
7610 void finish(int r) override {
f67539c2 7611 server->_logged_peer_rmdir(mdr, dn, straydn);
7c673cae
FG
7612 }
7613};
7614
f67539c2 7615struct C_MDS_PeerRmdirCommit : public ServerContext {
7c673cae 7616 MDRequestRef mdr;
31f18b77 7617 CDentry *straydn;
f67539c2 7618 C_MDS_PeerRmdirCommit(Server *s, MDRequestRef& r, CDentry *sd)
31f18b77 7619 : ServerContext(s), mdr(r), straydn(sd) { }
7c673cae 7620 void finish(int r) override {
f67539c2 7621 server->_commit_peer_rmdir(mdr, r, straydn);
7c673cae
FG
7622 }
7623};
7624
f67539c2 7625void Server::handle_peer_rmdir_prep(MDRequestRef& mdr)
7c673cae 7626{
f67539c2
TL
7627 dout(10) << "handle_peer_rmdir_prep " << *mdr
7628 << " " << mdr->peer_request->srcdnpath
7629 << " to " << mdr->peer_request->destdnpath
7c673cae
FG
7630 << dendl;
7631
7632 vector<CDentry*> trace;
f67539c2 7633 filepath srcpath(mdr->peer_request->srcdnpath);
7c673cae
FG
7634 dout(10) << " src " << srcpath << dendl;
7635 CInode *in;
f67539c2 7636 CF_MDS_RetryRequestFactory cf(mdcache, mdr, false);
9f95a23c
TL
7637 int r = mdcache->path_traverse(mdr, cf, srcpath,
7638 MDS_TRAVERSE_DISCOVER | MDS_TRAVERSE_PATH_LOCKED,
7639 &trace, &in);
7c673cae 7640 if (r > 0) return;
f67539c2 7641 if (r == -CEPHFS_ESTALE) {
7c673cae 7642 mdcache->find_ino_peers(srcpath.get_ino(), new C_MDS_RetryRequest(mdcache, mdr),
f67539c2 7643 mdr->peer_to_mds, true);
7c673cae
FG
7644 return;
7645 }
11fdf7f2 7646 ceph_assert(r == 0);
91327a77 7647 CDentry *dn = trace.back();
7c673cae
FG
7648 dout(10) << " dn " << *dn << dendl;
7649 mdr->pin(dn);
7650
11fdf7f2 7651 ceph_assert(mdr->straydn);
7c673cae
FG
7652 CDentry *straydn = mdr->straydn;
7653 dout(10) << " straydn " << *straydn << dendl;
7654
f67539c2 7655 mdr->set_op_stamp(mdr->peer_request->op_stamp);
7c673cae
FG
7656
7657 rmdir_rollback rollback;
7658 rollback.reqid = mdr->reqid;
7659 rollback.src_dir = dn->get_dir()->dirfrag();
11fdf7f2 7660 rollback.src_dname = dn->get_name();
7c673cae 7661 rollback.dest_dir = straydn->get_dir()->dirfrag();
11fdf7f2 7662 rollback.dest_dname = straydn->get_name();
f67539c2 7663 if (mdr->peer_request->desti_snapbl.length()) {
11fdf7f2
TL
7664 if (in->snaprealm) {
7665 encode(true, rollback.snapbl);
7666 in->encode_snap_blob(rollback.snapbl);
7667 } else {
7668 encode(false, rollback.snapbl);
7669 }
7670 }
7671 encode(rollback, mdr->more()->rollback_bl);
7672 // FIXME: rollback snaprealm
7c673cae
FG
7673 dout(20) << " rollback is " << mdr->more()->rollback_bl.length() << " bytes" << dendl;
7674
7675 // set up commit waiter
f67539c2 7676 mdr->more()->peer_commit = new C_MDS_PeerRmdirCommit(this, mdr, straydn);
7c673cae 7677
11fdf7f2
TL
7678 straydn->push_projected_linkage(in);
7679 dn->push_projected_linkage();
7c673cae 7680
11fdf7f2
TL
7681 ceph_assert(straydn->first >= in->first);
7682 in->first = straydn->first;
7c673cae 7683
11fdf7f2
TL
7684 if (!in->has_subtree_root_dirfrag(mds->get_nodeid())) {
7685 dout(10) << " no auth subtree in " << *in << ", skipping journal" << dendl;
f67539c2 7686 _logged_peer_rmdir(mdr, dn, straydn);
7c673cae
FG
7687 return;
7688 }
7689
e306af50 7690 mdr->ls = mdlog->get_current_segment();
f67539c2
TL
7691 EPeerUpdate *le = new EPeerUpdate(mdlog, "peer_rmdir", mdr->reqid, mdr->peer_to_mds,
7692 EPeerUpdate::OP_PREPARE, EPeerUpdate::RMDIR);
7c673cae
FG
7693 mdlog->start_entry(le);
7694 le->rollback = mdr->more()->rollback_bl;
7695
7696 le->commit.add_dir_context(straydn->get_dir());
7697 le->commit.add_primary_dentry(straydn, in, true);
f67539c2 7698 // peer: no need to journal original dentry
7c673cae
FG
7699
7700 dout(10) << " noting renamed (unlinked) dir ino " << in->ino() << " in metablob" << dendl;
7701 le->commit.renamed_dirino = in->ino();
7702
7703 mdcache->project_subtree_rename(in, dn->get_dir(), straydn->get_dir());
f67539c2 7704 mdcache->add_uncommitted_peer(mdr->reqid, mdr->ls, mdr->peer_to_mds);
7c673cae 7705
f67539c2
TL
7706 mdr->more()->peer_update_journaled = true;
7707 submit_mdlog_entry(le, new C_MDS_PeerRmdirPrep(this, mdr, dn, straydn),
7c673cae
FG
7708 mdr, __func__);
7709 mdlog->flush();
7710}
7711
f67539c2 7712void Server::_logged_peer_rmdir(MDRequestRef& mdr, CDentry *dn, CDentry *straydn)
7c673cae 7713{
f67539c2 7714 dout(10) << "_logged_peer_rmdir " << *mdr << " on " << *dn << dendl;
11fdf7f2
TL
7715 CInode *in = dn->get_linkage()->get_inode();
7716
7717 bool new_realm;
f67539c2 7718 if (mdr->peer_request->desti_snapbl.length()) {
11fdf7f2 7719 new_realm = !in->snaprealm;
f67539c2 7720 in->decode_snap_blob(mdr->peer_request->desti_snapbl);
11fdf7f2 7721 ceph_assert(in->snaprealm);
11fdf7f2
TL
7722 } else {
7723 new_realm = false;
7724 }
7c673cae
FG
7725
7726 // update our cache now, so we are consistent with what is in the journal
7727 // when we journal a subtree map
7c673cae
FG
7728 dn->get_dir()->unlink_inode(dn);
7729 straydn->pop_projected_linkage();
7730 dn->pop_projected_linkage();
11fdf7f2 7731
f67539c2 7732 mdcache->adjust_subtree_after_rename(in, dn->get_dir(), mdr->more()->peer_update_journaled);
11fdf7f2
TL
7733
7734 if (new_realm)
7735 mdcache->do_realm_invalidate_and_update_notify(in, CEPH_SNAP_OP_SPLIT, false);
7c673cae
FG
7736
7737 // done.
f67539c2 7738 mdr->reset_peer_request();
7c673cae
FG
7739 mdr->straydn = 0;
7740
7741 if (!mdr->aborted) {
f67539c2
TL
7742 auto reply = make_message<MMDSPeerRequest>(mdr->reqid, mdr->attempt, MMDSPeerRequest::OP_RMDIRPREPACK);
7743 if (!mdr->more()->peer_update_journaled)
11fdf7f2 7744 reply->mark_not_journaled();
f67539c2 7745 mds->send_message_mds(reply, mdr->peer_to_mds);
7c673cae
FG
7746 } else {
7747 dout(10) << " abort flag set, finishing" << dendl;
7748 mdcache->request_finish(mdr);
7749 }
7750}
7751
f67539c2 7752void Server::handle_peer_rmdir_prep_ack(MDRequestRef& mdr, const cref_t<MMDSPeerRequest> &ack)
7c673cae 7753{
f67539c2 7754 dout(10) << "handle_peer_rmdir_prep_ack " << *mdr
7c673cae
FG
7755 << " " << *ack << dendl;
7756
7757 mds_rank_t from = mds_rank_t(ack->get_source().num());
7758
f67539c2 7759 mdr->more()->peers.insert(from);
7c673cae
FG
7760 mdr->more()->witnessed.insert(from);
7761 if (!ack->is_not_journaled())
f67539c2 7762 mdr->more()->has_journaled_peers = true;
7c673cae
FG
7763
7764 // remove from waiting list
f67539c2
TL
7765 ceph_assert(mdr->more()->waiting_on_peer.count(from));
7766 mdr->more()->waiting_on_peer.erase(from);
7c673cae 7767
f67539c2 7768 if (mdr->more()->waiting_on_peer.empty())
7c673cae
FG
7769 dispatch_client_request(mdr); // go again!
7770 else
f67539c2 7771 dout(10) << "still waiting on peers " << mdr->more()->waiting_on_peer << dendl;
7c673cae
FG
7772}
7773
f67539c2 7774void Server::_commit_peer_rmdir(MDRequestRef& mdr, int r, CDentry *straydn)
7c673cae 7775{
f67539c2 7776 dout(10) << "_commit_peer_rmdir " << *mdr << " r=" << r << dendl;
e306af50 7777
7c673cae 7778 if (r == 0) {
f67539c2 7779 if (mdr->more()->peer_update_journaled) {
31f18b77
FG
7780 CInode *strayin = straydn->get_projected_linkage()->get_inode();
7781 if (strayin && !strayin->snaprealm)
7782 mdcache->clear_dirty_bits_for_stray(strayin);
7783 }
7784
7c673cae
FG
7785 mdr->cleanup();
7786
f67539c2 7787 if (mdr->more()->peer_update_journaled) {
7c673cae 7788 // write a commit to the journal
f67539c2
TL
7789 EPeerUpdate *le = new EPeerUpdate(mdlog, "peer_rmdir_commit", mdr->reqid,
7790 mdr->peer_to_mds, EPeerUpdate::OP_COMMIT,
7791 EPeerUpdate::RMDIR);
7c673cae 7792 mdlog->start_entry(le);
f67539c2 7793 submit_mdlog_entry(le, new C_MDS_CommittedPeer(this, mdr), mdr, __func__);
7c673cae
FG
7794 mdlog->flush();
7795 } else {
f67539c2 7796 _committed_peer(mdr);
7c673cae
FG
7797 }
7798 } else {
7799 // abort
f67539c2 7800 do_rmdir_rollback(mdr->more()->rollback_bl, mdr->peer_to_mds, mdr);
7c673cae
FG
7801 }
7802}
7803
7804struct C_MDS_LoggedRmdirRollback : public ServerLogContext {
7805 metareqid_t reqid;
7806 CDentry *dn;
7807 CDentry *straydn;
7808 C_MDS_LoggedRmdirRollback(Server *s, MDRequestRef& m, metareqid_t mr, CDentry *d, CDentry *st)
7809 : ServerLogContext(s, m), reqid(mr), dn(d), straydn(st) {}
7810 void finish(int r) override {
7811 server->_rmdir_rollback_finish(mdr, reqid, dn, straydn);
7812 }
7813};
7814
f67539c2 7815void Server::do_rmdir_rollback(bufferlist &rbl, mds_rank_t leader, MDRequestRef& mdr)
7c673cae
FG
7816{
7817 // unlink the other rollback methods, the rmdir rollback is only
7818 // needed to record the subtree changes in the journal for inode
7819 // replicas who are auth for empty dirfrags. no actual changes to
7820 // the file system are taking place here, so there is no Mutation.
7821
7822 rmdir_rollback rollback;
11fdf7f2
TL
7823 auto p = rbl.cbegin();
7824 decode(rollback, p);
7c673cae
FG
7825
7826 dout(10) << "do_rmdir_rollback on " << rollback.reqid << dendl;
f67539c2 7827 mdcache->add_rollback(rollback.reqid, leader); // need to finish this update before resolve finishes
11fdf7f2 7828 ceph_assert(mdr || mds->is_resolve());
7c673cae
FG
7829
7830 CDir *dir = mdcache->get_dirfrag(rollback.src_dir);
7831 if (!dir)
7832 dir = mdcache->get_dirfrag(rollback.src_dir.ino, rollback.src_dname);
11fdf7f2 7833 ceph_assert(dir);
7c673cae 7834 CDentry *dn = dir->lookup(rollback.src_dname);
11fdf7f2 7835 ceph_assert(dn);
7c673cae 7836 dout(10) << " dn " << *dn << dendl;
11fdf7f2
TL
7837 CDir *straydir = mdcache->get_dirfrag(rollback.dest_dir);
7838 ceph_assert(straydir);
7839 CDentry *straydn = straydir->lookup(rollback.dest_dname);
7840 ceph_assert(straydn);
7841 dout(10) << " straydn " << *straydn << dendl;
7c673cae
FG
7842 CInode *in = straydn->get_linkage()->get_inode();
7843
11fdf7f2
TL
7844 dn->push_projected_linkage(in);
7845 straydn->push_projected_linkage();
7c673cae 7846
11fdf7f2
TL
7847 if (rollback.snapbl.length() && in->snaprealm) {
7848 bool hadrealm;
7849 auto p = rollback.snapbl.cbegin();
7850 decode(hadrealm, p);
7851 if (hadrealm) {
7852 decode(in->snaprealm->srnode, p);
7853 } else {
7854 in->snaprealm->merge_to(dir->get_inode()->find_snaprealm());
7855 }
7856 }
7c673cae 7857
f67539c2 7858 if (mdr && !mdr->more()->peer_update_journaled) {
11fdf7f2 7859 ceph_assert(!in->has_subtree_root_dirfrag(mds->get_nodeid()));
7c673cae 7860
11fdf7f2 7861 _rmdir_rollback_finish(mdr, rollback.reqid, dn, straydn);
7c673cae
FG
7862 return;
7863 }
7864
7c673cae 7865
f67539c2
TL
7866 EPeerUpdate *le = new EPeerUpdate(mdlog, "peer_rmdir_rollback", rollback.reqid, leader,
7867 EPeerUpdate::OP_ROLLBACK, EPeerUpdate::RMDIR);
7c673cae
FG
7868 mdlog->start_entry(le);
7869
7870 le->commit.add_dir_context(dn->get_dir());
7871 le->commit.add_primary_dentry(dn, in, true);
f67539c2 7872 // peer: no need to journal straydn
7c673cae
FG
7873
7874 dout(10) << " noting renamed (unlinked) dir ino " << in->ino() << " in metablob" << dendl;
7875 le->commit.renamed_dirino = in->ino();
7876
7877 mdcache->project_subtree_rename(in, straydn->get_dir(), dn->get_dir());
7878
7879 submit_mdlog_entry(le,
7880 new C_MDS_LoggedRmdirRollback(this, mdr,rollback.reqid,
7881 dn, straydn),
7882 mdr, __func__);
7883 mdlog->flush();
7884}
7885
7886void Server::_rmdir_rollback_finish(MDRequestRef& mdr, metareqid_t reqid, CDentry *dn, CDentry *straydn)
7887{
7888 dout(10) << "_rmdir_rollback_finish " << reqid << dendl;
7889
7890 straydn->get_dir()->unlink_inode(straydn);
7891 dn->pop_projected_linkage();
7892 straydn->pop_projected_linkage();
7893
7894 CInode *in = dn->get_linkage()->get_inode();
11fdf7f2 7895 mdcache->adjust_subtree_after_rename(in, straydn->get_dir(),
f67539c2 7896 !mdr || mdr->more()->peer_update_journaled);
11fdf7f2 7897
7c673cae
FG
7898 if (mds->is_resolve()) {
7899 CDir *root = mdcache->get_subtree_root(straydn->get_dir());
7900 mdcache->try_trim_non_auth_subtree(root);
7901 }
7902
7903 if (mdr)
7904 mdcache->request_finish(mdr);
7905
e306af50 7906 mdcache->finish_rollback(reqid, mdr);
7c673cae
FG
7907}
7908
7909
7910/** _dir_is_nonempty[_unlocked]
7911 *
7912 * check if a directory is non-empty (i.e. we can rmdir it).
7913 *
7914 * the unlocked varient this is a fastpath check. we can't really be
7915 * sure until we rdlock the filelock.
7916 */
7917bool Server::_dir_is_nonempty_unlocked(MDRequestRef& mdr, CInode *in)
7918{
7919 dout(10) << "dir_is_nonempty_unlocked " << *in << dendl;
11fdf7f2 7920 ceph_assert(in->is_auth());
7c673cae 7921
9f95a23c
TL
7922 if (in->filelock.is_cached())
7923 return false; // there can be pending async create/unlink. don't know.
7c673cae
FG
7924 if (in->snaprealm && in->snaprealm->srnode.snaps.size())
7925 return true; // in a snapshot!
7926
9f95a23c
TL
7927 auto&& ls = in->get_dirfrags();
7928 for (const auto& dir : ls) {
7c673cae
FG
7929 // is the frag obviously non-empty?
7930 if (dir->is_auth()) {
7931 if (dir->get_projected_fnode()->fragstat.size()) {
7932 dout(10) << "dir_is_nonempty_unlocked dirstat has "
7933 << dir->get_projected_fnode()->fragstat.size() << " items " << *dir << dendl;
7934 return true;
7935 }
7936 }
7937 }
7938
7939 return false;
7940}
7941
7942bool Server::_dir_is_nonempty(MDRequestRef& mdr, CInode *in)
7943{
7944 dout(10) << "dir_is_nonempty " << *in << dendl;
11fdf7f2
TL
7945 ceph_assert(in->is_auth());
7946 ceph_assert(in->filelock.can_read(mdr->get_client()));
7c673cae
FG
7947
7948 frag_info_t dirstat;
7949 version_t dirstat_version = in->get_projected_inode()->dirstat.version;
7950
9f95a23c
TL
7951 auto&& ls = in->get_dirfrags();
7952 for (const auto& dir : ls) {
f67539c2 7953 const auto& pf = dir->get_projected_fnode();
7c673cae
FG
7954 if (pf->fragstat.size()) {
7955 dout(10) << "dir_is_nonempty dirstat has "
7956 << pf->fragstat.size() << " items " << *dir << dendl;
7957 return true;
7958 }
7959
7960 if (pf->accounted_fragstat.version == dirstat_version)
7961 dirstat.add(pf->accounted_fragstat);
7962 else
7963 dirstat.add(pf->fragstat);
7964 }
7965
7966 return dirstat.size() != in->get_projected_inode()->dirstat.size();
7967}
7968
7969
7970// ======================================================
7971
7972
7973class C_MDS_rename_finish : public ServerLogContext {
7974 CDentry *srcdn;
7975 CDentry *destdn;
7976 CDentry *straydn;
7977public:
7978 C_MDS_rename_finish(Server *s, MDRequestRef& r,
7979 CDentry *sdn, CDentry *ddn, CDentry *stdn) :
7980 ServerLogContext(s, r),
7981 srcdn(sdn), destdn(ddn), straydn(stdn) { }
7982 void finish(int r) override {
11fdf7f2 7983 ceph_assert(r == 0);
7c673cae
FG
7984 server->_rename_finish(mdr, srcdn, destdn, straydn);
7985 }
7986};
7987
7988
7989/** handle_client_rename
7990 *
f67539c2 7991 * rename leader is the destdn auth. this is because cached inodes
7c673cae
FG
7992 * must remain connected. thus, any replica of srci, must also
7993 * replicate destdn, and possibly straydn, so that srci (and
7994 * destdn->inode) remain connected during the rename.
7995 *
f67539c2 7996 * to do this, we freeze srci, then leader (destdn auth) verifies that
7c673cae
FG
7997 * all other nodes have also replciated destdn and straydn. note that
7998 * destdn replicas need not also replicate srci. this only works when
f67539c2 7999 * destdn is leader.
7c673cae
FG
8000 *
8001 * This function takes responsibility for the passed mdr.
8002 */
8003void Server::handle_client_rename(MDRequestRef& mdr)
8004{
f67539c2 8005 const auto& req = mdr->client_request;
7c673cae
FG
8006 dout(7) << "handle_client_rename " << *req << dendl;
8007
8008 filepath destpath = req->get_filepath();
8009 filepath srcpath = req->get_filepath2();
91327a77 8010 if (srcpath.is_last_dot_or_dotdot() || destpath.is_last_dot_or_dotdot()) {
f67539c2
TL
8011 respond_to_request(mdr, -CEPHFS_EBUSY);
8012 return;
8013 }
8014
8015 if (req->get_alternate_name().size() > alternate_name_max) {
8016 dout(10) << " alternate_name longer than " << alternate_name_max << dendl;
8017 respond_to_request(mdr, -CEPHFS_ENAMETOOLONG);
91327a77
AA
8018 return;
8019 }
8020
9f95a23c
TL
8021 auto [destdn, srcdn] = rdlock_two_paths_xlock_destdn(mdr, true);
8022 if (!destdn)
8023 return;
7c673cae 8024
7c673cae 8025 dout(10) << " destdn " << *destdn << dendl;
7c673cae 8026 CDir *destdir = destdn->get_dir();
11fdf7f2 8027 ceph_assert(destdir->is_auth());
9f95a23c 8028 CDentry::linkage_t *destdnl = destdn->get_projected_linkage();
7c673cae 8029
7c673cae 8030 dout(10) << " srcdn " << *srcdn << dendl;
11fdf7f2 8031 CDir *srcdir = srcdn->get_dir();
7c673cae
FG
8032 CDentry::linkage_t *srcdnl = srcdn->get_projected_linkage();
8033 CInode *srci = srcdnl->get_inode();
8034 dout(10) << " srci " << *srci << dendl;
8035
9f95a23c
TL
8036 // -- some sanity checks --
8037 if (destdn == srcdn) {
8038 dout(7) << "rename src=dest, noop" << dendl;
8039 respond_to_request(mdr, 0);
8040 return;
8041 }
8042
8043 // dest a child of src?
8044 // e.g. mv /usr /usr/foo
8045 if (srci->is_dir() && srci->is_projected_ancestor_of(destdir->get_inode())) {
8046 dout(7) << "cannot rename item to be a child of itself" << dendl;
f67539c2 8047 respond_to_request(mdr, -CEPHFS_EINVAL);
9f95a23c
TL
8048 return;
8049 }
8050
8051 // is this a stray migration, reintegration or merge? (sanity checks!)
8052 if (mdr->reqid.name.is_mds() &&
8053 !(MDS_INO_IS_STRAY(srcpath.get_ino()) &&
8054 MDS_INO_IS_STRAY(destpath.get_ino())) &&
8055 !(destdnl->is_remote() &&
8056 destdnl->get_remote_ino() == srci->ino())) {
f67539c2 8057 respond_to_request(mdr, -CEPHFS_EINVAL); // actually, this won't reply, but whatev.
9f95a23c
TL
8058 return;
8059 }
8060
7c673cae
FG
8061 CInode *oldin = 0;
8062 if (!destdnl->is_null()) {
8063 //dout(10) << "dest dn exists " << *destdn << dendl;
8064 oldin = mdcache->get_dentry_inode(destdn, mdr, true);
8065 if (!oldin) return;
8066 dout(10) << " oldin " << *oldin << dendl;
7c673cae
FG
8067
8068 // non-empty dir? do trivial fast unlocked check, do another check later with read locks
8069 if (oldin->is_dir() && _dir_is_nonempty_unlocked(mdr, oldin)) {
f67539c2 8070 respond_to_request(mdr, -CEPHFS_ENOTEMPTY);
7c673cae
FG
8071 return;
8072 }
181888fb 8073
9f95a23c
TL
8074 // mv /some/thing /to/some/existing_other_thing
8075 if (oldin->is_dir() && !srci->is_dir()) {
f67539c2 8076 respond_to_request(mdr, -CEPHFS_EISDIR);
9f95a23c
TL
8077 return;
8078 }
8079 if (!oldin->is_dir() && srci->is_dir()) {
f67539c2 8080 respond_to_request(mdr, -CEPHFS_ENOTDIR);
9f95a23c
TL
8081 return;
8082 }
8083 if (srci == oldin && !srcdir->inode->is_stray()) {
8084 respond_to_request(mdr, 0); // no-op. POSIX makes no sense.
8085 return;
7c673cae 8086 }
f67539c2
TL
8087 if (destdn->get_alternate_name() != req->get_alternate_name()) {
8088 /* the dentry exists but the alternate_names do not match, fail... */
8089 respond_to_request(mdr, -CEPHFS_EINVAL);
8090 return;
8091 }
7c673cae
FG
8092 }
8093
9f95a23c
TL
8094 vector<CDentry*>& srctrace = mdr->dn[1];
8095 vector<CDentry*>& desttrace = mdr->dn[0];
7c673cae
FG
8096
8097 // src+dest traces _must_ share a common ancestor for locking to prevent orphans
8098 if (destpath.get_ino() != srcpath.get_ino() &&
8099 !(req->get_source().is_mds() &&
9f95a23c 8100 MDS_INO_IS_STRAY(srcpath.get_ino()))) { // <-- mds 'rename' out of stray dir is ok!
7c673cae
FG
8101 CInode *srcbase = srctrace[0]->get_dir()->get_inode();
8102 CInode *destbase = desttrace[0]->get_dir()->get_inode();
8103 // ok, extend srctrace toward root until it is an ancestor of desttrace.
8104 while (srcbase != destbase &&
8105 !srcbase->is_projected_ancestor_of(destbase)) {
8106 CDentry *pdn = srcbase->get_projected_parent_dn();
8107 srctrace.insert(srctrace.begin(), pdn);
8108 dout(10) << "rename prepending srctrace with " << *pdn << dendl;
8109 srcbase = pdn->get_dir()->get_inode();
8110 }
8111
8112 // then, extend destpath until it shares the same parent inode as srcpath.
8113 while (destbase != srcbase) {
8114 CDentry *pdn = destbase->get_projected_parent_dn();
8115 desttrace.insert(desttrace.begin(), pdn);
7c673cae
FG
8116 dout(10) << "rename prepending desttrace with " << *pdn << dendl;
8117 destbase = pdn->get_dir()->get_inode();
8118 }
8119 dout(10) << "rename src and dest traces now share common ancestor " << *destbase << dendl;
8120 }
8121
7c673cae 8122
11fdf7f2 8123 bool linkmerge = srcdnl->get_inode() == destdnl->get_inode();
7c673cae
FG
8124 if (linkmerge)
8125 dout(10) << " this is a link merge" << dendl;
8126
8127 // -- create stray dentry? --
8128 CDentry *straydn = NULL;
8129 if (destdnl->is_primary() && !linkmerge) {
8130 straydn = prepare_stray_dentry(mdr, destdnl->get_inode());
8131 if (!straydn)
8132 return;
8133 dout(10) << " straydn is " << *straydn << dendl;
8134 } else if (mdr->straydn) {
8135 mdr->unpin(mdr->straydn);
8136 mdr->straydn = NULL;
8137 }
8138
7c673cae
FG
8139
8140 // -- locks --
9f95a23c
TL
8141 if (!(mdr->locking_state & MutationImpl::ALL_LOCKED)) {
8142 MutationImpl::LockOpVec lov;
7c673cae 8143
9f95a23c
TL
8144 // we need to update srci's ctime. xlock its least contended lock to do that...
8145 lov.add_xlock(&srci->linklock);
8146 lov.add_xlock(&srci->snaplock);
7c673cae 8147
9f95a23c
TL
8148 if (oldin) {
8149 // xlock oldin (for nlink--)
8150 lov.add_xlock(&oldin->linklock);
8151 lov.add_xlock(&oldin->snaplock);
8152 if (oldin->is_dir()) {
8153 ceph_assert(srci->is_dir());
11fdf7f2 8154 lov.add_rdlock(&oldin->filelock); // to verify it's empty
7c673cae 8155
9f95a23c
TL
8156 // adjust locking order?
8157 int cmp = mdr->compare_paths();
8158 if (cmp < 0 || (cmp == 0 && oldin->ino() < srci->ino()))
8159 std::reverse(lov.begin(), lov.end());
8160 } else {
8161 ceph_assert(!srci->is_dir());
8162 // adjust locking order;
8163 if (srci->ino() > oldin->ino())
8164 std::reverse(lov.begin(), lov.end());
8165 }
8166 }
8167
8168 // straydn?
8169 if (straydn) {
8170 lov.add_wrlock(&straydn->get_dir()->inode->filelock);
8171 lov.add_wrlock(&straydn->get_dir()->inode->nestlock);
8172 lov.add_xlock(&straydn->lock);
8173 }
8174
8175 CInode *auth_pin_freeze = !srcdn->is_auth() && srcdnl->is_primary() ? srci : nullptr;
8176 if (!mds->locker->acquire_locks(mdr, lov, auth_pin_freeze))
8177 return;
8178
8179 mdr->locking_state |= MutationImpl::ALL_LOCKED;
8180 }
7c673cae 8181
11fdf7f2
TL
8182 if (linkmerge)
8183 ceph_assert(srcdir->inode->is_stray() && srcdnl->is_primary() && destdnl->is_remote());
8184
7c673cae 8185 if ((!mdr->has_more() || mdr->more()->witnessed.empty())) {
11fdf7f2 8186 if (!check_access(mdr, srcdir->get_inode(), MAY_WRITE))
7c673cae
FG
8187 return;
8188
8189 if (!check_access(mdr, destdn->get_dir()->get_inode(), MAY_WRITE))
8190 return;
8191
8192 if (!check_fragment_space(mdr, destdn->get_dir()))
8193 return;
8194
8195 if (!check_access(mdr, srci, MAY_WRITE))
8196 return;
8197 }
8198
8199 // with read lock, really verify oldin is empty
8200 if (oldin &&
8201 oldin->is_dir() &&
8202 _dir_is_nonempty(mdr, oldin)) {
f67539c2 8203 respond_to_request(mdr, -CEPHFS_ENOTEMPTY);
7c673cae
FG
8204 return;
8205 }
8206
11fdf7f2 8207 /* project_snaprealm_past_parent() will do this job
7c673cae
FG
8208 *
8209 // moving between snaprealms?
8210 if (srcdnl->is_primary() && srci->is_multiversion() && !srci->snaprealm) {
8211 SnapRealm *srcrealm = srci->find_snaprealm();
8212 SnapRealm *destrealm = destdn->get_dir()->inode->find_snaprealm();
8213 if (srcrealm != destrealm &&
8214 (srcrealm->get_newest_seq() + 1 > srcdn->first ||
8215 destrealm->get_newest_seq() + 1 > srcdn->first)) {
8216 dout(10) << " renaming between snaprealms, creating snaprealm for " << *srci << dendl;
8217 mdcache->snaprealm_create(mdr, srci);
8218 return;
8219 }
8220 }
8221 */
8222
adb31ebb
TL
8223 SnapRealm *dest_realm = nullptr;
8224 SnapRealm *src_realm = nullptr;
8225 if (!linkmerge) {
8226 dest_realm = destdir->inode->find_snaprealm();
8227 if (srcdir->inode == destdir->inode)
8228 src_realm = dest_realm;
8229 else
8230 src_realm = srcdir->inode->find_snaprealm();
8231 if (src_realm != dest_realm &&
8232 src_realm->get_subvolume_ino() != dest_realm->get_subvolume_ino()) {
f67539c2 8233 respond_to_request(mdr, -CEPHFS_EXDEV);
adb31ebb
TL
8234 return;
8235 }
8236 }
8237
11fdf7f2 8238 ceph_assert(g_conf()->mds_kill_rename_at != 1);
7c673cae
FG
8239
8240 // -- open all srcdn inode frags, if any --
8241 // we need these open so that auth can properly delegate from inode to dirfrags
8242 // after the inode is _ours_.
8243 if (srcdnl->is_primary() &&
8244 !srcdn->is_auth() &&
8245 srci->is_dir()) {
8246 dout(10) << "srci is remote dir, setting stickydirs and opening all frags" << dendl;
8247 mdr->set_stickydirs(srci);
8248
11fdf7f2
TL
8249 frag_vec_t leaves;
8250 srci->dirfragtree.get_leaves(leaves);
8251 for (const auto& leaf : leaves) {
8252 CDir *dir = srci->get_dirfrag(leaf);
7c673cae 8253 if (!dir) {
11fdf7f2
TL
8254 dout(10) << " opening " << leaf << " under " << *srci << dendl;
8255 mdcache->open_remote_dirfrag(srci, leaf, new C_MDS_RetryRequest(mdcache, mdr));
7c673cae
FG
8256 return;
8257 }
8258 }
8259 }
8260
11fdf7f2
TL
8261 // -- prepare snaprealm ---
8262
8263 if (linkmerge) {
8264 if (!mdr->more()->srci_srnode &&
8265 srci->get_projected_inode()->nlink == 1 &&
8266 srci->is_projected_snaprealm_global()) {
8267 sr_t *new_srnode = srci->prepare_new_srnode(0);
adb31ebb 8268 srci->record_snaprealm_parent_dentry(new_srnode, nullptr, destdn, false);
11fdf7f2
TL
8269
8270 srci->clear_snaprealm_global(new_srnode);
8271 mdr->more()->srci_srnode = new_srnode;
8272 }
8273 } else {
8274 if (oldin && !mdr->more()->desti_srnode) {
8275 if (oldin->is_projected_snaprealm_global()) {
8276 sr_t *new_srnode = oldin->prepare_new_srnode(0);
adb31ebb 8277 oldin->record_snaprealm_parent_dentry(new_srnode, dest_realm, destdn, destdnl->is_primary());
11fdf7f2
TL
8278 // dropping the last linkage or dropping the last remote linkage,
8279 // detch the inode from global snaprealm
8280 auto nlink = oldin->get_projected_inode()->nlink;
8281 if (nlink == 1 ||
8282 (nlink == 2 && !destdnl->is_primary() &&
8283 !oldin->get_projected_parent_dir()->inode->is_stray()))
8284 oldin->clear_snaprealm_global(new_srnode);
8285 mdr->more()->desti_srnode = new_srnode;
8286 } else if (destdnl->is_primary()) {
11fdf7f2
TL
8287 snapid_t follows = dest_realm->get_newest_seq();
8288 if (oldin->snaprealm || follows + 1 > oldin->get_oldest_snap()) {
8289 sr_t *new_srnode = oldin->prepare_new_srnode(follows);
8290 oldin->record_snaprealm_past_parent(new_srnode, straydn->get_dir()->inode->find_snaprealm());
8291 mdr->more()->desti_srnode = new_srnode;
8292 }
8293 }
8294 }
8295 if (!mdr->more()->srci_srnode) {
11fdf7f2
TL
8296 if (srci->is_projected_snaprealm_global()) {
8297 sr_t *new_srnode = srci->prepare_new_srnode(0);
adb31ebb 8298 srci->record_snaprealm_parent_dentry(new_srnode, src_realm, srcdn, srcdnl->is_primary());
11fdf7f2
TL
8299 mdr->more()->srci_srnode = new_srnode;
8300 } else if (srcdnl->is_primary()) {
11fdf7f2
TL
8301 snapid_t follows = src_realm->get_newest_seq();
8302 if (src_realm != dest_realm &&
8303 (srci->snaprealm || follows + 1 > srci->get_oldest_snap())) {
8304 sr_t *new_srnode = srci->prepare_new_srnode(follows);
8305 srci->record_snaprealm_past_parent(new_srnode, dest_realm);
8306 mdr->more()->srci_srnode = new_srnode;
8307 }
8308 }
8309 }
8310 }
8311
7c673cae
FG
8312 // -- prepare witnesses --
8313
9f95a23c
TL
8314 /*
8315 * NOTE: we use _all_ replicas as witnesses.
8316 * this probably isn't totally necessary (esp for file renames),
8317 * but if/when we change that, we have to make sure rejoin is
8318 * sufficiently robust to handle strong rejoins from survivors
8319 * with totally wrong dentry->inode linkage.
8320 * (currently, it can ignore rename effects, because the resolve
8321 * stage will sort them out.)
8322 */
8323 set<mds_rank_t> witnesses = mdr->more()->extra_witnesses;
8324 if (srcdn->is_auth())
8325 srcdn->list_replicas(witnesses);
8326 else
8327 witnesses.insert(srcdn->authority().first);
8328 if (srcdnl->is_remote() && !srci->is_auth())
8329 witnesses.insert(srci->authority().first);
8330 destdn->list_replicas(witnesses);
8331 if (destdnl->is_remote() && !oldin->is_auth())
8332 witnesses.insert(oldin->authority().first);
8333 dout(10) << " witnesses " << witnesses << ", have " << mdr->more()->witnessed << dendl;
8334
8335 if (!witnesses.empty()) {
8336 // Replicas can't see projected dentry linkages and will get confused.
8337 // We have taken snaplocks on ancestor inodes. Later rename/rmdir requests
8338 // can't project these inodes' linkages.
8339 bool need_flush = false;
8340 for (auto& dn : srctrace) {
8341 if (dn->is_projected()) {
8342 need_flush = true;
8343 break;
8344 }
8345 }
8346 if (!need_flush) {
8347 CDentry *dn = destdn;
8348 do {
8349 if (dn->is_projected()) {
8350 need_flush = true;
8351 break;
8352 }
8353 CInode *diri = dn->get_dir()->get_inode();
8354 dn = diri->get_projected_parent_dn();
8355 } while (dn);
8356 }
8357 if (need_flush) {
8358 mdlog->wait_for_safe(
8359 new MDSInternalContextWrapper(mds,
8360 new C_MDS_RetryRequest(mdcache, mdr)));
8361 mdlog->flush();
8362 return;
8363 }
8364 }
8365
7c673cae
FG
8366 // do srcdn auth last
8367 mds_rank_t last = MDS_RANK_NONE;
8368 if (!srcdn->is_auth()) {
8369 last = srcdn->authority().first;
8370 mdr->more()->srcdn_auth_mds = last;
8371 // ask auth of srci to mark srci as ambiguous auth if more than two MDS
8372 // are involved in the rename operation.
8373 if (srcdnl->is_primary() && !mdr->more()->is_ambiguous_auth) {
8374 dout(10) << " preparing ambiguous auth for srci" << dendl;
11fdf7f2
TL
8375 ceph_assert(mdr->more()->is_remote_frozen_authpin);
8376 ceph_assert(mdr->more()->rename_inode == srci);
7c673cae
FG
8377 _rename_prepare_witness(mdr, last, witnesses, srctrace, desttrace, straydn);
8378 return;
8379 }
8380 }
8381
8382 for (set<mds_rank_t>::iterator p = witnesses.begin();
8383 p != witnesses.end();
8384 ++p) {
8385 if (*p == last) continue; // do it last!
8386 if (mdr->more()->witnessed.count(*p)) {
8387 dout(10) << " already witnessed by mds." << *p << dendl;
f67539c2 8388 } else if (mdr->more()->waiting_on_peer.count(*p)) {
7c673cae
FG
8389 dout(10) << " already waiting on witness mds." << *p << dendl;
8390 } else {
8391 if (!_rename_prepare_witness(mdr, *p, witnesses, srctrace, desttrace, straydn))
8392 return;
8393 }
8394 }
f67539c2 8395 if (!mdr->more()->waiting_on_peer.empty())
7c673cae
FG
8396 return; // we're waiting for a witness.
8397
8398 if (last != MDS_RANK_NONE && mdr->more()->witnessed.count(last) == 0) {
8399 dout(10) << " preparing last witness (srcdn auth)" << dendl;
f67539c2 8400 ceph_assert(mdr->more()->waiting_on_peer.count(last) == 0);
7c673cae
FG
8401 _rename_prepare_witness(mdr, last, witnesses, srctrace, desttrace, straydn);
8402 return;
8403 }
8404
f67539c2
TL
8405 // test hack: bail after peer does prepare, so we can verify it's _live_ rollback.
8406 if (!mdr->more()->peers.empty() && !srci->is_dir())
11fdf7f2 8407 ceph_assert(g_conf()->mds_kill_rename_at != 3);
f67539c2 8408 if (!mdr->more()->peers.empty() && srci->is_dir())
11fdf7f2 8409 ceph_assert(g_conf()->mds_kill_rename_at != 4);
7c673cae
FG
8410
8411 // -- declare now --
8412 mdr->set_mds_stamp(ceph_clock_now());
8413
8414 // -- prepare journal entry --
8415 mdr->ls = mdlog->get_current_segment();
8416 EUpdate *le = new EUpdate(mdlog, "rename");
8417 mdlog->start_entry(le);
f67539c2 8418 le->metablob.add_client_req(mdr->reqid, req->get_oldest_client_tid());
7c673cae 8419 if (!mdr->more()->witnessed.empty()) {
f67539c2 8420 dout(20) << " noting uncommitted_peers " << mdr->more()->witnessed << dendl;
7c673cae
FG
8421
8422 le->reqid = mdr->reqid;
f67539c2 8423 le->had_peers = true;
7c673cae 8424
f67539c2 8425 mdcache->add_uncommitted_leader(mdr->reqid, mdr->ls, mdr->more()->witnessed);
7c673cae
FG
8426 // no need to send frozen auth pin to recovring auth MDS of srci
8427 mdr->more()->is_remote_frozen_authpin = false;
8428 }
8429
f67539c2 8430 _rename_prepare(mdr, &le->metablob, &le->client_map, srcdn, destdn, req->get_alternate_name(), straydn);
7c673cae
FG
8431 if (le->client_map.length())
8432 le->cmapv = mds->sessionmap.get_projected();
8433
8434 // -- commit locally --
8435 C_MDS_rename_finish *fin = new C_MDS_rename_finish(this, mdr, srcdn, destdn, straydn);
8436
8437 journal_and_reply(mdr, srci, destdn, le, fin);
81eedcae 8438 mds->balancer->maybe_fragment(destdn->get_dir(), false);
7c673cae
FG
8439}
8440
8441
8442void Server::_rename_finish(MDRequestRef& mdr, CDentry *srcdn, CDentry *destdn, CDentry *straydn)
8443{
8444 dout(10) << "_rename_finish " << *mdr << dendl;
8445
8446 if (!mdr->more()->witnessed.empty())
f67539c2 8447 mdcache->logged_leader_update(mdr->reqid);
7c673cae
FG
8448
8449 // apply
8450 _rename_apply(mdr, srcdn, destdn, straydn);
8451
8452 mdcache->send_dentry_link(destdn, mdr);
8453
8454 CDentry::linkage_t *destdnl = destdn->get_linkage();
8455 CInode *in = destdnl->get_inode();
8456 bool need_eval = mdr->more()->cap_imports.count(in);
8457
f67539c2
TL
8458 // test hack: test peer commit
8459 if (!mdr->more()->peers.empty() && !in->is_dir())
11fdf7f2 8460 ceph_assert(g_conf()->mds_kill_rename_at != 5);
f67539c2 8461 if (!mdr->more()->peers.empty() && in->is_dir())
11fdf7f2 8462 ceph_assert(g_conf()->mds_kill_rename_at != 6);
7c673cae
FG
8463
8464 // bump popularity
11fdf7f2 8465 mds->balancer->hit_dir(srcdn->get_dir(), META_POP_IWR);
7c673cae 8466 if (destdnl->is_remote() && in->is_auth())
11fdf7f2 8467 mds->balancer->hit_inode(in, META_POP_IWR);
7c673cae
FG
8468
8469 // did we import srci? if so, explicitly ack that import that, before we unlock and reply.
8470
11fdf7f2 8471 ceph_assert(g_conf()->mds_kill_rename_at != 7);
7c673cae
FG
8472
8473 // reply
8474 respond_to_request(mdr, 0);
8475
8476 if (need_eval)
8477 mds->locker->eval(in, CEPH_CAP_LOCKS, true);
8478
8479 // clean up?
8480 // respond_to_request() drops locks. So stray reintegration can race with us.
8481 if (straydn && !straydn->get_projected_linkage()->is_null()) {
8482 mdcache->notify_stray(straydn);
8483 }
8484}
8485
8486
8487
8488// helpers
8489
8490bool Server::_rename_prepare_witness(MDRequestRef& mdr, mds_rank_t who, set<mds_rank_t> &witnesse,
8491 vector<CDentry*>& srctrace, vector<CDentry*>& dsttrace, CDentry *straydn)
8492{
f67539c2
TL
8493 const auto& client_req = mdr->client_request;
8494 ceph_assert(client_req);
8495
7c673cae
FG
8496 if (mds->is_cluster_degraded() &&
8497 !mds->mdsmap->is_clientreplay_or_active_or_stopping(who)) {
8498 dout(10) << "_rename_prepare_witness mds." << who << " is not active" << dendl;
f67539c2 8499 if (mdr->more()->waiting_on_peer.empty())
7c673cae
FG
8500 mds->wait_for_active_peer(who, new C_MDS_RetryRequest(mdcache, mdr));
8501 return false;
8502 }
8503
8504 dout(10) << "_rename_prepare_witness mds." << who << dendl;
f67539c2 8505 auto req = make_message<MMDSPeerRequest>(mdr->reqid, mdr->attempt, MMDSPeerRequest::OP_RENAMEPREP);
7c673cae
FG
8506
8507 req->srcdnpath = filepath(srctrace.front()->get_dir()->ino());
8508 for (auto dn : srctrace)
94b18763 8509 req->srcdnpath.push_dentry(dn->get_name());
7c673cae
FG
8510 req->destdnpath = filepath(dsttrace.front()->get_dir()->ino());
8511 for (auto dn : dsttrace)
94b18763 8512 req->destdnpath.push_dentry(dn->get_name());
f67539c2 8513 req->alternate_name = client_req->alternate_name;
7c673cae 8514 if (straydn)
9f95a23c 8515 mdcache->encode_replica_stray(straydn, who, req->straybl);
11fdf7f2
TL
8516
8517 if (mdr->more()->srci_srnode)
8518 encode(*mdr->more()->srci_srnode, req->srci_snapbl);
8519 if (mdr->more()->desti_srnode)
8520 encode(*mdr->more()->desti_srnode, req->desti_snapbl);
31f18b77
FG
8521
8522 req->srcdn_auth = mdr->more()->srcdn_auth_mds;
7c673cae
FG
8523
8524 // srcdn auth will verify our current witness list is sufficient
8525 req->witnesses = witnesse;
8526
8527 req->op_stamp = mdr->get_op_stamp();
8528 mds->send_message_mds(req, who);
8529
f67539c2
TL
8530 ceph_assert(mdr->more()->waiting_on_peer.count(who) == 0);
8531 mdr->more()->waiting_on_peer.insert(who);
7c673cae
FG
8532 return true;
8533}
8534
8535version_t Server::_rename_prepare_import(MDRequestRef& mdr, CDentry *srcdn, bufferlist *client_map_bl)
8536{
8537 version_t oldpv = mdr->more()->inode_import_v;
8538
8539 CDentry::linkage_t *srcdnl = srcdn->get_linkage();
8540
8541 /* import node */
11fdf7f2 8542 auto blp = mdr->more()->inode_import.cbegin();
7c673cae
FG
8543
8544 // imported caps
28e407b8 8545 map<client_t,entity_inst_t> client_map;
11fdf7f2 8546 map<client_t, client_metadata_t> client_metadata_map;
28e407b8 8547 decode(client_map, blp);
11fdf7f2
TL
8548 decode(client_metadata_map, blp);
8549 prepare_force_open_sessions(client_map, client_metadata_map,
8550 mdr->more()->imported_session_map);
28e407b8 8551 encode(client_map, *client_map_bl, mds->mdsmap->get_up_features());
11fdf7f2 8552 encode(client_metadata_map, *client_map_bl);
7c673cae
FG
8553
8554 list<ScatterLock*> updated_scatterlocks;
8555 mdcache->migrator->decode_import_inode(srcdn, blp, srcdn->authority().first, mdr->ls,
8556 mdr->more()->cap_imports, updated_scatterlocks);
8557
8558 // hack: force back to !auth and clean, temporarily
8559 srcdnl->get_inode()->state_clear(CInode::STATE_AUTH);
8560 srcdnl->get_inode()->mark_clean();
8561
8562 return oldpv;
8563}
8564
8565bool Server::_need_force_journal(CInode *diri, bool empty)
8566{
9f95a23c 8567 auto&& dirs = diri->get_dirfrags();
7c673cae
FG
8568
8569 bool force_journal = false;
8570 if (empty) {
11fdf7f2
TL
8571 for (const auto& dir : dirs) {
8572 if (dir->is_subtree_root() && dir->get_dir_auth().first == mds->get_nodeid()) {
8573 dout(10) << " frag " << dir->get_frag() << " is auth subtree dirfrag, will force journal" << dendl;
7c673cae
FG
8574 force_journal = true;
8575 break;
8576 } else
11fdf7f2 8577 dout(20) << " frag " << dir->get_frag() << " is not auth subtree dirfrag" << dendl;
7c673cae
FG
8578 }
8579 } else {
8580 // see if any children of our frags are auth subtrees.
11fdf7f2
TL
8581 std::vector<CDir*> subtrees;
8582 mdcache->get_subtrees(subtrees);
8583 dout(10) << " subtrees " << subtrees << " frags " << dirs << dendl;
8584 for (const auto& dir : dirs) {
8585 for (const auto& subtree : subtrees) {
8586 if (dir->contains(subtree)) {
8587 if (subtree->get_dir_auth().first == mds->get_nodeid()) {
8588 dout(10) << " frag " << dir->get_frag() << " contains (maybe) auth subtree, will force journal "
8589 << *subtree << dendl;
7c673cae
FG
8590 force_journal = true;
8591 break;
8592 } else
11fdf7f2 8593 dout(20) << " frag " << dir->get_frag() << " contains but isn't auth for " << *subtree << dendl;
7c673cae 8594 } else
11fdf7f2 8595 dout(20) << " frag " << dir->get_frag() << " does not contain " << *subtree << dendl;
7c673cae
FG
8596 }
8597 if (force_journal)
8598 break;
8599 }
8600 }
8601 return force_journal;
8602}
8603
8604void Server::_rename_prepare(MDRequestRef& mdr,
8605 EMetaBlob *metablob, bufferlist *client_map_bl,
f67539c2
TL
8606 CDentry *srcdn, CDentry *destdn, std::string_view alternate_name,
8607 CDentry *straydn)
7c673cae
FG
8608{
8609 dout(10) << "_rename_prepare " << *mdr << " " << *srcdn << " " << *destdn << dendl;
8610 if (straydn)
8611 dout(10) << " straydn " << *straydn << dendl;
8612
8613 CDentry::linkage_t *srcdnl = srcdn->get_projected_linkage();
8614 CDentry::linkage_t *destdnl = destdn->get_projected_linkage();
8615 CInode *srci = srcdnl->get_inode();
8616 CInode *oldin = destdnl->get_inode();
8617
8618 // primary+remote link merge?
11fdf7f2
TL
8619 bool linkmerge = (srci == oldin);
8620 if (linkmerge)
8621 ceph_assert(srcdnl->is_primary() && destdnl->is_remote());
7c673cae
FG
8622 bool silent = srcdn->get_dir()->inode->is_stray();
8623
8624 bool force_journal_dest = false;
8625 if (srci->is_dir() && !destdn->is_auth()) {
8626 if (srci->is_auth()) {
8627 // if we are auth for srci and exporting it, force journal because journal replay needs
8628 // the source inode to create auth subtrees.
8629 dout(10) << " we are exporting srci, will force journal destdn" << dendl;
8630 force_journal_dest = true;
8631 } else
8632 force_journal_dest = _need_force_journal(srci, false);
8633 }
8634
8635 bool force_journal_stray = false;
8636 if (oldin && oldin->is_dir() && straydn && !straydn->is_auth())
8637 force_journal_stray = _need_force_journal(oldin, true);
8638
8639 if (linkmerge)
8640 dout(10) << " merging remote and primary links to the same inode" << dendl;
8641 if (silent)
8642 dout(10) << " reintegrating stray; will avoid changing nlink or dir mtime" << dendl;
8643 if (force_journal_dest)
8644 dout(10) << " forcing journal destdn because we (will) have auth subtrees nested beneath it" << dendl;
8645 if (force_journal_stray)
8646 dout(10) << " forcing journal straydn because we (will) have auth subtrees nested beneath it" << dendl;
8647
8648 if (srci->is_dir() && (destdn->is_auth() || force_journal_dest)) {
8649 dout(10) << " noting renamed dir ino " << srci->ino() << " in metablob" << dendl;
8650 metablob->renamed_dirino = srci->ino();
8651 } else if (oldin && oldin->is_dir() && force_journal_stray) {
8652 dout(10) << " noting rename target dir " << oldin->ino() << " in metablob" << dendl;
8653 metablob->renamed_dirino = oldin->ino();
8654 }
8655
8656 // prepare
94b18763
FG
8657 CInode::mempool_inode *spi = 0; // renamed inode
8658 CInode::mempool_inode *tpi = 0; // target/overwritten inode
7c673cae
FG
8659
8660 // target inode
8661 if (!linkmerge) {
8662 if (destdnl->is_primary()) {
11fdf7f2 8663 ceph_assert(straydn); // moving to straydn.
7c673cae
FG
8664 // link--, and move.
8665 if (destdn->is_auth()) {
f67539c2
TL
8666 auto pi= oldin->project_inode(mdr); //project_snaprealm
8667 pi.inode->version = straydn->pre_dirty(pi.inode->version);
8668 pi.inode->update_backtrace();
8669 tpi = pi.inode.get();
7c673cae
FG
8670 }
8671 straydn->push_projected_linkage(oldin);
8672 } else if (destdnl->is_remote()) {
8673 // nlink-- targeti
8674 if (oldin->is_auth()) {
f67539c2
TL
8675 auto pi = oldin->project_inode(mdr);
8676 pi.inode->version = oldin->pre_dirty();
8677 tpi = pi.inode.get();
7c673cae
FG
8678 }
8679 }
8680 }
8681
8682 // dest
f67539c2
TL
8683 if (destdnl->is_null()) {
8684 /* handle_client_rename checks that alternate_name matches for existing destdn */
8685 destdn->set_alternate_name(alternate_name);
8686 }
7c673cae
FG
8687 if (srcdnl->is_remote()) {
8688 if (!linkmerge) {
8689 // destdn
8690 if (destdn->is_auth())
8691 mdr->more()->pvmap[destdn] = destdn->pre_dirty();
8692 destdn->push_projected_linkage(srcdnl->get_remote_ino(), srcdnl->get_remote_d_type());
8693 // srci
8694 if (srci->is_auth()) {
f67539c2
TL
8695 auto pi = srci->project_inode(mdr);
8696 pi.inode->version = srci->pre_dirty();
8697 spi = pi.inode.get();
7c673cae
FG
8698 }
8699 } else {
8700 dout(10) << " will merge remote onto primary link" << dendl;
8701 if (destdn->is_auth()) {
f67539c2
TL
8702 auto pi = oldin->project_inode(mdr);
8703 pi.inode->version = mdr->more()->pvmap[destdn] = destdn->pre_dirty(oldin->get_version());
8704 spi = pi.inode.get();
7c673cae
FG
8705 }
8706 }
8707 } else { // primary
8708 if (destdn->is_auth()) {
8709 version_t oldpv;
8710 if (srcdn->is_auth())
8711 oldpv = srci->get_projected_version();
8712 else {
8713 oldpv = _rename_prepare_import(mdr, srcdn, client_map_bl);
8714
8715 // note which dirfrags have child subtrees in the journal
8716 // event, so that we can open those (as bounds) during replay.
8717 if (srci->is_dir()) {
9f95a23c
TL
8718 auto&& ls = srci->get_dirfrags();
8719 for (const auto& dir : ls) {
7c673cae
FG
8720 if (!dir->is_auth())
8721 metablob->renamed_dir_frags.push_back(dir->get_frag());
8722 }
8723 dout(10) << " noting renamed dir open frags " << metablob->renamed_dir_frags << dendl;
8724 }
8725 }
f67539c2 8726 auto pi = srci->project_inode(mdr); // project snaprealm if srcdnl->is_primary
7c673cae 8727 // & srcdnl->snaprealm
f67539c2
TL
8728 pi.inode->version = mdr->more()->pvmap[destdn] = destdn->pre_dirty(oldpv);
8729 pi.inode->update_backtrace();
8730 spi = pi.inode.get();
7c673cae
FG
8731 }
8732 destdn->push_projected_linkage(srci);
8733 }
8734
8735 // src
8736 if (srcdn->is_auth())
8737 mdr->more()->pvmap[srcdn] = srcdn->pre_dirty();
8738 srcdn->push_projected_linkage(); // push null linkage
8739
8740 if (!silent) {
94b18763 8741 if (spi) {
91327a77
AA
8742 spi->ctime = mdr->get_op_stamp();
8743 if (mdr->get_op_stamp() > spi->rstat.rctime)
8744 spi->rstat.rctime = mdr->get_op_stamp();
94b18763 8745 spi->change_attr++;
7c673cae 8746 if (linkmerge)
94b18763 8747 spi->nlink--;
7c673cae
FG
8748 }
8749 if (tpi) {
91327a77
AA
8750 tpi->ctime = mdr->get_op_stamp();
8751 if (mdr->get_op_stamp() > tpi->rstat.rctime)
8752 tpi->rstat.rctime = mdr->get_op_stamp();
7c673cae 8753 tpi->change_attr++;
94b18763
FG
8754 {
8755 std::string t;
8756 destdn->make_path_string(t, true);
11fdf7f2 8757 tpi->stray_prior_path = std::move(t);
94b18763 8758 }
7c673cae
FG
8759 tpi->nlink--;
8760 if (tpi->nlink == 0)
8761 oldin->state_set(CInode::STATE_ORPHAN);
8762 }
8763 }
8764
8765 // prepare nesting, mtime updates
8766 int predirty_dir = silent ? 0:PREDIRTY_DIR;
8767
8768 // guarantee stray dir is processed first during journal replay. unlink the old inode,
8769 // then link the source inode to destdn
8770 if (destdnl->is_primary()) {
11fdf7f2 8771 ceph_assert(straydn);
7c673cae
FG
8772 if (straydn->is_auth()) {
8773 metablob->add_dir_context(straydn->get_dir());
8774 metablob->add_dir(straydn->get_dir(), true);
8775 }
8776 }
8777
f67539c2
TL
8778 if (!linkmerge && destdnl->is_remote() && oldin->is_auth()) {
8779 CDir *oldin_dir = oldin->get_projected_parent_dir();
8780 if (oldin_dir != srcdn->get_dir() && oldin_dir != destdn->get_dir())
8781 mdcache->predirty_journal_parents(mdr, metablob, oldin, oldin_dir, PREDIRTY_PRIMARY);
8782 }
8783
7c673cae
FG
8784 // sub off target
8785 if (destdn->is_auth() && !destdnl->is_null()) {
8786 mdcache->predirty_journal_parents(mdr, metablob, oldin, destdn->get_dir(),
8787 (destdnl->is_primary() ? PREDIRTY_PRIMARY:0)|predirty_dir, -1);
224ce89b 8788 if (destdnl->is_primary()) {
11fdf7f2 8789 ceph_assert(straydn);
7c673cae
FG
8790 mdcache->predirty_journal_parents(mdr, metablob, oldin, straydn->get_dir(),
8791 PREDIRTY_PRIMARY|PREDIRTY_DIR, 1);
224ce89b 8792 }
7c673cae 8793 }
f67539c2
TL
8794
8795 if (srcdnl->is_remote() && srci->is_auth()) {
8796 CDir *srci_dir = srci->get_projected_parent_dir();
8797 if (srci_dir != srcdn->get_dir() && srci_dir != destdn->get_dir())
8798 mdcache->predirty_journal_parents(mdr, metablob, srci, srci_dir, PREDIRTY_PRIMARY);
8799 }
7c673cae
FG
8800
8801 // move srcdn
8802 int predirty_primary = (srcdnl->is_primary() && srcdn->get_dir() != destdn->get_dir()) ? PREDIRTY_PRIMARY:0;
8803 int flags = predirty_dir | predirty_primary;
8804 if (srcdn->is_auth())
8805 mdcache->predirty_journal_parents(mdr, metablob, srci, srcdn->get_dir(), PREDIRTY_SHALLOW|flags, -1);
8806 if (destdn->is_auth())
8807 mdcache->predirty_journal_parents(mdr, metablob, srci, destdn->get_dir(), flags, 1);
8808
7c673cae
FG
8809 // add it all to the metablob
8810 // target inode
8811 if (!linkmerge) {
8812 if (destdnl->is_primary()) {
11fdf7f2 8813 ceph_assert(straydn);
7c673cae
FG
8814 if (destdn->is_auth()) {
8815 // project snaprealm, too
11fdf7f2
TL
8816 if (auto& desti_srnode = mdr->more()->desti_srnode) {
8817 oldin->project_snaprealm(desti_srnode);
8818 if (tpi->nlink == 0)
8819 ceph_assert(!desti_srnode->is_parent_global());
8820 desti_srnode = NULL;
8821 }
8822 straydn->first = mdcache->get_global_snaprealm()->get_newest_seq() + 1;
7c673cae
FG
8823 metablob->add_primary_dentry(straydn, oldin, true, true);
8824 } else if (force_journal_stray) {
8825 dout(10) << " forced journaling straydn " << *straydn << dendl;
8826 metablob->add_dir_context(straydn->get_dir());
8827 metablob->add_primary_dentry(straydn, oldin, true);
8828 }
8829 } else if (destdnl->is_remote()) {
8830 if (oldin->is_auth()) {
11fdf7f2 8831 sr_t *new_srnode = NULL;
f67539c2
TL
8832 if (mdr->peer_request) {
8833 if (mdr->peer_request->desti_snapbl.length() > 0) {
11fdf7f2 8834 new_srnode = new sr_t();
f67539c2 8835 auto p = mdr->peer_request->desti_snapbl.cbegin();
11fdf7f2
TL
8836 decode(*new_srnode, p);
8837 }
8838 } else if (auto& desti_srnode = mdr->more()->desti_srnode) {
8839 new_srnode = desti_srnode;
8840 desti_srnode = NULL;
8841 }
8842 if (new_srnode) {
8843 oldin->project_snaprealm(new_srnode);
8844 if (tpi->nlink == 0)
8845 ceph_assert(!new_srnode->is_parent_global());
8846 }
7c673cae 8847 // auth for targeti
f67539c2
TL
8848 CDentry *oldin_pdn = oldin->get_projected_parent_dn();
8849 mdcache->journal_cow_dentry(mdr.get(), metablob, oldin_pdn);
8850 metablob->add_primary_dentry(oldin_pdn, oldin, true);
7c673cae
FG
8851 }
8852 }
8853 }
8854
8855 // dest
8856 if (srcdnl->is_remote()) {
11fdf7f2
TL
8857 ceph_assert(!linkmerge);
8858 if (destdn->is_auth() && !destdnl->is_null())
8859 mdcache->journal_cow_dentry(mdr.get(), metablob, destdn, CEPH_NOSNAP, 0, destdnl);
8860 else
8861 destdn->first = mdcache->get_global_snaprealm()->get_newest_seq() + 1;
7c673cae 8862
11fdf7f2
TL
8863 if (destdn->is_auth())
8864 metablob->add_remote_dentry(destdn, true, srcdnl->get_remote_ino(), srcdnl->get_remote_d_type());
8865
8866 if (srci->is_auth() ) { // it's remote
f67539c2
TL
8867 if (mdr->peer_request) {
8868 if (mdr->peer_request->srci_snapbl.length() > 0) {
11fdf7f2 8869 sr_t *new_srnode = new sr_t();
f67539c2 8870 auto p = mdr->peer_request->srci_snapbl.cbegin();
11fdf7f2
TL
8871 decode(*new_srnode, p);
8872 srci->project_snaprealm(new_srnode);
8873 }
8874 } else if (auto& srci_srnode = mdr->more()->srci_srnode) {
8875 srci->project_snaprealm(srci_srnode);
8876 srci_srnode = NULL;
7c673cae 8877 }
7c673cae 8878
11fdf7f2 8879 CDentry *srci_pdn = srci->get_projected_parent_dn();
f67539c2 8880 mdcache->journal_cow_dentry(mdr.get(), metablob, srci_pdn);
11fdf7f2 8881 metablob->add_primary_dentry(srci_pdn, srci, true);
7c673cae
FG
8882 }
8883 } else if (srcdnl->is_primary()) {
8884 // project snap parent update?
11fdf7f2
TL
8885 if (destdn->is_auth()) {
8886 if (auto& srci_srnode = mdr->more()->srci_srnode) {
8887 srci->project_snaprealm(srci_srnode);
8888 srci_srnode = NULL;
8889 }
8890 }
7c673cae
FG
8891
8892 if (destdn->is_auth() && !destdnl->is_null())
8893 mdcache->journal_cow_dentry(mdr.get(), metablob, destdn, CEPH_NOSNAP, 0, destdnl);
11fdf7f2
TL
8894
8895 destdn->first = mdcache->get_global_snaprealm()->get_newest_seq() + 1;
7c673cae
FG
8896
8897 if (destdn->is_auth())
8898 metablob->add_primary_dentry(destdn, srci, true, true);
8899 else if (force_journal_dest) {
8900 dout(10) << " forced journaling destdn " << *destdn << dendl;
8901 metablob->add_dir_context(destdn->get_dir());
8902 metablob->add_primary_dentry(destdn, srci, true);
8903 if (srcdn->is_auth() && srci->is_dir()) {
8904 // journal new subtrees root dirfrags
9f95a23c
TL
8905 auto&& ls = srci->get_dirfrags();
8906 for (const auto& dir : ls) {
7c673cae
FG
8907 if (dir->is_auth())
8908 metablob->add_dir(dir, true);
8909 }
8910 }
8911 }
8912 }
8913
8914 // src
8915 if (srcdn->is_auth()) {
8916 dout(10) << " journaling srcdn " << *srcdn << dendl;
8917 mdcache->journal_cow_dentry(mdr.get(), metablob, srcdn, CEPH_NOSNAP, 0, srcdnl);
f67539c2 8918 // also journal the inode in case we need do peer rename rollback. It is Ok to add
7c673cae
FG
8919 // both primary and NULL dentries. Because during journal replay, null dentry is
8920 // processed after primary dentry.
8921 if (srcdnl->is_primary() && !srci->is_dir() && !destdn->is_auth())
8922 metablob->add_primary_dentry(srcdn, srci, true);
8923 metablob->add_null_dentry(srcdn, true);
8924 } else
8925 dout(10) << " NOT journaling srcdn " << *srcdn << dendl;
8926
8927 // make renamed inode first track the dn
11fdf7f2
TL
8928 if (srcdnl->is_primary() && destdn->is_auth()) {
8929 ceph_assert(srci->first <= destdn->first);
8930 srci->first = destdn->first;
8931 }
8932 // make stray inode first track the straydn
8933 if (straydn && straydn->is_auth()) {
8934 ceph_assert(oldin->first <= straydn->first);
8935 oldin->first = straydn->first;
8936 }
7c673cae 8937
224ce89b 8938 if (oldin && oldin->is_dir()) {
11fdf7f2 8939 ceph_assert(straydn);
7c673cae 8940 mdcache->project_subtree_rename(oldin, destdn->get_dir(), straydn->get_dir());
224ce89b 8941 }
7c673cae
FG
8942 if (srci->is_dir())
8943 mdcache->project_subtree_rename(srci, srcdn->get_dir(), destdn->get_dir());
8944
8945}
8946
8947
8948void Server::_rename_apply(MDRequestRef& mdr, CDentry *srcdn, CDentry *destdn, CDentry *straydn)
8949{
8950 dout(10) << "_rename_apply " << *mdr << " " << *srcdn << " " << *destdn << dendl;
8951 dout(10) << " pvs " << mdr->more()->pvmap << dendl;
8952
8953 CDentry::linkage_t *srcdnl = srcdn->get_linkage();
8954 CDentry::linkage_t *destdnl = destdn->get_linkage();
8955
8956 CInode *oldin = destdnl->get_inode();
7c673cae
FG
8957
8958 // primary+remote link merge?
11fdf7f2
TL
8959 bool linkmerge = (srcdnl->get_inode() == oldin);
8960 if (linkmerge)
8961 ceph_assert(srcdnl->is_primary() || destdnl->is_remote());
8962
8963 bool new_in_snaprealm = false;
8964 bool new_oldin_snaprealm = false;
7c673cae
FG
8965
8966 // target inode
8967 if (!linkmerge) {
8968 if (destdnl->is_primary()) {
11fdf7f2 8969 ceph_assert(straydn);
7c673cae 8970 dout(10) << "straydn is " << *straydn << dendl;
11fdf7f2
TL
8971
8972 // if there is newly created snaprealm, need to split old snaprealm's
8973 // inodes_with_caps. So pop snaprealm before linkage changes.
8974 if (destdn->is_auth()) {
8975 bool hadrealm = (oldin->snaprealm ? true : false);
8976 oldin->early_pop_projected_snaprealm();
8977 new_oldin_snaprealm = (oldin->snaprealm && !hadrealm);
8978 } else {
f67539c2
TL
8979 ceph_assert(mdr->peer_request);
8980 if (mdr->peer_request->desti_snapbl.length()) {
11fdf7f2 8981 new_oldin_snaprealm = !oldin->snaprealm;
f67539c2 8982 oldin->decode_snap_blob(mdr->peer_request->desti_snapbl);
11fdf7f2 8983 ceph_assert(oldin->snaprealm);
11fdf7f2
TL
8984 }
8985 }
8986
31f18b77 8987 destdn->get_dir()->unlink_inode(destdn, false);
7c673cae
FG
8988
8989 straydn->pop_projected_linkage();
f67539c2 8990 if (mdr->is_peer() && !mdr->more()->peer_update_journaled)
11fdf7f2 8991 ceph_assert(!straydn->is_projected()); // no other projected
7c673cae
FG
8992
8993 // nlink-- targeti
11fdf7f2 8994 if (destdn->is_auth())
f67539c2 8995 oldin->pop_and_dirty_projected_inode(mdr->ls, mdr);
11fdf7f2
TL
8996
8997 mdcache->touch_dentry_bottom(straydn); // drop dn as quickly as possible.
7c673cae 8998 } else if (destdnl->is_remote()) {
31f18b77 8999 destdn->get_dir()->unlink_inode(destdn, false);
11fdf7f2 9000 if (oldin->is_auth()) {
f67539c2
TL
9001 oldin->pop_and_dirty_projected_inode(mdr->ls, mdr);
9002 } else if (mdr->peer_request) {
9003 if (mdr->peer_request->desti_snapbl.length() > 0) {
11fdf7f2 9004 ceph_assert(oldin->snaprealm);
f67539c2 9005 oldin->decode_snap_blob(mdr->peer_request->desti_snapbl);
11fdf7f2
TL
9006 }
9007 } else if (auto& desti_srnode = mdr->more()->desti_srnode) {
9008 delete desti_srnode;
9009 desti_srnode = NULL;
9010 }
7c673cae
FG
9011 }
9012 }
9013
9014 // unlink src before we relink it at dest
9015 CInode *in = srcdnl->get_inode();
11fdf7f2 9016 ceph_assert(in);
7c673cae
FG
9017
9018 bool srcdn_was_remote = srcdnl->is_remote();
11fdf7f2
TL
9019 if (!srcdn_was_remote) {
9020 // if there is newly created snaprealm, need to split old snaprealm's
9021 // inodes_with_caps. So pop snaprealm before linkage changes.
9022 if (destdn->is_auth()) {
9023 bool hadrealm = (in->snaprealm ? true : false);
9024 in->early_pop_projected_snaprealm();
9025 new_in_snaprealm = (in->snaprealm && !hadrealm);
9026 } else {
f67539c2
TL
9027 ceph_assert(mdr->peer_request);
9028 if (mdr->peer_request->srci_snapbl.length()) {
11fdf7f2 9029 new_in_snaprealm = !in->snaprealm;
f67539c2 9030 in->decode_snap_blob(mdr->peer_request->srci_snapbl);
11fdf7f2 9031 ceph_assert(in->snaprealm);
11fdf7f2
TL
9032 }
9033 }
9034 }
9035
7c673cae
FG
9036 srcdn->get_dir()->unlink_inode(srcdn);
9037
9038 // dest
9039 if (srcdn_was_remote) {
9040 if (!linkmerge) {
9041 // destdn
9042 destdnl = destdn->pop_projected_linkage();
f67539c2 9043 if (mdr->is_peer() && !mdr->more()->peer_update_journaled)
11fdf7f2 9044 ceph_assert(!destdn->is_projected()); // no other projected
7c673cae
FG
9045
9046 destdn->link_remote(destdnl, in);
9047 if (destdn->is_auth())
9048 destdn->mark_dirty(mdr->more()->pvmap[destdn], mdr->ls);
9049 // in
11fdf7f2 9050 if (in->is_auth()) {
f67539c2
TL
9051 in->pop_and_dirty_projected_inode(mdr->ls, mdr);
9052 } else if (mdr->peer_request) {
9053 if (mdr->peer_request->srci_snapbl.length() > 0) {
11fdf7f2 9054 ceph_assert(in->snaprealm);
f67539c2 9055 in->decode_snap_blob(mdr->peer_request->srci_snapbl);
11fdf7f2
TL
9056 }
9057 } else if (auto& srci_srnode = mdr->more()->srci_srnode) {
9058 delete srci_srnode;
9059 srci_srnode = NULL;
9060 }
7c673cae
FG
9061 } else {
9062 dout(10) << "merging remote onto primary link" << dendl;
f67539c2 9063 oldin->pop_and_dirty_projected_inode(mdr->ls, mdr);
7c673cae
FG
9064 }
9065 } else { // primary
9066 if (linkmerge) {
9067 dout(10) << "merging primary onto remote link" << dendl;
31f18b77 9068 destdn->get_dir()->unlink_inode(destdn, false);
7c673cae
FG
9069 }
9070 destdnl = destdn->pop_projected_linkage();
f67539c2 9071 if (mdr->is_peer() && !mdr->more()->peer_update_journaled)
11fdf7f2 9072 ceph_assert(!destdn->is_projected()); // no other projected
7c673cae
FG
9073
9074 // srcdn inode import?
9075 if (!srcdn->is_auth() && destdn->is_auth()) {
11fdf7f2 9076 ceph_assert(mdr->more()->inode_import.length() > 0);
7c673cae
FG
9077
9078 map<client_t,Capability::Import> imported_caps;
9079
9080 // finish cap imports
28e407b8 9081 finish_force_open_sessions(mdr->more()->imported_session_map);
7c673cae
FG
9082 if (mdr->more()->cap_imports.count(destdnl->get_inode())) {
9083 mdcache->migrator->finish_import_inode_caps(destdnl->get_inode(),
28e407b8
AA
9084 mdr->more()->srcdn_auth_mds, true,
9085 mdr->more()->imported_session_map,
9086 mdr->more()->cap_imports[destdnl->get_inode()],
9087 imported_caps);
7c673cae
FG
9088 }
9089
9090 mdr->more()->inode_import.clear();
11fdf7f2 9091 encode(imported_caps, mdr->more()->inode_import);
7c673cae
FG
9092
9093 /* hack: add an auth pin for each xlock we hold. These were
9094 * remote xlocks previously but now they're local and
9095 * we're going to try and unpin when we xlock_finish. */
11fdf7f2
TL
9096
9097 for (auto i = mdr->locks.lower_bound(&destdnl->get_inode()->versionlock);
9098 i != mdr->locks.end();
9099 ++i) {
9100 SimpleLock *lock = i->lock;
9101 if (lock->get_parent() != destdnl->get_inode())
9102 break;
9103 if (i->is_xlock() && !lock->is_locallock())
9104 mds->locker->xlock_import(lock);
9105 }
7c673cae
FG
9106
9107 // hack: fix auth bit
9108 in->state_set(CInode::STATE_AUTH);
7c673cae
FG
9109
9110 mdr->clear_ambiguous_auth();
9111 }
9112
11fdf7f2 9113 if (destdn->is_auth())
f67539c2 9114 in->pop_and_dirty_projected_inode(mdr->ls, mdr);
7c673cae
FG
9115 }
9116
9117 // src
9118 if (srcdn->is_auth())
9119 srcdn->mark_dirty(mdr->more()->pvmap[srcdn], mdr->ls);
9120 srcdn->pop_projected_linkage();
f67539c2 9121 if (mdr->is_peer() && !mdr->more()->peer_update_journaled)
11fdf7f2 9122 ceph_assert(!srcdn->is_projected()); // no other projected
7c673cae
FG
9123
9124 // apply remaining projected inodes (nested)
9125 mdr->apply();
9126
9127 // update subtree map?
11fdf7f2 9128 if (destdnl->is_primary() && in->is_dir())
224ce89b 9129 mdcache->adjust_subtree_after_rename(in, srcdn->get_dir(), true);
7c673cae
FG
9130
9131 if (straydn && oldin->is_dir())
9132 mdcache->adjust_subtree_after_rename(oldin, destdn->get_dir(), true);
9133
11fdf7f2
TL
9134 if (new_oldin_snaprealm)
9135 mdcache->do_realm_invalidate_and_update_notify(oldin, CEPH_SNAP_OP_SPLIT, false);
9136 if (new_in_snaprealm)
9137 mdcache->do_realm_invalidate_and_update_notify(in, CEPH_SNAP_OP_SPLIT, true);
9138
7c673cae
FG
9139 // removing a new dn?
9140 if (srcdn->is_auth())
9141 srcdn->get_dir()->try_remove_unlinked_dn(srcdn);
9142}
9143
9144
9145
9146// ------------
f67539c2 9147// PEER
7c673cae 9148
f67539c2 9149class C_MDS_PeerRenamePrep : public ServerLogContext {
7c673cae
FG
9150 CDentry *srcdn, *destdn, *straydn;
9151public:
f67539c2 9152 C_MDS_PeerRenamePrep(Server *s, MDRequestRef& m, CDentry *sr, CDentry *de, CDentry *st) :
7c673cae
FG
9153 ServerLogContext(s, m), srcdn(sr), destdn(de), straydn(st) {}
9154 void finish(int r) override {
f67539c2 9155 server->_logged_peer_rename(mdr, srcdn, destdn, straydn);
7c673cae
FG
9156 }
9157};
9158
f67539c2 9159class C_MDS_PeerRenameCommit : public ServerContext {
7c673cae
FG
9160 MDRequestRef mdr;
9161 CDentry *srcdn, *destdn, *straydn;
9162public:
f67539c2 9163 C_MDS_PeerRenameCommit(Server *s, MDRequestRef& m, CDentry *sr, CDentry *de, CDentry *st) :
7c673cae
FG
9164 ServerContext(s), mdr(m), srcdn(sr), destdn(de), straydn(st) {}
9165 void finish(int r) override {
f67539c2 9166 server->_commit_peer_rename(mdr, r, srcdn, destdn, straydn);
7c673cae
FG
9167 }
9168};
9169
f67539c2 9170class C_MDS_PeerRenameSessionsFlushed : public ServerContext {
7c673cae
FG
9171 MDRequestRef mdr;
9172public:
f67539c2 9173 C_MDS_PeerRenameSessionsFlushed(Server *s, MDRequestRef& r) :
7c673cae
FG
9174 ServerContext(s), mdr(r) {}
9175 void finish(int r) override {
f67539c2 9176 server->_peer_rename_sessions_flushed(mdr);
7c673cae
FG
9177 }
9178};
9179
f67539c2 9180void Server::handle_peer_rename_prep(MDRequestRef& mdr)
7c673cae 9181{
f67539c2
TL
9182 dout(10) << "handle_peer_rename_prep " << *mdr
9183 << " " << mdr->peer_request->srcdnpath
9184 << " to " << mdr->peer_request->destdnpath
7c673cae 9185 << dendl;
31f18b77 9186
f67539c2
TL
9187 if (mdr->peer_request->is_interrupted()) {
9188 dout(10) << " peer request interrupted, sending noop reply" << dendl;
9189 auto reply = make_message<MMDSPeerRequest>(mdr->reqid, mdr->attempt, MMDSPeerRequest::OP_RENAMEPREPACK);
31f18b77 9190 reply->mark_interrupted();
f67539c2
TL
9191 mds->send_message_mds(reply, mdr->peer_to_mds);
9192 mdr->reset_peer_request();
31f18b77
FG
9193 return;
9194 }
9195
7c673cae 9196 // discover destdn
f67539c2 9197 filepath destpath(mdr->peer_request->destdnpath);
7c673cae
FG
9198 dout(10) << " dest " << destpath << dendl;
9199 vector<CDentry*> trace;
f67539c2 9200 CF_MDS_RetryRequestFactory cf(mdcache, mdr, false);
9f95a23c
TL
9201 int r = mdcache->path_traverse(mdr, cf, destpath,
9202 MDS_TRAVERSE_DISCOVER | MDS_TRAVERSE_PATH_LOCKED | MDS_TRAVERSE_WANT_DENTRY,
9203 &trace);
7c673cae 9204 if (r > 0) return;
f67539c2 9205 if (r == -CEPHFS_ESTALE) {
7c673cae 9206 mdcache->find_ino_peers(destpath.get_ino(), new C_MDS_RetryRequest(mdcache, mdr),
f67539c2 9207 mdr->peer_to_mds, true);
7c673cae
FG
9208 return;
9209 }
11fdf7f2 9210 ceph_assert(r == 0); // we shouldn't get an error here!
7c673cae 9211
91327a77 9212 CDentry *destdn = trace.back();
7c673cae
FG
9213 CDentry::linkage_t *destdnl = destdn->get_projected_linkage();
9214 dout(10) << " destdn " << *destdn << dendl;
9215 mdr->pin(destdn);
9216
9217 // discover srcdn
f67539c2 9218 filepath srcpath(mdr->peer_request->srcdnpath);
7c673cae
FG
9219 dout(10) << " src " << srcpath << dendl;
9220 CInode *srci = nullptr;
9f95a23c
TL
9221 r = mdcache->path_traverse(mdr, cf, srcpath,
9222 MDS_TRAVERSE_DISCOVER | MDS_TRAVERSE_PATH_LOCKED,
9223 &trace, &srci);
7c673cae 9224 if (r > 0) return;
11fdf7f2 9225 ceph_assert(r == 0);
7c673cae 9226
91327a77 9227 CDentry *srcdn = trace.back();
7c673cae
FG
9228 CDentry::linkage_t *srcdnl = srcdn->get_projected_linkage();
9229 dout(10) << " srcdn " << *srcdn << dendl;
9230 mdr->pin(srcdn);
9231 mdr->pin(srci);
9232
9233 // stray?
11fdf7f2
TL
9234 bool linkmerge = srcdnl->get_inode() == destdnl->get_inode();
9235 if (linkmerge)
9236 ceph_assert(srcdnl->is_primary() && destdnl->is_remote());
7c673cae
FG
9237 CDentry *straydn = mdr->straydn;
9238 if (destdnl->is_primary() && !linkmerge)
11fdf7f2 9239 ceph_assert(straydn);
7c673cae 9240
f67539c2 9241 mdr->set_op_stamp(mdr->peer_request->op_stamp);
7c673cae
FG
9242 mdr->more()->srcdn_auth_mds = srcdn->authority().first;
9243
9244 // set up commit waiter (early, to clean up any freezing etc we do)
f67539c2
TL
9245 if (!mdr->more()->peer_commit)
9246 mdr->more()->peer_commit = new C_MDS_PeerRenameCommit(this, mdr, srcdn, destdn, straydn);
7c673cae
FG
9247
9248 // am i srcdn auth?
9249 if (srcdn->is_auth()) {
9250 set<mds_rank_t> srcdnrep;
9251 srcdn->list_replicas(srcdnrep);
9252
9253 bool reply_witness = false;
9254 if (srcdnl->is_primary() && !srcdnl->get_inode()->state_test(CInode::STATE_AMBIGUOUSAUTH)) {
9255 // freeze?
9256 // we need this to
9257 // - avoid conflicting lock state changes
9258 // - avoid concurrent updates to the inode
9259 // (this could also be accomplished with the versionlock)
11fdf7f2 9260 int allowance = 3; // 1 for the mdr auth_pin, 1 for the link lock, 1 for the snap lock
7c673cae
FG
9261 dout(10) << " freezing srci " << *srcdnl->get_inode() << " with allowance " << allowance << dendl;
9262 bool frozen_inode = srcdnl->get_inode()->freeze_inode(allowance);
9263
9264 // unfreeze auth pin after freezing the inode to avoid queueing waiters
9265 if (srcdnl->get_inode()->is_frozen_auth_pin())
9266 mdr->unfreeze_auth_pin();
9267
9268 if (!frozen_inode) {
9269 srcdnl->get_inode()->add_waiter(CInode::WAIT_FROZEN, new C_MDS_RetryRequest(mdcache, mdr));
9270 return;
9271 }
9272
9273 /*
9274 * set ambiguous auth for srci
9275 * NOTE: we don't worry about ambiguous cache expire as we do
f67539c2 9276 * with subtree migrations because all peers will pin
7c673cae
FG
9277 * srcdn->get_inode() for duration of this rename.
9278 */
9279 mdr->set_ambiguous_auth(srcdnl->get_inode());
9280
9281 // just mark the source inode as ambiguous auth if more than two MDS are involved.
f67539c2
TL
9282 // the leader will send another OP_RENAMEPREP peer request later.
9283 if (mdr->peer_request->witnesses.size() > 1) {
7c673cae
FG
9284 dout(10) << " set srci ambiguous auth; providing srcdn replica list" << dendl;
9285 reply_witness = true;
9286 }
9287
9288 // make sure bystanders have received all lock related messages
9289 for (set<mds_rank_t>::iterator p = srcdnrep.begin(); p != srcdnrep.end(); ++p) {
f67539c2 9290 if (*p == mdr->peer_to_mds ||
7c673cae
FG
9291 (mds->is_cluster_degraded() &&
9292 !mds->mdsmap->is_clientreplay_or_active_or_stopping(*p)))
9293 continue;
f67539c2 9294 auto notify = make_message<MMDSPeerRequest>(mdr->reqid, mdr->attempt, MMDSPeerRequest::OP_RENAMENOTIFY);
7c673cae 9295 mds->send_message_mds(notify, *p);
f67539c2 9296 mdr->more()->waiting_on_peer.insert(*p);
7c673cae
FG
9297 }
9298
9299 // make sure clients have received all cap related messages
9300 set<client_t> export_client_set;
9301 mdcache->migrator->get_export_client_set(srcdnl->get_inode(), export_client_set);
9302
9303 MDSGatherBuilder gather(g_ceph_context);
9304 flush_client_sessions(export_client_set, gather);
9305 if (gather.has_subs()) {
f67539c2
TL
9306 mdr->more()->waiting_on_peer.insert(MDS_RANK_NONE);
9307 gather.set_finisher(new C_MDS_PeerRenameSessionsFlushed(this, mdr));
7c673cae
FG
9308 gather.activate();
9309 }
9310 }
9311
9312 // is witness list sufficient?
9313 for (set<mds_rank_t>::iterator p = srcdnrep.begin(); p != srcdnrep.end(); ++p) {
f67539c2
TL
9314 if (*p == mdr->peer_to_mds ||
9315 mdr->peer_request->witnesses.count(*p)) continue;
7c673cae
FG
9316 dout(10) << " witness list insufficient; providing srcdn replica list" << dendl;
9317 reply_witness = true;
9318 break;
9319 }
9320
9321 if (reply_witness) {
11fdf7f2 9322 ceph_assert(!srcdnrep.empty());
f67539c2 9323 auto reply = make_message<MMDSPeerRequest>(mdr->reqid, mdr->attempt, MMDSPeerRequest::OP_RENAMEPREPACK);
7c673cae 9324 reply->witnesses.swap(srcdnrep);
f67539c2
TL
9325 mds->send_message_mds(reply, mdr->peer_to_mds);
9326 mdr->reset_peer_request();
7c673cae
FG
9327 return;
9328 }
9329 dout(10) << " witness list sufficient: includes all srcdn replicas" << dendl;
f67539c2 9330 if (!mdr->more()->waiting_on_peer.empty()) {
7c673cae 9331 dout(10) << " still waiting for rename notify acks from "
f67539c2 9332 << mdr->more()->waiting_on_peer << dendl;
7c673cae
FG
9333 return;
9334 }
9335 } else if (srcdnl->is_primary() && srcdn->authority() != destdn->authority()) {
9336 // set ambiguous auth for srci on witnesses
9337 mdr->set_ambiguous_auth(srcdnl->get_inode());
9338 }
9339
9340 // encode everything we'd need to roll this back... basically, just the original state.
9341 rename_rollback rollback;
9342
9343 rollback.reqid = mdr->reqid;
9344
9345 rollback.orig_src.dirfrag = srcdn->get_dir()->dirfrag();
9346 rollback.orig_src.dirfrag_old_mtime = srcdn->get_dir()->get_projected_fnode()->fragstat.mtime;
9347 rollback.orig_src.dirfrag_old_rctime = srcdn->get_dir()->get_projected_fnode()->rstat.rctime;
11fdf7f2 9348 rollback.orig_src.dname = srcdn->get_name();
7c673cae
FG
9349 if (srcdnl->is_primary())
9350 rollback.orig_src.ino = srcdnl->get_inode()->ino();
9351 else {
11fdf7f2 9352 ceph_assert(srcdnl->is_remote());
7c673cae
FG
9353 rollback.orig_src.remote_ino = srcdnl->get_remote_ino();
9354 rollback.orig_src.remote_d_type = srcdnl->get_remote_d_type();
9355 }
9356
9357 rollback.orig_dest.dirfrag = destdn->get_dir()->dirfrag();
9358 rollback.orig_dest.dirfrag_old_mtime = destdn->get_dir()->get_projected_fnode()->fragstat.mtime;
9359 rollback.orig_dest.dirfrag_old_rctime = destdn->get_dir()->get_projected_fnode()->rstat.rctime;
11fdf7f2 9360 rollback.orig_dest.dname = destdn->get_name();
7c673cae
FG
9361 if (destdnl->is_primary())
9362 rollback.orig_dest.ino = destdnl->get_inode()->ino();
9363 else if (destdnl->is_remote()) {
9364 rollback.orig_dest.remote_ino = destdnl->get_remote_ino();
9365 rollback.orig_dest.remote_d_type = destdnl->get_remote_d_type();
9366 }
9367
9368 if (straydn) {
9369 rollback.stray.dirfrag = straydn->get_dir()->dirfrag();
9370 rollback.stray.dirfrag_old_mtime = straydn->get_dir()->get_projected_fnode()->fragstat.mtime;
9371 rollback.stray.dirfrag_old_rctime = straydn->get_dir()->get_projected_fnode()->rstat.rctime;
11fdf7f2
TL
9372 rollback.stray.dname = straydn->get_name();
9373 }
f67539c2 9374 if (mdr->peer_request->desti_snapbl.length()) {
11fdf7f2
TL
9375 CInode *oldin = destdnl->get_inode();
9376 if (oldin->snaprealm) {
9377 encode(true, rollback.desti_snapbl);
9378 oldin->encode_snap_blob(rollback.desti_snapbl);
9379 } else {
9380 encode(false, rollback.desti_snapbl);
9381 }
9382 }
f67539c2 9383 if (mdr->peer_request->srci_snapbl.length()) {
11fdf7f2
TL
9384 if (srci->snaprealm) {
9385 encode(true, rollback.srci_snapbl);
9386 srci->encode_snap_blob(rollback.srci_snapbl);
9387 } else {
9388 encode(false, rollback.srci_snapbl);
9389 }
7c673cae 9390 }
11fdf7f2
TL
9391 encode(rollback, mdr->more()->rollback_bl);
9392 // FIXME: rollback snaprealm
7c673cae
FG
9393 dout(20) << " rollback is " << mdr->more()->rollback_bl.length() << " bytes" << dendl;
9394
9395 // journal.
9396 mdr->ls = mdlog->get_current_segment();
f67539c2
TL
9397 EPeerUpdate *le = new EPeerUpdate(mdlog, "peer_rename_prep", mdr->reqid, mdr->peer_to_mds,
9398 EPeerUpdate::OP_PREPARE, EPeerUpdate::RENAME);
7c673cae
FG
9399 mdlog->start_entry(le);
9400 le->rollback = mdr->more()->rollback_bl;
9401
f67539c2
TL
9402 bufferlist blah; // inode import data... obviously not used if we're the peer
9403 _rename_prepare(mdr, &le->commit, &blah, srcdn, destdn, mdr->peer_request->alternate_name, straydn);
7c673cae
FG
9404
9405 if (le->commit.empty()) {
9406 dout(10) << " empty metablob, skipping journal" << dendl;
9407 mdlog->cancel_entry(le);
9408 mdr->ls = NULL;
f67539c2 9409 _logged_peer_rename(mdr, srcdn, destdn, straydn);
7c673cae 9410 } else {
f67539c2
TL
9411 mdcache->add_uncommitted_peer(mdr->reqid, mdr->ls, mdr->peer_to_mds);
9412 mdr->more()->peer_update_journaled = true;
9413 submit_mdlog_entry(le, new C_MDS_PeerRenamePrep(this, mdr, srcdn, destdn, straydn),
7c673cae
FG
9414 mdr, __func__);
9415 mdlog->flush();
9416 }
9417}
9418
f67539c2 9419void Server::_logged_peer_rename(MDRequestRef& mdr,
7c673cae
FG
9420 CDentry *srcdn, CDentry *destdn, CDentry *straydn)
9421{
f67539c2 9422 dout(10) << "_logged_peer_rename " << *mdr << dendl;
7c673cae
FG
9423
9424 // prepare ack
f67539c2 9425 ref_t<MMDSPeerRequest> reply;
7c673cae 9426 if (!mdr->aborted) {
f67539c2
TL
9427 reply = make_message<MMDSPeerRequest>(mdr->reqid, mdr->attempt, MMDSPeerRequest::OP_RENAMEPREPACK);
9428 if (!mdr->more()->peer_update_journaled)
7c673cae
FG
9429 reply->mark_not_journaled();
9430 }
9431
9432 CDentry::linkage_t *srcdnl = srcdn->get_linkage();
7c673cae
FG
9433 //CDentry::linkage_t *straydnl = straydn ? straydn->get_linkage() : 0;
9434
9435 // export srci?
9436 if (srcdn->is_auth() && srcdnl->is_primary()) {
9437 // set export bounds for CInode::encode_export()
11fdf7f2 9438 if (reply) {
9f95a23c 9439 std::vector<CDir*> bounds;
11fdf7f2
TL
9440 if (srcdnl->get_inode()->is_dir()) {
9441 srcdnl->get_inode()->get_dirfrags(bounds);
9f95a23c
TL
9442 for (const auto& bound : bounds) {
9443 bound->state_set(CDir::STATE_EXPORTBOUND);
9444 }
11fdf7f2 9445 }
7c673cae 9446
11fdf7f2
TL
9447 map<client_t,entity_inst_t> exported_client_map;
9448 map<client_t, client_metadata_t> exported_client_metadata_map;
9449 bufferlist inodebl;
9450 mdcache->migrator->encode_export_inode(srcdnl->get_inode(), inodebl,
9451 exported_client_map,
9452 exported_client_metadata_map);
7c673cae 9453
9f95a23c
TL
9454 for (const auto& bound : bounds) {
9455 bound->state_clear(CDir::STATE_EXPORTBOUND);
9456 }
7c673cae 9457
11fdf7f2
TL
9458 encode(exported_client_map, reply->inode_export, mds->mdsmap->get_up_features());
9459 encode(exported_client_metadata_map, reply->inode_export);
7c673cae 9460 reply->inode_export.claim_append(inodebl);
f67539c2 9461 reply->inode_export_v = srcdnl->get_inode()->get_version();
7c673cae
FG
9462 }
9463
9464 // remove mdr auth pin
9465 mdr->auth_unpin(srcdnl->get_inode());
9466 mdr->more()->is_inode_exporter = true;
9467
9468 if (srcdnl->get_inode()->is_dirty())
9469 srcdnl->get_inode()->mark_clean();
9470
9471 dout(10) << " exported srci " << *srcdnl->get_inode() << dendl;
9472 }
9473
9474 // apply
9475 _rename_apply(mdr, srcdn, destdn, straydn);
11fdf7f2
TL
9476
9477 CDentry::linkage_t *destdnl = destdn->get_linkage();
7c673cae
FG
9478
9479 // bump popularity
11fdf7f2 9480 mds->balancer->hit_dir(srcdn->get_dir(), META_POP_IWR);
7c673cae 9481 if (destdnl->get_inode() && destdnl->get_inode()->is_auth())
11fdf7f2 9482 mds->balancer->hit_inode(destdnl->get_inode(), META_POP_IWR);
7c673cae
FG
9483
9484 // done.
f67539c2 9485 mdr->reset_peer_request();
7c673cae
FG
9486 mdr->straydn = 0;
9487
9488 if (reply) {
f67539c2 9489 mds->send_message_mds(reply, mdr->peer_to_mds);
7c673cae 9490 } else {
11fdf7f2 9491 ceph_assert(mdr->aborted);
7c673cae
FG
9492 dout(10) << " abort flag set, finishing" << dendl;
9493 mdcache->request_finish(mdr);
9494 }
9495}
9496
f67539c2 9497void Server::_commit_peer_rename(MDRequestRef& mdr, int r,
7c673cae
FG
9498 CDentry *srcdn, CDentry *destdn, CDentry *straydn)
9499{
f67539c2 9500 dout(10) << "_commit_peer_rename " << *mdr << " r=" << r << dendl;
7c673cae 9501
f64942e4
AA
9502 CInode *in = destdn->get_linkage()->get_inode();
9503
9504 inodeno_t migrated_stray;
9505 if (srcdn->is_auth() && srcdn->get_dir()->inode->is_stray())
9506 migrated_stray = in->ino();
7c673cae 9507
11fdf7f2 9508 MDSContext::vec finished;
7c673cae
FG
9509 if (r == 0) {
9510 // unfreeze+singleauth inode
9511 // hmm, do i really need to delay this?
9512 if (mdr->more()->is_inode_exporter) {
7c673cae
FG
9513 // drop our pins
9514 // we exported, clear out any xlocks that we moved to another MDS
7c673cae 9515
11fdf7f2
TL
9516 for (auto i = mdr->locks.lower_bound(&in->versionlock);
9517 i != mdr->locks.end(); ) {
9518 SimpleLock *lock = i->lock;
9519 if (lock->get_parent() != in)
9520 break;
7c673cae 9521 // we only care about xlocks on the exported inode
11fdf7f2
TL
9522 if (i->is_xlock() && !lock->is_locallock())
9523 mds->locker->xlock_export(i++, mdr.get());
9524 else
9525 ++i;
7c673cae
FG
9526 }
9527
9528 map<client_t,Capability::Import> peer_imported;
11fdf7f2
TL
9529 auto bp = mdr->more()->inode_import.cbegin();
9530 decode(peer_imported, bp);
7c673cae 9531
f64942e4 9532 dout(10) << " finishing inode export on " << *in << dendl;
f67539c2 9533 mdcache->migrator->finish_export_inode(in, mdr->peer_to_mds, peer_imported, finished);
7c673cae
FG
9534 mds->queue_waiters(finished); // this includes SINGLEAUTH waiters.
9535
9536 // unfreeze
11fdf7f2 9537 ceph_assert(in->is_frozen_inode());
f64942e4 9538 in->unfreeze_inode(finished);
7c673cae
FG
9539 }
9540
9541 // singleauth
9542 if (mdr->more()->is_ambiguous_auth) {
9543 mdr->more()->rename_inode->clear_ambiguous_auth(finished);
9544 mdr->more()->is_ambiguous_auth = false;
9545 }
9546
f67539c2 9547 if (straydn && mdr->more()->peer_update_journaled) {
31f18b77
FG
9548 CInode *strayin = straydn->get_projected_linkage()->get_inode();
9549 if (strayin && !strayin->snaprealm)
9550 mdcache->clear_dirty_bits_for_stray(strayin);
9551 }
7c673cae
FG
9552
9553 mds->queue_waiters(finished);
9554 mdr->cleanup();
9555
f67539c2 9556 if (mdr->more()->peer_update_journaled) {
7c673cae 9557 // write a commit to the journal
f67539c2
TL
9558 EPeerUpdate *le = new EPeerUpdate(mdlog, "peer_rename_commit", mdr->reqid,
9559 mdr->peer_to_mds, EPeerUpdate::OP_COMMIT,
9560 EPeerUpdate::RENAME);
7c673cae 9561 mdlog->start_entry(le);
f67539c2 9562 submit_mdlog_entry(le, new C_MDS_CommittedPeer(this, mdr), mdr, __func__);
7c673cae
FG
9563 mdlog->flush();
9564 } else {
f67539c2 9565 _committed_peer(mdr);
7c673cae
FG
9566 }
9567 } else {
9568
9569 // abort
9570 // rollback_bl may be empty if we froze the inode but had to provide an expanded
f67539c2 9571 // witness list from the leader, and they failed before we tried prep again.
7c673cae
FG
9572 if (mdr->more()->rollback_bl.length()) {
9573 if (mdr->more()->is_inode_exporter) {
f64942e4
AA
9574 dout(10) << " reversing inode export of " << *in << dendl;
9575 in->abort_export();
7c673cae 9576 }
f67539c2
TL
9577 if (mdcache->is_ambiguous_peer_update(mdr->reqid, mdr->peer_to_mds)) {
9578 mdcache->remove_ambiguous_peer_update(mdr->reqid, mdr->peer_to_mds);
9579 // rollback but preserve the peer request
9580 do_rename_rollback(mdr->more()->rollback_bl, mdr->peer_to_mds, mdr, false);
7c673cae
FG
9581 mdr->more()->rollback_bl.clear();
9582 } else
f67539c2 9583 do_rename_rollback(mdr->more()->rollback_bl, mdr->peer_to_mds, mdr, true);
7c673cae 9584 } else {
f67539c2 9585 dout(10) << " rollback_bl empty, not rollback back rename (leader failed after getting extra witnesses?)" << dendl;
7c673cae
FG
9586 // singleauth
9587 if (mdr->more()->is_ambiguous_auth) {
9588 if (srcdn->is_auth())
9589 mdr->more()->rename_inode->unfreeze_inode(finished);
9590
9591 mdr->more()->rename_inode->clear_ambiguous_auth(finished);
9592 mdr->more()->is_ambiguous_auth = false;
9593 }
9594 mds->queue_waiters(finished);
9595 mdcache->request_finish(mdr);
9596 }
9597 }
f64942e4
AA
9598
9599 if (migrated_stray && mds->is_stopping())
9600 mdcache->shutdown_export_stray_finish(migrated_stray);
7c673cae
FG
9601}
9602
f67539c2
TL
9603static void _rollback_repair_dir(MutationRef& mut, CDir *dir,
9604 rename_rollback::drec &r, utime_t ctime,
9605 bool isdir, const nest_info_t &rstat)
7c673cae 9606{
f67539c2 9607 auto pf = dir->project_fnode(mut);
7c673cae
FG
9608 pf->version = dir->pre_dirty();
9609
9610 if (isdir) {
f67539c2 9611 pf->fragstat.nsubdirs += 1;
7c673cae 9612 } else {
f67539c2 9613 pf->fragstat.nfiles += 1;
7c673cae
FG
9614 }
9615 if (r.ino) {
f67539c2
TL
9616 pf->rstat.rbytes += rstat.rbytes;
9617 pf->rstat.rfiles += rstat.rfiles;
9618 pf->rstat.rsubdirs += rstat.rsubdirs;
9619 pf->rstat.rsnaps += rstat.rsnaps;
7c673cae
FG
9620 }
9621 if (pf->fragstat.mtime == ctime) {
9622 pf->fragstat.mtime = r.dirfrag_old_mtime;
9623 if (pf->rstat.rctime == ctime)
9624 pf->rstat.rctime = r.dirfrag_old_rctime;
9625 }
9626 mut->add_updated_lock(&dir->get_inode()->filelock);
9627 mut->add_updated_lock(&dir->get_inode()->nestlock);
9628}
9629
9630struct C_MDS_LoggedRenameRollback : public ServerLogContext {
9631 MutationRef mut;
9632 CDentry *srcdn;
9633 version_t srcdnpv;
9634 CDentry *destdn;
9635 CDentry *straydn;
9f95a23c 9636 map<client_t,ref_t<MClientSnap>> splits[2];
7c673cae
FG
9637 bool finish_mdr;
9638 C_MDS_LoggedRenameRollback(Server *s, MutationRef& m, MDRequestRef& r,
11fdf7f2 9639 CDentry *sd, version_t pv, CDentry *dd, CDentry *st,
9f95a23c 9640 map<client_t,ref_t<MClientSnap>> _splits[2], bool f) :
7c673cae 9641 ServerLogContext(s, r), mut(m), srcdn(sd), srcdnpv(pv), destdn(dd),
11fdf7f2
TL
9642 straydn(st), finish_mdr(f) {
9643 splits[0].swap(_splits[0]);
9644 splits[1].swap(_splits[1]);
9645 }
7c673cae
FG
9646 void finish(int r) override {
9647 server->_rename_rollback_finish(mut, mdr, srcdn, srcdnpv,
11fdf7f2 9648 destdn, straydn, splits, finish_mdr);
7c673cae
FG
9649 }
9650};
9651
f67539c2 9652void Server::do_rename_rollback(bufferlist &rbl, mds_rank_t leader, MDRequestRef& mdr,
7c673cae
FG
9653 bool finish_mdr)
9654{
9655 rename_rollback rollback;
11fdf7f2
TL
9656 auto p = rbl.cbegin();
9657 decode(rollback, p);
7c673cae
FG
9658
9659 dout(10) << "do_rename_rollback on " << rollback.reqid << dendl;
9660 // need to finish this update before sending resolve to claim the subtree
f67539c2 9661 mdcache->add_rollback(rollback.reqid, leader);
7c673cae
FG
9662
9663 MutationRef mut(new MutationImpl(nullptr, utime_t(), rollback.reqid));
9664 mut->ls = mds->mdlog->get_current_segment();
9665
9666 CDentry *srcdn = NULL;
9667 CDir *srcdir = mdcache->get_dirfrag(rollback.orig_src.dirfrag);
9668 if (!srcdir)
9669 srcdir = mdcache->get_dirfrag(rollback.orig_src.dirfrag.ino, rollback.orig_src.dname);
9670 if (srcdir) {
9671 dout(10) << " srcdir " << *srcdir << dendl;
9672 srcdn = srcdir->lookup(rollback.orig_src.dname);
9673 if (srcdn) {
9674 dout(10) << " srcdn " << *srcdn << dendl;
11fdf7f2 9675 ceph_assert(srcdn->get_linkage()->is_null());
7c673cae
FG
9676 } else
9677 dout(10) << " srcdn not found" << dendl;
9678 } else
9679 dout(10) << " srcdir not found" << dendl;
9680
9681 CDentry *destdn = NULL;
9682 CDir *destdir = mdcache->get_dirfrag(rollback.orig_dest.dirfrag);
9683 if (!destdir)
9684 destdir = mdcache->get_dirfrag(rollback.orig_dest.dirfrag.ino, rollback.orig_dest.dname);
9685 if (destdir) {
9686 dout(10) << " destdir " << *destdir << dendl;
9687 destdn = destdir->lookup(rollback.orig_dest.dname);
9688 if (destdn)
9689 dout(10) << " destdn " << *destdn << dendl;
9690 else
9691 dout(10) << " destdn not found" << dendl;
9692 } else
9693 dout(10) << " destdir not found" << dendl;
9694
9695 CInode *in = NULL;
9696 if (rollback.orig_src.ino) {
9697 in = mdcache->get_inode(rollback.orig_src.ino);
9698 if (in && in->is_dir())
11fdf7f2 9699 ceph_assert(srcdn && destdn);
7c673cae
FG
9700 } else
9701 in = mdcache->get_inode(rollback.orig_src.remote_ino);
9702
9703 CDir *straydir = NULL;
9704 CDentry *straydn = NULL;
9705 if (rollback.stray.dirfrag.ino) {
9706 straydir = mdcache->get_dirfrag(rollback.stray.dirfrag);
9707 if (straydir) {
9708 dout(10) << "straydir " << *straydir << dendl;
9709 straydn = straydir->lookup(rollback.stray.dname);
9710 if (straydn) {
9711 dout(10) << " straydn " << *straydn << dendl;
11fdf7f2 9712 ceph_assert(straydn->get_linkage()->is_primary());
7c673cae
FG
9713 } else
9714 dout(10) << " straydn not found" << dendl;
9715 } else
9716 dout(10) << "straydir not found" << dendl;
9717 }
9718
9719 CInode *target = NULL;
9720 if (rollback.orig_dest.ino) {
9721 target = mdcache->get_inode(rollback.orig_dest.ino);
9722 if (target)
11fdf7f2 9723 ceph_assert(destdn && straydn);
7c673cae
FG
9724 } else if (rollback.orig_dest.remote_ino)
9725 target = mdcache->get_inode(rollback.orig_dest.remote_ino);
9726
9727 // can't use is_auth() in the resolve stage
9728 mds_rank_t whoami = mds->get_nodeid();
f67539c2 9729 // peer
11fdf7f2
TL
9730 ceph_assert(!destdn || destdn->authority().first != whoami);
9731 ceph_assert(!straydn || straydn->authority().first != whoami);
7c673cae
FG
9732
9733 bool force_journal_src = false;
9734 bool force_journal_dest = false;
9735 if (in && in->is_dir() && srcdn->authority().first != whoami)
9736 force_journal_src = _need_force_journal(in, false);
9737 if (in && target && target->is_dir())
9738 force_journal_dest = _need_force_journal(in, true);
9739
9740 version_t srcdnpv = 0;
9741 // repair src
9742 if (srcdn) {
9743 if (srcdn->authority().first == whoami)
9744 srcdnpv = srcdn->pre_dirty();
9745 if (rollback.orig_src.ino) {
11fdf7f2 9746 ceph_assert(in);
7c673cae
FG
9747 srcdn->push_projected_linkage(in);
9748 } else
9749 srcdn->push_projected_linkage(rollback.orig_src.remote_ino,
9750 rollback.orig_src.remote_d_type);
9751 }
9752
9f95a23c 9753 map<client_t,ref_t<MClientSnap>> splits[2];
11fdf7f2 9754
f67539c2 9755 const CInode::mempool_inode *pip = nullptr;
7c673cae 9756 if (in) {
11fdf7f2 9757 bool projected;
f67539c2
TL
9758 CDir *pdir = in->get_projected_parent_dir();
9759 if (pdir->authority().first == whoami) {
9760 auto pi = in->project_inode(mut);
9761 pi.inode->version = in->pre_dirty();
9762 if (pdir != srcdir) {
9763 auto pf = pdir->project_fnode(mut);
9764 pf->version = pdir->pre_dirty();
9765 }
9766 if (pi.inode->ctime == rollback.ctime)
9767 pi.inode->ctime = rollback.orig_src.old_ctime;
11fdf7f2
TL
9768 projected = true;
9769 } else {
f67539c2
TL
9770 if (in->get_inode()->ctime == rollback.ctime) {
9771 auto _inode = CInode::allocate_inode(*in->get_inode());
9772 _inode->ctime = rollback.orig_src.old_ctime;
9773 in->reset_inode(_inode);
9774 }
11fdf7f2
TL
9775 projected = false;
9776 }
f67539c2 9777 pip = in->get_projected_inode().get();
11fdf7f2
TL
9778
9779 if (rollback.srci_snapbl.length() && in->snaprealm) {
9780 bool hadrealm;
9781 auto p = rollback.srci_snapbl.cbegin();
9782 decode(hadrealm, p);
9783 if (hadrealm) {
9784 if (projected && !mds->is_resolve()) {
9785 sr_t *new_srnode = new sr_t();
9786 decode(*new_srnode, p);
9787 in->project_snaprealm(new_srnode);
9788 } else
9789 decode(in->snaprealm->srnode, p);
9790 } else {
9791 SnapRealm *realm;
9792 if (rollback.orig_src.ino) {
9793 ceph_assert(srcdir);
9794 realm = srcdir->get_inode()->find_snaprealm();
9795 } else {
9796 realm = in->snaprealm->parent;
9797 }
9798 if (!mds->is_resolve())
9799 mdcache->prepare_realm_merge(in->snaprealm, realm, splits[0]);
9800 if (projected)
9801 in->project_snaprealm(NULL);
9802 else
9803 in->snaprealm->merge_to(realm);
9804 }
9805 }
7c673cae
FG
9806 }
9807
7c673cae
FG
9808 // repair dest
9809 if (destdn) {
9810 if (rollback.orig_dest.ino && target) {
9811 destdn->push_projected_linkage(target);
9812 } else if (rollback.orig_dest.remote_ino) {
9813 destdn->push_projected_linkage(rollback.orig_dest.remote_ino,
9814 rollback.orig_dest.remote_d_type);
9815 } else {
9816 // the dentry will be trimmed soon, it's ok to have wrong linkage
9817 if (rollback.orig_dest.ino)
11fdf7f2 9818 ceph_assert(mds->is_resolve());
7c673cae
FG
9819 destdn->push_projected_linkage();
9820 }
9821 }
9822
9823 if (straydn)
9824 straydn->push_projected_linkage();
9825
9826 if (target) {
11fdf7f2 9827 bool projected;
f67539c2
TL
9828 CInode::inode_ptr ti;
9829 CDir *pdir = target->get_projected_parent_dir();
9830 if (pdir->authority().first == whoami) {
9831 auto pi = target->project_inode(mut);
9832 pi.inode->version = target->pre_dirty();
9833 if (pdir != srcdir) {
9834 auto pf = pdir->project_fnode(mut);
9835 pf->version = pdir->pre_dirty();
9836 }
9837 ti = pi.inode;
11fdf7f2
TL
9838 projected = true;
9839 } else {
f67539c2 9840 ti = CInode::allocate_inode(*target->get_inode());
11fdf7f2
TL
9841 projected = false;
9842 }
f67539c2 9843
7c673cae 9844 if (ti->ctime == rollback.ctime)
91327a77 9845 ti->ctime = rollback.orig_dest.old_ctime;
7c673cae
FG
9846 if (MDS_INO_IS_STRAY(rollback.orig_src.dirfrag.ino)) {
9847 if (MDS_INO_IS_STRAY(rollback.orig_dest.dirfrag.ino))
11fdf7f2 9848 ceph_assert(!rollback.orig_dest.ino && !rollback.orig_dest.remote_ino);
7c673cae 9849 else
11fdf7f2 9850 ceph_assert(rollback.orig_dest.remote_ino &&
7c673cae
FG
9851 rollback.orig_dest.remote_ino == rollback.orig_src.ino);
9852 } else
9853 ti->nlink++;
11fdf7f2 9854
f67539c2
TL
9855 if (!projected)
9856 target->reset_inode(ti);
9857
11fdf7f2
TL
9858 if (rollback.desti_snapbl.length() && target->snaprealm) {
9859 bool hadrealm;
9860 auto p = rollback.desti_snapbl.cbegin();
9861 decode(hadrealm, p);
9862 if (hadrealm) {
9863 if (projected && !mds->is_resolve()) {
9864 sr_t *new_srnode = new sr_t();
9865 decode(*new_srnode, p);
9866 target->project_snaprealm(new_srnode);
9867 } else
9868 decode(target->snaprealm->srnode, p);
9869 } else {
9870 SnapRealm *realm;
9871 if (rollback.orig_dest.ino) {
9872 ceph_assert(destdir);
9873 realm = destdir->get_inode()->find_snaprealm();
9874 } else {
9875 realm = target->snaprealm->parent;
9876 }
9877 if (!mds->is_resolve())
9878 mdcache->prepare_realm_merge(target->snaprealm, realm, splits[1]);
9879 if (projected)
9880 target->project_snaprealm(NULL);
9881 else
9882 target->snaprealm->merge_to(realm);
9883 }
9884 }
7c673cae
FG
9885 }
9886
f67539c2
TL
9887 if (srcdn && srcdn->authority().first == whoami) {
9888 nest_info_t blah;
9889 _rollback_repair_dir(mut, srcdir, rollback.orig_src, rollback.ctime,
9890 in && in->is_dir(), pip ? pip->accounted_rstat : blah);
9891 }
9892
7c673cae
FG
9893 if (srcdn)
9894 dout(0) << " srcdn back to " << *srcdn << dendl;
9895 if (in)
9896 dout(0) << " srci back to " << *in << dendl;
9897 if (destdn)
9898 dout(0) << " destdn back to " << *destdn << dendl;
9899 if (target)
9900 dout(0) << " desti back to " << *target << dendl;
9901
9902 // journal it
f67539c2
TL
9903 EPeerUpdate *le = new EPeerUpdate(mdlog, "peer_rename_rollback", rollback.reqid, leader,
9904 EPeerUpdate::OP_ROLLBACK, EPeerUpdate::RENAME);
7c673cae
FG
9905 mdlog->start_entry(le);
9906
9907 if (srcdn && (srcdn->authority().first == whoami || force_journal_src)) {
9908 le->commit.add_dir_context(srcdir);
9909 if (rollback.orig_src.ino)
9910 le->commit.add_primary_dentry(srcdn, 0, true);
9911 else
9912 le->commit.add_remote_dentry(srcdn, true);
9913 }
9914
9915 if (!rollback.orig_src.ino && // remote linkage
9916 in && in->authority().first == whoami) {
9917 le->commit.add_dir_context(in->get_projected_parent_dir());
9918 le->commit.add_primary_dentry(in->get_projected_parent_dn(), in, true);
9919 }
9920
9921 if (force_journal_dest) {
11fdf7f2 9922 ceph_assert(rollback.orig_dest.ino);
7c673cae
FG
9923 le->commit.add_dir_context(destdir);
9924 le->commit.add_primary_dentry(destdn, 0, true);
9925 }
9926
f67539c2 9927 // peer: no need to journal straydn
7c673cae
FG
9928
9929 if (target && target != in && target->authority().first == whoami) {
11fdf7f2 9930 ceph_assert(rollback.orig_dest.remote_ino);
7c673cae
FG
9931 le->commit.add_dir_context(target->get_projected_parent_dir());
9932 le->commit.add_primary_dentry(target->get_projected_parent_dn(), target, true);
9933 }
9934
9935 if (in && in->is_dir() && (srcdn->authority().first == whoami || force_journal_src)) {
9936 dout(10) << " noting renamed dir ino " << in->ino() << " in metablob" << dendl;
9937 le->commit.renamed_dirino = in->ino();
9938 if (srcdn->authority().first == whoami) {
9f95a23c
TL
9939 auto&& ls = in->get_dirfrags();
9940 for (const auto& dir : ls) {
7c673cae
FG
9941 if (!dir->is_auth())
9942 le->commit.renamed_dir_frags.push_back(dir->get_frag());
9943 }
9944 dout(10) << " noting renamed dir open frags " << le->commit.renamed_dir_frags << dendl;
9945 }
9946 } else if (force_journal_dest) {
9947 dout(10) << " noting rename target ino " << target->ino() << " in metablob" << dendl;
9948 le->commit.renamed_dirino = target->ino();
9949 }
9950
9951 if (target && target->is_dir()) {
11fdf7f2 9952 ceph_assert(destdn);
7c673cae
FG
9953 mdcache->project_subtree_rename(target, straydir, destdir);
9954 }
9955
9956 if (in && in->is_dir()) {
11fdf7f2 9957 ceph_assert(srcdn);
7c673cae
FG
9958 mdcache->project_subtree_rename(in, destdir, srcdir);
9959 }
9960
f67539c2 9961 if (mdr && !mdr->more()->peer_update_journaled) {
11fdf7f2 9962 ceph_assert(le->commit.empty());
7c673cae
FG
9963 mdlog->cancel_entry(le);
9964 mut->ls = NULL;
11fdf7f2 9965 _rename_rollback_finish(mut, mdr, srcdn, srcdnpv, destdn, straydn, splits, finish_mdr);
7c673cae 9966 } else {
11fdf7f2 9967 ceph_assert(!le->commit.empty());
7c673cae 9968 if (mdr)
f67539c2 9969 mdr->more()->peer_update_journaled = false;
11fdf7f2
TL
9970 MDSLogContextBase *fin = new C_MDS_LoggedRenameRollback(this, mut, mdr,
9971 srcdn, srcdnpv, destdn, straydn,
9972 splits, finish_mdr);
7c673cae
FG
9973 submit_mdlog_entry(le, fin, mdr, __func__);
9974 mdlog->flush();
9975 }
9976}
9977
9978void Server::_rename_rollback_finish(MutationRef& mut, MDRequestRef& mdr, CDentry *srcdn,
11fdf7f2 9979 version_t srcdnpv, CDentry *destdn, CDentry *straydn,
9f95a23c 9980 map<client_t,ref_t<MClientSnap>> splits[2], bool finish_mdr)
7c673cae
FG
9981{
9982 dout(10) << "_rename_rollback_finish " << mut->reqid << dendl;
9983
9984 if (straydn) {
9985 straydn->get_dir()->unlink_inode(straydn);
9986 straydn->pop_projected_linkage();
9987 }
9988 if (destdn) {
9989 destdn->get_dir()->unlink_inode(destdn);
9990 destdn->pop_projected_linkage();
9991 }
9992 if (srcdn) {
9993 srcdn->pop_projected_linkage();
11fdf7f2 9994 if (srcdn->authority().first == mds->get_nodeid()) {
7c673cae 9995 srcdn->mark_dirty(srcdnpv, mut->ls);
11fdf7f2
TL
9996 if (srcdn->get_linkage()->is_primary())
9997 srcdn->get_linkage()->get_inode()->state_set(CInode::STATE_AUTH);
9998 }
7c673cae
FG
9999 }
10000
10001 mut->apply();
10002
10003 if (srcdn && srcdn->get_linkage()->is_primary()) {
10004 CInode *in = srcdn->get_linkage()->get_inode();
7c673cae 10005 if (in && in->is_dir()) {
11fdf7f2 10006 ceph_assert(destdn);
7c673cae
FG
10007 mdcache->adjust_subtree_after_rename(in, destdn->get_dir(), true);
10008 }
10009 }
10010
10011 if (destdn) {
10012 CInode *oldin = destdn->get_linkage()->get_inode();
10013 // update subtree map?
10014 if (oldin && oldin->is_dir()) {
11fdf7f2 10015 ceph_assert(straydn);
7c673cae
FG
10016 mdcache->adjust_subtree_after_rename(oldin, straydn->get_dir(), true);
10017 }
10018 }
10019
10020 if (mds->is_resolve()) {
10021 CDir *root = NULL;
10022 if (straydn)
10023 root = mdcache->get_subtree_root(straydn->get_dir());
10024 else if (destdn)
10025 root = mdcache->get_subtree_root(destdn->get_dir());
10026 if (root)
10027 mdcache->try_trim_non_auth_subtree(root);
11fdf7f2
TL
10028 } else {
10029 mdcache->send_snaps(splits[1]);
10030 mdcache->send_snaps(splits[0]);
7c673cae
FG
10031 }
10032
10033 if (mdr) {
11fdf7f2 10034 MDSContext::vec finished;
7c673cae
FG
10035 if (mdr->more()->is_ambiguous_auth) {
10036 if (srcdn->is_auth())
10037 mdr->more()->rename_inode->unfreeze_inode(finished);
10038
10039 mdr->more()->rename_inode->clear_ambiguous_auth(finished);
10040 mdr->more()->is_ambiguous_auth = false;
10041 }
10042 mds->queue_waiters(finished);
10043 if (finish_mdr || mdr->aborted)
10044 mdcache->request_finish(mdr);
10045 else
f67539c2 10046 mdr->more()->peer_rolling_back = false;
7c673cae
FG
10047 }
10048
e306af50 10049 mdcache->finish_rollback(mut->reqid, mdr);
7c673cae
FG
10050
10051 mut->cleanup();
10052}
10053
f67539c2 10054void Server::handle_peer_rename_prep_ack(MDRequestRef& mdr, const cref_t<MMDSPeerRequest> &ack)
7c673cae 10055{
f67539c2 10056 dout(10) << "handle_peer_rename_prep_ack " << *mdr
7c673cae
FG
10057 << " witnessed by " << ack->get_source()
10058 << " " << *ack << dendl;
10059 mds_rank_t from = mds_rank_t(ack->get_source().num());
10060
f67539c2
TL
10061 // note peer
10062 mdr->more()->peers.insert(from);
7c673cae
FG
10063 if (mdr->more()->srcdn_auth_mds == from &&
10064 mdr->more()->is_remote_frozen_authpin &&
10065 !mdr->more()->is_ambiguous_auth) {
10066 mdr->set_ambiguous_auth(mdr->more()->rename_inode);
10067 }
10068
10069 // witnessed? or add extra witnesses?
11fdf7f2 10070 ceph_assert(mdr->more()->witnessed.count(from) == 0);
31f18b77 10071 if (ack->is_interrupted()) {
f67539c2 10072 dout(10) << " peer request interrupted, noop" << dendl;
31f18b77 10073 } else if (ack->witnesses.empty()) {
7c673cae
FG
10074 mdr->more()->witnessed.insert(from);
10075 if (!ack->is_not_journaled())
f67539c2 10076 mdr->more()->has_journaled_peers = true;
7c673cae
FG
10077 } else {
10078 dout(10) << " extra witnesses (srcdn replicas) are " << ack->witnesses << dendl;
11fdf7f2 10079 mdr->more()->extra_witnesses = ack->witnesses;
7c673cae
FG
10080 mdr->more()->extra_witnesses.erase(mds->get_nodeid()); // not me!
10081 }
10082
10083 // srci import?
10084 if (ack->inode_export.length()) {
10085 dout(10) << " got srci import" << dendl;
11fdf7f2 10086 mdr->more()->inode_import.share(ack->inode_export);
7c673cae
FG
10087 mdr->more()->inode_import_v = ack->inode_export_v;
10088 }
10089
10090 // remove from waiting list
f67539c2
TL
10091 ceph_assert(mdr->more()->waiting_on_peer.count(from));
10092 mdr->more()->waiting_on_peer.erase(from);
7c673cae 10093
f67539c2 10094 if (mdr->more()->waiting_on_peer.empty())
7c673cae
FG
10095 dispatch_client_request(mdr); // go again!
10096 else
f67539c2 10097 dout(10) << "still waiting on peers " << mdr->more()->waiting_on_peer << dendl;
7c673cae
FG
10098}
10099
f67539c2 10100void Server::handle_peer_rename_notify_ack(MDRequestRef& mdr, const cref_t<MMDSPeerRequest> &ack)
7c673cae 10101{
f67539c2 10102 dout(10) << "handle_peer_rename_notify_ack " << *mdr << " from mds."
7c673cae 10103 << ack->get_source() << dendl;
f67539c2 10104 ceph_assert(mdr->is_peer());
7c673cae
FG
10105 mds_rank_t from = mds_rank_t(ack->get_source().num());
10106
f67539c2
TL
10107 if (mdr->more()->waiting_on_peer.count(from)) {
10108 mdr->more()->waiting_on_peer.erase(from);
7c673cae 10109
f67539c2
TL
10110 if (mdr->more()->waiting_on_peer.empty()) {
10111 if (mdr->peer_request)
10112 dispatch_peer_request(mdr);
7c673cae
FG
10113 } else
10114 dout(10) << " still waiting for rename notify acks from "
f67539c2 10115 << mdr->more()->waiting_on_peer << dendl;
7c673cae
FG
10116 }
10117}
10118
f67539c2 10119void Server::_peer_rename_sessions_flushed(MDRequestRef& mdr)
7c673cae 10120{
f67539c2 10121 dout(10) << "_peer_rename_sessions_flushed " << *mdr << dendl;
7c673cae 10122
f67539c2
TL
10123 if (mdr->more()->waiting_on_peer.count(MDS_RANK_NONE)) {
10124 mdr->more()->waiting_on_peer.erase(MDS_RANK_NONE);
7c673cae 10125
f67539c2
TL
10126 if (mdr->more()->waiting_on_peer.empty()) {
10127 if (mdr->peer_request)
10128 dispatch_peer_request(mdr);
7c673cae
FG
10129 } else
10130 dout(10) << " still waiting for rename notify acks from "
f67539c2 10131 << mdr->more()->waiting_on_peer << dendl;
7c673cae
FG
10132 }
10133}
10134
10135// snaps
10136/* This function takes responsibility for the passed mdr*/
10137void Server::handle_client_lssnap(MDRequestRef& mdr)
10138{
9f95a23c 10139 const cref_t<MClientRequest> &req = mdr->client_request;
7c673cae
FG
10140
10141 // traverse to path
9f95a23c
TL
10142 CInode *diri = try_get_auth_inode(mdr, req->get_filepath().get_ino());
10143 if (!diri)
7c673cae 10144 return;
9f95a23c 10145
7c673cae 10146 if (!diri->is_dir()) {
f67539c2 10147 respond_to_request(mdr, -CEPHFS_ENOTDIR);
7c673cae
FG
10148 return;
10149 }
10150 dout(10) << "lssnap on " << *diri << dendl;
10151
10152 // lock snap
9f95a23c 10153 if (!mds->locker->try_rdlock_snap_layout(diri, mdr))
7c673cae
FG
10154 return;
10155
10156 if (!check_access(mdr, diri, MAY_READ))
10157 return;
10158
10159 SnapRealm *realm = diri->find_snaprealm();
11fdf7f2 10160 map<snapid_t,const SnapInfo*> infomap;
7c673cae
FG
10161 realm->get_snap_info(infomap, diri->get_oldest_snap());
10162
10163 unsigned max_entries = req->head.args.readdir.max_entries;
10164 if (!max_entries)
10165 max_entries = infomap.size();
10166 int max_bytes = req->head.args.readdir.max_bytes;
10167 if (!max_bytes)
10168 // make sure at least one item can be encoded
11fdf7f2 10169 max_bytes = (512 << 10) + g_conf()->mds_max_xattr_pairs_size;
7c673cae
FG
10170
10171 __u64 last_snapid = 0;
10172 string offset_str = req->get_path2();
10173 if (!offset_str.empty())
10174 last_snapid = realm->resolve_snapname(offset_str, diri->ino());
10175
11fdf7f2 10176 //Empty DirStat
7c673cae 10177 bufferlist dirbl;
11fdf7f2
TL
10178 static DirStat empty;
10179 CDir::encode_dirstat(dirbl, mdr->session->info, empty);
7c673cae
FG
10180
10181 max_bytes -= dirbl.length() - sizeof(__u32) + sizeof(__u8) * 2;
10182
10183 __u32 num = 0;
10184 bufferlist dnbl;
11fdf7f2 10185 auto p = infomap.upper_bound(last_snapid);
7c673cae
FG
10186 for (; p != infomap.end() && num < max_entries; ++p) {
10187 dout(10) << p->first << " -> " << *p->second << dendl;
10188
10189 // actual
10190 string snap_name;
10191 if (p->second->ino == diri->ino())
11fdf7f2 10192 snap_name = p->second->name;
7c673cae 10193 else
11fdf7f2 10194 snap_name = p->second->get_long_name();
7c673cae
FG
10195
10196 unsigned start_len = dnbl.length();
10197 if (int(start_len + snap_name.length() + sizeof(__u32) + sizeof(LeaseStat)) > max_bytes)
10198 break;
10199
11fdf7f2
TL
10200 encode(snap_name, dnbl);
10201 //infinite lease
9f95a23c 10202 LeaseStat e(CEPH_LEASE_VALID, -1, 0);
11fdf7f2
TL
10203 mds->locker->encode_lease(dnbl, mdr->session->info, e);
10204 dout(20) << "encode_infinite_lease" << dendl;
7c673cae
FG
10205
10206 int r = diri->encode_inodestat(dnbl, mdr->session, realm, p->first, max_bytes - (int)dnbl.length());
10207 if (r < 0) {
10208 bufferlist keep;
10209 keep.substr_of(dnbl, 0, start_len);
10210 dnbl.swap(keep);
10211 break;
10212 }
10213 ++num;
10214 }
10215
11fdf7f2 10216 encode(num, dirbl);
7c673cae
FG
10217 __u16 flags = 0;
10218 if (p == infomap.end()) {
10219 flags = CEPH_READDIR_FRAG_END;
10220 if (last_snapid == 0)
10221 flags |= CEPH_READDIR_FRAG_COMPLETE;
10222 }
11fdf7f2 10223 encode(flags, dirbl);
7c673cae
FG
10224 dirbl.claim_append(dnbl);
10225
10226 mdr->reply_extra_bl = dirbl;
10227 mdr->tracei = diri;
10228 respond_to_request(mdr, 0);
10229}
10230
10231
10232// MKSNAP
10233
10234struct C_MDS_mksnap_finish : public ServerLogContext {
10235 CInode *diri;
10236 SnapInfo info;
10237 C_MDS_mksnap_finish(Server *s, MDRequestRef& r, CInode *di, SnapInfo &i) :
10238 ServerLogContext(s, r), diri(di), info(i) {}
10239 void finish(int r) override {
10240 server->_mksnap_finish(mdr, diri, info);
10241 }
10242};
10243
10244/* This function takes responsibility for the passed mdr*/
10245void Server::handle_client_mksnap(MDRequestRef& mdr)
10246{
9f95a23c 10247 const cref_t<MClientRequest> &req = mdr->client_request;
11fdf7f2
TL
10248 // make sure we have as new a map as the client
10249 if (req->get_mdsmap_epoch() > mds->mdsmap->get_epoch()) {
10250 mds->wait_for_mdsmap(req->get_mdsmap_epoch(), new C_MDS_RetryRequest(mdcache, mdr));
10251 return;
10252 }
7c673cae
FG
10253 if (!mds->mdsmap->allows_snaps()) {
10254 // you can't make snapshots until you set an option right now
522d829b 10255 dout(5) << "new snapshots are disabled for this fs" << dendl;
f67539c2 10256 respond_to_request(mdr, -CEPHFS_EPERM);
7c673cae
FG
10257 return;
10258 }
10259
9f95a23c
TL
10260 CInode *diri = try_get_auth_inode(mdr, req->get_filepath().get_ino());
10261 if (!diri)
7c673cae 10262 return;
7c673cae
FG
10263
10264 // dir only
10265 if (!diri->is_dir()) {
f67539c2 10266 respond_to_request(mdr, -CEPHFS_ENOTDIR);
7c673cae
FG
10267 return;
10268 }
10269 if (diri->is_system() && !diri->is_root()) {
10270 // no snaps in system dirs (root is ok)
522d829b 10271 dout(5) << "is an internal system dir" << dendl;
f67539c2 10272 respond_to_request(mdr, -CEPHFS_EPERM);
7c673cae
FG
10273 return;
10274 }
10275
11fdf7f2 10276 std::string_view snapname = req->get_filepath().last_dentry();
7c673cae 10277
11fdf7f2 10278 if (mdr->client_request->get_caller_uid() < g_conf()->mds_snap_min_uid || mdr->client_request->get_caller_uid() > g_conf()->mds_snap_max_uid) {
7c673cae 10279 dout(20) << "mksnap " << snapname << " on " << *diri << " denied to uid " << mdr->client_request->get_caller_uid() << dendl;
f67539c2 10280 respond_to_request(mdr, -CEPHFS_EPERM);
7c673cae
FG
10281 return;
10282 }
10283
10284 dout(10) << "mksnap " << snapname << " on " << *diri << dendl;
10285
10286 // lock snap
9f95a23c
TL
10287 if (!(mdr->locking_state & MutationImpl::ALL_LOCKED)) {
10288 MutationImpl::LockOpVec lov;
10289 lov.add_xlock(&diri->snaplock);
10290 if (!mds->locker->acquire_locks(mdr, lov))
10291 return;
7c673cae 10292
9f95a23c
TL
10293 if (CDentry *pdn = diri->get_projected_parent_dn(); pdn) {
10294 if (!mds->locker->try_rdlock_snap_layout(pdn->get_dir()->get_inode(), mdr))
10295 return;
10296 }
10297 mdr->locking_state |= MutationImpl::ALL_LOCKED;
10298 }
7c673cae 10299
9f95a23c 10300 if (!check_access(mdr, diri, MAY_WRITE|MAY_SNAPSHOT))
7c673cae
FG
10301 return;
10302
adb31ebb
TL
10303 if (inodeno_t subvol_ino = diri->find_snaprealm()->get_subvolume_ino();
10304 (subvol_ino && subvol_ino != diri->ino())) {
522d829b 10305 dout(5) << "is a descendent of a subvolume dir" << dendl;
f67539c2 10306 respond_to_request(mdr, -CEPHFS_EPERM);
adb31ebb
TL
10307 return;
10308 }
10309
9f95a23c
TL
10310 // check if we can create any more snapshots
10311 // we don't allow any more if we are already at or beyond the limit
10312 if (diri->snaprealm &&
10313 diri->snaprealm->get_snaps().size() >= max_snaps_per_dir) {
f67539c2 10314 respond_to_request(mdr, -CEPHFS_EMLINK);
7c673cae 10315 return;
9f95a23c 10316 }
7c673cae
FG
10317
10318 // make sure name is unique
10319 if (diri->snaprealm &&
10320 diri->snaprealm->exists(snapname)) {
f67539c2 10321 respond_to_request(mdr, -CEPHFS_EEXIST);
7c673cae
FG
10322 return;
10323 }
10324 if (snapname.length() == 0 ||
10325 snapname[0] == '_') {
f67539c2 10326 respond_to_request(mdr, -CEPHFS_EINVAL);
7c673cae
FG
10327 return;
10328 }
10329
10330 // allocate a snapid
10331 if (!mdr->more()->stid) {
10332 // prepare an stid
10333 mds->snapclient->prepare_create(diri->ino(), snapname,
10334 mdr->get_mds_stamp(),
10335 &mdr->more()->stid, &mdr->more()->snapidbl,
10336 new C_MDS_RetryRequest(mdcache, mdr));
10337 return;
10338 }
10339
10340 version_t stid = mdr->more()->stid;
10341 snapid_t snapid;
11fdf7f2
TL
10342 auto p = mdr->more()->snapidbl.cbegin();
10343 decode(snapid, p);
7c673cae
FG
10344 dout(10) << " stid " << stid << " snapid " << snapid << dendl;
10345
11fdf7f2
TL
10346 ceph_assert(mds->snapclient->get_cached_version() >= stid);
10347
f67539c2
TL
10348 SnapPayload payload;
10349 if (req->get_data().length()) {
10350 try {
10351 auto iter = req->get_data().cbegin();
10352 decode(payload, iter);
10353 } catch (const ceph::buffer::error &e) {
10354 // backward compat -- client sends xattr bufferlist. however,
10355 // that is not used anywhere -- so (log and) ignore.
10356 dout(20) << ": no metadata in payload (old client?)" << dendl;
10357 }
10358 }
10359
7c673cae
FG
10360 // journal
10361 SnapInfo info;
10362 info.ino = diri->ino();
10363 info.snapid = snapid;
11fdf7f2 10364 info.name = snapname;
7c673cae 10365 info.stamp = mdr->get_op_stamp();
f67539c2 10366 info.metadata = payload.metadata;
7c673cae 10367
f67539c2
TL
10368 auto pi = diri->project_inode(mdr, false, true);
10369 pi.inode->ctime = info.stamp;
10370 if (info.stamp > pi.inode->rstat.rctime)
10371 pi.inode->rstat.rctime = info.stamp;
10372 pi.inode->rstat.rsnaps++;
10373 pi.inode->version = diri->pre_dirty();
7c673cae
FG
10374
10375 // project the snaprealm
94b18763
FG
10376 auto &newsnap = *pi.snapnode;
10377 newsnap.created = snapid;
10378 auto em = newsnap.snaps.emplace(std::piecewise_construct, std::forward_as_tuple(snapid), std::forward_as_tuple(info));
10379 if (!em.second)
10380 em.first->second = info;
10381 newsnap.seq = snapid;
10382 newsnap.last_created = snapid;
7c673cae
FG
10383
10384 // journal the inode changes
10385 mdr->ls = mdlog->get_current_segment();
10386 EUpdate *le = new EUpdate(mdlog, "mksnap");
10387 mdlog->start_entry(le);
10388
10389 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
10390 le->metablob.add_table_transaction(TABLE_SNAP, stid);
10391 mdcache->predirty_journal_parents(mdr, &le->metablob, diri, 0, PREDIRTY_PRIMARY, false);
10392 mdcache->journal_dirty_inode(mdr.get(), &le->metablob, diri);
10393
10394 // journal the snaprealm changes
10395 submit_mdlog_entry(le, new C_MDS_mksnap_finish(this, mdr, diri, info),
10396 mdr, __func__);
10397 mdlog->flush();
10398}
10399
10400void Server::_mksnap_finish(MDRequestRef& mdr, CInode *diri, SnapInfo &info)
10401{
10402 dout(10) << "_mksnap_finish " << *mdr << " " << info << dendl;
10403
10404 int op = (diri->snaprealm? CEPH_SNAP_OP_CREATE : CEPH_SNAP_OP_SPLIT);
10405
7c673cae
FG
10406 mdr->apply();
10407
10408 mds->snapclient->commit(mdr->more()->stid, mdr->ls);
10409
10410 // create snap
10411 dout(10) << "snaprealm now " << *diri->snaprealm << dendl;
10412
11fdf7f2
TL
10413 // notify other mds
10414 mdcache->send_snap_update(diri, mdr->more()->stid, op);
10415
7c673cae
FG
10416 mdcache->do_realm_invalidate_and_update_notify(diri, op);
10417
10418 // yay
10419 mdr->in[0] = diri;
10420 mdr->snapid = info.snapid;
10421 mdr->tracei = diri;
10422 respond_to_request(mdr, 0);
10423}
10424
10425
10426// RMSNAP
10427
10428struct C_MDS_rmsnap_finish : public ServerLogContext {
10429 CInode *diri;
10430 snapid_t snapid;
10431 C_MDS_rmsnap_finish(Server *s, MDRequestRef& r, CInode *di, snapid_t sn) :
10432 ServerLogContext(s, r), diri(di), snapid(sn) {}
10433 void finish(int r) override {
10434 server->_rmsnap_finish(mdr, diri, snapid);
10435 }
10436};
10437
10438/* This function takes responsibility for the passed mdr*/
10439void Server::handle_client_rmsnap(MDRequestRef& mdr)
10440{
9f95a23c 10441 const cref_t<MClientRequest> &req = mdr->client_request;
7c673cae 10442
9f95a23c
TL
10443 CInode *diri = try_get_auth_inode(mdr, req->get_filepath().get_ino());
10444 if (!diri)
7c673cae 10445 return;
9f95a23c 10446
7c673cae 10447 if (!diri->is_dir()) {
f67539c2 10448 respond_to_request(mdr, -CEPHFS_ENOTDIR);
7c673cae
FG
10449 return;
10450 }
10451
11fdf7f2 10452 std::string_view snapname = req->get_filepath().last_dentry();
7c673cae 10453
11fdf7f2 10454 if (mdr->client_request->get_caller_uid() < g_conf()->mds_snap_min_uid || mdr->client_request->get_caller_uid() > g_conf()->mds_snap_max_uid) {
7c673cae 10455 dout(20) << "rmsnap " << snapname << " on " << *diri << " denied to uid " << mdr->client_request->get_caller_uid() << dendl;
f67539c2 10456 respond_to_request(mdr, -CEPHFS_EPERM);
7c673cae
FG
10457 return;
10458 }
10459
10460 dout(10) << "rmsnap " << snapname << " on " << *diri << dendl;
10461
10462 // does snap exist?
10463 if (snapname.length() == 0 || snapname[0] == '_') {
f67539c2 10464 respond_to_request(mdr, -CEPHFS_EINVAL); // can't prune a parent snap, currently.
7c673cae
FG
10465 return;
10466 }
10467 if (!diri->snaprealm || !diri->snaprealm->exists(snapname)) {
f67539c2 10468 respond_to_request(mdr, -CEPHFS_ENOENT);
7c673cae
FG
10469 return;
10470 }
10471 snapid_t snapid = diri->snaprealm->resolve_snapname(snapname, diri->ino());
10472 dout(10) << " snapname " << snapname << " is " << snapid << dendl;
10473
9f95a23c
TL
10474 if (!(mdr->locking_state & MutationImpl::ALL_LOCKED)) {
10475 MutationImpl::LockOpVec lov;
10476 lov.add_xlock(&diri->snaplock);
10477 if (!mds->locker->acquire_locks(mdr, lov))
10478 return;
10479 if (CDentry *pdn = diri->get_projected_parent_dn(); pdn) {
10480 if (!mds->locker->try_rdlock_snap_layout(pdn->get_dir()->get_inode(), mdr))
10481 return;
10482 }
10483 mdr->locking_state |= MutationImpl::ALL_LOCKED;
10484 }
7c673cae 10485
11fdf7f2 10486 if (!check_access(mdr, diri, MAY_WRITE|MAY_SNAPSHOT))
7c673cae
FG
10487 return;
10488
10489 // prepare
10490 if (!mdr->more()->stid) {
10491 mds->snapclient->prepare_destroy(diri->ino(), snapid,
10492 &mdr->more()->stid, &mdr->more()->snapidbl,
10493 new C_MDS_RetryRequest(mdcache, mdr));
10494 return;
10495 }
10496 version_t stid = mdr->more()->stid;
11fdf7f2 10497 auto p = mdr->more()->snapidbl.cbegin();
7c673cae 10498 snapid_t seq;
11fdf7f2 10499 decode(seq, p);
7c673cae
FG
10500 dout(10) << " stid is " << stid << ", seq is " << seq << dendl;
10501
11fdf7f2
TL
10502 ceph_assert(mds->snapclient->get_cached_version() >= stid);
10503
7c673cae 10504 // journal
f67539c2
TL
10505 auto pi = diri->project_inode(mdr, false, true);
10506 pi.inode->version = diri->pre_dirty();
10507 pi.inode->ctime = mdr->get_op_stamp();
10508 if (mdr->get_op_stamp() > pi.inode->rstat.rctime)
10509 pi.inode->rstat.rctime = mdr->get_op_stamp();
10510 pi.inode->rstat.rsnaps--;
7c673cae
FG
10511
10512 mdr->ls = mdlog->get_current_segment();
10513 EUpdate *le = new EUpdate(mdlog, "rmsnap");
10514 mdlog->start_entry(le);
10515
10516 // project the snaprealm
94b18763
FG
10517 auto &newnode = *pi.snapnode;
10518 newnode.snaps.erase(snapid);
10519 newnode.seq = seq;
10520 newnode.last_destroyed = seq;
7c673cae
FG
10521
10522 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
10523 le->metablob.add_table_transaction(TABLE_SNAP, stid);
10524 mdcache->predirty_journal_parents(mdr, &le->metablob, diri, 0, PREDIRTY_PRIMARY, false);
10525 mdcache->journal_dirty_inode(mdr.get(), &le->metablob, diri);
10526
10527 submit_mdlog_entry(le, new C_MDS_rmsnap_finish(this, mdr, diri, snapid),
10528 mdr, __func__);
10529 mdlog->flush();
10530}
10531
10532void Server::_rmsnap_finish(MDRequestRef& mdr, CInode *diri, snapid_t snapid)
10533{
10534 dout(10) << "_rmsnap_finish " << *mdr << " " << snapid << dendl;
10535 snapid_t stid = mdr->more()->stid;
11fdf7f2 10536 auto p = mdr->more()->snapidbl.cbegin();
7c673cae 10537 snapid_t seq;
11fdf7f2 10538 decode(seq, p);
7c673cae 10539
7c673cae
FG
10540 mdr->apply();
10541
10542 mds->snapclient->commit(stid, mdr->ls);
10543
10544 dout(10) << "snaprealm now " << *diri->snaprealm << dendl;
10545
11fdf7f2
TL
10546 // notify other mds
10547 mdcache->send_snap_update(diri, mdr->more()->stid, CEPH_SNAP_OP_DESTROY);
10548
7c673cae
FG
10549 mdcache->do_realm_invalidate_and_update_notify(diri, CEPH_SNAP_OP_DESTROY);
10550
10551 // yay
10552 mdr->in[0] = diri;
10553 respond_to_request(mdr, 0);
10554
10555 // purge snapshot data
f67539c2 10556 diri->purge_stale_snap_data(diri->snaprealm->get_snaps());
7c673cae
FG
10557}
10558
10559struct C_MDS_renamesnap_finish : public ServerLogContext {
10560 CInode *diri;
10561 snapid_t snapid;
10562 C_MDS_renamesnap_finish(Server *s, MDRequestRef& r, CInode *di, snapid_t sn) :
10563 ServerLogContext(s, r), diri(di), snapid(sn) {}
10564 void finish(int r) override {
10565 server->_renamesnap_finish(mdr, diri, snapid);
10566 }
10567};
10568
10569/* This function takes responsibility for the passed mdr*/
10570void Server::handle_client_renamesnap(MDRequestRef& mdr)
10571{
9f95a23c 10572 const cref_t<MClientRequest> &req = mdr->client_request;
7c673cae 10573 if (req->get_filepath().get_ino() != req->get_filepath2().get_ino()) {
f67539c2 10574 respond_to_request(mdr, -CEPHFS_EINVAL);
7c673cae
FG
10575 return;
10576 }
10577
9f95a23c
TL
10578 CInode *diri = try_get_auth_inode(mdr, req->get_filepath().get_ino());
10579 if (!diri)
7c673cae 10580 return;
7c673cae
FG
10581
10582 if (!diri->is_dir()) { // dir only
f67539c2 10583 respond_to_request(mdr, -CEPHFS_ENOTDIR);
7c673cae
FG
10584 return;
10585 }
10586
11fdf7f2
TL
10587 if (mdr->client_request->get_caller_uid() < g_conf()->mds_snap_min_uid ||
10588 mdr->client_request->get_caller_uid() > g_conf()->mds_snap_max_uid) {
f67539c2 10589 respond_to_request(mdr, -CEPHFS_EPERM);
7c673cae
FG
10590 return;
10591 }
10592
11fdf7f2
TL
10593 std::string_view dstname = req->get_filepath().last_dentry();
10594 std::string_view srcname = req->get_filepath2().last_dentry();
7c673cae
FG
10595 dout(10) << "renamesnap " << srcname << "->" << dstname << " on " << *diri << dendl;
10596
10597 if (srcname.length() == 0 || srcname[0] == '_') {
f67539c2 10598 respond_to_request(mdr, -CEPHFS_EINVAL); // can't rename a parent snap.
7c673cae
FG
10599 return;
10600 }
10601 if (!diri->snaprealm || !diri->snaprealm->exists(srcname)) {
f67539c2 10602 respond_to_request(mdr, -CEPHFS_ENOENT);
7c673cae
FG
10603 return;
10604 }
10605 if (dstname.length() == 0 || dstname[0] == '_') {
f67539c2 10606 respond_to_request(mdr, -CEPHFS_EINVAL);
7c673cae
FG
10607 return;
10608 }
10609 if (diri->snaprealm->exists(dstname)) {
f67539c2 10610 respond_to_request(mdr, -CEPHFS_EEXIST);
7c673cae
FG
10611 return;
10612 }
10613
10614 snapid_t snapid = diri->snaprealm->resolve_snapname(srcname, diri->ino());
10615 dout(10) << " snapname " << srcname << " is " << snapid << dendl;
10616
10617 // lock snap
9f95a23c
TL
10618 if (!(mdr->locking_state & MutationImpl::ALL_LOCKED)) {
10619 MutationImpl::LockOpVec lov;
10620 lov.add_xlock(&diri->snaplock);
10621 if (!mds->locker->acquire_locks(mdr, lov))
10622 return;
10623 if (CDentry *pdn = diri->get_projected_parent_dn(); pdn) {
10624 if (!mds->locker->try_rdlock_snap_layout(pdn->get_dir()->get_inode(), mdr))
10625 return;
10626 }
10627 mdr->locking_state |= MutationImpl::ALL_LOCKED;
10628 }
7c673cae 10629
11fdf7f2 10630 if (!check_access(mdr, diri, MAY_WRITE|MAY_SNAPSHOT))
7c673cae
FG
10631 return;
10632
10633 // prepare
10634 if (!mdr->more()->stid) {
10635 mds->snapclient->prepare_update(diri->ino(), snapid, dstname, utime_t(),
11fdf7f2 10636 &mdr->more()->stid,
7c673cae
FG
10637 new C_MDS_RetryRequest(mdcache, mdr));
10638 return;
10639 }
10640
10641 version_t stid = mdr->more()->stid;
11fdf7f2
TL
10642 dout(10) << " stid is " << stid << dendl;
10643
10644 ceph_assert(mds->snapclient->get_cached_version() >= stid);
7c673cae
FG
10645
10646 // journal
f67539c2
TL
10647 auto pi = diri->project_inode(mdr, false, true);
10648 pi.inode->ctime = mdr->get_op_stamp();
10649 if (mdr->get_op_stamp() > pi.inode->rstat.rctime)
10650 pi.inode->rstat.rctime = mdr->get_op_stamp();
10651 pi.inode->version = diri->pre_dirty();
7c673cae
FG
10652
10653 // project the snaprealm
94b18763
FG
10654 auto &newsnap = *pi.snapnode;
10655 auto it = newsnap.snaps.find(snapid);
11fdf7f2
TL
10656 ceph_assert(it != newsnap.snaps.end());
10657 it->second.name = dstname;
7c673cae
FG
10658
10659 // journal the inode changes
10660 mdr->ls = mdlog->get_current_segment();
10661 EUpdate *le = new EUpdate(mdlog, "renamesnap");
10662 mdlog->start_entry(le);
10663
10664 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
10665 le->metablob.add_table_transaction(TABLE_SNAP, stid);
10666 mdcache->predirty_journal_parents(mdr, &le->metablob, diri, 0, PREDIRTY_PRIMARY, false);
10667 mdcache->journal_dirty_inode(mdr.get(), &le->metablob, diri);
10668
10669 // journal the snaprealm changes
10670 submit_mdlog_entry(le, new C_MDS_renamesnap_finish(this, mdr, diri, snapid),
10671 mdr, __func__);
10672 mdlog->flush();
10673}
10674
10675void Server::_renamesnap_finish(MDRequestRef& mdr, CInode *diri, snapid_t snapid)
10676{
10677 dout(10) << "_renamesnap_finish " << *mdr << " " << snapid << dendl;
10678
7c673cae
FG
10679 mdr->apply();
10680
10681 mds->snapclient->commit(mdr->more()->stid, mdr->ls);
10682
10683 dout(10) << "snaprealm now " << *diri->snaprealm << dendl;
10684
11fdf7f2
TL
10685 // notify other mds
10686 mdcache->send_snap_update(diri, mdr->more()->stid, CEPH_SNAP_OP_UPDATE);
10687
10688 mdcache->do_realm_invalidate_and_update_notify(diri, CEPH_SNAP_OP_UPDATE);
7c673cae
FG
10689
10690 // yay
10691 mdr->in[0] = diri;
10692 mdr->tracei = diri;
10693 mdr->snapid = snapid;
10694 respond_to_request(mdr, 0);
10695}
10696
10697/**
10698 * Return true if server is in state RECONNECT and this
10699 * client has not yet reconnected.
10700 */
10701bool Server::waiting_for_reconnect(client_t c) const
10702{
10703 return client_reconnect_gather.count(c) > 0;
10704}
10705
10706void Server::dump_reconnect_status(Formatter *f) const
10707{
10708 f->open_object_section("reconnect_status");
10709 f->dump_stream("client_reconnect_gather") << client_reconnect_gather;
10710 f->close_section();
10711}