]> git.proxmox.com Git - ceph.git/blame - ceph/src/mds/Server.cc
update source to Ceph Pacific 16.2.2
[ceph.git] / ceph / src / mds / Server.cc
CommitLineData
7c673cae
FG
1// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2// vim: ts=8 sw=2 smarttab
3/*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 *
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
12 *
13 */
14
15#include <boost/lexical_cast.hpp>
11fdf7f2 16#include "include/ceph_assert.h" // lexical_cast includes system assert.h
7c673cae
FG
17
18#include <boost/config/warning_disable.hpp>
19#include <boost/fusion/include/std_pair.hpp>
a8e16298 20#include <boost/range/adaptor/reversed.hpp>
7c673cae
FG
21
22#include "MDSRank.h"
23#include "Server.h"
24#include "Locker.h"
25#include "MDCache.h"
26#include "MDLog.h"
27#include "Migrator.h"
28#include "MDBalancer.h"
29#include "InoTable.h"
30#include "SnapClient.h"
31#include "Mutation.h"
f67539c2 32#include "MetricsHandler.h"
11fdf7f2 33#include "cephfs_features.h"
7c673cae
FG
34
35#include "msg/Messenger.h"
36
37#include "osdc/Objecter.h"
38
7c673cae 39#include "events/EUpdate.h"
f67539c2 40#include "events/EPeerUpdate.h"
7c673cae
FG
41#include "events/ESession.h"
42#include "events/EOpen.h"
43#include "events/ECommitted.h"
9f95a23c 44#include "events/EPurged.h"
7c673cae 45
11fdf7f2 46#include "include/stringify.h"
7c673cae
FG
47#include "include/filepath.h"
48#include "common/errno.h"
49#include "common/Timer.h"
50#include "common/perf_counters.h"
51#include "include/compat.h"
52#include "osd/OSDMap.h"
53
54#include <errno.h>
55
56#include <list>
f67539c2 57#include <regex>
11fdf7f2 58#include <string_view>
f67539c2 59#include <functional>
7c673cae
FG
60
61#include "common/config.h"
62
63#define dout_context g_ceph_context
64#define dout_subsys ceph_subsys_mds
65#undef dout_prefix
66#define dout_prefix *_dout << "mds." << mds->get_nodeid() << ".server "
67
11fdf7f2 68class ServerContext : public MDSContext {
7c673cae
FG
69 protected:
70 Server *server;
71 MDSRank *get_mds() override
72 {
73 return server->mds;
74 }
75
76 public:
77 explicit ServerContext(Server *s) : server(s) {
11fdf7f2 78 ceph_assert(server != NULL);
7c673cae
FG
79 }
80};
81
9f95a23c
TL
82class Batch_Getattr_Lookup : public BatchOp {
83protected:
84 Server* server;
85 ceph::ref_t<MDRequestImpl> mdr;
f91f0fd5 86 std::vector<ceph::ref_t<MDRequestImpl>> batch_reqs;
9f95a23c
TL
87 int res = 0;
88public:
f91f0fd5
TL
89 Batch_Getattr_Lookup(Server* s, const ceph::ref_t<MDRequestImpl>& r)
90 : server(s), mdr(r) {
91 if (mdr->client_request->get_op() == CEPH_MDS_OP_LOOKUP)
92 mdr->batch_op_map = &mdr->dn[0].back()->batch_ops;
93 else
94 mdr->batch_op_map = &mdr->in[0]->batch_ops;
95 }
96 void add_request(const ceph::ref_t<MDRequestImpl>& r) override {
97 batch_reqs.push_back(r);
9f95a23c 98 }
f91f0fd5
TL
99 ceph::ref_t<MDRequestImpl> find_new_head() override {
100 while (!batch_reqs.empty()) {
101 auto r = std::move(batch_reqs.back());
102 batch_reqs.pop_back();
103 if (r->killed)
104 continue;
105
106 r->batch_op_map = mdr->batch_op_map;
107 mdr->batch_op_map = nullptr;
108 mdr = r;
109 return mdr;
110 }
111 return nullptr;
9f95a23c
TL
112 }
113 void _forward(mds_rank_t t) override {
f91f0fd5 114 MDCache* mdcache = server->mdcache;
9f95a23c
TL
115 mdcache->mds->forward_message_mds(mdr->release_client_request(), t);
116 mdr->set_mds_stamp(ceph_clock_now());
f91f0fd5 117 for (auto& m : batch_reqs) {
9f95a23c
TL
118 if (!m->killed)
119 mdcache->request_forward(m, t);
120 }
f91f0fd5 121 batch_reqs.clear();
9f95a23c
TL
122 }
123 void _respond(int r) override {
124 mdr->set_mds_stamp(ceph_clock_now());
f91f0fd5 125 for (auto& m : batch_reqs) {
9f95a23c
TL
126 if (!m->killed) {
127 m->tracei = mdr->tracei;
128 m->tracedn = mdr->tracedn;
129 server->respond_to_request(m, r);
130 }
131 }
f91f0fd5 132 batch_reqs.clear();
9f95a23c
TL
133 server->reply_client_request(mdr, make_message<MClientReply>(*mdr->client_request, r));
134 }
135 void print(std::ostream& o) {
136 o << "[batch front=" << *mdr << "]";
137 }
138};
139
7c673cae
FG
140class ServerLogContext : public MDSLogContextBase {
141protected:
142 Server *server;
143 MDSRank *get_mds() override
144 {
145 return server->mds;
146 }
147
148 MDRequestRef mdr;
149 void pre_finish(int r) override {
150 if (mdr)
151 mdr->mark_event("journal_committed: ");
152 }
153public:
154 explicit ServerLogContext(Server *s) : server(s) {
11fdf7f2 155 ceph_assert(server != NULL);
7c673cae
FG
156 }
157 explicit ServerLogContext(Server *s, MDRequestRef& r) : server(s), mdr(r) {
11fdf7f2 158 ceph_assert(server != NULL);
7c673cae
FG
159 }
160};
161
162void Server::create_logger()
163{
164 PerfCountersBuilder plb(g_ceph_context, "mds_server", l_mdss_first, l_mdss_last);
91327a77
AA
165
166 plb.add_u64_counter(l_mdss_handle_client_request, "handle_client_request",
167 "Client requests", "hcr", PerfCountersBuilder::PRIO_INTERESTING);
f67539c2
TL
168 plb.add_u64_counter(l_mdss_handle_peer_request, "handle_peer_request",
169 "Peer requests", "hsr", PerfCountersBuilder::PRIO_INTERESTING);
91327a77
AA
170 plb.add_u64_counter(l_mdss_handle_client_session,
171 "handle_client_session", "Client session messages", "hcs",
172 PerfCountersBuilder::PRIO_INTERESTING);
11fdf7f2
TL
173 plb.add_u64_counter(l_mdss_cap_revoke_eviction, "cap_revoke_eviction",
174 "Cap Revoke Client Eviction", "cre", PerfCountersBuilder::PRIO_INTERESTING);
adb31ebb
TL
175 plb.add_u64_counter(l_mdss_cap_acquisition_throttle,
176 "cap_acquisition_throttle", "Cap acquisition throttle counter", "cat",
177 PerfCountersBuilder::PRIO_INTERESTING);
91327a77
AA
178
179 // fop latencies are useful
180 plb.set_prio_default(PerfCountersBuilder::PRIO_USEFUL);
181 plb.add_time_avg(l_mdss_req_lookuphash_latency, "req_lookuphash_latency",
182 "Request type lookup hash of inode latency");
183 plb.add_time_avg(l_mdss_req_lookupino_latency, "req_lookupino_latency",
184 "Request type lookup inode latency");
185 plb.add_time_avg(l_mdss_req_lookupparent_latency, "req_lookupparent_latency",
186 "Request type lookup parent latency");
187 plb.add_time_avg(l_mdss_req_lookupname_latency, "req_lookupname_latency",
188 "Request type lookup name latency");
189 plb.add_time_avg(l_mdss_req_lookup_latency, "req_lookup_latency",
190 "Request type lookup latency");
191 plb.add_time_avg(l_mdss_req_lookupsnap_latency, "req_lookupsnap_latency",
192 "Request type lookup snapshot latency");
193 plb.add_time_avg(l_mdss_req_getattr_latency, "req_getattr_latency",
194 "Request type get attribute latency");
195 plb.add_time_avg(l_mdss_req_setattr_latency, "req_setattr_latency",
196 "Request type set attribute latency");
197 plb.add_time_avg(l_mdss_req_setlayout_latency, "req_setlayout_latency",
198 "Request type set file layout latency");
199 plb.add_time_avg(l_mdss_req_setdirlayout_latency, "req_setdirlayout_latency",
200 "Request type set directory layout latency");
201 plb.add_time_avg(l_mdss_req_setxattr_latency, "req_setxattr_latency",
202 "Request type set extended attribute latency");
203 plb.add_time_avg(l_mdss_req_rmxattr_latency, "req_rmxattr_latency",
204 "Request type remove extended attribute latency");
205 plb.add_time_avg(l_mdss_req_readdir_latency, "req_readdir_latency",
206 "Request type read directory latency");
207 plb.add_time_avg(l_mdss_req_setfilelock_latency, "req_setfilelock_latency",
208 "Request type set file lock latency");
209 plb.add_time_avg(l_mdss_req_getfilelock_latency, "req_getfilelock_latency",
210 "Request type get file lock latency");
211 plb.add_time_avg(l_mdss_req_create_latency, "req_create_latency",
212 "Request type create latency");
213 plb.add_time_avg(l_mdss_req_open_latency, "req_open_latency",
214 "Request type open latency");
215 plb.add_time_avg(l_mdss_req_mknod_latency, "req_mknod_latency",
216 "Request type make node latency");
217 plb.add_time_avg(l_mdss_req_link_latency, "req_link_latency",
218 "Request type link latency");
219 plb.add_time_avg(l_mdss_req_unlink_latency, "req_unlink_latency",
220 "Request type unlink latency");
221 plb.add_time_avg(l_mdss_req_rmdir_latency, "req_rmdir_latency",
222 "Request type remove directory latency");
223 plb.add_time_avg(l_mdss_req_rename_latency, "req_rename_latency",
224 "Request type rename latency");
225 plb.add_time_avg(l_mdss_req_mkdir_latency, "req_mkdir_latency",
226 "Request type make directory latency");
227 plb.add_time_avg(l_mdss_req_symlink_latency, "req_symlink_latency",
228 "Request type symbolic link latency");
229 plb.add_time_avg(l_mdss_req_lssnap_latency, "req_lssnap_latency",
230 "Request type list snapshot latency");
231 plb.add_time_avg(l_mdss_req_mksnap_latency, "req_mksnap_latency",
232 "Request type make snapshot latency");
233 plb.add_time_avg(l_mdss_req_rmsnap_latency, "req_rmsnap_latency",
234 "Request type remove snapshot latency");
235 plb.add_time_avg(l_mdss_req_renamesnap_latency, "req_renamesnap_latency",
236 "Request type rename snapshot latency");
237
91327a77
AA
238 plb.set_prio_default(PerfCountersBuilder::PRIO_DEBUGONLY);
239 plb.add_u64_counter(l_mdss_dispatch_client_request, "dispatch_client_request",
240 "Client requests dispatched");
f67539c2 241 plb.add_u64_counter(l_mdss_dispatch_peer_request, "dispatch_server_request",
91327a77
AA
242 "Server requests dispatched");
243
7c673cae
FG
244 logger = plb.create_perf_counters();
245 g_ceph_context->get_perfcounters_collection()->add(logger);
246}
247
f67539c2 248Server::Server(MDSRank *m, MetricsHandler *metrics_handler) :
7c673cae
FG
249 mds(m),
250 mdcache(mds->mdcache), mdlog(mds->mdlog),
f67539c2
TL
251 recall_throttle(g_conf().get_val<double>("mds_recall_max_decay_rate")),
252 metrics_handler(metrics_handler)
7c673cae 253{
f91f0fd5 254 forward_all_requests_to_auth = g_conf().get_val<bool>("mds_forward_all_requests_to_auth");
92f5a8d4 255 replay_unsafe_with_closed_session = g_conf().get_val<bool>("mds_replay_unsafe_with_closed_session");
81eedcae 256 cap_revoke_eviction_timeout = g_conf().get_val<double>("mds_cap_revoke_eviction_timeout");
9f95a23c
TL
257 max_snaps_per_dir = g_conf().get_val<uint64_t>("mds_max_snaps_per_dir");
258 delegate_inos_pct = g_conf().get_val<uint64_t>("mds_client_delegate_inos_pct");
adb31ebb
TL
259 max_caps_per_client = g_conf().get_val<uint64_t>("mds_max_caps_per_client");
260 cap_acquisition_throttle = g_conf().get_val<uint64_t>("mds_session_cap_acquisition_throttle");
261 max_caps_throttle_ratio = g_conf().get_val<double>("mds_session_max_caps_throttle_ratio");
262 caps_throttle_retry_request_timeout = g_conf().get_val<double>("mds_cap_acquisition_throttle_retry_request_timeout");
11fdf7f2 263 supported_features = feature_bitset_t(CEPHFS_FEATURES_MDS_SUPPORTED);
7c673cae
FG
264}
265
9f95a23c 266void Server::dispatch(const cref_t<Message> &m)
7c673cae
FG
267{
268 switch (m->get_type()) {
269 case CEPH_MSG_CLIENT_RECONNECT:
9f95a23c 270 handle_client_reconnect(ref_cast<MClientReconnect>(m));
7c673cae
FG
271 return;
272 }
273
92f5a8d4
TL
274/*
275 *In reconnect phase, client sent unsafe requests to mds before reconnect msg. Seting sessionclosed_isok will handle scenario like this:
276
2771. In reconnect phase, client sent unsafe requests to mds.
2782. It reached reconnect timeout. All sessions without sending reconnect msg in time, some of which may had sent unsafe requests, are marked as closed.
279(Another situation is #31668, which will deny all client reconnect msg to speed up reboot).
2803.So these unsafe request from session without sending reconnect msg in time or being denied could be handled in clientreplay phase.
281
282*/
283 bool sessionclosed_isok = replay_unsafe_with_closed_session;
7c673cae 284 // active?
f67539c2 285 // handle_peer_request()/handle_client_session() will wait if necessary
94b18763 286 if (m->get_type() == CEPH_MSG_CLIENT_REQUEST && !mds->is_active()) {
9f95a23c 287 const auto &req = ref_cast<MClientRequest>(m);
94b18763
FG
288 if (mds->is_reconnect() || mds->get_want_state() == CEPH_MDS_STATE_RECONNECT) {
289 Session *session = mds->get_session(req);
92f5a8d4 290 if (!session || (!session->is_open() && !sessionclosed_isok)) {
7c673cae 291 dout(5) << "session is closed, dropping " << req->get_reqid() << dendl;
7c673cae
FG
292 return;
293 }
294 bool queue_replay = false;
9f95a23c 295 if (req->is_replay() || req->is_async()) {
7c673cae
FG
296 dout(3) << "queuing replayed op" << dendl;
297 queue_replay = true;
11fdf7f2
TL
298 if (req->head.ino &&
299 !session->have_completed_request(req->get_reqid().tid, nullptr)) {
f67539c2
TL
300 inodeno_t ino(req->head.ino);
301 mdcache->add_replay_ino_alloc(ino);
302 if (replay_unsafe_with_closed_session &&
303 session->free_prealloc_inos.contains(ino)) {
304 // don't purge inodes that will be created by later replay
305 session->free_prealloc_inos.erase(ino);
306 session->delegated_inos.insert(ino);
307 }
11fdf7f2 308 }
7c673cae
FG
309 } else if (req->get_retry_attempt()) {
310 // process completed request in clientreplay stage. The completed request
311 // might have created new file/directorie. This guarantees MDS sends a reply
312 // to client before other request modifies the new file/directorie.
313 if (session->have_completed_request(req->get_reqid().tid, NULL)) {
314 dout(3) << "queuing completed op" << dendl;
315 queue_replay = true;
316 }
317 // this request was created before the cap reconnect message, drop any embedded
318 // cap releases.
319 req->releases.clear();
320 }
321 if (queue_replay) {
322 req->mark_queued_for_replay();
323 mds->enqueue_replay(new C_MDS_RetryMessage(mds, m));
324 return;
325 }
326 }
327
328 bool wait_for_active = true;
94b18763 329 if (mds->is_stopping()) {
28e407b8 330 wait_for_active = false;
7c673cae 331 } else if (mds->is_clientreplay()) {
94b18763 332 if (req->is_queued_for_replay()) {
7c673cae 333 wait_for_active = false;
7c673cae
FG
334 }
335 }
336 if (wait_for_active) {
337 dout(3) << "not active yet, waiting" << dendl;
338 mds->wait_for_active(new C_MDS_RetryMessage(mds, m));
339 return;
340 }
341 }
342
343 switch (m->get_type()) {
344 case CEPH_MSG_CLIENT_SESSION:
9f95a23c 345 handle_client_session(ref_cast<MClientSession>(m));
7c673cae
FG
346 return;
347 case CEPH_MSG_CLIENT_REQUEST:
9f95a23c 348 handle_client_request(ref_cast<MClientRequest>(m));
11fdf7f2
TL
349 return;
350 case CEPH_MSG_CLIENT_RECLAIM:
9f95a23c 351 handle_client_reclaim(ref_cast<MClientReclaim>(m));
7c673cae 352 return;
f67539c2
TL
353 case MSG_MDS_PEER_REQUEST:
354 handle_peer_request(ref_cast<MMDSPeerRequest>(m));
7c673cae
FG
355 return;
356 default:
357 derr << "server unknown message " << m->get_type() << dendl;
11fdf7f2 358 ceph_abort_msg("server unknown message");
7c673cae
FG
359 }
360}
361
362
363
364// ----------------------------------------------------------
365// SESSION management
366
367class C_MDS_session_finish : public ServerLogContext {
368 Session *session;
369 uint64_t state_seq;
370 bool open;
371 version_t cmapv;
f67539c2 372 interval_set<inodeno_t> inos_to_free;
7c673cae 373 version_t inotablev;
f67539c2 374 interval_set<inodeno_t> inos_to_purge;
9f95a23c 375 LogSegment *ls = nullptr;
7c673cae
FG
376 Context *fin;
377public:
f67539c2 378 C_MDS_session_finish(Server *srv, Session *se, uint64_t sseq, bool s, version_t mv, Context *fin_ = nullptr) :
7c673cae 379 ServerLogContext(srv), session(se), state_seq(sseq), open(s), cmapv(mv), inotablev(0), fin(fin_) { }
f67539c2
TL
380 C_MDS_session_finish(Server *srv, Session *se, uint64_t sseq, bool s, version_t mv,
381 const interval_set<inodeno_t>& to_free, version_t iv,
382 const interval_set<inodeno_t>& to_purge, LogSegment *_ls, Context *fin_ = nullptr) :
383 ServerLogContext(srv), session(se), state_seq(sseq), open(s), cmapv(mv),
384 inos_to_free(to_free), inotablev(iv), inos_to_purge(to_purge), ls(_ls), fin(fin_) {}
7c673cae 385 void finish(int r) override {
11fdf7f2 386 ceph_assert(r == 0);
f67539c2 387 server->_session_logged(session, state_seq, open, cmapv, inos_to_free, inotablev, inos_to_purge, ls);
7c673cae
FG
388 if (fin) {
389 fin->complete(r);
390 }
391 }
392};
393
11fdf7f2
TL
394Session* Server::find_session_by_uuid(std::string_view uuid)
395{
396 Session* session = nullptr;
397 for (auto& it : mds->sessionmap.get_sessions()) {
398 auto& metadata = it.second->info.client_metadata;
399
400 auto p = metadata.find("uuid");
401 if (p == metadata.end() || p->second != uuid)
402 continue;
403
404 if (!session) {
405 session = it.second;
406 } else if (!session->reclaiming_from) {
407 assert(it.second->reclaiming_from == session);
408 session = it.second;
409 } else {
410 assert(session->reclaiming_from == it.second);
411 }
412 }
413 return session;
414}
415
9f95a23c 416void Server::reclaim_session(Session *session, const cref_t<MClientReclaim> &m)
11fdf7f2
TL
417{
418 if (!session->is_open() && !session->is_stale()) {
419 dout(10) << "session not open, dropping this req" << dendl;
420 return;
421 }
422
9f95a23c 423 auto reply = make_message<MClientReclaimReply>(0);
11fdf7f2
TL
424 if (m->get_uuid().empty()) {
425 dout(10) << __func__ << " invalid message (no uuid)" << dendl;
f67539c2 426 reply->set_result(-CEPHFS_EINVAL);
11fdf7f2
TL
427 mds->send_message_client(reply, session);
428 return;
429 }
430
431 unsigned flags = m->get_flags();
432 if (flags != CEPH_RECLAIM_RESET) { // currently only support reset
433 dout(10) << __func__ << " unsupported flags" << dendl;
f67539c2 434 reply->set_result(-CEPHFS_EOPNOTSUPP);
11fdf7f2
TL
435 mds->send_message_client(reply, session);
436 return;
437 }
438
439 Session* target = find_session_by_uuid(m->get_uuid());
440 if (target) {
441 if (session->info.auth_name != target->info.auth_name) {
442 dout(10) << __func__ << " session auth_name " << session->info.auth_name
443 << " != target auth_name " << target->info.auth_name << dendl;
f67539c2 444 reply->set_result(-CEPHFS_EPERM);
11fdf7f2
TL
445 mds->send_message_client(reply, session);
446 }
447
448 assert(!target->reclaiming_from);
449 assert(!session->reclaiming_from);
450 session->reclaiming_from = target;
451 reply->set_addrs(entity_addrvec_t(target->info.inst.addr));
452 }
453
454 if (flags & CEPH_RECLAIM_RESET) {
455 finish_reclaim_session(session, reply);
456 return;
457 }
458
459 ceph_abort();
460}
461
9f95a23c 462void Server::finish_reclaim_session(Session *session, const ref_t<MClientReclaimReply> &reply)
11fdf7f2
TL
463{
464 Session *target = session->reclaiming_from;
465 if (target) {
466 session->reclaiming_from = nullptr;
467
468 Context *send_reply;
469 if (reply) {
470 int64_t session_id = session->get_client().v;
9f95a23c
TL
471 send_reply = new LambdaContext([this, session_id, reply](int r) {
472 assert(ceph_mutex_is_locked_by_me(mds->mds_lock));
11fdf7f2
TL
473 Session *session = mds->sessionmap.get_session(entity_name_t::CLIENT(session_id));
474 if (!session) {
475 return;
476 }
477 auto epoch = mds->objecter->with_osdmap([](const OSDMap &map){ return map.get_epoch(); });
478 reply->set_epoch(epoch);
479 mds->send_message_client(reply, session);
480 });
481 } else {
482 send_reply = nullptr;
483 }
484
f67539c2
TL
485 bool blocklisted = mds->objecter->with_osdmap([target](const OSDMap &map) {
486 return map.is_blocklisted(target->info.inst.addr);
11fdf7f2
TL
487 });
488
f67539c2 489 if (blocklisted || !g_conf()->mds_session_blocklist_on_evict) {
11fdf7f2
TL
490 kill_session(target, send_reply);
491 } else {
f67539c2
TL
492 CachedStackStringStream css;
493 mds->evict_client(target->get_client().v, false, true, *css, send_reply);
11fdf7f2
TL
494 }
495 } else if (reply) {
496 mds->send_message_client(reply, session);
497 }
498}
499
9f95a23c 500void Server::handle_client_reclaim(const cref_t<MClientReclaim> &m)
11fdf7f2
TL
501{
502 Session *session = mds->get_session(m);
503 dout(3) << __func__ << " " << *m << " from " << m->get_source() << dendl;
504 assert(m->get_source().is_client()); // should _not_ come from an mds!
505
506 if (!session) {
507 dout(0) << " ignoring sessionless msg " << *m << dendl;
508 return;
509 }
510
f67539c2
TL
511 std::string_view fs_name = mds->get_fs_name();
512 if (!fs_name.empty() && !session->fs_name_capable(fs_name, MAY_READ)) {
513 dout(0) << " dropping message not allowed for this fs_name: " << *m << dendl;
514 return;
515 }
516
11fdf7f2
TL
517 if (mds->get_state() < MDSMap::STATE_CLIENTREPLAY) {
518 mds->wait_for_replay(new C_MDS_RetryMessage(mds, m));
519 return;
520 }
521
522 if (m->get_flags() & MClientReclaim::FLAG_FINISH) {
523 finish_reclaim_session(session);
524 } else {
525 reclaim_session(session, m);
526 }
527}
528
9f95a23c 529void Server::handle_client_session(const cref_t<MClientSession> &m)
7c673cae
FG
530{
531 version_t pv;
94b18763 532 Session *session = mds->get_session(m);
7c673cae
FG
533
534 dout(3) << "handle_client_session " << *m << " from " << m->get_source() << dendl;
11fdf7f2 535 ceph_assert(m->get_source().is_client()); // should _not_ come from an mds!
7c673cae
FG
536
537 if (!session) {
538 dout(0) << " ignoring sessionless msg " << *m << dendl;
9f95a23c 539 auto reply = make_message<MClientSession>(CEPH_SESSION_REJECT);
92f5a8d4
TL
540 reply->metadata["error_string"] = "sessionless";
541 mds->send_message(reply, m->get_connection());
7c673cae
FG
542 return;
543 }
544
f67539c2
TL
545 std::string_view fs_name = mds->get_fs_name();
546 if (!fs_name.empty() && !session->fs_name_capable(fs_name, MAY_READ)) {
547 dout(0) << " dropping message not allowed for this fs_name: " << *m << dendl;
548 auto reply = make_message<MClientSession>(CEPH_SESSION_REJECT);
549 reply->metadata["error_string"] = "client doesn't have caps for FS \"" +
550 std::string(fs_name) + "\"";
551 mds->send_message(std::move(reply), m->get_connection());
552 return;
553 }
554
94b18763
FG
555 if (m->get_op() == CEPH_SESSION_REQUEST_RENEWCAPS) {
556 // always handle renewcaps (state >= MDSMap::STATE_RECONNECT)
557 } else if (m->get_op() == CEPH_SESSION_REQUEST_CLOSE) {
558 // close requests need to be handled when mds is active
559 if (mds->get_state() < MDSMap::STATE_ACTIVE) {
560 mds->wait_for_active(new C_MDS_RetryMessage(mds, m));
561 return;
562 }
563 } else {
564 if (mds->get_state() < MDSMap::STATE_CLIENTREPLAY) {
565 mds->wait_for_replay(new C_MDS_RetryMessage(mds, m));
566 return;
567 }
568 }
569
7c673cae
FG
570 if (logger)
571 logger->inc(l_mdss_handle_client_session);
572
573 uint64_t sseq = 0;
574 switch (m->get_op()) {
575 case CEPH_SESSION_REQUEST_OPEN:
576 if (session->is_opening() ||
577 session->is_open() ||
578 session->is_stale() ||
28e407b8
AA
579 session->is_killing() ||
580 terminating_sessions) {
7c673cae 581 dout(10) << "currently open|opening|stale|killing, dropping this req" << dendl;
7c673cae
FG
582 return;
583 }
11fdf7f2 584 ceph_assert(session->is_closed() || session->is_closing());
7c673cae 585
b32b8144
FG
586 if (mds->is_stopping()) {
587 dout(10) << "mds is stopping, dropping open req" << dendl;
b32b8144
FG
588 return;
589 }
590
a8e16298
TL
591 {
592 auto& addr = session->info.inst.addr;
9f95a23c 593 session->set_client_metadata(client_metadata_t(m->metadata, m->supported_features, m->metric_spec));
a8e16298
TL
594 auto& client_metadata = session->info.client_metadata;
595
11fdf7f2 596 auto log_session_status = [this, m, session](std::string_view status, std::string_view err) {
a8e16298
TL
597 auto now = ceph_clock_now();
598 auto throttle_elapsed = m->get_recv_complete_stamp() - m->get_throttle_stamp();
599 auto elapsed = now - m->get_recv_stamp();
11fdf7f2
TL
600 CachedStackStringStream css;
601 *css << "New client session:"
a8e16298
TL
602 << " addr=\"" << session->info.inst.addr << "\""
603 << ",elapsed=" << elapsed
604 << ",throttled=" << throttle_elapsed
605 << ",status=\"" << status << "\"";
606 if (!err.empty()) {
11fdf7f2 607 *css << ",error=\"" << err << "\"";
a8e16298
TL
608 }
609 const auto& metadata = session->info.client_metadata;
11fdf7f2
TL
610 if (auto it = metadata.find("root"); it != metadata.end()) {
611 *css << ",root=\"" << it->second << "\"";
a8e16298 612 }
11fdf7f2
TL
613 dout(2) << css->strv() << dendl;
614 };
615
616 auto send_reject_message = [this, &session, &log_session_status](std::string_view err_str) {
9f95a23c 617 auto m = make_message<MClientSession>(CEPH_SESSION_REJECT);
11fdf7f2
TL
618 if (session->info.has_feature(CEPHFS_FEATURE_MIMIC))
619 m->metadata["error_string"] = err_str;
620 mds->send_message_client(m, session);
621 log_session_status("REJECTED", err_str);
a8e16298 622 };
7c673cae 623
f67539c2 624 bool blocklisted = mds->objecter->with_osdmap(
11fdf7f2 625 [&addr](const OSDMap &osd_map) -> bool {
f67539c2 626 return osd_map.is_blocklisted(addr);
11fdf7f2
TL
627 });
628
f67539c2
TL
629 if (blocklisted) {
630 dout(10) << "rejecting blocklisted client " << addr << dendl;
631 // This goes on the wire and the "blacklisted" substring is
632 // depended upon by the kernel client for detecting whether it
633 // has been blocklisted. If mounted with recover_session=clean
634 // (since 5.4), it tries to automatically recover itself from
635 // blocklisting.
636 send_reject_message("blocklisted (blacklisted)");
11fdf7f2
TL
637 session->clear();
638 break;
7c673cae 639 }
7c673cae 640
11fdf7f2
TL
641 if (client_metadata.features.empty())
642 infer_supported_features(session, client_metadata);
643
644 dout(20) << __func__ << " CEPH_SESSION_REQUEST_OPEN metadata entries:" << dendl;
9f95a23c
TL
645 dout(20) << " features: '" << client_metadata.features << "'" << dendl;
646 dout(20) << " metric specification: [" << client_metadata.metric_spec << "]" << dendl;
11fdf7f2
TL
647 for (const auto& p : client_metadata) {
648 dout(20) << " " << p.first << ": " << p.second << dendl;
649 }
650
651 feature_bitset_t missing_features = required_client_features;
652 missing_features -= client_metadata.features;
653 if (!missing_features.empty()) {
f67539c2
TL
654 CachedStackStringStream css;
655 *css << "missing required features '" << missing_features << "'";
656 send_reject_message(css->strv());
92f5a8d4
TL
657 mds->clog->warn() << "client session (" << session->info.inst
658 << ") lacks required features " << missing_features
659 << "; client supports " << client_metadata.features;
11fdf7f2
TL
660 session->clear();
661 break;
a8e16298 662 }
7c673cae 663
a8e16298
TL
664 // Special case for the 'root' metadata path; validate that the claimed
665 // root is actually within the caps of the session
11fdf7f2
TL
666 if (auto it = client_metadata.find("root"); it != client_metadata.end()) {
667 auto claimed_root = it->second;
f67539c2 668 CachedStackStringStream css;
11fdf7f2
TL
669 bool denied = false;
670 // claimed_root has a leading "/" which we strip before passing
671 // into caps check
672 if (claimed_root.empty() || claimed_root[0] != '/') {
673 denied = true;
f67539c2 674 *css << "invalue root '" << claimed_root << "'";
11fdf7f2
TL
675 } else if (!session->auth_caps.path_capable(claimed_root.substr(1))) {
676 denied = true;
f67539c2 677 *css << "non-allowable root '" << claimed_root << "'";
11fdf7f2
TL
678 }
679
680 if (denied) {
681 // Tell the client we're rejecting their open
f67539c2
TL
682 send_reject_message(css->strv());
683 mds->clog->warn() << "client session with " << css->strv()
11fdf7f2
TL
684 << " denied (" << session->info.inst << ")";
685 session->clear();
686 break;
687 }
688 }
689
690 if (auto it = client_metadata.find("uuid"); it != client_metadata.end()) {
691 if (find_session_by_uuid(it->second)) {
692 send_reject_message("duplicated session uuid");
693 mds->clog->warn() << "client session with duplicated session uuid '"
694 << it->second << "' denied (" << session->info.inst << ")";
695 session->clear();
696 break;
697 }
a8e16298
TL
698 }
699
f67539c2
TL
700 if (session->is_closed()) {
701 mds->sessionmap.add_session(session);
702 }
a8e16298
TL
703
704 pv = mds->sessionmap.mark_projected(session);
705 sseq = mds->sessionmap.set_state(session, Session::STATE_OPENING);
706 mds->sessionmap.touch_session(session);
9f95a23c 707 auto fin = new LambdaContext([log_session_status = std::move(log_session_status)](int r){
11fdf7f2 708 ceph_assert(r == 0);
a8e16298
TL
709 log_session_status("ACCEPTED", "");
710 });
711 mdlog->start_submit_entry(new ESession(m->get_source_inst(), true, pv, client_metadata),
712 new C_MDS_session_finish(this, session, sseq, true, pv, fin));
11fdf7f2 713 mdlog->flush();
a8e16298 714 }
7c673cae
FG
715 break;
716
717 case CEPH_SESSION_REQUEST_RENEWCAPS:
11fdf7f2 718 if (session->is_open() || session->is_stale()) {
7c673cae
FG
719 mds->sessionmap.touch_session(session);
720 if (session->is_stale()) {
721 mds->sessionmap.set_state(session, Session::STATE_OPEN);
722 mds->locker->resume_stale_caps(session);
723 mds->sessionmap.touch_session(session);
724 }
9f95a23c 725 auto reply = make_message<MClientSession>(CEPH_SESSION_RENEWCAPS, m->get_seq());
11fdf7f2 726 mds->send_message_client(reply, session);
7c673cae
FG
727 } else {
728 dout(10) << "ignoring renewcaps on non open|stale session (" << session->get_state_name() << ")" << dendl;
729 }
730 break;
731
732 case CEPH_SESSION_REQUEST_CLOSE:
733 {
734 if (session->is_closed() ||
735 session->is_closing() ||
736 session->is_killing()) {
737 dout(10) << "already closed|closing|killing, dropping this req" << dendl;
7c673cae
FG
738 return;
739 }
740 if (session->is_importing()) {
741 dout(10) << "ignoring close req on importing session" << dendl;
7c673cae
FG
742 return;
743 }
11fdf7f2 744 ceph_assert(session->is_open() ||
7c673cae
FG
745 session->is_stale() ||
746 session->is_opening());
747 if (m->get_seq() < session->get_push_seq()) {
748 dout(10) << "old push seq " << m->get_seq() << " < " << session->get_push_seq()
749 << ", dropping" << dendl;
7c673cae
FG
750 return;
751 }
752 // We are getting a seq that is higher than expected.
753 // Handle the same as any other seqn error.
754 //
755 if (m->get_seq() != session->get_push_seq()) {
756 dout(0) << "old push seq " << m->get_seq() << " != " << session->get_push_seq()
757 << ", BUGGY!" << dendl;
758 mds->clog->warn() << "incorrect push seq " << m->get_seq() << " != "
759 << session->get_push_seq() << ", dropping" << " from client : " << session->get_human_name();
7c673cae
FG
760 return;
761 }
762 journal_close_session(session, Session::STATE_CLOSING, NULL);
763 }
764 break;
765
766 case CEPH_SESSION_FLUSHMSG_ACK:
767 finish_flush_session(session, m->get_seq());
768 break;
769
31f18b77 770 case CEPH_SESSION_REQUEST_FLUSH_MDLOG:
b32b8144
FG
771 if (mds->is_active())
772 mdlog->flush();
31f18b77
FG
773 break;
774
7c673cae
FG
775 default:
776 ceph_abort();
777 }
7c673cae
FG
778}
779
f91f0fd5 780void Server::flush_session(Session *session, MDSGatherBuilder& gather) {
f64942e4 781 if (!session->is_open() ||
11fdf7f2
TL
782 !session->get_connection() ||
783 !session->get_connection()->has_feature(CEPH_FEATURE_EXPORT_PEER)) {
f64942e4
AA
784 return;
785 }
786
f91f0fd5 787 version_t seq = session->wait_for_flush(gather.new_sub());
11fdf7f2 788 mds->send_message_client(
9f95a23c 789 make_message<MClientSession>(CEPH_SESSION_FLUSHMSG, seq), session);
f64942e4
AA
790}
791
7c673cae
FG
792void Server::flush_client_sessions(set<client_t>& client_set, MDSGatherBuilder& gather)
793{
f91f0fd5
TL
794 for (const auto& client : client_set) {
795 Session *session = mds->sessionmap.get_session(entity_name_t::CLIENT(client.v));
11fdf7f2 796 ceph_assert(session);
f91f0fd5 797 flush_session(session, gather);
7c673cae
FG
798 }
799}
800
801void Server::finish_flush_session(Session *session, version_t seq)
802{
11fdf7f2 803 MDSContext::vec finished;
7c673cae
FG
804 session->finish_flush(seq, finished);
805 mds->queue_waiters(finished);
806}
807
808void Server::_session_logged(Session *session, uint64_t state_seq, bool open, version_t pv,
f67539c2
TL
809 const interval_set<inodeno_t>& inos_to_free, version_t piv,
810 const interval_set<inodeno_t>& inos_to_purge, LogSegment *ls)
7c673cae 811{
9f95a23c
TL
812 dout(10) << "_session_logged " << session->info.inst
813 << " state_seq " << state_seq
f67539c2
TL
814 << " " << (open ? "open":"close") << " " << pv
815 << " inos_to_free " << inos_to_free << " inotablev " << piv
816 << " inos_to_purge " << inos_to_purge << dendl;
817
818 if (!open) {
819 if (inos_to_purge.size()){
820 ceph_assert(ls);
821 session->info.prealloc_inos.subtract(inos_to_purge);
822 ls->purging_inodes.insert(inos_to_purge);
823 if (mds->is_clientreplay() || mds->is_active() || mds->is_stopping())
824 mdcache->purge_inodes(inos_to_purge, ls);
825 }
826
827 if (inos_to_free.size()) {
828 ceph_assert(piv);
829 ceph_assert(session->is_closing() || session->is_killing() ||
830 session->is_opening()); // re-open closing session
831 session->info.prealloc_inos.subtract(inos_to_free);
832 mds->inotable->apply_release_ids(inos_to_free);
833 ceph_assert(mds->inotable->get_version() == piv);
834 }
835 session->free_prealloc_inos = session->info.prealloc_inos;
9f95a23c 836 session->delegated_inos.clear();
7c673cae
FG
837 }
838
839 mds->sessionmap.mark_dirty(session);
840
841 // apply
842 if (session->get_state_seq() != state_seq) {
843 dout(10) << " journaled state_seq " << state_seq << " != current " << session->get_state_seq()
844 << ", noop" << dendl;
845 // close must have been canceled (by an import?), or any number of other things..
846 } else if (open) {
11fdf7f2 847 ceph_assert(session->is_opening());
7c673cae
FG
848 mds->sessionmap.set_state(session, Session::STATE_OPEN);
849 mds->sessionmap.touch_session(session);
f67539c2 850 metrics_handler->add_session(session);
11fdf7f2 851 ceph_assert(session->get_connection());
9f95a23c 852 auto reply = make_message<MClientSession>(CEPH_SESSION_OPEN);
11fdf7f2
TL
853 if (session->info.has_feature(CEPHFS_FEATURE_MIMIC))
854 reply->supported_features = supported_features;
855 mds->send_message_client(reply, session);
856 if (mdcache->is_readonly()) {
9f95a23c 857 auto m = make_message<MClientSession>(CEPH_SESSION_FORCE_RO);
11fdf7f2
TL
858 mds->send_message_client(m, session);
859 }
7c673cae
FG
860 } else if (session->is_closing() ||
861 session->is_killing()) {
862 // kill any lingering capabilities, leases, requests
f91f0fd5 863 bool killing = session->is_killing();
7c673cae
FG
864 while (!session->caps.empty()) {
865 Capability *cap = session->caps.front();
866 CInode *in = cap->get_inode();
867 dout(20) << " killing capability " << ccap_string(cap->issued()) << " on " << *in << dendl;
f91f0fd5 868 mds->locker->remove_client_cap(in, cap, killing);
7c673cae
FG
869 }
870 while (!session->leases.empty()) {
871 ClientLease *r = session->leases.front();
872 CDentry *dn = static_cast<CDentry*>(r->parent);
873 dout(20) << " killing client lease of " << *dn << dendl;
874 dn->remove_client_lease(r, mds->locker);
875 }
11fdf7f2 876 if (client_reconnect_gather.erase(session->info.get_client())) {
7c673cae 877 dout(20) << " removing client from reconnect set" << dendl;
7c673cae
FG
878 if (client_reconnect_gather.empty()) {
879 dout(7) << " client " << session->info.inst << " was last reconnect, finishing" << dendl;
880 reconnect_gather_finish();
881 }
882 }
11fdf7f2
TL
883 if (client_reclaim_gather.erase(session->info.get_client())) {
884 dout(20) << " removing client from reclaim set" << dendl;
885 if (client_reclaim_gather.empty()) {
886 dout(7) << " client " << session->info.inst << " was last reclaimed, finishing" << dendl;
887 mds->maybe_clientreplay_done();
888 }
889 }
7c673cae
FG
890
891 if (session->is_closing()) {
892 // mark con disposable. if there is a fault, we will get a
893 // reset and clean it up. if the client hasn't received the
894 // CLOSE message yet, they will reconnect and get an
895 // ms_handle_remote_reset() and realize they had in fact closed.
896 // do this *before* sending the message to avoid a possible
897 // race.
11fdf7f2 898 if (session->get_connection()) {
7c673cae
FG
899 // Conditional because terminate_sessions will indiscrimately
900 // put sessions in CLOSING whether they ever had a conn or not.
11fdf7f2 901 session->get_connection()->mark_disposable();
7c673cae
FG
902 }
903
904 // reset session
9f95a23c 905 mds->send_message_client(make_message<MClientSession>(CEPH_SESSION_CLOSE), session);
7c673cae
FG
906 mds->sessionmap.set_state(session, Session::STATE_CLOSED);
907 session->clear();
f67539c2 908 metrics_handler->remove_session(session);
7c673cae
FG
909 mds->sessionmap.remove_session(session);
910 } else if (session->is_killing()) {
911 // destroy session, close connection
11fdf7f2 912 if (session->get_connection()) {
92f5a8d4
TL
913 session->get_connection()->mark_down();
914 mds->sessionmap.set_state(session, Session::STATE_CLOSED);
915 session->set_connection(nullptr);
7c673cae 916 }
f67539c2 917 metrics_handler->remove_session(session);
7c673cae
FG
918 mds->sessionmap.remove_session(session);
919 } else {
920 ceph_abort();
921 }
922 } else {
923 ceph_abort();
924 }
925}
926
927/**
928 * Inject sessions from some source other than actual connections.
929 *
930 * For example:
931 * - sessions inferred from journal replay
932 * - sessions learned from other MDSs during rejoin
933 * - sessions learned from other MDSs during dir/caps migration
934 * - sessions learned from other MDSs during a cross-MDS rename
935 */
936version_t Server::prepare_force_open_sessions(map<client_t,entity_inst_t>& cm,
11fdf7f2 937 map<client_t,client_metadata_t>& cmm,
28e407b8 938 map<client_t, pair<Session*,uint64_t> >& smap)
7c673cae
FG
939{
940 version_t pv = mds->sessionmap.get_projected();
941
942 dout(10) << "prepare_force_open_sessions " << pv
943 << " on " << cm.size() << " clients"
944 << dendl;
7c673cae 945
28e407b8 946 mds->objecter->with_osdmap(
11fdf7f2 947 [this, &cm, &cmm](const OSDMap &osd_map) {
28e407b8 948 for (auto p = cm.begin(); p != cm.end(); ) {
f67539c2
TL
949 if (osd_map.is_blocklisted(p->second.addr)) {
950 dout(10) << " ignoring blocklisted client." << p->first
28e407b8 951 << " (" << p->second.addr << ")" << dendl;
11fdf7f2 952 cmm.erase(p->first);
28e407b8
AA
953 cm.erase(p++);
954 } else {
955 ++p;
956 }
957 }
958 });
959
960 for (map<client_t,entity_inst_t>::iterator p = cm.begin(); p != cm.end(); ++p) {
7c673cae
FG
961 Session *session = mds->sessionmap.get_or_add_session(p->second);
962 pv = mds->sessionmap.mark_projected(session);
28e407b8 963 uint64_t sseq;
7c673cae
FG
964 if (session->is_closed() ||
965 session->is_closing() ||
28e407b8
AA
966 session->is_killing()) {
967 sseq = mds->sessionmap.set_state(session, Session::STATE_OPENING);
11fdf7f2
TL
968 auto q = cmm.find(p->first);
969 if (q != cmm.end())
970 session->info.client_metadata.merge(q->second);
28e407b8 971 } else {
11fdf7f2 972 ceph_assert(session->is_open() ||
7c673cae
FG
973 session->is_opening() ||
974 session->is_stale());
28e407b8
AA
975 sseq = 0;
976 }
977 smap[p->first] = make_pair(session, sseq);
7c673cae
FG
978 session->inc_importing();
979 }
980 return pv;
981}
982
28e407b8 983void Server::finish_force_open_sessions(const map<client_t,pair<Session*,uint64_t> >& smap,
7c673cae
FG
984 bool dec_import)
985{
986 /*
987 * FIXME: need to carefully consider the race conditions between a
988 * client trying to close a session and an MDS doing an import
989 * trying to force open a session...
990 */
28e407b8 991 dout(10) << "finish_force_open_sessions on " << smap.size() << " clients,"
7c673cae 992 << " initial v " << mds->sessionmap.get_version() << dendl;
7c673cae 993
28e407b8
AA
994 for (auto &it : smap) {
995 Session *session = it.second.first;
996 uint64_t sseq = it.second.second;
997 if (sseq > 0) {
7c673cae
FG
998 if (session->get_state_seq() != sseq) {
999 dout(10) << "force_open_sessions skipping changed " << session->info.inst << dendl;
1000 } else {
1001 dout(10) << "force_open_sessions opened " << session->info.inst << dendl;
1002 mds->sessionmap.set_state(session, Session::STATE_OPEN);
1003 mds->sessionmap.touch_session(session);
f67539c2 1004 metrics_handler->add_session(session);
11fdf7f2 1005
9f95a23c 1006 auto reply = make_message<MClientSession>(CEPH_SESSION_OPEN);
11fdf7f2
TL
1007 if (session->info.has_feature(CEPHFS_FEATURE_MIMIC))
1008 reply->supported_features = supported_features;
1009 mds->send_message_client(reply, session);
1010
7c673cae 1011 if (mdcache->is_readonly())
9f95a23c 1012 mds->send_message_client(make_message<MClientSession>(CEPH_SESSION_FORCE_RO), session);
7c673cae
FG
1013 }
1014 } else {
1015 dout(10) << "force_open_sessions skipping already-open " << session->info.inst << dendl;
11fdf7f2 1016 ceph_assert(session->is_open() || session->is_stale());
7c673cae
FG
1017 }
1018
1019 if (dec_import) {
1020 session->dec_importing();
1021 }
1022
1023 mds->sessionmap.mark_dirty(session);
1024 }
1025
1026 dout(10) << __func__ << ": final v " << mds->sessionmap.get_version() << dendl;
1027}
1028
1029class C_MDS_TerminatedSessions : public ServerContext {
1030 void finish(int r) override {
1031 server->terminating_sessions = false;
1032 }
1033 public:
1034 explicit C_MDS_TerminatedSessions(Server *s) : ServerContext(s) {}
1035};
1036
1037void Server::terminate_sessions()
1038{
a8e16298 1039 dout(5) << "terminating all sessions..." << dendl;
7c673cae
FG
1040
1041 terminating_sessions = true;
1042
1043 // kill them off. clients will retry etc.
1044 set<Session*> sessions;
1045 mds->sessionmap.get_client_session_set(sessions);
1046 for (set<Session*>::const_iterator p = sessions.begin();
1047 p != sessions.end();
1048 ++p) {
1049 Session *session = *p;
1050 if (session->is_closing() ||
1051 session->is_killing() ||
1052 session->is_closed())
1053 continue;
1054 journal_close_session(session, Session::STATE_CLOSING, NULL);
1055 }
1056
1057 mdlog->wait_for_safe(new C_MDS_TerminatedSessions(this));
1058}
1059
1060
1061void Server::find_idle_sessions()
1062{
91327a77
AA
1063 auto now = clock::now();
1064 auto last_cleared_laggy = mds->last_cleared_laggy();
1065
1066 dout(10) << "find_idle_sessions. last cleared laggy state " << last_cleared_laggy << "s ago" << dendl;
7c673cae
FG
1067
1068 // timeout/stale
1069 // (caps go stale, lease die)
91327a77
AA
1070 double queue_max_age = mds->get_dispatch_queue_max_age(ceph_clock_now());
1071 double cutoff = queue_max_age + mds->mdsmap->get_session_timeout();
f64942e4 1072
494da23a
TL
1073 // don't kick clients if we've been laggy
1074 if (last_cleared_laggy < cutoff) {
1075 dout(10) << " last cleared laggy " << last_cleared_laggy << "s ago (< cutoff " << cutoff
1076 << "), not marking any client stale" << dendl;
1077 return;
1078 }
1079
11fdf7f2
TL
1080 std::vector<Session*> to_evict;
1081
494da23a 1082 bool defer_session_stale = g_conf().get_val<bool>("mds_defer_session_stale");
f64942e4
AA
1083 const auto sessions_p1 = mds->sessionmap.by_state.find(Session::STATE_OPEN);
1084 if (sessions_p1 != mds->sessionmap.by_state.end() && !sessions_p1->second->empty()) {
1085 std::vector<Session*> new_stale;
1086
1087 for (auto session : *(sessions_p1->second)) {
1088 auto last_cap_renew_span = std::chrono::duration<double>(now - session->last_cap_renew).count();
1089 if (last_cap_renew_span < cutoff) {
1090 dout(20) << "laggiest active session is " << session->info.inst
1091 << " and renewed caps recently (" << last_cap_renew_span << "s ago)" << dendl;
1092 break;
1093 }
1094
1095 if (session->last_seen > session->last_cap_renew) {
1096 last_cap_renew_span = std::chrono::duration<double>(now - session->last_seen).count();
1097 if (last_cap_renew_span < cutoff) {
1098 dout(20) << "laggiest active session is " << session->info.inst
1099 << " and renewed caps recently (" << last_cap_renew_span << "s ago)" << dendl;
1100 continue;
1101 }
1102 }
1103
494da23a
TL
1104 if (last_cap_renew_span >= mds->mdsmap->get_session_autoclose()) {
1105 dout(20) << "evicting session " << session->info.inst << " since autoclose "
1106 "has arrived" << dendl;
1107 // evict session without marking it stale
1108 to_evict.push_back(session);
1109 continue;
1110 }
1111
1112 if (defer_session_stale &&
1113 !session->is_any_flush_waiter() &&
1114 !mds->locker->is_revoking_any_caps_from(session->get_client())) {
1115 dout(20) << "deferring marking session " << session->info.inst << " stale "
1116 "since it holds no caps" << dendl;
1117 continue;
1118 }
1119
11fdf7f2
TL
1120 auto it = session->info.client_metadata.find("timeout");
1121 if (it != session->info.client_metadata.end()) {
1122 unsigned timeout = strtoul(it->second.c_str(), nullptr, 0);
1123 if (timeout == 0) {
1124 dout(10) << "skipping session " << session->info.inst
1125 << ", infinite timeout specified" << dendl;
1126 continue;
1127 }
1128 double cutoff = queue_max_age + timeout;
1129 if (last_cap_renew_span < cutoff) {
1130 dout(10) << "skipping session " << session->info.inst
1131 << ", timeout (" << timeout << ") specified"
1132 << " and renewed caps recently (" << last_cap_renew_span << "s ago)" << dendl;
1133 continue;
1134 }
1135
1136 // do not go through stale, evict it directly.
1137 to_evict.push_back(session);
1138 } else {
1139 dout(10) << "new stale session " << session->info.inst
1140 << " last renewed caps " << last_cap_renew_span << "s ago" << dendl;
1141 new_stale.push_back(session);
1142 }
7c673cae
FG
1143 }
1144
f64942e4
AA
1145 for (auto session : new_stale) {
1146 mds->sessionmap.set_state(session, Session::STATE_STALE);
494da23a
TL
1147 if (mds->locker->revoke_stale_caps(session)) {
1148 mds->locker->remove_stale_leases(session);
1149 finish_flush_session(session, session->get_push_seq());
9f95a23c 1150 auto m = make_message<MClientSession>(CEPH_SESSION_STALE, session->get_push_seq());
494da23a
TL
1151 mds->send_message_client(m, session);
1152 } else {
1153 to_evict.push_back(session);
1154 }
f64942e4 1155 }
7c673cae
FG
1156 }
1157
1158 // autoclose
91327a77 1159 cutoff = queue_max_age + mds->mdsmap->get_session_autoclose();
7c673cae 1160
31f18b77 1161 // Collect a list of sessions exceeding the autoclose threshold
f64942e4 1162 const auto sessions_p2 = mds->sessionmap.by_state.find(Session::STATE_STALE);
11fdf7f2
TL
1163 if (sessions_p2 != mds->sessionmap.by_state.end() && !sessions_p2->second->empty()) {
1164 for (auto session : *(sessions_p2->second)) {
1165 assert(session->is_stale());
1166 auto last_cap_renew_span = std::chrono::duration<double>(now - session->last_cap_renew).count();
1167 if (last_cap_renew_span < cutoff) {
1168 dout(20) << "oldest stale session is " << session->info.inst
1169 << " and recently renewed caps " << last_cap_renew_span << "s ago" << dendl;
1170 break;
1171 }
1172 to_evict.push_back(session);
1173 }
31f18b77 1174 }
31f18b77 1175
11fdf7f2 1176 for (auto session: to_evict) {
7c673cae 1177 if (session->is_importing()) {
11fdf7f2
TL
1178 dout(10) << "skipping session " << session->info.inst << ", it's being imported" << dendl;
1179 continue;
7c673cae 1180 }
31f18b77 1181
11fdf7f2
TL
1182 auto last_cap_renew_span = std::chrono::duration<double>(now - session->last_cap_renew).count();
1183 mds->clog->warn() << "evicting unresponsive client " << *session
1184 << ", after " << last_cap_renew_span << " seconds";
1185 dout(10) << "autoclosing stale session " << session->info.inst
1186 << " last renewed caps " << last_cap_renew_span << "s ago" << dendl;
31f18b77 1187
f67539c2
TL
1188 if (g_conf()->mds_session_blocklist_on_timeout) {
1189 CachedStackStringStream css;
1190 mds->evict_client(session->get_client().v, false, true, *css, nullptr);
31f18b77
FG
1191 } else {
1192 kill_session(session, NULL);
1193 }
7c673cae
FG
1194 }
1195}
1196
91327a77
AA
1197void Server::evict_cap_revoke_non_responders() {
1198 if (!cap_revoke_eviction_timeout) {
1199 return;
1200 }
1201
9f95a23c 1202 auto&& to_evict = mds->locker->get_late_revoking_clients(cap_revoke_eviction_timeout);
91327a77
AA
1203
1204 for (auto const &client: to_evict) {
1205 mds->clog->warn() << "client id " << client << " has not responded to"
1206 << " cap revoke by MDS for over " << cap_revoke_eviction_timeout
1207 << " seconds, evicting";
1208 dout(1) << __func__ << ": evicting cap revoke non-responder client id "
1209 << client << dendl;
1210
f67539c2 1211 CachedStackStringStream css;
91327a77 1212 bool evicted = mds->evict_client(client.v, false,
f67539c2
TL
1213 g_conf()->mds_session_blocklist_on_evict,
1214 *css, nullptr);
91327a77
AA
1215 if (evicted && logger) {
1216 logger->inc(l_mdss_cap_revoke_eviction);
1217 }
1218 }
1219}
1220
92f5a8d4 1221void Server::handle_conf_change(const std::set<std::string>& changed) {
f91f0fd5
TL
1222 if (changed.count("mds_forward_all_requests_to_auth")){
1223 forward_all_requests_to_auth = g_conf().get_val<bool>("mds_forward_all_requests_to_auth");
92f5a8d4 1224 }
91327a77 1225 if (changed.count("mds_cap_revoke_eviction_timeout")) {
11fdf7f2 1226 cap_revoke_eviction_timeout = g_conf().get_val<double>("mds_cap_revoke_eviction_timeout");
91327a77
AA
1227 dout(20) << __func__ << " cap revoke eviction timeout changed to "
1228 << cap_revoke_eviction_timeout << dendl;
1229 }
a8e16298 1230 if (changed.count("mds_recall_max_decay_rate")) {
11fdf7f2 1231 recall_throttle = DecayCounter(g_conf().get_val<double>("mds_recall_max_decay_rate"));
a8e16298 1232 }
9f95a23c
TL
1233 if (changed.count("mds_max_snaps_per_dir")) {
1234 max_snaps_per_dir = g_conf().get_val<uint64_t>("mds_max_snaps_per_dir");
1235 dout(20) << __func__ << " max snapshots per directory changed to "
1236 << max_snaps_per_dir << dendl;
1237 }
1238 if (changed.count("mds_client_delegate_inos_pct")) {
1239 delegate_inos_pct = g_conf().get_val<uint64_t>("mds_client_delegate_inos_pct");
1240 }
adb31ebb
TL
1241 if (changed.count("mds_max_caps_per_client")) {
1242 max_caps_per_client = g_conf().get_val<uint64_t>("mds_max_caps_per_client");
1243 }
1244 if (changed.count("mds_session_cap_acquisition_throttle")) {
1245 cap_acquisition_throttle = g_conf().get_val<uint64_t>("mds_session_cap_acquisition_throttle");
1246 }
1247 if (changed.count("mds_session_max_caps_throttle_ratio")) {
1248 max_caps_throttle_ratio = g_conf().get_val<double>("mds_session_max_caps_throttle_ratio");
1249 }
1250 if (changed.count("mds_cap_acquisition_throttle_retry_request_timeout")) {
1251 caps_throttle_retry_request_timeout = g_conf().get_val<double>("mds_cap_acquisition_throttle_retry_request_timeout");
1252 }
f67539c2
TL
1253 if (changed.count("mds_alternate_name_max")) {
1254 alternate_name_max = g_conf().get_val<Option::size_t>("mds_alternate_name_max");
1255 }
91327a77
AA
1256}
1257
7c673cae 1258/*
11fdf7f2 1259 * XXX bump in the interface here, not using an MDSContext here
7c673cae
FG
1260 * because all the callers right now happen to use a SaferCond
1261 */
f67539c2 1262void Server::kill_session(Session *session, Context *on_safe)
7c673cae 1263{
9f95a23c 1264 ceph_assert(ceph_mutex_is_locked_by_me(mds->mds_lock));
31f18b77 1265
7c673cae
FG
1266 if ((session->is_opening() ||
1267 session->is_open() ||
1268 session->is_stale()) &&
1269 !session->is_importing()) {
1270 dout(10) << "kill_session " << session << dendl;
f67539c2 1271 journal_close_session(session, Session::STATE_KILLING, on_safe);
7c673cae
FG
1272 } else {
1273 dout(10) << "kill_session importing or already closing/killing " << session << dendl;
11fdf7f2
TL
1274 if (session->is_closing() ||
1275 session->is_killing()) {
1276 if (on_safe)
1277 mdlog->wait_for_safe(new MDSInternalContextWrapper(mds, on_safe));
1278 } else {
1279 ceph_assert(session->is_closed() ||
1280 session->is_importing());
1281 if (on_safe)
1282 on_safe->complete(0);
7c673cae
FG
1283 }
1284 }
1285}
1286
f67539c2 1287size_t Server::apply_blocklist(const std::set<entity_addr_t> &blocklist)
31f18b77 1288{
81eedcae
TL
1289 bool prenautilus = mds->objecter->with_osdmap(
1290 [&](const OSDMap& o) {
9f95a23c 1291 return o.require_osd_release < ceph_release_t::nautilus;
81eedcae
TL
1292 });
1293
1294 std::vector<Session*> victims;
11fdf7f2 1295 const auto& sessions = mds->sessionmap.get_sessions();
81eedcae 1296 for (const auto& p : sessions) {
31f18b77 1297 if (!p.first.is_client()) {
f67539c2 1298 // Do not apply OSDMap blocklist to MDS daemons, we find out
31f18b77
FG
1299 // about their death via MDSMap.
1300 continue;
1301 }
1302
1303 Session *s = p.second;
81eedcae 1304 auto inst_addr = s->info.inst.addr;
f67539c2 1305 // blocklist entries are always TYPE_ANY for nautilus+
81eedcae 1306 inst_addr.set_type(entity_addr_t::TYPE_ANY);
f67539c2 1307 if (blocklist.count(inst_addr)) {
31f18b77 1308 victims.push_back(s);
81eedcae
TL
1309 continue;
1310 }
1311 if (prenautilus) {
1312 // ...except pre-nautilus, they were TYPE_LEGACY
1313 inst_addr.set_type(entity_addr_t::TYPE_LEGACY);
f67539c2 1314 if (blocklist.count(inst_addr)) {
81eedcae
TL
1315 victims.push_back(s);
1316 }
31f18b77
FG
1317 }
1318 }
1319
9f95a23c 1320 for (const auto& s : victims) {
31f18b77
FG
1321 kill_session(s, nullptr);
1322 }
1323
f67539c2 1324 dout(10) << "apply_blocklist: killed " << victims.size() << dendl;
31f18b77
FG
1325
1326 return victims.size();
1327}
1328
f67539c2 1329void Server::journal_close_session(Session *session, int state, Context *on_safe)
7c673cae 1330{
9f95a23c 1331 dout(10) << __func__ << " : "
9f95a23c 1332 << session->info.inst
f67539c2
TL
1333 << " pending_prealloc_inos " << session->pending_prealloc_inos
1334 << " free_prealloc_inos " << session->free_prealloc_inos
1335 << " delegated_inos " << session->delegated_inos << dendl;
9f95a23c 1336
7c673cae
FG
1337 uint64_t sseq = mds->sessionmap.set_state(session, state);
1338 version_t pv = mds->sessionmap.mark_projected(session);
1339 version_t piv = 0;
1340
1341 // release alloc and pending-alloc inos for this session
1342 // and wipe out session state, in case the session close aborts for some reason
f67539c2
TL
1343 interval_set<inodeno_t> inos_to_free;
1344 inos_to_free.insert(session->pending_prealloc_inos);
1345 inos_to_free.insert(session->free_prealloc_inos);
1346 if (inos_to_free.size()) {
1347 mds->inotable->project_release_ids(inos_to_free);
7c673cae
FG
1348 piv = mds->inotable->get_projected_version();
1349 } else
1350 piv = 0;
9f95a23c 1351
f67539c2
TL
1352 auto le = new ESession(session->info.inst, false, pv, inos_to_free, piv, session->delegated_inos);
1353 auto fin = new C_MDS_session_finish(this, session, sseq, false, pv, inos_to_free, piv,
1354 session->delegated_inos, mdlog->get_current_segment(), on_safe);
1355 mdlog->start_submit_entry(le, fin);
7c673cae
FG
1356 mdlog->flush();
1357
1358 // clean up requests, too
f67539c2
TL
1359 while(!session->requests.empty()) {
1360 auto mdr = MDRequestRef(*session->requests.begin());
7c673cae
FG
1361 mdcache->request_kill(mdr);
1362 }
1363
1364 finish_flush_session(session, session->get_push_seq());
1365}
1366
11fdf7f2 1367void Server::reconnect_clients(MDSContext *reconnect_done_)
7c673cae
FG
1368{
1369 reconnect_done = reconnect_done_;
28e407b8 1370
11fdf7f2 1371 auto now = clock::now();
28e407b8
AA
1372 set<Session*> sessions;
1373 mds->sessionmap.get_client_session_set(sessions);
1374 for (auto session : sessions) {
11fdf7f2
TL
1375 if (session->is_open()) {
1376 client_reconnect_gather.insert(session->get_client());
92f5a8d4 1377 session->set_reconnecting(true);
11fdf7f2
TL
1378 session->last_cap_renew = now;
1379 }
28e407b8 1380 }
7c673cae
FG
1381
1382 if (client_reconnect_gather.empty()) {
1383 dout(7) << "reconnect_clients -- no sessions, doing nothing." << dendl;
1384 reconnect_gather_finish();
1385 return;
1386 }
1387
1388 // clients will get the mdsmap and discover we're reconnecting via the monitor.
1389
11fdf7f2 1390 reconnect_start = now;
7c673cae
FG
1391 dout(1) << "reconnect_clients -- " << client_reconnect_gather.size() << " sessions" << dendl;
1392 mds->sessionmap.dump();
1393}
1394
9f95a23c 1395void Server::handle_client_reconnect(const cref_t<MClientReconnect> &m)
7c673cae 1396{
11fdf7f2
TL
1397 dout(7) << "handle_client_reconnect " << m->get_source()
1398 << (m->has_more() ? " (more)" : "") << dendl;
7c673cae 1399 client_t from = m->get_source().num();
94b18763 1400 Session *session = mds->get_session(m);
92f5a8d4
TL
1401 if (!session) {
1402 dout(0) << " ignoring sessionless msg " << *m << dendl;
9f95a23c 1403 auto reply = make_message<MClientSession>(CEPH_SESSION_REJECT);
92f5a8d4
TL
1404 reply->metadata["error_string"] = "sessionless";
1405 mds->send_message(reply, m->get_connection());
81eedcae 1406 return;
92f5a8d4
TL
1407 }
1408
1409 if (!session->is_open()) {
1410 dout(0) << " ignoring msg from not-open session" << *m << dendl;
9f95a23c 1411 auto reply = make_message<MClientSession>(CEPH_SESSION_CLOSE);
92f5a8d4
TL
1412 mds->send_message(reply, m->get_connection());
1413 return;
1414 }
7c673cae 1415
f67539c2
TL
1416 bool reconnect_all_deny = g_conf().get_val<bool>("mds_deny_all_reconnect");
1417
7c673cae
FG
1418 if (!mds->is_reconnect() && mds->get_want_state() == CEPH_MDS_STATE_RECONNECT) {
1419 dout(10) << " we're almost in reconnect state (mdsmap delivery race?); waiting" << dendl;
1420 mds->wait_for_reconnect(new C_MDS_RetryMessage(mds, m));
1421 return;
1422 }
1423
f64942e4 1424 auto delay = std::chrono::duration<double>(clock::now() - reconnect_start).count();
7c673cae
FG
1425 dout(10) << " reconnect_start " << reconnect_start << " delay " << delay << dendl;
1426
1427 bool deny = false;
f67539c2 1428 if (reconnect_all_deny || !mds->is_reconnect() || mds->get_want_state() != CEPH_MDS_STATE_RECONNECT || reconnect_evicting) {
7c673cae 1429 // XXX maybe in the future we can do better than this?
f67539c2
TL
1430 if (reconnect_all_deny) {
1431 dout(1) << "mds_deny_all_reconnect was set to speed up reboot phase, ignoring reconnect, sending close" << dendl;
1432 } else {
1433 dout(1) << "no longer in reconnect state, ignoring reconnect, sending close" << dendl;
1434 }
7c673cae
FG
1435 mds->clog->info() << "denied reconnect attempt (mds is "
1436 << ceph_mds_state_name(mds->get_state())
1437 << ") from " << m->get_source_inst()
11fdf7f2 1438 << " after " << delay << " (allowed interval " << g_conf()->mds_reconnect_timeout << ")";
7c673cae 1439 deny = true;
11fdf7f2
TL
1440 } else {
1441 std::string error_str;
1442 if (!session->is_open()) {
1443 error_str = "session is closed";
1444 } else if (mdcache->is_readonly()) {
1445 error_str = "mds is readonly";
1446 } else {
1447 if (session->info.client_metadata.features.empty())
1448 infer_supported_features(session, session->info.client_metadata);
1449
1450 feature_bitset_t missing_features = required_client_features;
1451 missing_features -= session->info.client_metadata.features;
1452 if (!missing_features.empty()) {
f67539c2
TL
1453 CachedStackStringStream css;
1454 *css << "missing required features '" << missing_features << "'";
1455 error_str = css->strv();
11fdf7f2
TL
1456 }
1457 }
1458
1459 if (!error_str.empty()) {
1460 deny = true;
1461 dout(1) << " " << error_str << ", ignoring reconnect, sending close" << dendl;
1462 mds->clog->info() << "denied reconnect attempt from "
1463 << m->get_source_inst() << " (" << error_str << ")";
1464 }
7c673cae
FG
1465 }
1466
1467 if (deny) {
9f95a23c 1468 auto r = make_message<MClientSession>(CEPH_SESSION_CLOSE);
11fdf7f2 1469 mds->send_message_client(r, session);
f67539c2
TL
1470 if (session->is_open()) {
1471 client_reconnect_denied.insert(session->get_client());
1472 }
7c673cae
FG
1473 return;
1474 }
1475
11fdf7f2 1476 if (!m->has_more()) {
f67539c2 1477 metrics_handler->add_session(session);
11fdf7f2 1478 // notify client of success with an OPEN
9f95a23c 1479 auto reply = make_message<MClientSession>(CEPH_SESSION_OPEN);
11fdf7f2
TL
1480 if (session->info.has_feature(CEPHFS_FEATURE_MIMIC))
1481 reply->supported_features = supported_features;
1482 mds->send_message_client(reply, session);
1483 mds->clog->debug() << "reconnect by " << session->info.inst << " after " << delay;
1484 }
1485
91327a77 1486 session->last_cap_renew = clock::now();
7c673cae
FG
1487
1488 // snaprealms
11fdf7f2
TL
1489 for (const auto &r : m->realms) {
1490 CInode *in = mdcache->get_inode(inodeno_t(r.realm.ino));
7c673cae
FG
1491 if (in && in->state_test(CInode::STATE_PURGING))
1492 continue;
1493 if (in) {
11fdf7f2
TL
1494 if (in->snaprealm) {
1495 dout(15) << "open snaprealm (w inode) on " << *in << dendl;
7c673cae 1496 } else {
11fdf7f2
TL
1497 // this can happen if we are non-auth or we rollback snaprealm
1498 dout(15) << "open snaprealm (null snaprealm) on " << *in << dendl;
7c673cae 1499 }
11fdf7f2 1500 mdcache->add_reconnected_snaprealm(from, inodeno_t(r.realm.ino), snapid_t(r.realm.seq));
7c673cae 1501 } else {
11fdf7f2
TL
1502 dout(15) << "open snaprealm (w/o inode) on " << inodeno_t(r.realm.ino)
1503 << " seq " << r.realm.seq << dendl;
1504 mdcache->add_reconnected_snaprealm(from, inodeno_t(r.realm.ino), snapid_t(r.realm.seq));
7c673cae
FG
1505 }
1506 }
1507
1508 // caps
11fdf7f2 1509 for (const auto &p : m->caps) {
7c673cae 1510 // make sure our last_cap_id is MAX over all issued caps
11fdf7f2
TL
1511 if (p.second.capinfo.cap_id > mdcache->last_cap_id)
1512 mdcache->last_cap_id = p.second.capinfo.cap_id;
7c673cae 1513
11fdf7f2 1514 CInode *in = mdcache->get_inode(p.first);
7c673cae
FG
1515 if (in && in->state_test(CInode::STATE_PURGING))
1516 continue;
1517 if (in && in->is_auth()) {
1518 // we recovered it, and it's ours. take note.
11fdf7f2 1519 dout(15) << "open cap realm " << inodeno_t(p.second.capinfo.snaprealm)
7c673cae 1520 << " on " << *in << dendl;
11fdf7f2
TL
1521 in->reconnect_cap(from, p.second, session);
1522 mdcache->add_reconnected_cap(from, p.first, p.second);
1523 recover_filelocks(in, p.second.flockbl, m->get_orig_source().num());
7c673cae
FG
1524 continue;
1525 }
1526
1527 if (in && !in->is_auth()) {
1528 // not mine.
1529 dout(10) << "non-auth " << *in << ", will pass off to authority" << dendl;
1530 // add to cap export list.
11fdf7f2
TL
1531 mdcache->rejoin_export_caps(p.first, from, p.second,
1532 in->authority().first, true);
7c673cae
FG
1533 } else {
1534 // don't know if the inode is mine
11fdf7f2
TL
1535 dout(10) << "missing ino " << p.first << ", will load later" << dendl;
1536 mdcache->rejoin_recovered_caps(p.first, from, p.second, MDS_RANK_NONE);
7c673cae
FG
1537 }
1538 }
1539
f64942e4
AA
1540 reconnect_last_seen = clock::now();
1541
11fdf7f2
TL
1542 if (!m->has_more()) {
1543 mdcache->rejoin_recovered_client(session->get_client(), session->info.inst);
1544
1545 // remove from gather set
1546 client_reconnect_gather.erase(from);
92f5a8d4 1547 session->set_reconnecting(false);
11fdf7f2
TL
1548 if (client_reconnect_gather.empty())
1549 reconnect_gather_finish();
1550 }
1551}
1552
1553void Server::infer_supported_features(Session *session, client_metadata_t& client_metadata)
1554{
1555 int supported = -1;
1556 auto it = client_metadata.find("ceph_version");
1557 if (it != client_metadata.end()) {
1558 // user space client
1559 if (it->second.compare(0, 16, "ceph version 12.") == 0)
1560 supported = CEPHFS_FEATURE_LUMINOUS;
1561 else if (session->get_connection()->has_feature(CEPH_FEATURE_FS_CHANGE_ATTR))
1562 supported = CEPHFS_FEATURE_KRAKEN;
1563 } else {
1564 it = client_metadata.find("kernel_version");
1565 if (it != client_metadata.end()) {
1566 // kernel client
1567 if (session->get_connection()->has_feature(CEPH_FEATURE_NEW_OSDOP_ENCODING))
1568 supported = CEPHFS_FEATURE_LUMINOUS;
1569 }
1570 }
1571 if (supported == -1 &&
1572 session->get_connection()->has_feature(CEPH_FEATURE_FS_FILE_LAYOUT_V2))
1573 supported = CEPHFS_FEATURE_JEWEL;
7c673cae 1574
11fdf7f2
TL
1575 if (supported >= 0) {
1576 unsigned long value = (1UL << (supported + 1)) - 1;
1577 client_metadata.features = feature_bitset_t(value);
1578 dout(10) << __func__ << " got '" << client_metadata.features << "'" << dendl;
1579 }
7c673cae
FG
1580}
1581
11fdf7f2
TL
1582void Server::update_required_client_features()
1583{
f67539c2 1584 required_client_features = mds->mdsmap->get_required_client_features();
11fdf7f2
TL
1585 dout(7) << "required_client_features: " << required_client_features << dendl;
1586
1587 if (mds->get_state() >= MDSMap::STATE_RECONNECT) {
1588 set<Session*> sessions;
1589 mds->sessionmap.get_client_session_set(sessions);
1590 for (auto session : sessions) {
1591 feature_bitset_t missing_features = required_client_features;
1592 missing_features -= session->info.client_metadata.features;
1593 if (!missing_features.empty()) {
f67539c2 1594 bool blocklisted = mds->objecter->with_osdmap(
11fdf7f2 1595 [session](const OSDMap &osd_map) -> bool {
f67539c2 1596 return osd_map.is_blocklisted(session->info.inst.addr);
11fdf7f2 1597 });
f67539c2 1598 if (blocklisted)
11fdf7f2 1599 continue;
7c673cae 1600
11fdf7f2
TL
1601 mds->clog->warn() << "evicting session " << *session << ", missing required features '"
1602 << missing_features << "'";
f67539c2 1603 CachedStackStringStream css;
11fdf7f2 1604 mds->evict_client(session->get_client().v, false,
f67539c2 1605 g_conf()->mds_session_blocklist_on_evict, *css);
11fdf7f2
TL
1606 }
1607 }
1608 }
1609}
7c673cae
FG
1610
1611void Server::reconnect_gather_finish()
1612{
1613 dout(7) << "reconnect_gather_finish. failed on " << failed_reconnects << " clients" << dendl;
11fdf7f2
TL
1614 ceph_assert(reconnect_done);
1615
1616 if (!mds->snapclient->is_synced()) {
1617 // make sure snaptable cache is populated. snaprealms will be
1618 // extensively used in rejoin stage.
1619 dout(7) << " snaptable cache isn't synced, delaying state transition" << dendl;
1620 mds->snapclient->wait_for_sync(reconnect_done);
1621 } else {
1622 reconnect_done->complete(0);
1623 }
7c673cae
FG
1624 reconnect_done = NULL;
1625}
1626
1627void Server::reconnect_tick()
1628{
f67539c2 1629 bool reject_all_reconnect = false;
31f18b77 1630 if (reconnect_evicting) {
f64942e4 1631 dout(7) << "reconnect_tick: waiting for evictions" << dendl;
31f18b77
FG
1632 return;
1633 }
1634
f67539c2
TL
1635 /*
1636 * Set mds_deny_all_reconnect to reject all the reconnect req ,
1637 * then load less meta information in rejoin phase. This will shorten reboot time.
1638 * Moreover, loading less meta increases the chance standby with less memory can failover.
1639
1640 * Why not shorten reconnect period?
1641 * Clients may send unsafe or retry requests, which haven't been
1642 * completed before old mds stop, to new mds. These requests may
1643 * need to be processed during new mds's clientreplay phase,
1644 * see: #https://github.com/ceph/ceph/pull/29059.
1645 */
1646 bool reconnect_all_deny = g_conf().get_val<bool>("mds_deny_all_reconnect");
f64942e4
AA
1647 if (client_reconnect_gather.empty())
1648 return;
31f18b77 1649
f67539c2
TL
1650 if (reconnect_all_deny && (client_reconnect_gather == client_reconnect_denied))
1651 reject_all_reconnect = true;
1652
f64942e4
AA
1653 auto now = clock::now();
1654 auto elapse1 = std::chrono::duration<double>(now - reconnect_start).count();
f67539c2 1655 if (elapse1 < g_conf()->mds_reconnect_timeout && !reject_all_reconnect)
f64942e4 1656 return;
31f18b77 1657
f64942e4
AA
1658 vector<Session*> remaining_sessions;
1659 remaining_sessions.reserve(client_reconnect_gather.size());
1660 for (auto c : client_reconnect_gather) {
1661 Session *session = mds->sessionmap.get_session(entity_name_t::CLIENT(c.v));
1662 ceph_assert(session);
1663 remaining_sessions.push_back(session);
1664 // client re-sends cap flush messages before the reconnect message
1665 if (session->last_seen > reconnect_last_seen)
1666 reconnect_last_seen = session->last_seen;
1667 }
31f18b77 1668
f64942e4 1669 auto elapse2 = std::chrono::duration<double>(now - reconnect_last_seen).count();
f67539c2 1670 if (elapse2 < g_conf()->mds_reconnect_timeout / 2 && !reject_all_reconnect) {
f64942e4
AA
1671 dout(7) << "reconnect_tick: last seen " << elapse2
1672 << " seconds ago, extending reconnect interval" << dendl;
1673 return;
1674 }
1675
1676 dout(7) << "reconnect timed out, " << remaining_sessions.size()
f67539c2 1677 << " clients have not reconnected in time" << dendl;
f64942e4 1678
f67539c2 1679 // If we're doing blocklist evictions, use this to wait for them before
f64942e4
AA
1680 // proceeding to reconnect_gather_finish
1681 MDSGatherBuilder gather(g_ceph_context);
1682
1683 for (auto session : remaining_sessions) {
11fdf7f2
TL
1684 // Keep sessions that have specified timeout. These sessions will prevent
1685 // mds from going to active. MDS goes to active after they all have been
1686 // killed or reclaimed.
1687 if (session->info.client_metadata.find("timeout") !=
1688 session->info.client_metadata.end()) {
1689 dout(1) << "reconnect keeps " << session->info.inst
1690 << ", need to be reclaimed" << dendl;
1691 client_reclaim_gather.insert(session->get_client());
1692 continue;
1693 }
1694
f64942e4 1695 dout(1) << "reconnect gives up on " << session->info.inst << dendl;
31f18b77 1696
f64942e4
AA
1697 mds->clog->warn() << "evicting unresponsive client " << *session
1698 << ", after waiting " << elapse1
1699 << " seconds during MDS startup";
1700
f67539c2
TL
1701 // make _session_logged() purge orphan objects of lost async/unsafe requests
1702 session->delegated_inos.swap(session->free_prealloc_inos);
1703
1704 if (g_conf()->mds_session_blocklist_on_timeout) {
1705 CachedStackStringStream css;
1706 mds->evict_client(session->get_client().v, false, true, *css,
f64942e4 1707 gather.new_sub());
31f18b77 1708 } else {
f67539c2 1709 kill_session(session, NULL);
31f18b77 1710 }
f64942e4
AA
1711
1712 failed_reconnects++;
1713 }
1714 client_reconnect_gather.clear();
f67539c2 1715 client_reconnect_denied.clear();
f64942e4
AA
1716
1717 if (gather.has_subs()) {
1718 dout(1) << "reconnect will complete once clients are evicted" << dendl;
9f95a23c 1719 gather.set_finisher(new MDSInternalContextWrapper(mds, new LambdaContext(
f64942e4
AA
1720 [this](int r){reconnect_gather_finish();})));
1721 gather.activate();
1722 reconnect_evicting = true;
1723 } else {
1724 reconnect_gather_finish();
7c673cae
FG
1725 }
1726}
1727
1728void Server::recover_filelocks(CInode *in, bufferlist locks, int64_t client)
1729{
1730 if (!locks.length()) return;
1731 int numlocks;
1732 ceph_filelock lock;
11fdf7f2
TL
1733 auto p = locks.cbegin();
1734 decode(numlocks, p);
7c673cae 1735 for (int i = 0; i < numlocks; ++i) {
11fdf7f2 1736 decode(lock, p);
7c673cae
FG
1737 lock.client = client;
1738 in->get_fcntl_lock_state()->held_locks.insert(pair<uint64_t, ceph_filelock>(lock.start, lock));
1739 ++in->get_fcntl_lock_state()->client_held_lock_counts[client];
1740 }
11fdf7f2 1741 decode(numlocks, p);
7c673cae 1742 for (int i = 0; i < numlocks; ++i) {
11fdf7f2 1743 decode(lock, p);
7c673cae
FG
1744 lock.client = client;
1745 in->get_flock_lock_state()->held_locks.insert(pair<uint64_t, ceph_filelock> (lock.start, lock));
1746 ++in->get_flock_lock_state()->client_held_lock_counts[client];
1747 }
1748}
1749
7c673cae
FG
1750/**
1751 * Call this when the MDCache is oversized, to send requests to the clients
1752 * to trim some caps, and consequently unpin some inodes in the MDCache so
1753 * that it can trim too.
1754 */
a8e16298
TL
1755std::pair<bool, uint64_t> Server::recall_client_state(MDSGatherBuilder* gather, RecallFlags flags)
1756{
1757 const auto now = clock::now();
92f5a8d4
TL
1758 const bool steady = !!(flags&RecallFlags::STEADY);
1759 const bool enforce_max = !!(flags&RecallFlags::ENFORCE_MAX);
1760 const bool enforce_liveness = !!(flags&RecallFlags::ENFORCE_LIVENESS);
1761 const bool trim = !!(flags&RecallFlags::TRIM);
a8e16298 1762
11fdf7f2
TL
1763 const auto max_caps_per_client = g_conf().get_val<uint64_t>("mds_max_caps_per_client");
1764 const auto min_caps_per_client = g_conf().get_val<uint64_t>("mds_min_caps_per_client");
1765 const auto recall_global_max_decay_threshold = g_conf().get_val<Option::size_t>("mds_recall_global_max_decay_threshold");
1766 const auto recall_max_caps = g_conf().get_val<Option::size_t>("mds_recall_max_caps");
1767 const auto recall_max_decay_threshold = g_conf().get_val<Option::size_t>("mds_recall_max_decay_threshold");
92f5a8d4 1768 const auto cache_liveness_magnitude = g_conf().get_val<Option::size_t>("mds_session_cache_liveness_magnitude");
a8e16298
TL
1769
1770 dout(7) << __func__ << ":"
1771 << " min=" << min_caps_per_client
1772 << " max=" << max_caps_per_client
1773 << " total=" << Capability::count()
92f5a8d4 1774 << " flags=" << flags
a8e16298 1775 << dendl;
f64942e4 1776
a8e16298
TL
1777 /* trim caps of sessions with the most caps first */
1778 std::multimap<uint64_t, Session*> caps_session;
92f5a8d4 1779 auto f = [&caps_session, enforce_max, enforce_liveness, trim, max_caps_per_client, cache_liveness_magnitude](auto& s) {
a8e16298 1780 auto num_caps = s->caps.size();
92f5a8d4
TL
1781 auto cache_liveness = s->get_session_cache_liveness();
1782 if (trim || (enforce_max && num_caps > max_caps_per_client) || (enforce_liveness && cache_liveness < (num_caps>>cache_liveness_magnitude))) {
a8e16298
TL
1783 caps_session.emplace(std::piecewise_construct, std::forward_as_tuple(num_caps), std::forward_as_tuple(s));
1784 }
1785 };
1786 mds->sessionmap.get_client_sessions(std::move(f));
1787
1788 std::pair<bool, uint64_t> result = {false, 0};
11fdf7f2 1789 auto& [throttled, caps_recalled] = result;
a8e16298 1790 last_recall_state = now;
11fdf7f2 1791 for (const auto& [num_caps, session] : boost::adaptors::reverse(caps_session)) {
7c673cae 1792 if (!session->is_open() ||
11fdf7f2 1793 !session->get_connection() ||
7c673cae
FG
1794 !session->info.inst.name.is_client())
1795 continue;
1796
a8e16298
TL
1797 dout(10) << __func__ << ":"
1798 << " session " << session->info.inst
1799 << " caps " << num_caps
7c673cae
FG
1800 << ", leases " << session->leases.size()
1801 << dendl;
1802
a8e16298
TL
1803 uint64_t newlim;
1804 if (num_caps < recall_max_caps || (num_caps-recall_max_caps) < min_caps_per_client) {
1805 newlim = min_caps_per_client;
1806 } else {
1807 newlim = num_caps-recall_max_caps;
1808 }
1809 if (num_caps > newlim) {
1810 /* now limit the number of caps we recall at a time to prevent overloading ourselves */
1811 uint64_t recall = std::min<uint64_t>(recall_max_caps, num_caps-newlim);
1812 newlim = num_caps-recall;
1813 const uint64_t session_recall_throttle = session->get_recall_caps_throttle();
11fdf7f2
TL
1814 const uint64_t session_recall_throttle2o = session->get_recall_caps_throttle2o();
1815 const uint64_t global_recall_throttle = recall_throttle.get();
a8e16298
TL
1816 if (session_recall_throttle+recall > recall_max_decay_threshold) {
1817 dout(15) << " session recall threshold (" << recall_max_decay_threshold << ") hit at " << session_recall_throttle << "; skipping!" << dendl;
1818 throttled = true;
1819 continue;
11fdf7f2
TL
1820 } else if (session_recall_throttle2o+recall > recall_max_caps*2) {
1821 dout(15) << " session recall 2nd-order threshold (" << 2*recall_max_caps << ") hit at " << session_recall_throttle2o << "; skipping!" << dendl;
1822 throttled = true;
1823 continue;
a8e16298
TL
1824 } else if (global_recall_throttle+recall > recall_global_max_decay_threshold) {
1825 dout(15) << " global recall threshold (" << recall_global_max_decay_threshold << ") hit at " << global_recall_throttle << "; skipping!" << dendl;
1826 throttled = true;
1827 break;
1828 }
1829
1830 // now check if we've recalled caps recently and the client is unlikely to satisfy a new recall
1831 if (steady) {
1832 const auto session_recall = session->get_recall_caps();
1833 const auto session_release = session->get_release_caps();
1834 if (2*session_release < session_recall && 2*session_recall > recall_max_decay_threshold) {
1835 /* The session has been unable to keep up with the number of caps
1836 * recalled (by half); additionally, to prevent marking sessions
1837 * we've just begun to recall from, the session_recall counter
1838 * (decayed count of caps recently recalled) is **greater** than the
1839 * session threshold for the session's cap recall throttle.
1840 */
1841 dout(15) << " 2*session_release < session_recall"
11fdf7f2
TL
1842 " (2*" << session_release << " < " << session_recall << ") &&"
1843 " 2*session_recall < recall_max_decay_threshold"
1844 " (2*" << session_recall << " > " << recall_max_decay_threshold << ")"
a8e16298
TL
1845 " Skipping because we are unlikely to get more released." << dendl;
1846 continue;
1847 } else if (recall < recall_max_caps && 2*recall < session_recall) {
1848 /* The number of caps recalled is less than the number we *could*
1849 * recall (so there isn't much left to recall?) and the number of
1850 * caps is less than the current recall_caps counter (decayed count
1851 * of caps recently recalled).
1852 */
1853 dout(15) << " 2*recall < session_recall "
1854 " (2*" << recall << " < " << session_recall << ") &&"
1855 " recall < recall_max_caps (" << recall << " < " << recall_max_caps << ");"
1856 " Skipping because we are unlikely to get more released." << dendl;
1857 continue;
1858 }
1859 }
1860
1861 dout(7) << " recalling " << recall << " caps; session_recall_throttle = " << session_recall_throttle << "; global_recall_throttle = " << global_recall_throttle << dendl;
1862
9f95a23c 1863 auto m = make_message<MClientSession>(CEPH_SESSION_RECALL_STATE);
3efd9988
FG
1864 m->head.max_caps = newlim;
1865 mds->send_message_client(m, session);
a8e16298 1866 if (gather) {
f91f0fd5 1867 flush_session(session, *gather);
f64942e4 1868 }
a8e16298 1869 caps_recalled += session->notify_recall_sent(newlim);
11fdf7f2 1870 recall_throttle.hit(recall);
7c673cae
FG
1871 }
1872 }
a8e16298
TL
1873
1874 dout(7) << "recalled" << (throttled ? " (throttled)" : "") << " " << caps_recalled << " client caps." << dendl;
1875
1876 return result;
7c673cae
FG
1877}
1878
1879void Server::force_clients_readonly()
1880{
1881 dout(10) << "force_clients_readonly" << dendl;
1882 set<Session*> sessions;
1883 mds->sessionmap.get_client_session_set(sessions);
1884 for (set<Session*>::const_iterator p = sessions.begin();
1885 p != sessions.end();
1886 ++p) {
1887 Session *session = *p;
1888 if (!session->info.inst.name.is_client() ||
1889 !(session->is_open() || session->is_stale()))
1890 continue;
9f95a23c 1891 mds->send_message_client(make_message<MClientSession>(CEPH_SESSION_FORCE_RO), session);
7c673cae
FG
1892 }
1893}
1894
1895/*******
1896 * some generic stuff for finishing off requests
1897 */
1898void Server::journal_and_reply(MDRequestRef& mdr, CInode *in, CDentry *dn, LogEvent *le, MDSLogContextBase *fin)
1899{
1900 dout(10) << "journal_and_reply tracei " << in << " tracedn " << dn << dendl;
11fdf7f2 1901 ceph_assert(!mdr->has_completed);
7c673cae
FG
1902
1903 // note trace items for eventual reply.
1904 mdr->tracei = in;
1905 if (in)
1906 mdr->pin(in);
1907
1908 mdr->tracedn = dn;
1909 if (dn)
1910 mdr->pin(dn);
1911
1912 early_reply(mdr, in, dn);
1913
1914 mdr->committing = true;
1915 submit_mdlog_entry(le, fin, mdr, __func__);
1916
1917 if (mdr->client_request && mdr->client_request->is_queued_for_replay()) {
1918 if (mds->queue_one_replay()) {
1919 dout(10) << " queued next replay op" << dendl;
1920 } else {
11fdf7f2 1921 dout(10) << " journaled last replay op" << dendl;
7c673cae
FG
1922 }
1923 } else if (mdr->did_early_reply)
b32b8144 1924 mds->locker->drop_rdlocks_for_early_reply(mdr.get());
7c673cae
FG
1925 else
1926 mdlog->flush();
1927}
1928
1929void Server::submit_mdlog_entry(LogEvent *le, MDSLogContextBase *fin, MDRequestRef& mdr,
11fdf7f2 1930 std::string_view event)
7c673cae
FG
1931{
1932 if (mdr) {
1933 string event_str("submit entry: ");
1934 event_str += event;
11fdf7f2 1935 mdr->mark_event(event_str);
7c673cae
FG
1936 }
1937 mdlog->submit_entry(le, fin);
1938}
1939
1940/*
1941 * send response built from mdr contents and error code; clean up mdr
1942 */
1943void Server::respond_to_request(MDRequestRef& mdr, int r)
1944{
1945 if (mdr->client_request) {
f91f0fd5
TL
1946 if (mdr->is_batch_head()) {
1947 dout(20) << __func__ << " batch head " << *mdr << dendl;
1948 mdr->release_batch_op()->respond(r);
9f95a23c
TL
1949 } else {
1950 reply_client_request(mdr, make_message<MClientReply>(*mdr->client_request, r));
1951 }
7c673cae
FG
1952 } else if (mdr->internal_op > -1) {
1953 dout(10) << "respond_to_request on internal request " << mdr << dendl;
1954 if (!mdr->internal_op_finish)
11fdf7f2 1955 ceph_abort_msg("trying to respond to internal op without finisher");
7c673cae
FG
1956 mdr->internal_op_finish->complete(r);
1957 mdcache->request_finish(mdr);
1958 }
1959}
1960
91327a77 1961// statistics mds req op number and latency
9f95a23c 1962void Server::perf_gather_op_latency(const cref_t<MClientRequest> &req, utime_t lat)
91327a77
AA
1963{
1964 int code = l_mdss_first;
1965 switch(req->get_op()) {
1966 case CEPH_MDS_OP_LOOKUPHASH:
1967 code = l_mdss_req_lookuphash_latency;
1968 break;
1969 case CEPH_MDS_OP_LOOKUPINO:
1970 code = l_mdss_req_lookupino_latency;
1971 break;
1972 case CEPH_MDS_OP_LOOKUPPARENT:
1973 code = l_mdss_req_lookupparent_latency;
1974 break;
1975 case CEPH_MDS_OP_LOOKUPNAME:
1976 code = l_mdss_req_lookupname_latency;
1977 break;
1978 case CEPH_MDS_OP_LOOKUP:
1979 code = l_mdss_req_lookup_latency;
1980 break;
1981 case CEPH_MDS_OP_LOOKUPSNAP:
1982 code = l_mdss_req_lookupsnap_latency;
1983 break;
1984 case CEPH_MDS_OP_GETATTR:
1985 code = l_mdss_req_getattr_latency;
1986 break;
1987 case CEPH_MDS_OP_SETATTR:
1988 code = l_mdss_req_setattr_latency;
1989 break;
1990 case CEPH_MDS_OP_SETLAYOUT:
1991 code = l_mdss_req_setlayout_latency;
1992 break;
1993 case CEPH_MDS_OP_SETDIRLAYOUT:
1994 code = l_mdss_req_setdirlayout_latency;
1995 break;
1996 case CEPH_MDS_OP_SETXATTR:
1997 code = l_mdss_req_setxattr_latency;
1998 break;
1999 case CEPH_MDS_OP_RMXATTR:
2000 code = l_mdss_req_rmxattr_latency;
2001 break;
2002 case CEPH_MDS_OP_READDIR:
2003 code = l_mdss_req_readdir_latency;
2004 break;
2005 case CEPH_MDS_OP_SETFILELOCK:
2006 code = l_mdss_req_setfilelock_latency;
2007 break;
2008 case CEPH_MDS_OP_GETFILELOCK:
2009 code = l_mdss_req_getfilelock_latency;
2010 break;
2011 case CEPH_MDS_OP_CREATE:
2012 code = l_mdss_req_create_latency;
2013 break;
2014 case CEPH_MDS_OP_OPEN:
2015 code = l_mdss_req_open_latency;
2016 break;
2017 case CEPH_MDS_OP_MKNOD:
2018 code = l_mdss_req_mknod_latency;
2019 break;
2020 case CEPH_MDS_OP_LINK:
2021 code = l_mdss_req_link_latency;
2022 break;
2023 case CEPH_MDS_OP_UNLINK:
2024 code = l_mdss_req_unlink_latency;
2025 break;
2026 case CEPH_MDS_OP_RMDIR:
2027 code = l_mdss_req_rmdir_latency;
2028 break;
2029 case CEPH_MDS_OP_RENAME:
2030 code = l_mdss_req_rename_latency;
2031 break;
2032 case CEPH_MDS_OP_MKDIR:
2033 code = l_mdss_req_mkdir_latency;
2034 break;
2035 case CEPH_MDS_OP_SYMLINK:
2036 code = l_mdss_req_symlink_latency;
2037 break;
2038 case CEPH_MDS_OP_LSSNAP:
2039 code = l_mdss_req_lssnap_latency;
2040 break;
2041 case CEPH_MDS_OP_MKSNAP:
2042 code = l_mdss_req_mksnap_latency;
2043 break;
2044 case CEPH_MDS_OP_RMSNAP:
2045 code = l_mdss_req_rmsnap_latency;
2046 break;
2047 case CEPH_MDS_OP_RENAMESNAP:
2048 code = l_mdss_req_renamesnap_latency;
2049 break;
2050 default: ceph_abort();
2051 }
2052 logger->tinc(code, lat);
2053}
2054
7c673cae
FG
2055void Server::early_reply(MDRequestRef& mdr, CInode *tracei, CDentry *tracedn)
2056{
11fdf7f2 2057 if (!g_conf()->mds_early_reply)
7c673cae
FG
2058 return;
2059
b32b8144
FG
2060 if (mdr->no_early_reply) {
2061 dout(10) << "early_reply - flag no_early_reply is set, not allowed." << dendl;
2062 return;
2063 }
2064
f67539c2
TL
2065 if (mdr->has_more() && mdr->more()->has_journaled_peers) {
2066 dout(10) << "early_reply - there are journaled peers, not allowed." << dendl;
7c673cae
FG
2067 return;
2068 }
2069
2070 if (mdr->alloc_ino) {
2071 dout(10) << "early_reply - allocated ino, not allowed" << dendl;
2072 return;
2073 }
2074
9f95a23c 2075 const cref_t<MClientRequest> &req = mdr->client_request;
7c673cae
FG
2076 entity_inst_t client_inst = req->get_source_inst();
2077 if (client_inst.name.is_mds())
2078 return;
2079
2080 if (req->is_replay()) {
2081 dout(10) << " no early reply on replay op" << dendl;
2082 return;
2083 }
2084
2085
9f95a23c 2086 auto reply = make_message<MClientReply>(*req, 0);
7c673cae
FG
2087 reply->set_unsafe();
2088
2089 // mark xlocks "done", indicating that we are exposing uncommitted changes.
2090 //
2091 //_rename_finish() does not send dentry link/unlink message to replicas.
2092 // so do not set xlocks on dentries "done", the xlocks prevent dentries
2093 // that have projected linkages from getting new replica.
2094 mds->locker->set_xlocks_done(mdr.get(), req->get_op() == CEPH_MDS_OP_RENAME);
2095
2096 dout(10) << "early_reply " << reply->get_result()
2097 << " (" << cpp_strerror(reply->get_result())
2098 << ") " << *req << dendl;
2099
2100 if (tracei || tracedn) {
2101 if (tracei)
2102 mdr->cap_releases.erase(tracei->vino());
2103 if (tracedn)
2104 mdr->cap_releases.erase(tracedn->get_dir()->get_inode()->vino());
2105
9f95a23c 2106 set_trace_dist(reply, tracei, tracedn, mdr);
7c673cae
FG
2107 }
2108
2109 reply->set_extra_bl(mdr->reply_extra_bl);
11fdf7f2 2110 mds->send_message_client(reply, mdr->session);
7c673cae
FG
2111
2112 mdr->did_early_reply = true;
2113
2114 mds->logger->inc(l_mds_reply);
2115 utime_t lat = ceph_clock_now() - req->get_recv_stamp();
2116 mds->logger->tinc(l_mds_reply_latency, lat);
91327a77
AA
2117 if (client_inst.name.is_client()) {
2118 mds->sessionmap.hit_session(mdr->session);
2119 }
2120 perf_gather_op_latency(req, lat);
7c673cae
FG
2121 dout(20) << "lat " << lat << dendl;
2122
2123 mdr->mark_event("early_replied");
2124}
2125
2126/*
2127 * send given reply
2128 * include a trace to tracei
2129 * Clean up mdr
2130 */
9f95a23c 2131void Server::reply_client_request(MDRequestRef& mdr, const ref_t<MClientReply> &reply)
7c673cae 2132{
11fdf7f2 2133 ceph_assert(mdr.get());
9f95a23c 2134 const cref_t<MClientRequest> &req = mdr->client_request;
7c673cae
FG
2135
2136 dout(7) << "reply_client_request " << reply->get_result()
2137 << " (" << cpp_strerror(reply->get_result())
2138 << ") " << *req << dendl;
2139
2140 mdr->mark_event("replying");
2141
2142 Session *session = mdr->session;
2143
2144 // note successful request in session map?
2145 //
2146 // setfilelock requests are special, they only modify states in MDS memory.
2147 // The states get lost when MDS fails. If Client re-send a completed
2148 // setfilelock request, it means that client did not receive corresponding
2149 // setfilelock reply. So MDS should re-execute the setfilelock request.
2150 if (req->may_write() && req->get_op() != CEPH_MDS_OP_SETFILELOCK &&
2151 reply->get_result() == 0 && session) {
2152 inodeno_t created = mdr->alloc_ino ? mdr->alloc_ino : mdr->used_prealloc_ino;
2153 session->add_completed_request(mdr->reqid.tid, created);
2154 if (mdr->ls) {
2155 mdr->ls->touched_sessions.insert(session->info.inst.name);
2156 }
2157 }
2158
2159 // give any preallocated inos to the session
2160 apply_allocated_inos(mdr, session);
2161
2162 // get tracei/tracedn from mdr?
7c673cae
FG
2163 CInode *tracei = mdr->tracei;
2164 CDentry *tracedn = mdr->tracedn;
2165
2166 bool is_replay = mdr->client_request->is_replay();
2167 bool did_early_reply = mdr->did_early_reply;
2168 entity_inst_t client_inst = req->get_source_inst();
7c673cae
FG
2169
2170 if (!did_early_reply && !is_replay) {
2171
2172 mds->logger->inc(l_mds_reply);
2173 utime_t lat = ceph_clock_now() - mdr->client_request->get_recv_stamp();
2174 mds->logger->tinc(l_mds_reply_latency, lat);
81eedcae 2175 if (session && client_inst.name.is_client()) {
91327a77
AA
2176 mds->sessionmap.hit_session(session);
2177 }
2178 perf_gather_op_latency(req, lat);
7c673cae
FG
2179 dout(20) << "lat " << lat << dendl;
2180
2181 if (tracei)
2182 mdr->cap_releases.erase(tracei->vino());
2183 if (tracedn)
2184 mdr->cap_releases.erase(tracedn->get_dir()->get_inode()->vino());
2185 }
2186
2187 // drop non-rdlocks before replying, so that we can issue leases
2188 mdcache->request_drop_non_rdlocks(mdr);
2189
2190 // reply at all?
81eedcae 2191 if (session && !client_inst.name.is_mds()) {
7c673cae
FG
2192 // send reply.
2193 if (!did_early_reply && // don't issue leases if we sent an earlier reply already
2194 (tracei || tracedn)) {
2195 if (is_replay) {
2196 if (tracei)
2197 mdcache->try_reconnect_cap(tracei, session);
2198 } else {
2199 // include metadata in reply
9f95a23c 2200 set_trace_dist(reply, tracei, tracedn, mdr);
7c673cae
FG
2201 }
2202 }
2203
2204 // We can set the extra bl unconditionally: if it's already been sent in the
2205 // early_reply, set_extra_bl will have claimed it and reply_extra_bl is empty
2206 reply->set_extra_bl(mdr->reply_extra_bl);
2207
2208 reply->set_mdsmap_epoch(mds->mdsmap->get_epoch());
11fdf7f2 2209 mds->send_message_client(reply, session);
7c673cae
FG
2210 }
2211
2212 if (req->is_queued_for_replay() &&
2213 (mdr->has_completed || reply->get_result() < 0)) {
2214 if (reply->get_result() < 0) {
2215 int r = reply->get_result();
2216 derr << "reply_client_request: failed to replay " << *req
2217 << " error " << r << " (" << cpp_strerror(r) << ")" << dendl;
2218 mds->clog->warn() << "failed to replay " << req->get_reqid() << " error " << r;
2219 }
2220 mds->queue_one_replay();
2221 }
2222
2223 // clean up request
2224 mdcache->request_finish(mdr);
2225
2226 // take a closer look at tracei, if it happens to be a remote link
2227 if (tracei &&
2228 tracedn &&
2229 tracedn->get_projected_linkage()->is_remote()) {
2230 mdcache->eval_remote(tracedn);
2231 }
2232}
2233
7c673cae
FG
2234/*
2235 * pass inode OR dentry (not both, or we may get confused)
2236 *
2237 * trace is in reverse order (i.e. root inode comes last)
2238 */
9f95a23c 2239void Server::set_trace_dist(const ref_t<MClientReply> &reply,
7c673cae 2240 CInode *in, CDentry *dn,
7c673cae
FG
2241 MDRequestRef& mdr)
2242{
2243 // skip doing this for debugging purposes?
11fdf7f2 2244 if (g_conf()->mds_inject_traceless_reply_probability &&
7c673cae 2245 mdr->ls && !mdr->o_trunc &&
11fdf7f2 2246 (rand() % 10000 < g_conf()->mds_inject_traceless_reply_probability * 10000.0)) {
7c673cae
FG
2247 dout(5) << "deliberately skipping trace for " << *reply << dendl;
2248 return;
2249 }
2250
2251 // inode, dentry, dir, ..., inode
2252 bufferlist bl;
2253 mds_rank_t whoami = mds->get_nodeid();
9f95a23c
TL
2254 Session *session = mdr->session;
2255 snapid_t snapid = mdr->snapid;
7c673cae
FG
2256 utime_t now = ceph_clock_now();
2257
2258 dout(20) << "set_trace_dist snapid " << snapid << dendl;
2259
7c673cae
FG
2260 // realm
2261 if (snapid == CEPH_NOSNAP) {
2262 SnapRealm *realm;
2263 if (in)
2264 realm = in->find_snaprealm();
2265 else
2266 realm = dn->get_dir()->get_inode()->find_snaprealm();
2267 reply->snapbl = realm->get_snap_trace();
2268 dout(10) << "set_trace_dist snaprealm " << *realm << " len=" << reply->snapbl.length() << dendl;
2269 }
2270
2271 // dir + dentry?
2272 if (dn) {
2273 reply->head.is_dentry = 1;
2274 CDir *dir = dn->get_dir();
2275 CInode *diri = dir->get_inode();
2276
2277 diri->encode_inodestat(bl, session, NULL, snapid);
2278 dout(20) << "set_trace_dist added diri " << *diri << dendl;
2279
2280#ifdef MDS_VERIFY_FRAGSTAT
2281 if (dir->is_complete())
2282 dir->verify_fragstat();
2283#endif
11fdf7f2
TL
2284 DirStat ds;
2285 ds.frag = dir->get_frag();
2286 ds.auth = dir->get_dir_auth().first;
f91f0fd5 2287 if (dir->is_auth() && !forward_all_requests_to_auth)
11fdf7f2
TL
2288 dir->get_dist_spec(ds.dist, whoami);
2289
2290 dir->encode_dirstat(bl, session->info, ds);
7c673cae
FG
2291 dout(20) << "set_trace_dist added dir " << *dir << dendl;
2292
11fdf7f2 2293 encode(dn->get_name(), bl);
9f95a23c
TL
2294
2295 int lease_mask = 0;
2296 CDentry::linkage_t *dnl = dn->get_linkage(mdr->get_client(), mdr);
2297 if (dnl->is_primary()) {
2298 ceph_assert(dnl->get_inode() == in);
2299 lease_mask = CEPH_LEASE_PRIMARY_LINK;
2300 } else {
2301 if (dnl->is_remote())
2302 ceph_assert(dnl->get_remote_ino() == in->ino());
2303 else
2304 ceph_assert(!in);
11fdf7f2 2305 }
9f95a23c 2306 mds->locker->issue_client_lease(dn, mdr, lease_mask, now, bl);
7c673cae
FG
2307 dout(20) << "set_trace_dist added dn " << snapid << " " << *dn << dendl;
2308 } else
2309 reply->head.is_dentry = 0;
2310
2311 // inode
2312 if (in) {
2313 in->encode_inodestat(bl, session, NULL, snapid, 0, mdr->getattr_caps);
2314 dout(20) << "set_trace_dist added in " << *in << dendl;
2315 reply->head.is_target = 1;
2316 } else
2317 reply->head.is_target = 0;
2318
2319 reply->set_trace(bl);
2320}
2321
9f95a23c 2322void Server::handle_client_request(const cref_t<MClientRequest> &req)
7c673cae
FG
2323{
2324 dout(4) << "handle_client_request " << *req << dendl;
2325
2326 if (mds->logger)
2327 mds->logger->inc(l_mds_request);
2328 if (logger)
2329 logger->inc(l_mdss_handle_client_request);
2330
2331 if (!mdcache->is_open()) {
2332 dout(5) << "waiting for root" << dendl;
2333 mdcache->wait_for_open(new C_MDS_RetryMessage(mds, req));
2334 return;
2335 }
2336
92f5a8d4 2337 bool sessionclosed_isok = replay_unsafe_with_closed_session;
7c673cae
FG
2338 // active session?
2339 Session *session = 0;
2340 if (req->get_source().is_client()) {
94b18763 2341 session = mds->get_session(req);
7c673cae
FG
2342 if (!session) {
2343 dout(5) << "no session for " << req->get_source() << ", dropping" << dendl;
92f5a8d4 2344 } else if ((session->is_closed() && (!mds->is_clientreplay() || !sessionclosed_isok)) ||
7c673cae
FG
2345 session->is_closing() ||
2346 session->is_killing()) {
2347 dout(5) << "session closed|closing|killing, dropping" << dendl;
2348 session = NULL;
2349 }
2350 if (!session) {
2351 if (req->is_queued_for_replay())
2352 mds->queue_one_replay();
7c673cae
FG
2353 return;
2354 }
2355 }
2356
2357 // old mdsmap?
2358 if (req->get_mdsmap_epoch() < mds->mdsmap->get_epoch()) {
2359 // send it? hrm, this isn't ideal; they may get a lot of copies if
2360 // they have a high request rate.
2361 }
2362
2363 // completed request?
2364 bool has_completed = false;
2365 if (req->is_replay() || req->get_retry_attempt()) {
11fdf7f2 2366 ceph_assert(session);
7c673cae
FG
2367 inodeno_t created;
2368 if (session->have_completed_request(req->get_reqid().tid, &created)) {
2369 has_completed = true;
92f5a8d4
TL
2370 if (!session->is_open())
2371 return;
7c673cae
FG
2372 // Don't send traceless reply if the completed request has created
2373 // new inode. Treat the request as lookup request instead.
2374 if (req->is_replay() ||
2375 ((created == inodeno_t() || !mds->is_clientreplay()) &&
2376 req->get_op() != CEPH_MDS_OP_OPEN &&
2377 req->get_op() != CEPH_MDS_OP_CREATE)) {
2378 dout(5) << "already completed " << req->get_reqid() << dendl;
9f95a23c 2379 auto reply = make_message<MClientReply>(*req, 0);
7c673cae
FG
2380 if (created != inodeno_t()) {
2381 bufferlist extra;
11fdf7f2 2382 encode(created, extra);
7c673cae
FG
2383 reply->set_extra_bl(extra);
2384 }
11fdf7f2 2385 mds->send_message_client(reply, session);
7c673cae
FG
2386
2387 if (req->is_queued_for_replay())
2388 mds->queue_one_replay();
2389
7c673cae
FG
2390 return;
2391 }
2392 if (req->get_op() != CEPH_MDS_OP_OPEN &&
2393 req->get_op() != CEPH_MDS_OP_CREATE) {
2394 dout(10) << " completed request which created new inode " << created
2395 << ", convert it to lookup request" << dendl;
2396 req->head.op = req->get_dentry_wanted() ? CEPH_MDS_OP_LOOKUP : CEPH_MDS_OP_GETATTR;
2397 req->head.args.getattr.mask = CEPH_STAT_CAP_INODE_ALL;
2398 }
2399 }
2400 }
2401
2402 // trim completed_request list
2403 if (req->get_oldest_client_tid() > 0) {
2404 dout(15) << " oldest_client_tid=" << req->get_oldest_client_tid() << dendl;
11fdf7f2 2405 ceph_assert(session);
7c673cae
FG
2406 if (session->trim_completed_requests(req->get_oldest_client_tid())) {
2407 // Sessions 'completed_requests' was dirtied, mark it to be
2408 // potentially flushed at segment expiry.
2409 mdlog->get_current_segment()->touched_sessions.insert(session->info.inst.name);
2410
2411 if (session->get_num_trim_requests_warnings() > 0 &&
11fdf7f2 2412 session->get_num_completed_requests() * 2 < g_conf()->mds_max_completed_requests)
7c673cae
FG
2413 session->reset_num_trim_requests_warnings();
2414 } else {
2415 if (session->get_num_completed_requests() >=
11fdf7f2 2416 (g_conf()->mds_max_completed_requests << session->get_num_trim_requests_warnings())) {
7c673cae 2417 session->inc_num_trim_requests_warnings();
f67539c2
TL
2418 CachedStackStringStream css;
2419 *css << "client." << session->get_client() << " does not advance its oldest_client_tid ("
7c673cae
FG
2420 << req->get_oldest_client_tid() << "), "
2421 << session->get_num_completed_requests()
2422 << " completed requests recorded in session\n";
f67539c2
TL
2423 mds->clog->warn() << css->strv();
2424 dout(20) << __func__ << " " << css->strv() << dendl;
7c673cae
FG
2425 }
2426 }
2427 }
2428
2429 // register + dispatch
2430 MDRequestRef mdr = mdcache->request_start(req);
2431 if (!mdr.get())
2432 return;
2433
2434 if (session) {
2435 mdr->session = session;
2436 session->requests.push_back(&mdr->item_session_request);
2437 }
2438
2439 if (has_completed)
2440 mdr->has_completed = true;
2441
2442 // process embedded cap releases?
2443 // (only if NOT replay!)
2444 if (!req->releases.empty() && req->get_source().is_client() && !req->is_replay()) {
2445 client_t client = req->get_source().num();
11fdf7f2
TL
2446 for (const auto &r : req->releases) {
2447 mds->locker->process_request_cap_release(mdr, client, r.item, r.dname);
2448 }
7c673cae
FG
2449 req->releases.clear();
2450 }
2451
2452 dispatch_client_request(mdr);
2453 return;
2454}
2455
2456void Server::handle_osd_map()
2457{
2458 /* Note that we check the OSDMAP_FULL flag directly rather than
2459 * using osdmap_full_flag(), because we want to know "is the flag set"
2460 * rather than "does the flag apply to us?" */
2461 mds->objecter->with_osdmap([this](const OSDMap& o) {
b32b8144
FG
2462 auto pi = o.get_pg_pool(mds->mdsmap->get_metadata_pool());
2463 is_full = pi && pi->has_flag(pg_pool_t::FLAG_FULL);
7c673cae
FG
2464 dout(7) << __func__ << ": full = " << is_full << " epoch = "
2465 << o.get_epoch() << dendl;
2466 });
2467}
2468
2469void Server::dispatch_client_request(MDRequestRef& mdr)
2470{
2471 // we shouldn't be waiting on anyone.
f67539c2 2472 ceph_assert(!mdr->has_more() || mdr->more()->waiting_on_peer.empty());
7c673cae
FG
2473
2474 if (mdr->killed) {
2475 dout(10) << "request " << *mdr << " was killed" << dendl;
9f95a23c
TL
2476 //if the mdr is a "batch_op" and it has followers, pick a follower as
2477 //the new "head of the batch ops" and go on processing the new one.
f91f0fd5
TL
2478 if (mdr->is_batch_head()) {
2479 int mask = mdr->client_request->head.args.getattr.mask;
2480 auto it = mdr->batch_op_map->find(mask);
2481 auto new_batch_head = it->second->find_new_head();
2482 if (!new_batch_head) {
2483 mdr->batch_op_map->erase(it);
9f95a23c
TL
2484 return;
2485 }
f91f0fd5 2486 mdr = std::move(new_batch_head);
9f95a23c
TL
2487 } else {
2488 return;
2489 }
94b18763
FG
2490 } else if (mdr->aborted) {
2491 mdr->aborted = false;
2492 mdcache->request_kill(mdr);
2493 return;
7c673cae
FG
2494 }
2495
9f95a23c 2496 const cref_t<MClientRequest> &req = mdr->client_request;
7c673cae
FG
2497
2498 if (logger) logger->inc(l_mdss_dispatch_client_request);
2499
2500 dout(7) << "dispatch_client_request " << *req << dendl;
2501
9f95a23c
TL
2502 if (req->may_write() && mdcache->is_readonly()) {
2503 dout(10) << " read-only FS" << dendl;
f67539c2 2504 respond_to_request(mdr, -CEPHFS_EROFS);
9f95a23c
TL
2505 return;
2506 }
f67539c2
TL
2507 if (mdr->has_more() && mdr->more()->peer_error) {
2508 dout(10) << " got error from peers" << dendl;
2509 respond_to_request(mdr, mdr->more()->peer_error);
9f95a23c 2510 return;
7c673cae
FG
2511 }
2512
2513 if (is_full) {
2514 if (req->get_op() == CEPH_MDS_OP_SETLAYOUT ||
2515 req->get_op() == CEPH_MDS_OP_SETDIRLAYOUT ||
2516 req->get_op() == CEPH_MDS_OP_SETLAYOUT ||
2517 req->get_op() == CEPH_MDS_OP_RMXATTR ||
2518 req->get_op() == CEPH_MDS_OP_SETXATTR ||
2519 req->get_op() == CEPH_MDS_OP_CREATE ||
2520 req->get_op() == CEPH_MDS_OP_SYMLINK ||
2521 req->get_op() == CEPH_MDS_OP_MKSNAP ||
2522 ((req->get_op() == CEPH_MDS_OP_LINK ||
2523 req->get_op() == CEPH_MDS_OP_RENAME) &&
f67539c2 2524 (!mdr->has_more() || mdr->more()->witnessed.empty())) // haven't started peer request
7c673cae
FG
2525 ) {
2526
f67539c2
TL
2527 dout(20) << __func__ << ": full, responding CEPHFS_ENOSPC to op " << ceph_mds_op_name(req->get_op()) << dendl;
2528 respond_to_request(mdr, -CEPHFS_ENOSPC);
7c673cae
FG
2529 return;
2530 } else {
2531 dout(20) << __func__ << ": full, permitting op " << ceph_mds_op_name(req->get_op()) << dendl;
2532 }
2533 }
2534
2535 switch (req->get_op()) {
2536 case CEPH_MDS_OP_LOOKUPHASH:
2537 case CEPH_MDS_OP_LOOKUPINO:
2538 handle_client_lookup_ino(mdr, false, false);
2539 break;
2540 case CEPH_MDS_OP_LOOKUPPARENT:
2541 handle_client_lookup_ino(mdr, true, false);
2542 break;
2543 case CEPH_MDS_OP_LOOKUPNAME:
2544 handle_client_lookup_ino(mdr, false, true);
2545 break;
2546
2547 // inodes ops.
2548 case CEPH_MDS_OP_LOOKUP:
2549 handle_client_getattr(mdr, true);
2550 break;
2551
2552 case CEPH_MDS_OP_LOOKUPSNAP:
2553 // lookupsnap does not reference a CDentry; treat it as a getattr
2554 case CEPH_MDS_OP_GETATTR:
2555 handle_client_getattr(mdr, false);
2556 break;
2557
2558 case CEPH_MDS_OP_SETATTR:
2559 handle_client_setattr(mdr);
2560 break;
2561 case CEPH_MDS_OP_SETLAYOUT:
2562 handle_client_setlayout(mdr);
2563 break;
2564 case CEPH_MDS_OP_SETDIRLAYOUT:
2565 handle_client_setdirlayout(mdr);
2566 break;
2567 case CEPH_MDS_OP_SETXATTR:
2568 handle_client_setxattr(mdr);
2569 break;
2570 case CEPH_MDS_OP_RMXATTR:
2571 handle_client_removexattr(mdr);
2572 break;
2573
2574 case CEPH_MDS_OP_READDIR:
2575 handle_client_readdir(mdr);
2576 break;
2577
2578 case CEPH_MDS_OP_SETFILELOCK:
2579 handle_client_file_setlock(mdr);
2580 break;
2581
2582 case CEPH_MDS_OP_GETFILELOCK:
2583 handle_client_file_readlock(mdr);
2584 break;
2585
2586 // funky.
2587 case CEPH_MDS_OP_CREATE:
2588 if (mdr->has_completed)
2589 handle_client_open(mdr); // already created.. just open
2590 else
2591 handle_client_openc(mdr);
2592 break;
2593
2594 case CEPH_MDS_OP_OPEN:
2595 handle_client_open(mdr);
2596 break;
2597
2598 // namespace.
2599 // no prior locks.
2600 case CEPH_MDS_OP_MKNOD:
2601 handle_client_mknod(mdr);
2602 break;
2603 case CEPH_MDS_OP_LINK:
2604 handle_client_link(mdr);
2605 break;
2606 case CEPH_MDS_OP_UNLINK:
2607 case CEPH_MDS_OP_RMDIR:
2608 handle_client_unlink(mdr);
2609 break;
2610 case CEPH_MDS_OP_RENAME:
2611 handle_client_rename(mdr);
2612 break;
2613 case CEPH_MDS_OP_MKDIR:
2614 handle_client_mkdir(mdr);
2615 break;
2616 case CEPH_MDS_OP_SYMLINK:
2617 handle_client_symlink(mdr);
2618 break;
2619
2620
2621 // snaps
2622 case CEPH_MDS_OP_LSSNAP:
2623 handle_client_lssnap(mdr);
2624 break;
2625 case CEPH_MDS_OP_MKSNAP:
2626 handle_client_mksnap(mdr);
2627 break;
2628 case CEPH_MDS_OP_RMSNAP:
2629 handle_client_rmsnap(mdr);
2630 break;
2631 case CEPH_MDS_OP_RENAMESNAP:
2632 handle_client_renamesnap(mdr);
2633 break;
2634
2635 default:
2636 dout(1) << " unknown client op " << req->get_op() << dendl;
f67539c2 2637 respond_to_request(mdr, -CEPHFS_EOPNOTSUPP);
7c673cae
FG
2638 }
2639}
2640
2641
2642// ---------------------------------------
f67539c2 2643// PEER REQUESTS
7c673cae 2644
f67539c2 2645void Server::handle_peer_request(const cref_t<MMDSPeerRequest> &m)
7c673cae 2646{
f67539c2 2647 dout(4) << "handle_peer_request " << m->get_reqid() << " from " << m->get_source() << dendl;
7c673cae
FG
2648 mds_rank_t from = mds_rank_t(m->get_source().num());
2649
f67539c2 2650 if (logger) logger->inc(l_mdss_handle_peer_request);
7c673cae
FG
2651
2652 // reply?
2653 if (m->is_reply())
f67539c2 2654 return handle_peer_request_reply(m);
7c673cae
FG
2655
2656 // the purpose of rename notify is enforcing causal message ordering. making sure
2657 // bystanders have received all messages from rename srcdn's auth MDS.
f67539c2
TL
2658 if (m->get_op() == MMDSPeerRequest::OP_RENAMENOTIFY) {
2659 auto reply = make_message<MMDSPeerRequest>(m->get_reqid(), m->get_attempt(), MMDSPeerRequest::OP_RENAMENOTIFYACK);
7c673cae 2660 mds->send_message(reply, m->get_connection());
7c673cae
FG
2661 return;
2662 }
2663
2664 CDentry *straydn = NULL;
11fdf7f2 2665 if (m->straybl.length() > 0) {
9f95a23c 2666 mdcache->decode_replica_stray(straydn, m->straybl, from);
11fdf7f2
TL
2667 ceph_assert(straydn);
2668 m->straybl.clear();
7c673cae
FG
2669 }
2670
9f95a23c
TL
2671 if (!mds->is_clientreplay() && !mds->is_active() && !mds->is_stopping()) {
2672 dout(3) << "not clientreplay|active yet, waiting" << dendl;
2673 mds->wait_for_replay(new C_MDS_RetryMessage(mds, m));
2674 return;
2675 }
2676
f67539c2 2677 // am i a new peer?
7c673cae
FG
2678 MDRequestRef mdr;
2679 if (mdcache->have_request(m->get_reqid())) {
2680 // existing?
2681 mdr = mdcache->request_get(m->get_reqid());
2682
2683 // is my request newer?
2684 if (mdr->attempt > m->get_attempt()) {
2685 dout(10) << "local request " << *mdr << " attempt " << mdr->attempt << " > " << m->get_attempt()
2686 << ", dropping " << *m << dendl;
7c673cae
FG
2687 return;
2688 }
2689
7c673cae
FG
2690 if (mdr->attempt < m->get_attempt()) {
2691 // mine is old, close it out
2692 dout(10) << "local request " << *mdr << " attempt " << mdr->attempt << " < " << m->get_attempt()
2693 << ", closing out" << dendl;
2694 mdcache->request_finish(mdr);
2695 mdr.reset();
f67539c2
TL
2696 } else if (mdr->peer_to_mds != from) {
2697 dout(10) << "local request " << *mdr << " not peer to mds." << from << dendl;
7c673cae
FG
2698 return;
2699 }
2700
f67539c2
TL
2701 // may get these while mdr->peer_request is non-null
2702 if (m->get_op() == MMDSPeerRequest::OP_DROPLOCKS) {
9f95a23c
TL
2703 mds->locker->drop_locks(mdr.get());
2704 return;
2705 }
f67539c2 2706 if (m->get_op() == MMDSPeerRequest::OP_FINISH) {
9f95a23c
TL
2707 if (m->is_abort()) {
2708 mdr->aborted = true;
f67539c2 2709 if (mdr->peer_request) {
9f95a23c 2710 // only abort on-going xlock, wrlock and auth pin
f67539c2 2711 ceph_assert(!mdr->peer_did_prepare());
9f95a23c
TL
2712 } else {
2713 mdcache->request_finish(mdr);
2714 }
7c673cae 2715 } else {
9f95a23c
TL
2716 if (m->inode_export.length() > 0)
2717 mdr->more()->inode_import = m->inode_export;
2718 // finish off request.
7c673cae
FG
2719 mdcache->request_finish(mdr);
2720 }
2721 return;
2722 }
2723 }
2724 if (!mdr.get()) {
2725 // new?
f67539c2
TL
2726 if (m->get_op() == MMDSPeerRequest::OP_FINISH) {
2727 dout(10) << "missing peer request for " << m->get_reqid()
7c673cae 2728 << " OP_FINISH, must have lost race with a forward" << dendl;
7c673cae
FG
2729 return;
2730 }
f67539c2 2731 mdr = mdcache->request_start_peer(m->get_reqid(), m->get_attempt(), m);
7c673cae
FG
2732 mdr->set_op_stamp(m->op_stamp);
2733 }
f67539c2 2734 ceph_assert(mdr->peer_request == 0); // only one at a time, please!
7c673cae
FG
2735
2736 if (straydn) {
2737 mdr->pin(straydn);
2738 mdr->straydn = straydn;
2739 }
2740
9f95a23c
TL
2741 if (mds->is_clientreplay() && !mds->mdsmap->is_clientreplay(from) &&
2742 mdr->locks.empty()) {
7c673cae
FG
2743 dout(3) << "not active yet, waiting" << dendl;
2744 mds->wait_for_active(new C_MDS_RetryMessage(mds, m));
2745 return;
2746 }
2747
f67539c2 2748 mdr->reset_peer_request(m);
7c673cae 2749
f67539c2 2750 dispatch_peer_request(mdr);
7c673cae
FG
2751}
2752
f67539c2 2753void Server::handle_peer_request_reply(const cref_t<MMDSPeerRequest> &m)
7c673cae
FG
2754{
2755 mds_rank_t from = mds_rank_t(m->get_source().num());
2756
2757 if (!mds->is_clientreplay() && !mds->is_active() && !mds->is_stopping()) {
2758 metareqid_t r = m->get_reqid();
f67539c2
TL
2759 if (!mdcache->have_uncommitted_leader(r, from)) {
2760 dout(10) << "handle_peer_request_reply ignoring peer reply from mds."
7c673cae 2761 << from << " reqid " << r << dendl;
7c673cae
FG
2762 return;
2763 }
2764 dout(3) << "not clientreplay|active yet, waiting" << dendl;
2765 mds->wait_for_replay(new C_MDS_RetryMessage(mds, m));
2766 return;
2767 }
2768
f67539c2 2769 if (m->get_op() == MMDSPeerRequest::OP_COMMITTED) {
7c673cae 2770 metareqid_t r = m->get_reqid();
f67539c2 2771 mdcache->committed_leader_peer(r, from);
7c673cae
FG
2772 return;
2773 }
2774
2775 MDRequestRef mdr = mdcache->request_get(m->get_reqid());
2776 if (m->get_attempt() != mdr->attempt) {
f67539c2 2777 dout(10) << "handle_peer_request_reply " << *mdr << " ignoring reply from other attempt "
7c673cae 2778 << m->get_attempt() << dendl;
7c673cae
FG
2779 return;
2780 }
2781
2782 switch (m->get_op()) {
f67539c2 2783 case MMDSPeerRequest::OP_XLOCKACK:
7c673cae 2784 {
f67539c2 2785 // identify lock, leader request
7c673cae
FG
2786 SimpleLock *lock = mds->locker->get_lock(m->get_lock_type(),
2787 m->get_object_info());
f67539c2 2788 mdr->more()->peers.insert(from);
11fdf7f2 2789 lock->decode_locked_state(m->get_lock_data());
7c673cae 2790 dout(10) << "got remote xlock on " << *lock << " on " << *lock->get_parent() << dendl;
9f95a23c 2791 mdr->emplace_lock(lock, MutationImpl::LockOp::XLOCK);
7c673cae
FG
2792 mdr->finish_locking(lock);
2793 lock->get_xlock(mdr, mdr->get_client());
2794
f67539c2
TL
2795 ceph_assert(mdr->more()->waiting_on_peer.count(from));
2796 mdr->more()->waiting_on_peer.erase(from);
2797 ceph_assert(mdr->more()->waiting_on_peer.empty());
7c673cae
FG
2798 mdcache->dispatch_request(mdr);
2799 }
2800 break;
2801
f67539c2 2802 case MMDSPeerRequest::OP_WRLOCKACK:
7c673cae 2803 {
f67539c2 2804 // identify lock, leader request
7c673cae
FG
2805 SimpleLock *lock = mds->locker->get_lock(m->get_lock_type(),
2806 m->get_object_info());
f67539c2 2807 mdr->more()->peers.insert(from);
7c673cae 2808 dout(10) << "got remote wrlock on " << *lock << " on " << *lock->get_parent() << dendl;
9f95a23c 2809 auto it = mdr->emplace_lock(lock, MutationImpl::LockOp::REMOTE_WRLOCK, from);
11fdf7f2
TL
2810 ceph_assert(it->is_remote_wrlock());
2811 ceph_assert(it->wrlock_target == from);
2812
7c673cae
FG
2813 mdr->finish_locking(lock);
2814
f67539c2
TL
2815 ceph_assert(mdr->more()->waiting_on_peer.count(from));
2816 mdr->more()->waiting_on_peer.erase(from);
2817 ceph_assert(mdr->more()->waiting_on_peer.empty());
7c673cae
FG
2818 mdcache->dispatch_request(mdr);
2819 }
2820 break;
2821
f67539c2
TL
2822 case MMDSPeerRequest::OP_AUTHPINACK:
2823 handle_peer_auth_pin_ack(mdr, m);
7c673cae
FG
2824 break;
2825
f67539c2
TL
2826 case MMDSPeerRequest::OP_LINKPREPACK:
2827 handle_peer_link_prep_ack(mdr, m);
7c673cae
FG
2828 break;
2829
f67539c2
TL
2830 case MMDSPeerRequest::OP_RMDIRPREPACK:
2831 handle_peer_rmdir_prep_ack(mdr, m);
7c673cae
FG
2832 break;
2833
f67539c2
TL
2834 case MMDSPeerRequest::OP_RENAMEPREPACK:
2835 handle_peer_rename_prep_ack(mdr, m);
7c673cae
FG
2836 break;
2837
f67539c2
TL
2838 case MMDSPeerRequest::OP_RENAMENOTIFYACK:
2839 handle_peer_rename_notify_ack(mdr, m);
7c673cae
FG
2840 break;
2841
2842 default:
2843 ceph_abort();
2844 }
7c673cae
FG
2845}
2846
f67539c2 2847void Server::dispatch_peer_request(MDRequestRef& mdr)
7c673cae 2848{
f67539c2 2849 dout(7) << "dispatch_peer_request " << *mdr << " " << *mdr->peer_request << dendl;
7c673cae
FG
2850
2851 if (mdr->aborted) {
2852 dout(7) << " abort flag set, finishing" << dendl;
2853 mdcache->request_finish(mdr);
2854 return;
2855 }
2856
f67539c2 2857 if (logger) logger->inc(l_mdss_dispatch_peer_request);
7c673cae 2858
f67539c2 2859 int op = mdr->peer_request->get_op();
7c673cae 2860 switch (op) {
f67539c2
TL
2861 case MMDSPeerRequest::OP_XLOCK:
2862 case MMDSPeerRequest::OP_WRLOCK:
7c673cae
FG
2863 {
2864 // identify object
f67539c2
TL
2865 SimpleLock *lock = mds->locker->get_lock(mdr->peer_request->get_lock_type(),
2866 mdr->peer_request->get_object_info());
7c673cae
FG
2867
2868 if (!lock) {
2869 dout(10) << "don't have object, dropping" << dendl;
2870 ceph_abort(); // can this happen, if we auth pinned properly.
2871 }
f67539c2 2872 if (op == MMDSPeerRequest::OP_XLOCK && !lock->get_parent()->is_auth()) {
7c673cae
FG
2873 dout(10) << "not auth for remote xlock attempt, dropping on "
2874 << *lock << " on " << *lock->get_parent() << dendl;
2875 } else {
2876 // use acquire_locks so that we get auth_pinning.
11fdf7f2
TL
2877 MutationImpl::LockOpVec lov;
2878 for (const auto& p : mdr->locks) {
2879 if (p.is_xlock())
2880 lov.add_xlock(p.lock);
2881 else if (p.is_wrlock())
2882 lov.add_wrlock(p.lock);
2883 }
7c673cae
FG
2884
2885 int replycode = 0;
2886 switch (op) {
f67539c2 2887 case MMDSPeerRequest::OP_XLOCK:
11fdf7f2 2888 lov.add_xlock(lock);
f67539c2 2889 replycode = MMDSPeerRequest::OP_XLOCKACK;
7c673cae 2890 break;
f67539c2 2891 case MMDSPeerRequest::OP_WRLOCK:
11fdf7f2 2892 lov.add_wrlock(lock);
f67539c2 2893 replycode = MMDSPeerRequest::OP_WRLOCKACK;
7c673cae
FG
2894 break;
2895 }
2896
11fdf7f2 2897 if (!mds->locker->acquire_locks(mdr, lov))
7c673cae
FG
2898 return;
2899
2900 // ack
f67539c2 2901 auto r = make_message<MMDSPeerRequest>(mdr->reqid, mdr->attempt, replycode);
7c673cae
FG
2902 r->set_lock_type(lock->get_type());
2903 lock->get_parent()->set_object_info(r->get_object_info());
f67539c2 2904 if (replycode == MMDSPeerRequest::OP_XLOCKACK)
11fdf7f2 2905 lock->encode_locked_state(r->get_lock_data());
f67539c2 2906 mds->send_message(r, mdr->peer_request->get_connection());
7c673cae
FG
2907 }
2908
2909 // done.
f67539c2 2910 mdr->reset_peer_request();
7c673cae
FG
2911 }
2912 break;
2913
f67539c2
TL
2914 case MMDSPeerRequest::OP_UNXLOCK:
2915 case MMDSPeerRequest::OP_UNWRLOCK:
7c673cae 2916 {
f67539c2
TL
2917 SimpleLock *lock = mds->locker->get_lock(mdr->peer_request->get_lock_type(),
2918 mdr->peer_request->get_object_info());
11fdf7f2
TL
2919 ceph_assert(lock);
2920 auto it = mdr->locks.find(lock);
2921 ceph_assert(it != mdr->locks.end());
7c673cae
FG
2922 bool need_issue = false;
2923 switch (op) {
f67539c2 2924 case MMDSPeerRequest::OP_UNXLOCK:
11fdf7f2 2925 mds->locker->xlock_finish(it, mdr.get(), &need_issue);
7c673cae 2926 break;
f67539c2 2927 case MMDSPeerRequest::OP_UNWRLOCK:
11fdf7f2 2928 mds->locker->wrlock_finish(it, mdr.get(), &need_issue);
7c673cae
FG
2929 break;
2930 }
2931 if (need_issue)
2932 mds->locker->issue_caps(static_cast<CInode*>(lock->get_parent()));
2933
2934 // done. no ack necessary.
f67539c2 2935 mdr->reset_peer_request();
7c673cae
FG
2936 }
2937 break;
2938
f67539c2
TL
2939 case MMDSPeerRequest::OP_AUTHPIN:
2940 handle_peer_auth_pin(mdr);
7c673cae
FG
2941 break;
2942
f67539c2
TL
2943 case MMDSPeerRequest::OP_LINKPREP:
2944 case MMDSPeerRequest::OP_UNLINKPREP:
2945 handle_peer_link_prep(mdr);
7c673cae
FG
2946 break;
2947
f67539c2
TL
2948 case MMDSPeerRequest::OP_RMDIRPREP:
2949 handle_peer_rmdir_prep(mdr);
7c673cae
FG
2950 break;
2951
f67539c2
TL
2952 case MMDSPeerRequest::OP_RENAMEPREP:
2953 handle_peer_rename_prep(mdr);
7c673cae
FG
2954 break;
2955
7c673cae
FG
2956 default:
2957 ceph_abort();
2958 }
2959}
2960
f67539c2 2961void Server::handle_peer_auth_pin(MDRequestRef& mdr)
7c673cae 2962{
f67539c2 2963 dout(10) << "handle_peer_auth_pin " << *mdr << dendl;
7c673cae
FG
2964
2965 // build list of objects
2966 list<MDSCacheObject*> objects;
2967 CInode *auth_pin_freeze = NULL;
f67539c2 2968 bool nonblocking = mdr->peer_request->is_nonblocking();
7c673cae 2969 bool fail = false, wouldblock = false, readonly = false;
f67539c2 2970 ref_t<MMDSPeerRequest> reply;
7c673cae
FG
2971
2972 if (mdcache->is_readonly()) {
2973 dout(10) << " read-only FS" << dendl;
2974 readonly = true;
2975 fail = true;
2976 }
2977
2978 if (!fail) {
f67539c2 2979 for (const auto &oi : mdr->peer_request->get_authpins()) {
11fdf7f2 2980 MDSCacheObject *object = mdcache->get_object(oi);
7c673cae 2981 if (!object) {
11fdf7f2 2982 dout(10) << " don't have " << oi << dendl;
7c673cae
FG
2983 fail = true;
2984 break;
2985 }
2986
2987 objects.push_back(object);
f67539c2 2988 if (oi == mdr->peer_request->get_authpin_freeze())
7c673cae
FG
2989 auth_pin_freeze = static_cast<CInode*>(object);
2990 }
2991 }
2992
2993 // can we auth pin them?
2994 if (!fail) {
9f95a23c
TL
2995 for (const auto& obj : objects) {
2996 if (!obj->is_auth()) {
2997 dout(10) << " not auth for " << *obj << dendl;
7c673cae
FG
2998 fail = true;
2999 break;
3000 }
9f95a23c 3001 if (mdr->is_auth_pinned(obj))
7c673cae 3002 continue;
9f95a23c
TL
3003 if (!mdr->can_auth_pin(obj)) {
3004 if (nonblocking) {
3005 dout(10) << " can't auth_pin (freezing?) " << *obj << " nonblocking" << dendl;
7c673cae
FG
3006 fail = true;
3007 wouldblock = true;
3008 break;
3009 }
3010 // wait
9f95a23c
TL
3011 dout(10) << " waiting for authpinnable on " << *obj << dendl;
3012 obj->add_waiter(CDir::WAIT_UNFREEZE, new C_MDS_RetryRequest(mdcache, mdr));
7c673cae
FG
3013 mdr->drop_local_auth_pins();
3014
9f95a23c
TL
3015 mds->locker->notify_freeze_waiter(obj);
3016 goto blocked;
7c673cae
FG
3017 }
3018 }
3019 }
3020
9f95a23c 3021 if (!fail) {
7c673cae
FG
3022 /* freeze authpin wrong inode */
3023 if (mdr->has_more() && mdr->more()->is_freeze_authpin &&
3024 mdr->more()->rename_inode != auth_pin_freeze)
3025 mdr->unfreeze_auth_pin(true);
3026
f67539c2 3027 /* handle_peer_rename_prep() call freeze_inode() to wait for all other operations
7c673cae
FG
3028 * on the source inode to complete. This happens after all locks for the rename
3029 * operation are acquired. But to acquire locks, we need auth pin locks' parent
3030 * objects first. So there is an ABBA deadlock if someone auth pins the source inode
f67539c2 3031 * after locks are acquired and before Server::handle_peer_rename_prep() is called.
7c673cae
FG
3032 * The solution is freeze the inode and prevent other MDRequests from getting new
3033 * auth pins.
3034 */
3035 if (auth_pin_freeze) {
3036 dout(10) << " freezing auth pin on " << *auth_pin_freeze << dendl;
3037 if (!mdr->freeze_auth_pin(auth_pin_freeze)) {
3038 auth_pin_freeze->add_waiter(CInode::WAIT_FROZEN, new C_MDS_RetryRequest(mdcache, mdr));
3039 mds->mdlog->flush();
9f95a23c 3040 goto blocked;
7c673cae
FG
3041 }
3042 }
7c673cae
FG
3043 }
3044
f67539c2 3045 reply = make_message<MMDSPeerRequest>(mdr->reqid, mdr->attempt, MMDSPeerRequest::OP_AUTHPINACK);
7c673cae 3046
9f95a23c
TL
3047 if (fail) {
3048 mdr->drop_local_auth_pins(); // just in case
3049 if (readonly)
3050 reply->mark_error_rofs();
3051 if (wouldblock)
3052 reply->mark_error_wouldblock();
3053 } else {
3054 // auth pin!
3055 for (const auto& obj : objects) {
3056 dout(10) << "auth_pinning " << *obj << dendl;
3057 mdr->auth_pin(obj);
3058 }
3059 // return list of my auth_pins (if any)
3060 for (const auto &p : mdr->object_states) {
3061 if (!p.second.auth_pinned)
3062 continue;
3063 MDSCacheObjectInfo info;
3064 p.first->set_object_info(info);
3065 reply->get_authpins().push_back(info);
3066 if (p.first == (MDSCacheObject*)auth_pin_freeze)
3067 auth_pin_freeze->set_object_info(reply->get_authpin_freeze());
3068 }
3069 }
7c673cae 3070
f67539c2 3071 mds->send_message_mds(reply, mdr->peer_to_mds);
7c673cae
FG
3072
3073 // clean up this request
f67539c2 3074 mdr->reset_peer_request();
7c673cae 3075 return;
9f95a23c
TL
3076
3077blocked:
f67539c2
TL
3078 if (mdr->peer_request->should_notify_blocking()) {
3079 reply = make_message<MMDSPeerRequest>(mdr->reqid, mdr->attempt, MMDSPeerRequest::OP_AUTHPINACK);
9f95a23c 3080 reply->mark_req_blocked();
f67539c2
TL
3081 mds->send_message_mds(reply, mdr->peer_to_mds);
3082 mdr->peer_request->clear_notify_blocking();
9f95a23c
TL
3083 }
3084 return;
7c673cae
FG
3085}
3086
f67539c2 3087void Server::handle_peer_auth_pin_ack(MDRequestRef& mdr, const cref_t<MMDSPeerRequest> &ack)
7c673cae 3088{
f67539c2 3089 dout(10) << "handle_peer_auth_pin_ack on " << *mdr << " " << *ack << dendl;
7c673cae
FG
3090 mds_rank_t from = mds_rank_t(ack->get_source().num());
3091
9f95a23c
TL
3092 if (ack->is_req_blocked()) {
3093 mdr->disable_lock_cache();
f67539c2 3094 // peer auth pin is blocked, drop locks to avoid deadlock
9f95a23c
TL
3095 mds->locker->drop_locks(mdr.get(), nullptr);
3096 return;
3097 }
3098
7c673cae
FG
3099 // added auth pins?
3100 set<MDSCacheObject*> pinned;
11fdf7f2
TL
3101 for (const auto &oi : ack->get_authpins()) {
3102 MDSCacheObject *object = mdcache->get_object(oi);
3103 ceph_assert(object); // we pinned it
7c673cae 3104 dout(10) << " remote has pinned " << *object << dendl;
9f95a23c 3105 mdr->set_remote_auth_pinned(object, from);
11fdf7f2 3106 if (oi == ack->get_authpin_freeze())
7c673cae
FG
3107 mdr->set_remote_frozen_auth_pin(static_cast<CInode *>(object));
3108 pinned.insert(object);
3109 }
3110
3111 // removed frozen auth pin ?
3112 if (mdr->more()->is_remote_frozen_authpin &&
3113 ack->get_authpin_freeze() == MDSCacheObjectInfo()) {
9f95a23c
TL
3114 auto stat_p = mdr->find_object_state(mdr->more()->rename_inode);
3115 ceph_assert(stat_p);
3116 if (stat_p->remote_auth_pinned == from) {
7c673cae
FG
3117 mdr->more()->is_remote_frozen_authpin = false;
3118 }
3119 }
3120
3121 // removed auth pins?
9f95a23c
TL
3122 for (auto& p : mdr->object_states) {
3123 if (p.second.remote_auth_pinned == MDS_RANK_NONE)
3124 continue;
3125 MDSCacheObject* object = p.first;
3126 if (p.second.remote_auth_pinned == from && pinned.count(object) == 0) {
7c673cae 3127 dout(10) << " remote has unpinned " << *object << dendl;
9f95a23c 3128 mdr->_clear_remote_auth_pinned(p.second);
7c673cae
FG
3129 }
3130 }
3131
f67539c2
TL
3132 // note peer
3133 mdr->more()->peers.insert(from);
9f95a23c
TL
3134
3135 // clear from waiting list
f67539c2 3136 auto ret = mdr->more()->waiting_on_peer.erase(from);
9f95a23c
TL
3137 ceph_assert(ret);
3138
7c673cae 3139 if (ack->is_error_rofs()) {
f67539c2 3140 mdr->more()->peer_error = -CEPHFS_EROFS;
7c673cae 3141 } else if (ack->is_error_wouldblock()) {
f67539c2 3142 mdr->more()->peer_error = -CEPHFS_EWOULDBLOCK;
7c673cae 3143 }
7c673cae
FG
3144
3145 // go again?
f67539c2 3146 if (mdr->more()->waiting_on_peer.empty())
7c673cae
FG
3147 mdcache->dispatch_request(mdr);
3148 else
f67539c2 3149 dout(10) << "still waiting on peers " << mdr->more()->waiting_on_peer << dendl;
7c673cae
FG
3150}
3151
3152
3153// ---------------------------------------
3154// HELPERS
3155
3156
3157/**
3158 * check whether we are permitted to complete a request
3159 *
3160 * Check whether we have permission to perform the operation specified
3161 * by mask on the given inode, based on the capability in the mdr's
3162 * session.
3163 */
3164bool Server::check_access(MDRequestRef& mdr, CInode *in, unsigned mask)
3165{
3166 if (mdr->session) {
3167 int r = mdr->session->check_access(
3168 in, mask,
3169 mdr->client_request->get_caller_uid(),
3170 mdr->client_request->get_caller_gid(),
3171 &mdr->client_request->get_caller_gid_list(),
3172 mdr->client_request->head.args.setattr.uid,
3173 mdr->client_request->head.args.setattr.gid);
3174 if (r < 0) {
3175 respond_to_request(mdr, r);
3176 return false;
3177 }
3178 }
3179 return true;
3180}
3181
3182/**
3183 * check whether fragment has reached maximum size
3184 *
3185 */
3186bool Server::check_fragment_space(MDRequestRef &mdr, CDir *in)
3187{
3188 const auto size = in->get_frag_size();
11fdf7f2 3189 if (size >= g_conf()->mds_bal_fragment_size_max) {
f67539c2
TL
3190 dout(10) << "fragment " << *in << " size exceeds " << g_conf()->mds_bal_fragment_size_max << " (CEPHFS_ENOSPC)" << dendl;
3191 respond_to_request(mdr, -CEPHFS_ENOSPC);
7c673cae
FG
3192 return false;
3193 }
3194
3195 return true;
3196}
3197
7c673cae
FG
3198CDentry* Server::prepare_stray_dentry(MDRequestRef& mdr, CInode *in)
3199{
f67539c2
TL
3200 string straydname;
3201 in->name_stray_dentry(straydname);
3202
7c673cae
FG
3203 CDentry *straydn = mdr->straydn;
3204 if (straydn) {
9f95a23c
TL
3205 ceph_assert(straydn->get_name() == straydname);
3206 return straydn;
7c673cae 3207 }
7c673cae
FG
3208 CDir *straydir = mdcache->get_stray_dir(in);
3209
3210 if (!mdr->client_request->is_replay() &&
3211 !check_fragment_space(mdr, straydir))
f67539c2
TL
3212 return nullptr;
3213
3214 straydn = straydir->lookup(straydname);
3215 if (!straydn) {
3216 if (straydir->is_frozen_dir()) {
3217 dout(10) << __func__ << ": " << *straydir << " is frozen, waiting" << dendl;
3218 mds->locker->drop_locks(mdr.get());
3219 mdr->drop_local_auth_pins();
3220 straydir->add_waiter(CInode::WAIT_UNFREEZE, new C_MDS_RetryRequest(mdcache, mdr));
3221 return nullptr;
3222 }
3223 straydn = straydir->add_null_dentry(straydname);
3224 straydn->mark_new();
3225 } else {
3226 ceph_assert(straydn->get_projected_linkage()->is_null());
3227 }
7c673cae 3228
f67539c2 3229 straydn->state_set(CDentry::STATE_STRAY);
7c673cae
FG
3230 mdr->straydn = straydn;
3231 mdr->pin(straydn);
f67539c2 3232
7c673cae
FG
3233 return straydn;
3234}
3235
3236/** prepare_new_inode
3237 *
3238 * create a new inode. set c/m/atime. hit dir pop.
3239 */
3240CInode* Server::prepare_new_inode(MDRequestRef& mdr, CDir *dir, inodeno_t useino, unsigned mode,
f67539c2 3241 const file_layout_t *layout)
7c673cae
FG
3242{
3243 CInode *in = new CInode(mdcache);
f67539c2 3244 auto _inode = in->_get_inode();
7c673cae
FG
3245
3246 // Server::prepare_force_open_sessions() can re-open session in closing
3247 // state. In that corner case, session's prealloc_inos are being freed.
3248 // To simplify the code, we disallow using/refilling session's prealloc_ino
3249 // while session is opening.
92f5a8d4 3250 bool allow_prealloc_inos = mdr->session->is_open();
7c673cae
FG
3251
3252 // assign ino
f67539c2 3253 if (allow_prealloc_inos && (mdr->used_prealloc_ino = _inode->ino = mdr->session->take_ino(useino))) {
7c673cae 3254 mds->sessionmap.mark_projected(mdr->session);
7c673cae 3255 dout(10) << "prepare_new_inode used_prealloc " << mdr->used_prealloc_ino
f67539c2 3256 << " (" << mdr->session->info.prealloc_inos.size() << " left)"
7c673cae
FG
3257 << dendl;
3258 } else {
3259 mdr->alloc_ino =
f67539c2 3260 _inode->ino = mds->inotable->project_alloc_id(useino);
7c673cae
FG
3261 dout(10) << "prepare_new_inode alloc " << mdr->alloc_ino << dendl;
3262 }
3263
f67539c2
TL
3264 if (useino && useino != _inode->ino) {
3265 dout(0) << "WARNING: client specified " << useino << " and i allocated " << _inode->ino << dendl;
7c673cae
FG
3266 mds->clog->error() << mdr->client_request->get_source()
3267 << " specified ino " << useino
f67539c2 3268 << " but mds." << mds->get_nodeid() << " allocated " << _inode->ino;
7c673cae
FG
3269 //ceph_abort(); // just for now.
3270 }
3271
3272 if (allow_prealloc_inos &&
11fdf7f2
TL
3273 mdr->session->get_num_projected_prealloc_inos() < g_conf()->mds_client_prealloc_inos / 2) {
3274 int need = g_conf()->mds_client_prealloc_inos - mdr->session->get_num_projected_prealloc_inos();
7c673cae 3275 mds->inotable->project_alloc_ids(mdr->prealloc_inos, need);
11fdf7f2 3276 ceph_assert(mdr->prealloc_inos.size()); // or else fix projected increment semantics
7c673cae
FG
3277 mdr->session->pending_prealloc_inos.insert(mdr->prealloc_inos);
3278 mds->sessionmap.mark_projected(mdr->session);
3279 dout(10) << "prepare_new_inode prealloc " << mdr->prealloc_inos << dendl;
3280 }
3281
f67539c2
TL
3282 _inode->version = 1;
3283 _inode->xattr_version = 1;
3284 _inode->nlink = 1; // FIXME
7c673cae 3285
f67539c2 3286 _inode->mode = mode;
7c673cae 3287
92f5a8d4 3288 // FIPS zeroization audit 20191117: this memset is not security related.
f67539c2
TL
3289 memset(&_inode->dir_layout, 0, sizeof(_inode->dir_layout));
3290 if (_inode->is_dir()) {
3291 _inode->dir_layout.dl_dir_hash = g_conf()->mds_default_dir_hash;
7c673cae 3292 } else if (layout) {
f67539c2 3293 _inode->layout = *layout;
7c673cae 3294 } else {
f67539c2 3295 _inode->layout = mdcache->default_file_layout;
7c673cae
FG
3296 }
3297
f67539c2
TL
3298 _inode->truncate_size = -1ull; // not truncated, yet!
3299 _inode->truncate_seq = 1; /* starting with 1, 0 is kept for no-truncation logic */
7c673cae
FG
3300
3301 CInode *diri = dir->get_inode();
3302
f67539c2 3303 dout(10) << oct << " dir mode 0" << diri->get_inode()->mode << " new mode 0" << mode << dec << dendl;
7c673cae 3304
f67539c2 3305 if (diri->get_inode()->mode & S_ISGID) {
7c673cae 3306 dout(10) << " dir is sticky" << dendl;
f67539c2 3307 _inode->gid = diri->get_inode()->gid;
7c673cae
FG
3308 if (S_ISDIR(mode)) {
3309 dout(10) << " new dir also sticky" << dendl;
f67539c2 3310 _inode->mode |= S_ISGID;
7c673cae
FG
3311 }
3312 } else
f67539c2 3313 _inode->gid = mdr->client_request->get_caller_gid();
7c673cae 3314
f67539c2 3315 _inode->uid = mdr->client_request->get_caller_uid();
7c673cae 3316
f67539c2 3317 _inode->btime = _inode->ctime = _inode->mtime = _inode->atime =
7c673cae
FG
3318 mdr->get_op_stamp();
3319
f67539c2 3320 _inode->change_attr = 0;
7c673cae 3321
9f95a23c 3322 const cref_t<MClientRequest> &req = mdr->client_request;
7c673cae 3323 if (req->get_data().length()) {
11fdf7f2 3324 auto p = req->get_data().cbegin();
7c673cae
FG
3325
3326 // xattrs on new inode?
f67539c2
TL
3327 auto _xattrs = CInode::allocate_xattr_map();
3328 decode_noshare(*_xattrs, p);
3329 dout(10) << "prepare_new_inode setting xattrs " << *_xattrs << dendl;
3330 in->reset_xattrs(std::move(_xattrs));
7c673cae
FG
3331 }
3332
3333 if (!mds->mdsmap->get_inline_data_enabled() ||
11fdf7f2 3334 !mdr->session->get_connection()->has_feature(CEPH_FEATURE_MDS_INLINE_DATA))
f67539c2 3335 _inode->inline_data.version = CEPH_INLINE_NONE;
7c673cae
FG
3336
3337 mdcache->add_inode(in); // add
3338 dout(10) << "prepare_new_inode " << *in << dendl;
3339 return in;
3340}
3341
3342void Server::journal_allocated_inos(MDRequestRef& mdr, EMetaBlob *blob)
3343{
3344 dout(20) << "journal_allocated_inos sessionmapv " << mds->sessionmap.get_projected()
3345 << " inotablev " << mds->inotable->get_projected_version()
3346 << dendl;
3347 blob->set_ino_alloc(mdr->alloc_ino,
3348 mdr->used_prealloc_ino,
3349 mdr->prealloc_inos,
3350 mdr->client_request->get_source(),
3351 mds->sessionmap.get_projected(),
3352 mds->inotable->get_projected_version());
3353}
3354
3355void Server::apply_allocated_inos(MDRequestRef& mdr, Session *session)
3356{
3357 dout(10) << "apply_allocated_inos " << mdr->alloc_ino
3358 << " / " << mdr->prealloc_inos
3359 << " / " << mdr->used_prealloc_ino << dendl;
3360
3361 if (mdr->alloc_ino) {
3362 mds->inotable->apply_alloc_id(mdr->alloc_ino);
3363 }
3364 if (mdr->prealloc_inos.size()) {
11fdf7f2 3365 ceph_assert(session);
7c673cae 3366 session->pending_prealloc_inos.subtract(mdr->prealloc_inos);
f67539c2 3367 session->free_prealloc_inos.insert(mdr->prealloc_inos);
7c673cae 3368 session->info.prealloc_inos.insert(mdr->prealloc_inos);
81eedcae 3369 mds->sessionmap.mark_dirty(session, !mdr->used_prealloc_ino);
7c673cae
FG
3370 mds->inotable->apply_alloc_ids(mdr->prealloc_inos);
3371 }
3372 if (mdr->used_prealloc_ino) {
11fdf7f2 3373 ceph_assert(session);
f67539c2 3374 session->info.prealloc_inos.erase(mdr->used_prealloc_ino);
7c673cae
FG
3375 mds->sessionmap.mark_dirty(session);
3376 }
3377}
3378
3379class C_MDS_TryFindInode : public ServerContext {
3380 MDRequestRef mdr;
3381public:
3382 C_MDS_TryFindInode(Server *s, MDRequestRef& r) : ServerContext(s), mdr(r) {}
3383 void finish(int r) override {
f67539c2 3384 if (r == -CEPHFS_ESTALE) // :( find_ino_peers failed
7c673cae
FG
3385 server->respond_to_request(mdr, r);
3386 else
3387 server->dispatch_client_request(mdr);
3388 }
3389};
3390
7c673cae
FG
3391/* If this returns null, the request has been handled
3392 * as appropriate: forwarded on, or the client's been replied to */
9f95a23c 3393CInode* Server::rdlock_path_pin_ref(MDRequestRef& mdr,
7c673cae 3394 bool want_auth,
9f95a23c 3395 bool no_want_auth)
7c673cae 3396{
9f95a23c 3397 const filepath& refpath = mdr->get_filepath();
7c673cae
FG
3398 dout(10) << "rdlock_path_pin_ref " << *mdr << " " << refpath << dendl;
3399
9f95a23c
TL
3400 if (mdr->locking_state & MutationImpl::PATH_LOCKED)
3401 return mdr->in[0];
7c673cae
FG
3402
3403 // traverse
f67539c2 3404 CF_MDS_RetryRequestFactory cf(mdcache, mdr, true);
9f95a23c
TL
3405 int flags = 0;
3406 if (refpath.is_last_snap()) {
3407 if (!no_want_auth)
3408 want_auth = true;
3409 } else {
f91f0fd5
TL
3410 if (!no_want_auth && forward_all_requests_to_auth)
3411 want_auth = true;
9f95a23c
TL
3412 flags |= MDS_TRAVERSE_RDLOCK_PATH | MDS_TRAVERSE_RDLOCK_SNAP;
3413 }
3414 if (want_auth)
3415 flags |= MDS_TRAVERSE_WANT_AUTH;
3416 int r = mdcache->path_traverse(mdr, cf, refpath, flags, &mdr->dn[0], &mdr->in[0]);
7c673cae 3417 if (r > 0)
9f95a23c 3418 return nullptr; // delayed
7c673cae 3419 if (r < 0) { // error
f67539c2 3420 if (r == -CEPHFS_ENOENT && !mdr->dn[0].empty()) {
9f95a23c
TL
3421 if (mdr->client_request &&
3422 mdr->client_request->get_dentry_wanted())
3423 mdr->tracedn = mdr->dn[0].back();
7c673cae 3424 respond_to_request(mdr, r);
f67539c2
TL
3425 } else if (r == -CEPHFS_ESTALE) {
3426 dout(10) << "FAIL on CEPHFS_ESTALE but attempting recovery" << dendl;
11fdf7f2 3427 MDSContext *c = new C_MDS_TryFindInode(this, mdr);
7c673cae
FG
3428 mdcache->find_ino_peers(refpath.get_ino(), c);
3429 } else {
3430 dout(10) << "FAIL on error " << r << dendl;
3431 respond_to_request(mdr, r);
3432 }
9f95a23c 3433 return nullptr;
7c673cae 3434 }
9f95a23c 3435 CInode *ref = mdr->in[0];
7c673cae
FG
3436 dout(10) << "ref is " << *ref << dendl;
3437
7c673cae 3438 if (want_auth) {
7c673cae
FG
3439 // auth_pin?
3440 // do NOT proceed if freezing, as cap release may defer in that case, and
3441 // we could deadlock when we try to lock @ref.
3442 // if we're already auth_pinned, continue; the release has already been processed.
3443 if (ref->is_frozen() || ref->is_frozen_auth_pin() ||
3444 (ref->is_freezing() && !mdr->is_auth_pinned(ref))) {
3445 dout(7) << "waiting for !frozen/authpinnable on " << *ref << dendl;
9f95a23c
TL
3446 ref->add_waiter(CInode::WAIT_UNFREEZE, cf.build());
3447 if (mdr->is_any_remote_auth_pin())
224ce89b 3448 mds->locker->notify_freeze_waiter(ref);
7c673cae
FG
3449 return 0;
3450 }
7c673cae
FG
3451 mdr->auth_pin(ref);
3452 }
3453
7c673cae
FG
3454 // set and pin ref
3455 mdr->pin(ref);
3456 return ref;
3457}
3458
3459
3460/** rdlock_path_xlock_dentry
3461 * traverse path to the directory that could/would contain dentry.
3462 * make sure i am auth for that dentry, forward as necessary.
3463 * create null dentry in place (or use existing if okexist).
3464 * get rdlocks on traversed dentries, xlock on new dentry.
3465 */
9f95a23c
TL
3466CDentry* Server::rdlock_path_xlock_dentry(MDRequestRef& mdr,
3467 bool create, bool okexist, bool want_layout)
7c673cae 3468{
9f95a23c 3469 const filepath& refpath = mdr->get_filepath();
7c673cae
FG
3470 dout(10) << "rdlock_path_xlock_dentry " << *mdr << " " << refpath << dendl;
3471
9f95a23c
TL
3472 if (mdr->locking_state & MutationImpl::PATH_LOCKED)
3473 return mdr->dn[0].back();
3474
3475 // figure parent dir vs dname
3476 if (refpath.depth() == 0) {
3477 dout(7) << "invalid path (zero length)" << dendl;
f67539c2 3478 respond_to_request(mdr, -CEPHFS_EINVAL);
9f95a23c
TL
3479 return nullptr;
3480 }
3481
3482 if (refpath.is_last_snap()) {
f67539c2 3483 respond_to_request(mdr, -CEPHFS_EROFS);
9f95a23c
TL
3484 return nullptr;
3485 }
7c673cae 3486
9f95a23c
TL
3487 if (refpath.is_last_dot_or_dotdot()) {
3488 dout(7) << "invalid path (last dot or dot_dot)" << dendl;
3489 if (create)
f67539c2 3490 respond_to_request(mdr, -CEPHFS_EEXIST);
9f95a23c 3491 else
f67539c2 3492 respond_to_request(mdr, -CEPHFS_ENOTEMPTY);
9f95a23c
TL
3493 return nullptr;
3494 }
7c673cae 3495
9f95a23c 3496 // traverse to parent dir
f67539c2 3497 CF_MDS_RetryRequestFactory cf(mdcache, mdr, true);
9f95a23c
TL
3498 int flags = MDS_TRAVERSE_RDLOCK_SNAP | MDS_TRAVERSE_RDLOCK_PATH |
3499 MDS_TRAVERSE_WANT_DENTRY | MDS_TRAVERSE_XLOCK_DENTRY |
3500 MDS_TRAVERSE_WANT_AUTH;
3501 if (refpath.depth() == 1 && !mdr->lock_cache_disabled)
3502 flags |= MDS_TRAVERSE_CHECK_LOCKCACHE;
3503 if (create)
3504 flags |= MDS_TRAVERSE_RDLOCK_AUTHLOCK;
3505 if (want_layout)
3506 flags |= MDS_TRAVERSE_WANT_DIRLAYOUT;
3507 int r = mdcache->path_traverse(mdr, cf, refpath, flags, &mdr->dn[0]);
3508 if (r > 0)
3509 return nullptr; // delayed
3510 if (r < 0) {
f67539c2
TL
3511 if (r == -CEPHFS_ESTALE) {
3512 dout(10) << "FAIL on CEPHFS_ESTALE but attempting recovery" << dendl;
9f95a23c
TL
3513 mdcache->find_ino_peers(refpath.get_ino(), new C_MDS_TryFindInode(this, mdr));
3514 return nullptr;
3515 }
3516 respond_to_request(mdr, r);
3517 return nullptr;
3518 }
7c673cae 3519
9f95a23c
TL
3520 CDentry *dn = mdr->dn[0].back();
3521 CDir *dir = dn->get_dir();
7c673cae 3522 CInode *diri = dir->get_inode();
9f95a23c 3523
7c673cae
FG
3524 if (!mdr->reqid.name.is_mds()) {
3525 if (diri->is_system() && !diri->is_root()) {
f67539c2 3526 respond_to_request(mdr, -CEPHFS_EROFS);
9f95a23c 3527 return nullptr;
7c673cae
FG
3528 }
3529 }
9f95a23c 3530
7c673cae 3531 if (!diri->is_base() && diri->get_projected_parent_dir()->inode->is_stray()) {
f67539c2 3532 respond_to_request(mdr, -CEPHFS_ENOENT);
9f95a23c 3533 return nullptr;
7c673cae
FG
3534 }
3535
9f95a23c
TL
3536 CDentry::linkage_t *dnl = dn->get_projected_linkage();
3537 if (dnl->is_null()) {
3538 if (!create && okexist) {
f67539c2 3539 respond_to_request(mdr, -CEPHFS_ENOENT);
9f95a23c 3540 return nullptr;
7c673cae
FG
3541 }
3542
9f95a23c
TL
3543 snapid_t next_snap = mdcache->get_global_snaprealm()->get_newest_seq() + 1;
3544 dn->first = std::max(dn->first, next_snap);
7c673cae 3545 } else {
9f95a23c 3546 if (!okexist) {
f67539c2 3547 respond_to_request(mdr, -CEPHFS_EEXIST);
9f95a23c
TL
3548 return nullptr;
3549 }
3550 mdr->in[0] = dnl->get_inode();
7c673cae
FG
3551 }
3552
7c673cae
FG
3553 return dn;
3554}
3555
9f95a23c
TL
3556/** rdlock_two_paths_xlock_destdn
3557 * traverse two paths and lock the two paths in proper order.
3558 * The order of taking locks is:
3559 * 1. Lock directory inodes or dentries according to which trees they
3560 * are under. Lock objects under fs root before objects under mdsdir.
3561 * 2. Lock directory inodes or dentries according to their depth, in
3562 * ascending order.
3563 * 3. Lock directory inodes or dentries according to inode numbers or
3564 * dentries' parent inode numbers, in ascending order.
3565 * 4. Lock dentries in the same directory in order of their keys.
3566 * 5. Lock non-directory inodes according to inode numbers, in ascending
3567 * order.
3568 */
3569std::pair<CDentry*, CDentry*>
3570Server::rdlock_two_paths_xlock_destdn(MDRequestRef& mdr, bool xlock_srcdn)
3571{
7c673cae 3572
9f95a23c
TL
3573 const filepath& refpath = mdr->get_filepath();
3574 const filepath& refpath2 = mdr->get_filepath2();
7c673cae 3575
9f95a23c 3576 dout(10) << "rdlock_two_paths_xlock_destdn " << *mdr << " " << refpath << " " << refpath2 << dendl;
7c673cae 3577
9f95a23c
TL
3578 if (mdr->locking_state & MutationImpl::PATH_LOCKED)
3579 return std::make_pair(mdr->dn[0].back(), mdr->dn[1].back());
7c673cae 3580
9f95a23c 3581 if (refpath.depth() != 1 || refpath2.depth() != 1) {
f67539c2 3582 respond_to_request(mdr, -CEPHFS_EINVAL);
9f95a23c
TL
3583 return std::pair<CDentry*, CDentry*>(nullptr, nullptr);
3584 }
3585
3586 if (refpath.is_last_snap() || refpath2.is_last_snap()) {
f67539c2 3587 respond_to_request(mdr, -CEPHFS_EROFS);
9f95a23c
TL
3588 return std::make_pair(nullptr, nullptr);
3589 }
3590
3591 // traverse to parent dir
f67539c2 3592 CF_MDS_RetryRequestFactory cf(mdcache, mdr, true);
9f95a23c
TL
3593 int flags = MDS_TRAVERSE_RDLOCK_SNAP | MDS_TRAVERSE_WANT_DENTRY | MDS_TRAVERSE_WANT_AUTH;
3594 int r = mdcache->path_traverse(mdr, cf, refpath, flags, &mdr->dn[0]);
3595 if (r != 0) {
f67539c2
TL
3596 if (r == -CEPHFS_ESTALE) {
3597 dout(10) << "CEPHFS_ESTALE on path, attempting recovery" << dendl;
9f95a23c
TL
3598 mdcache->find_ino_peers(refpath.get_ino(), new C_MDS_TryFindInode(this, mdr));
3599 } else if (r < 0) {
3600 respond_to_request(mdr, r);
3601 }
3602 return std::make_pair(nullptr, nullptr);
3603 }
3604
3605 flags = MDS_TRAVERSE_RDLOCK_SNAP2 | MDS_TRAVERSE_WANT_DENTRY | MDS_TRAVERSE_DISCOVER;
3606 r = mdcache->path_traverse(mdr, cf, refpath2, flags, &mdr->dn[1]);
3607 if (r != 0) {
f67539c2
TL
3608 if (r == -CEPHFS_ESTALE) {
3609 dout(10) << "CEPHFS_ESTALE on path2, attempting recovery" << dendl;
9f95a23c
TL
3610 mdcache->find_ino_peers(refpath2.get_ino(), new C_MDS_TryFindInode(this, mdr));
3611 } else if (r < 0) {
3612 respond_to_request(mdr, r);
3613 }
3614 return std::make_pair(nullptr, nullptr);
3615 }
3616
3617 CDentry *srcdn = mdr->dn[1].back();
3618 CDir *srcdir = srcdn->get_dir();
3619 CDentry *destdn = mdr->dn[0].back();
3620 CDir *destdir = destdn->get_dir();
3621
3622 if (!mdr->reqid.name.is_mds()) {
3623 if ((srcdir->get_inode()->is_system() && !srcdir->get_inode()->is_root()) ||
3624 (destdir->get_inode()->is_system() && !destdir->get_inode()->is_root())) {
f67539c2 3625 respond_to_request(mdr, -CEPHFS_EROFS);
9f95a23c
TL
3626 return std::make_pair(nullptr, nullptr);
3627 }
3628 }
3629
3630 if (!destdir->get_inode()->is_base() &&
3631 destdir->get_inode()->get_projected_parent_dir()->inode->is_stray()) {
f67539c2 3632 respond_to_request(mdr, -CEPHFS_ENOENT);
9f95a23c
TL
3633 return std::make_pair(nullptr, nullptr);
3634 }
3635
3636 MutationImpl::LockOpVec lov;
3637 if (srcdir->get_inode() == destdir->get_inode()) {
3638 lov.add_wrlock(&destdir->inode->filelock);
3639 lov.add_wrlock(&destdir->inode->nestlock);
3640 if (xlock_srcdn && srcdir != destdir) {
3641 mds_rank_t srcdir_auth = srcdir->authority().first;
3642 if (srcdir_auth != mds->get_nodeid()) {
3643 lov.add_remote_wrlock(&srcdir->inode->filelock, srcdir_auth);
3644 lov.add_remote_wrlock(&srcdir->inode->nestlock, srcdir_auth);
3645 }
3646 }
3647
3648 if (srcdn->get_name() > destdn->get_name())
3649 lov.add_xlock(&destdn->lock);
3650
3651 if (xlock_srcdn)
3652 lov.add_xlock(&srcdn->lock);
3653 else
3654 lov.add_rdlock(&srcdn->lock);
3655
3656 if (srcdn->get_name() < destdn->get_name())
3657 lov.add_xlock(&destdn->lock);
3658 } else {
3659 int cmp = mdr->compare_paths();
3660 bool lock_destdir_first =
3661 (cmp < 0 || (cmp == 0 && destdir->ino() < srcdir->ino()));
3662
3663 if (lock_destdir_first) {
3664 lov.add_wrlock(&destdir->inode->filelock);
3665 lov.add_wrlock(&destdir->inode->nestlock);
3666 lov.add_xlock(&destdn->lock);
3667 }
3668
3669 if (xlock_srcdn) {
3670 mds_rank_t srcdir_auth = srcdir->authority().first;
3671 if (srcdir_auth == mds->get_nodeid()) {
3672 lov.add_wrlock(&srcdir->inode->filelock);
3673 lov.add_wrlock(&srcdir->inode->nestlock);
3674 } else {
3675 lov.add_remote_wrlock(&srcdir->inode->filelock, srcdir_auth);
3676 lov.add_remote_wrlock(&srcdir->inode->nestlock, srcdir_auth);
3677 }
3678 lov.add_xlock(&srcdn->lock);
3679 } else {
3680 lov.add_rdlock(&srcdn->lock);
3681 }
3682
3683 if (!lock_destdir_first) {
3684 lov.add_wrlock(&destdir->inode->filelock);
3685 lov.add_wrlock(&destdir->inode->nestlock);
3686 lov.add_xlock(&destdn->lock);
3687 }
3688 }
3689
3690 CInode *auth_pin_freeze = nullptr;
3691 // XXX any better way to do this?
3692 if (xlock_srcdn && !srcdn->is_auth()) {
3693 CDentry::linkage_t *srcdnl = srcdn->get_projected_linkage();
3694 auth_pin_freeze = srcdnl->is_primary() ? srcdnl->get_inode() : nullptr;
3695 }
3696 if (!mds->locker->acquire_locks(mdr, lov, auth_pin_freeze))
3697 return std::make_pair(nullptr, nullptr);
3698
3699 if (srcdn->get_projected_linkage()->is_null()) {
f67539c2 3700 respond_to_request(mdr, -CEPHFS_ENOENT);
9f95a23c
TL
3701 return std::make_pair(nullptr, nullptr);
3702 }
3703
3704 if (destdn->get_projected_linkage()->is_null()) {
3705 snapid_t next_snap = mdcache->get_global_snaprealm()->get_newest_seq() + 1;
3706 destdn->first = std::max(destdn->first, next_snap);
3707 }
3708
3709 mdr->locking_state |= MutationImpl::PATH_LOCKED;
3710
3711 return std::make_pair(destdn, srcdn);
3712}
3713
3714/**
3715 * try_open_auth_dirfrag -- open dirfrag, or forward to dirfrag auth
3716 *
3717 * @param diri base inode
3718 * @param fg the exact frag we want
7c673cae
FG
3719 * @param mdr request
3720 * @returns the pointer, or NULL if it had to be delayed (but mdr is taken care of)
3721 */
3722CDir* Server::try_open_auth_dirfrag(CInode *diri, frag_t fg, MDRequestRef& mdr)
3723{
3724 CDir *dir = diri->get_dirfrag(fg);
3725
9f95a23c
TL
3726 if (dir) {
3727 // am i auth for the dirfrag?
3728 if (!dir->is_auth()) {
3729 mds_rank_t auth = dir->authority().first;
3730 dout(7) << "try_open_auth_dirfrag: not auth for " << *dir
3731 << ", fw to mds." << auth << dendl;
3732 mdcache->request_forward(mdr, auth);
3733 return nullptr;
3734 }
3735 } else {
3736 // not open and inode not mine?
3737 if (!diri->is_auth()) {
3738 mds_rank_t inauth = diri->authority().first;
3739 dout(7) << "try_open_auth_dirfrag: not open, not inode auth, fw to mds." << inauth << dendl;
3740 mdcache->request_forward(mdr, inauth);
3741 return nullptr;
3742 }
7c673cae 3743
9f95a23c
TL
3744 // not open and inode frozen?
3745 if (diri->is_frozen()) {
3746 dout(10) << "try_open_auth_dirfrag: dir inode is frozen, waiting " << *diri << dendl;
3747 ceph_assert(diri->get_parent_dir());
3748 diri->add_waiter(CInode::WAIT_UNFREEZE, new C_MDS_RetryRequest(mdcache, mdr));
3749 return nullptr;
3750 }
7c673cae 3751
9f95a23c 3752 // invent?
7c673cae 3753 dir = diri->get_or_open_dirfrag(mdcache, fg);
7c673cae
FG
3754 }
3755
3756 return dir;
3757}
3758
3759
3760// ===============================================================================
3761// STAT
3762
3763void Server::handle_client_getattr(MDRequestRef& mdr, bool is_lookup)
3764{
9f95a23c 3765 const cref_t<MClientRequest> &req = mdr->client_request;
7c673cae
FG
3766
3767 if (req->get_filepath().depth() == 0 && is_lookup) {
3768 // refpath can't be empty for lookup but it can for
3769 // getattr (we do getattr with empty refpath for mount of '/')
f67539c2 3770 respond_to_request(mdr, -CEPHFS_EINVAL);
7c673cae
FG
3771 return;
3772 }
3773
28e407b8
AA
3774 bool want_auth = false;
3775 int mask = req->head.args.getattr.mask;
3776 if (mask & CEPH_STAT_RSTAT)
3777 want_auth = true; // set want_auth for CEPH_STAT_RSTAT mask
3778
f91f0fd5 3779 if (!mdr->is_batch_head() && mdr->can_batch()) {
f67539c2 3780 CF_MDS_RetryRequestFactory cf(mdcache, mdr, false);
f91f0fd5
TL
3781 int r = mdcache->path_traverse(mdr, cf, mdr->get_filepath(),
3782 (want_auth ? MDS_TRAVERSE_WANT_AUTH : 0),
3783 &mdr->dn[0], &mdr->in[0]);
3784 if (r > 0)
3785 return; // delayed
9f95a23c 3786
f91f0fd5
TL
3787 if (r < 0) {
3788 // fall-thru. let rdlock_path_pin_ref() check again.
3789 } else if (is_lookup) {
3790 CDentry* dn = mdr->dn[0].back();
3791 mdr->pin(dn);
3792 auto em = dn->batch_ops.emplace(std::piecewise_construct, std::forward_as_tuple(mask), std::forward_as_tuple());
9f95a23c 3793 if (em.second) {
f91f0fd5 3794 em.first->second = std::make_unique<Batch_Getattr_Lookup>(this, mdr);
9f95a23c 3795 } else {
f91f0fd5 3796 dout(20) << __func__ << ": LOOKUP op, wait for previous same getattr ops to respond. " << *mdr << dendl;
9f95a23c
TL
3797 em.first->second->add_request(mdr);
3798 return;
3799 }
3800 } else {
f91f0fd5
TL
3801 CInode *in = mdr->in[0];
3802 mdr->pin(in);
3803 auto em = in->batch_ops.emplace(std::piecewise_construct, std::forward_as_tuple(mask), std::forward_as_tuple());
9f95a23c 3804 if (em.second) {
f91f0fd5 3805 em.first->second = std::make_unique<Batch_Getattr_Lookup>(this, mdr);
9f95a23c 3806 } else {
f91f0fd5 3807 dout(20) << __func__ << ": GETATTR op, wait for previous same getattr ops to respond. " << *mdr << dendl;
9f95a23c
TL
3808 em.first->second->add_request(mdr);
3809 return;
3810 }
3811 }
9f95a23c 3812 }
7c673cae 3813
f91f0fd5
TL
3814 CInode *ref = rdlock_path_pin_ref(mdr, want_auth, false);
3815 if (!ref)
3816 return;
3817
3818 mdr->getattr_caps = mask;
3819
7c673cae
FG
3820 /*
3821 * if client currently holds the EXCL cap on a field, do not rdlock
3822 * it; client's stat() will result in valid info if _either_ EXCL
3823 * cap is held or MDS rdlocks and reads the value here.
3824 *
3825 * handling this case here is easier than weakening rdlock
3826 * semantics... that would cause problems elsewhere.
3827 */
3828 client_t client = mdr->get_client();
3829 int issued = 0;
3830 Capability *cap = ref->get_client_cap(client);
3831 if (cap && (mdr->snapid == CEPH_NOSNAP ||
3832 mdr->snapid <= cap->client_follows))
3833 issued = cap->issued();
3834
9f95a23c
TL
3835 // FIXME
3836 MutationImpl::LockOpVec lov;
94b18763 3837 if ((mask & CEPH_CAP_LINK_SHARED) && !(issued & CEPH_CAP_LINK_EXCL))
11fdf7f2 3838 lov.add_rdlock(&ref->linklock);
94b18763 3839 if ((mask & CEPH_CAP_AUTH_SHARED) && !(issued & CEPH_CAP_AUTH_EXCL))
11fdf7f2 3840 lov.add_rdlock(&ref->authlock);
94b18763 3841 if ((mask & CEPH_CAP_XATTR_SHARED) && !(issued & CEPH_CAP_XATTR_EXCL))
11fdf7f2 3842 lov.add_rdlock(&ref->xattrlock);
94b18763
FG
3843 if ((mask & CEPH_CAP_FILE_SHARED) && !(issued & CEPH_CAP_FILE_EXCL)) {
3844 // Don't wait on unstable filelock if client is allowed to read file size.
3845 // This can reduce the response time of getattr in the case that multiple
3846 // clients do stat(2) and there are writers.
3847 // The downside of this optimization is that mds may not issue Fs caps along
3848 // with getattr reply. Client may need to send more getattr requests.
11fdf7f2
TL
3849 if (mdr->is_rdlocked(&ref->filelock)) {
3850 lov.add_rdlock(&ref->filelock);
94b18763
FG
3851 } else if (ref->filelock.is_stable() ||
3852 ref->filelock.get_num_wrlocks() > 0 ||
3853 !ref->filelock.can_read(mdr->get_client())) {
11fdf7f2 3854 lov.add_rdlock(&ref->filelock);
9f95a23c 3855 mdr->locking_state &= ~MutationImpl::ALL_LOCKED;
94b18763
FG
3856 }
3857 }
7c673cae 3858
11fdf7f2 3859 if (!mds->locker->acquire_locks(mdr, lov))
7c673cae
FG
3860 return;
3861
3862 if (!check_access(mdr, ref, MAY_READ))
3863 return;
3864
28e407b8
AA
3865 utime_t now = ceph_clock_now();
3866 mdr->set_mds_stamp(now);
3867
7c673cae
FG
3868 // note which caps are requested, so we return at least a snapshot
3869 // value for them. (currently this matters for xattrs and inline data)
3870 mdr->getattr_caps = mask;
3871
11fdf7f2 3872 mds->balancer->hit_inode(ref, META_POP_IRD, req->get_source().num());
7c673cae
FG
3873
3874 // reply
3875 dout(10) << "reply to stat on " << *req << dendl;
3876 mdr->tracei = ref;
3877 if (is_lookup)
3878 mdr->tracedn = mdr->dn[0].back();
3879 respond_to_request(mdr, 0);
3880}
3881
3882struct C_MDS_LookupIno2 : public ServerContext {
3883 MDRequestRef mdr;
3884 C_MDS_LookupIno2(Server *s, MDRequestRef& r) : ServerContext(s), mdr(r) {}
3885 void finish(int r) override {
3886 server->_lookup_ino_2(mdr, r);
3887 }
3888};
3889
7c673cae
FG
3890/*
3891 * filepath: ino
3892 */
3893void Server::handle_client_lookup_ino(MDRequestRef& mdr,
3894 bool want_parent, bool want_dentry)
3895{
9f95a23c 3896 const cref_t<MClientRequest> &req = mdr->client_request;
11fdf7f2
TL
3897
3898 if ((uint64_t)req->head.args.lookupino.snapid > 0)
3899 return _lookup_snap_ino(mdr);
7c673cae
FG
3900
3901 inodeno_t ino = req->get_filepath().get_ino();
3902 CInode *in = mdcache->get_inode(ino);
3903 if (in && in->state_test(CInode::STATE_PURGING)) {
f67539c2 3904 respond_to_request(mdr, -CEPHFS_ESTALE);
7c673cae
FG
3905 return;
3906 }
3907 if (!in) {
3908 mdcache->open_ino(ino, (int64_t)-1, new C_MDS_LookupIno2(this, mdr), false);
3909 return;
3910 }
3911
7c673cae
FG
3912 // check for nothing (not read or write); this still applies the
3913 // path check.
3914 if (!check_access(mdr, in, 0))
3915 return;
3916
3917 CDentry *dn = in->get_projected_parent_dn();
3918 CInode *diri = dn ? dn->get_dir()->inode : NULL;
3919
11fdf7f2 3920 MutationImpl::LockOpVec lov;
7c673cae
FG
3921 if (dn && (want_parent || want_dentry)) {
3922 mdr->pin(dn);
11fdf7f2 3923 lov.add_rdlock(&dn->lock);
7c673cae
FG
3924 }
3925
11fdf7f2 3926 unsigned mask = req->head.args.lookupino.mask;
7c673cae
FG
3927 if (mask) {
3928 Capability *cap = in->get_client_cap(mdr->get_client());
3929 int issued = 0;
3930 if (cap && (mdr->snapid == CEPH_NOSNAP || mdr->snapid <= cap->client_follows))
3931 issued = cap->issued();
9f95a23c 3932 // FIXME
7c673cae
FG
3933 // permission bits, ACL/security xattrs
3934 if ((mask & CEPH_CAP_AUTH_SHARED) && (issued & CEPH_CAP_AUTH_EXCL) == 0)
11fdf7f2 3935 lov.add_rdlock(&in->authlock);
7c673cae 3936 if ((mask & CEPH_CAP_XATTR_SHARED) && (issued & CEPH_CAP_XATTR_EXCL) == 0)
11fdf7f2 3937 lov.add_rdlock(&in->xattrlock);
7c673cae
FG
3938
3939 mdr->getattr_caps = mask;
3940 }
3941
11fdf7f2
TL
3942 if (!lov.empty()) {
3943 if (!mds->locker->acquire_locks(mdr, lov))
7c673cae
FG
3944 return;
3945
d2e6a577
FG
3946 if (diri != NULL) {
3947 // need read access to directory inode
3948 if (!check_access(mdr, diri, MAY_READ))
3949 return;
3950 }
7c673cae
FG
3951 }
3952
3953 if (want_parent) {
3954 if (in->is_base()) {
f67539c2 3955 respond_to_request(mdr, -CEPHFS_EINVAL);
7c673cae
FG
3956 return;
3957 }
3958 if (!diri || diri->is_stray()) {
f67539c2 3959 respond_to_request(mdr, -CEPHFS_ESTALE);
7c673cae
FG
3960 return;
3961 }
3962 dout(10) << "reply to lookup_parent " << *in << dendl;
3963 mdr->tracei = diri;
3964 respond_to_request(mdr, 0);
3965 } else {
3966 if (want_dentry) {
3967 inodeno_t dirino = req->get_filepath2().get_ino();
3968 if (!diri || (dirino != inodeno_t() && diri->ino() != dirino)) {
f67539c2 3969 respond_to_request(mdr, -CEPHFS_ENOENT);
7c673cae
FG
3970 return;
3971 }
3972 dout(10) << "reply to lookup_name " << *in << dendl;
3973 } else
3974 dout(10) << "reply to lookup_ino " << *in << dendl;
3975
3976 mdr->tracei = in;
3977 if (want_dentry)
3978 mdr->tracedn = dn;
3979 respond_to_request(mdr, 0);
3980 }
3981}
3982
11fdf7f2
TL
3983void Server::_lookup_snap_ino(MDRequestRef& mdr)
3984{
9f95a23c 3985 const cref_t<MClientRequest> &req = mdr->client_request;
11fdf7f2
TL
3986
3987 vinodeno_t vino;
3988 vino.ino = req->get_filepath().get_ino();
3989 vino.snapid = (__u64)req->head.args.lookupino.snapid;
3990 inodeno_t parent_ino = (__u64)req->head.args.lookupino.parent;
3991 __u32 hash = req->head.args.lookupino.hash;
3992
3993 dout(7) << "lookup_snap_ino " << vino << " parent " << parent_ino << " hash " << hash << dendl;
3994
3995 CInode *in = mdcache->lookup_snap_inode(vino);
3996 if (!in) {
3997 in = mdcache->get_inode(vino.ino);
3998 if (in) {
3999 if (in->state_test(CInode::STATE_PURGING) ||
4000 !in->has_snap_data(vino.snapid)) {
4001 if (in->is_dir() || !parent_ino) {
f67539c2 4002 respond_to_request(mdr, -CEPHFS_ESTALE);
11fdf7f2
TL
4003 return;
4004 }
4005 in = NULL;
4006 }
4007 }
4008 }
4009
4010 if (in) {
4011 dout(10) << "reply to lookup_snap_ino " << *in << dendl;
4012 mdr->snapid = vino.snapid;
4013 mdr->tracei = in;
4014 respond_to_request(mdr, 0);
4015 return;
4016 }
4017
4018 CInode *diri = NULL;
4019 if (parent_ino) {
4020 diri = mdcache->get_inode(parent_ino);
4021 if (!diri) {
4022 mdcache->open_ino(parent_ino, mds->mdsmap->get_metadata_pool(), new C_MDS_LookupIno2(this, mdr));
4023 return;
4024 }
4025
4026 if (!diri->is_dir()) {
f67539c2 4027 respond_to_request(mdr, -CEPHFS_EINVAL);
11fdf7f2
TL
4028 return;
4029 }
4030
4031 MutationImpl::LockOpVec lov;
4032 lov.add_rdlock(&diri->dirfragtreelock);
4033 if (!mds->locker->acquire_locks(mdr, lov))
4034 return;
4035
4036 frag_t frag = diri->dirfragtree[hash];
4037 CDir *dir = try_open_auth_dirfrag(diri, frag, mdr);
4038 if (!dir)
4039 return;
4040
4041 if (!dir->is_complete()) {
4042 if (dir->is_frozen()) {
4043 mds->locker->drop_locks(mdr.get());
4044 mdr->drop_local_auth_pins();
4045 dir->add_waiter(CDir::WAIT_UNFREEZE, new C_MDS_RetryRequest(mdcache, mdr));
4046 return;
4047 }
4048 dir->fetch(new C_MDS_RetryRequest(mdcache, mdr), true);
4049 return;
4050 }
4051
f67539c2 4052 respond_to_request(mdr, -CEPHFS_ESTALE);
11fdf7f2
TL
4053 } else {
4054 mdcache->open_ino(vino.ino, mds->mdsmap->get_metadata_pool(), new C_MDS_LookupIno2(this, mdr), false);
4055 }
4056}
4057
7c673cae
FG
4058void Server::_lookup_ino_2(MDRequestRef& mdr, int r)
4059{
4060 inodeno_t ino = mdr->client_request->get_filepath().get_ino();
4061 dout(10) << "_lookup_ino_2 " << mdr.get() << " ino " << ino << " r=" << r << dendl;
4062
4063 // `r` is a rank if >=0, else an error code
4064 if (r >= 0) {
4065 mds_rank_t dest_rank(r);
4066 if (dest_rank == mds->get_nodeid())
4067 dispatch_client_request(mdr);
4068 else
4069 mdcache->request_forward(mdr, dest_rank);
4070 return;
4071 }
4072
4073 // give up
f67539c2
TL
4074 if (r == -CEPHFS_ENOENT || r == -CEPHFS_ENODATA)
4075 r = -CEPHFS_ESTALE;
7c673cae
FG
4076 respond_to_request(mdr, r);
4077}
4078
4079
4080/* This function takes responsibility for the passed mdr*/
4081void Server::handle_client_open(MDRequestRef& mdr)
4082{
9f95a23c 4083 const cref_t<MClientRequest> &req = mdr->client_request;
7c673cae
FG
4084 dout(7) << "open on " << req->get_filepath() << dendl;
4085
4086 int flags = req->head.args.open.flags;
4087 int cmode = ceph_flags_to_mode(flags);
4088 if (cmode < 0) {
f67539c2 4089 respond_to_request(mdr, -CEPHFS_EINVAL);
7c673cae
FG
4090 return;
4091 }
4092
181888fb
FG
4093 bool need_auth = !file_mode_is_readonly(cmode) ||
4094 (flags & (CEPH_O_TRUNC | CEPH_O_DIRECTORY));
7c673cae
FG
4095
4096 if ((cmode & CEPH_FILE_MODE_WR) && mdcache->is_readonly()) {
4097 dout(7) << "read-only FS" << dendl;
f67539c2 4098 respond_to_request(mdr, -CEPHFS_EROFS);
7c673cae
FG
4099 return;
4100 }
4101
9f95a23c 4102 CInode *cur = rdlock_path_pin_ref(mdr, need_auth);
7c673cae
FG
4103 if (!cur)
4104 return;
4105
4106 if (cur->is_frozen() || cur->state_test(CInode::STATE_EXPORTINGCAPS)) {
11fdf7f2 4107 ceph_assert(!need_auth);
9f95a23c
TL
4108 mdr->locking_state &= ~(MutationImpl::PATH_LOCKED | MutationImpl::ALL_LOCKED);
4109 CInode *cur = rdlock_path_pin_ref(mdr, true);
7c673cae
FG
4110 if (!cur)
4111 return;
4112 }
4113
f67539c2 4114 if (!cur->is_file()) {
7c673cae
FG
4115 // can only open non-regular inode with mode FILE_MODE_PIN, at least for now.
4116 cmode = CEPH_FILE_MODE_PIN;
4117 // the inode is symlink and client wants to follow it, ignore the O_TRUNC flag.
f67539c2 4118 if (cur->is_symlink() && !(flags & CEPH_O_NOFOLLOW))
7c673cae
FG
4119 flags &= ~CEPH_O_TRUNC;
4120 }
4121
4122 dout(10) << "open flags = " << flags
4123 << ", filemode = " << cmode
4124 << ", need_auth = " << need_auth
4125 << dendl;
4126
4127 // regular file?
4128 /*if (!cur->inode.is_file() && !cur->inode.is_dir()) {
4129 dout(7) << "not a file or dir " << *cur << dendl;
f67539c2 4130 respond_to_request(mdr, -CEPHFS_ENXIO); // FIXME what error do we want?
7c673cae
FG
4131 return;
4132 }*/
f67539c2 4133 if ((flags & CEPH_O_DIRECTORY) && !cur->is_dir() && !cur->is_symlink()) {
7c673cae 4134 dout(7) << "specified O_DIRECTORY on non-directory " << *cur << dendl;
f67539c2 4135 respond_to_request(mdr, -CEPHFS_EINVAL);
7c673cae
FG
4136 return;
4137 }
4138
f67539c2 4139 if ((flags & CEPH_O_TRUNC) && !cur->is_file()) {
7c673cae 4140 dout(7) << "specified O_TRUNC on !(file|symlink) " << *cur << dendl;
f67539c2
TL
4141 // we should return -CEPHFS_EISDIR for directory, return -CEPHFS_EINVAL for other non-regular
4142 respond_to_request(mdr, cur->is_dir() ? -CEPHFS_EISDIR : -CEPHFS_EINVAL);
7c673cae
FG
4143 return;
4144 }
4145
f67539c2 4146 if (cur->get_inode()->inline_data.version != CEPH_INLINE_NONE &&
11fdf7f2 4147 !mdr->session->get_connection()->has_feature(CEPH_FEATURE_MDS_INLINE_DATA)) {
7c673cae 4148 dout(7) << "old client cannot open inline data file " << *cur << dendl;
f67539c2 4149 respond_to_request(mdr, -CEPHFS_EPERM);
7c673cae
FG
4150 return;
4151 }
4152
4153 // snapped data is read only
4154 if (mdr->snapid != CEPH_NOSNAP &&
4155 ((cmode & CEPH_FILE_MODE_WR) || req->may_write())) {
4156 dout(7) << "snap " << mdr->snapid << " is read-only " << *cur << dendl;
f67539c2 4157 respond_to_request(mdr, -CEPHFS_EROFS);
7c673cae
FG
4158 return;
4159 }
4160
9f95a23c
TL
4161 MutationImpl::LockOpVec lov;
4162
7c673cae
FG
4163 unsigned mask = req->head.args.open.mask;
4164 if (mask) {
4165 Capability *cap = cur->get_client_cap(mdr->get_client());
4166 int issued = 0;
4167 if (cap && (mdr->snapid == CEPH_NOSNAP || mdr->snapid <= cap->client_follows))
4168 issued = cap->issued();
4169 // permission bits, ACL/security xattrs
4170 if ((mask & CEPH_CAP_AUTH_SHARED) && (issued & CEPH_CAP_AUTH_EXCL) == 0)
11fdf7f2 4171 lov.add_rdlock(&cur->authlock);
7c673cae 4172 if ((mask & CEPH_CAP_XATTR_SHARED) && (issued & CEPH_CAP_XATTR_EXCL) == 0)
11fdf7f2 4173 lov.add_rdlock(&cur->xattrlock);
7c673cae
FG
4174
4175 mdr->getattr_caps = mask;
4176 }
4177
4178 // O_TRUNC
4179 if ((flags & CEPH_O_TRUNC) && !mdr->has_completed) {
11fdf7f2 4180 ceph_assert(cur->is_auth());
7c673cae 4181
11fdf7f2
TL
4182 lov.add_xlock(&cur->filelock);
4183 if (!mds->locker->acquire_locks(mdr, lov))
7c673cae
FG
4184 return;
4185
4186 if (!check_access(mdr, cur, MAY_WRITE))
4187 return;
4188
4189 // wait for pending truncate?
f67539c2 4190 const auto& pi = cur->get_projected_inode();
7c673cae
FG
4191 if (pi->is_truncating()) {
4192 dout(10) << " waiting for pending truncate from " << pi->truncate_from
4193 << " to " << pi->truncate_size << " to complete on " << *cur << dendl;
4194 mds->locker->drop_locks(mdr.get());
4195 mdr->drop_local_auth_pins();
4196 cur->add_waiter(CInode::WAIT_TRUNC, new C_MDS_RetryRequest(mdcache, mdr));
4197 return;
4198 }
4199
4200 do_open_truncate(mdr, cmode);
4201 return;
4202 }
4203
4204 // sync filelock if snapped.
4205 // this makes us wait for writers to flushsnaps, ensuring we get accurate metadata,
4206 // and that data itself is flushed so that we can read the snapped data off disk.
4207 if (mdr->snapid != CEPH_NOSNAP && !cur->is_dir()) {
11fdf7f2 4208 lov.add_rdlock(&cur->filelock);
7c673cae
FG
4209 }
4210
11fdf7f2 4211 if (!mds->locker->acquire_locks(mdr, lov))
7c673cae
FG
4212 return;
4213
4214 mask = MAY_READ;
4215 if (cmode & CEPH_FILE_MODE_WR)
4216 mask |= MAY_WRITE;
4217 if (!check_access(mdr, cur, mask))
4218 return;
4219
28e407b8
AA
4220 utime_t now = ceph_clock_now();
4221 mdr->set_mds_stamp(now);
4222
7c673cae
FG
4223 if (cur->is_file() || cur->is_dir()) {
4224 if (mdr->snapid == CEPH_NOSNAP) {
4225 // register new cap
9f95a23c 4226 Capability *cap = mds->locker->issue_new_caps(cur, cmode, mdr, nullptr);
7c673cae
FG
4227 if (cap)
4228 dout(12) << "open issued caps " << ccap_string(cap->pending())
4229 << " for " << req->get_source()
4230 << " on " << *cur << dendl;
4231 } else {
4232 int caps = ceph_caps_for_mode(cmode);
4233 dout(12) << "open issued IMMUTABLE SNAP caps " << ccap_string(caps)
4234 << " for " << req->get_source()
4235 << " snapid " << mdr->snapid
4236 << " on " << *cur << dendl;
4237 mdr->snap_caps = caps;
4238 }
4239 }
4240
4241 // increase max_size?
4242 if (cmode & CEPH_FILE_MODE_WR)
4243 mds->locker->check_inode_max_size(cur);
4244
4245 // make sure this inode gets into the journal
4246 if (cur->is_auth() && cur->last == CEPH_NOSNAP &&
11fdf7f2 4247 mdcache->open_file_table.should_log_open(cur)) {
7c673cae
FG
4248 EOpen *le = new EOpen(mds->mdlog);
4249 mdlog->start_entry(le);
4250 le->add_clean_inode(cur);
7c673cae
FG
4251 mdlog->submit_entry(le);
4252 }
4253
4254 // hit pop
4255 if (cmode & CEPH_FILE_MODE_WR)
11fdf7f2 4256 mds->balancer->hit_inode(cur, META_POP_IWR);
7c673cae 4257 else
11fdf7f2 4258 mds->balancer->hit_inode(cur, META_POP_IRD,
7c673cae
FG
4259 mdr->client_request->get_source().num());
4260
4261 CDentry *dn = 0;
4262 if (req->get_dentry_wanted()) {
11fdf7f2 4263 ceph_assert(mdr->dn[0].size());
7c673cae
FG
4264 dn = mdr->dn[0].back();
4265 }
4266
4267 mdr->tracei = cur;
4268 mdr->tracedn = dn;
4269 respond_to_request(mdr, 0);
4270}
4271
4272class C_MDS_openc_finish : public ServerLogContext {
4273 CDentry *dn;
4274 CInode *newi;
7c673cae 4275public:
11fdf7f2
TL
4276 C_MDS_openc_finish(Server *s, MDRequestRef& r, CDentry *d, CInode *ni) :
4277 ServerLogContext(s, r), dn(d), newi(ni) {}
7c673cae 4278 void finish(int r) override {
11fdf7f2 4279 ceph_assert(r == 0);
7c673cae
FG
4280
4281 dn->pop_projected_linkage();
4282
4283 // dirty inode, dn, dir
f67539c2 4284 newi->mark_dirty(mdr->ls);
28e407b8 4285 newi->mark_dirty_parent(mdr->ls, true);
7c673cae
FG
4286
4287 mdr->apply();
4288
4289 get_mds()->locker->share_inode_max_size(newi);
4290
4291 MDRequestRef null_ref;
4292 get_mds()->mdcache->send_dentry_link(dn, null_ref);
4293
11fdf7f2 4294 get_mds()->balancer->hit_inode(newi, META_POP_IWR);
7c673cae
FG
4295
4296 server->respond_to_request(mdr, 0);
4297
11fdf7f2 4298 ceph_assert(g_conf()->mds_kill_openc_at != 1);
7c673cae
FG
4299 }
4300};
4301
4302/* This function takes responsibility for the passed mdr*/
4303void Server::handle_client_openc(MDRequestRef& mdr)
4304{
9f95a23c 4305 const cref_t<MClientRequest> &req = mdr->client_request;
7c673cae
FG
4306 client_t client = mdr->get_client();
4307
4308 dout(7) << "open w/ O_CREAT on " << req->get_filepath() << dendl;
4309
4310 int cmode = ceph_flags_to_mode(req->head.args.open.flags);
4311 if (cmode < 0) {
f67539c2 4312 respond_to_request(mdr, -CEPHFS_EINVAL);
7c673cae
FG
4313 return;
4314 }
4315
c07f9fc5 4316 bool excl = req->head.args.open.flags & CEPH_O_EXCL;
9f95a23c
TL
4317 CDentry *dn = rdlock_path_xlock_dentry(mdr, true, !excl, true);
4318 if (!dn)
4319 return;
c07f9fc5 4320
9f95a23c
TL
4321 CDentry::linkage_t *dnl = dn->get_projected_linkage();
4322 if (!excl && !dnl->is_null()) {
4323 // it existed.
4324 mds->locker->xlock_downgrade(&dn->lock, mdr.get());
4325
4326 MutationImpl::LockOpVec lov;
4327 lov.add_rdlock(&dnl->get_inode()->snaplock);
4328 if (!mds->locker->acquire_locks(mdr, lov))
7c673cae 4329 return;
7c673cae 4330
9f95a23c 4331 handle_client_open(mdr);
7c673cae
FG
4332 return;
4333 }
9f95a23c
TL
4334
4335 ceph_assert(dnl->is_null());
4336
f67539c2
TL
4337 if (req->get_alternate_name().size() > alternate_name_max) {
4338 dout(10) << " alternate_name longer than " << alternate_name_max << dendl;
4339 respond_to_request(mdr, -CEPHFS_ENAMETOOLONG);
4340 return;
4341 }
4342 dn->set_alternate_name(req->get_alternate_name());
4343
7c673cae
FG
4344 // set layout
4345 file_layout_t layout;
9f95a23c
TL
4346 if (mdr->dir_layout != file_layout_t())
4347 layout = mdr->dir_layout;
7c673cae
FG
4348 else
4349 layout = mdcache->default_file_layout;
4350
4351 // What kind of client caps are required to complete this operation
4352 uint64_t access = MAY_WRITE;
4353
4354 const auto default_layout = layout;
4355
4356 // fill in any special params from client
4357 if (req->head.args.open.stripe_unit)
4358 layout.stripe_unit = req->head.args.open.stripe_unit;
4359 if (req->head.args.open.stripe_count)
4360 layout.stripe_count = req->head.args.open.stripe_count;
4361 if (req->head.args.open.object_size)
4362 layout.object_size = req->head.args.open.object_size;
4363 if (req->get_connection()->has_feature(CEPH_FEATURE_CREATEPOOLID) &&
4364 (__s32)req->head.args.open.pool >= 0) {
4365 layout.pool_id = req->head.args.open.pool;
4366
4367 // make sure we have as new a map as the client
4368 if (req->get_mdsmap_epoch() > mds->mdsmap->get_epoch()) {
4369 mds->wait_for_mdsmap(req->get_mdsmap_epoch(), new C_MDS_RetryRequest(mdcache, mdr));
4370 return;
4371 }
4372 }
4373
4374 // If client doesn't have capability to modify layout pools, then
4375 // only permit this request if the requested pool matches what the
4376 // file would have inherited anyway from its parent.
4377 if (default_layout != layout) {
4378 access |= MAY_SET_VXATTR;
4379 }
4380
4381 if (!layout.is_valid()) {
4382 dout(10) << " invalid initial file layout" << dendl;
f67539c2 4383 respond_to_request(mdr, -CEPHFS_EINVAL);
7c673cae
FG
4384 return;
4385 }
4386 if (!mds->mdsmap->is_data_pool(layout.pool_id)) {
4387 dout(10) << " invalid data pool " << layout.pool_id << dendl;
f67539c2 4388 respond_to_request(mdr, -CEPHFS_EINVAL);
7c673cae
FG
4389 return;
4390 }
4391
c07f9fc5 4392 // created null dn.
7c673cae
FG
4393 CDir *dir = dn->get_dir();
4394 CInode *diri = dir->get_inode();
7c673cae
FG
4395 if (!check_access(mdr, diri, access))
4396 return;
7c673cae
FG
4397 if (!check_fragment_space(mdr, dir))
4398 return;
4399
9f95a23c
TL
4400 if (mdr->dn[0].size() == 1)
4401 mds->locker->create_lock_cache(mdr, diri, &mdr->dir_layout);
7c673cae 4402
7c673cae 4403 // create inode.
f67539c2
TL
4404 CInode *newi = prepare_new_inode(mdr, dn->get_dir(), inodeno_t(req->head.ino),
4405 req->head.args.open.mode | S_IFREG, &layout);
4406 ceph_assert(newi);
7c673cae
FG
4407
4408 // it's a file.
f67539c2 4409 dn->push_projected_linkage(newi);
7c673cae 4410
f67539c2
TL
4411 auto _inode = newi->_get_inode();
4412 _inode->version = dn->pre_dirty();
7c673cae 4413 if (layout.pool_id != mdcache->default_file_layout.pool_id)
f67539c2
TL
4414 _inode->add_old_pool(mdcache->default_file_layout.pool_id);
4415 _inode->update_backtrace();
4416 _inode->rstat.rfiles = 1;
4417 _inode->accounted_rstat = _inode->rstat;
a8e16298
TL
4418
4419 SnapRealm *realm = diri->find_snaprealm();
11fdf7f2
TL
4420 snapid_t follows = mdcache->get_global_snaprealm()->get_newest_seq();
4421 ceph_assert(follows >= realm->get_newest_seq());
a8e16298
TL
4422
4423 ceph_assert(dn->first == follows+1);
f67539c2 4424 newi->first = dn->first;
a8e16298
TL
4425
4426 // do the open
f67539c2
TL
4427 Capability *cap = mds->locker->issue_new_caps(newi, cmode, mdr, realm);
4428 newi->authlock.set_state(LOCK_EXCL);
4429 newi->xattrlock.set_state(LOCK_EXCL);
a8e16298
TL
4430
4431 if (cap && (cmode & CEPH_FILE_MODE_WR)) {
f67539c2
TL
4432 _inode->client_ranges[client].range.first = 0;
4433 _inode->client_ranges[client].range.last = _inode->layout.stripe_unit;
4434 _inode->client_ranges[client].follows = follows;
4435 newi->mark_clientwriteable();
a8e16298 4436 cap->mark_clientwriteable();
7c673cae 4437 }
7c673cae
FG
4438
4439 // prepare finisher
4440 mdr->ls = mdlog->get_current_segment();
4441 EUpdate *le = new EUpdate(mdlog, "openc");
4442 mdlog->start_entry(le);
4443 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
4444 journal_allocated_inos(mdr, &le->metablob);
f67539c2
TL
4445 mdcache->predirty_journal_parents(mdr, &le->metablob, newi, dn->get_dir(), PREDIRTY_PRIMARY|PREDIRTY_DIR, 1);
4446 le->metablob.add_primary_dentry(dn, newi, true, true, true);
7c673cae 4447
7c673cae 4448 // make sure this inode gets into the journal
f67539c2 4449 le->metablob.add_opened_ino(newi->ino());
7c673cae 4450
f67539c2 4451 C_MDS_openc_finish *fin = new C_MDS_openc_finish(this, mdr, dn, newi);
7c673cae 4452
9f95a23c
TL
4453 if (mdr->session->info.has_feature(CEPHFS_FEATURE_DELEG_INO)) {
4454 openc_response_t ocresp;
4455
4456 dout(10) << "adding created_ino and delegated_inos" << dendl;
f67539c2 4457 ocresp.created_ino = _inode->ino;
9f95a23c
TL
4458
4459 if (delegate_inos_pct && !req->is_queued_for_replay()) {
4460 // Try to delegate some prealloc_inos to the client, if it's down to half the max
4461 unsigned frac = 100 / delegate_inos_pct;
4462 if (mdr->session->delegated_inos.size() < (unsigned)g_conf()->mds_client_prealloc_inos / frac / 2)
4463 mdr->session->delegate_inos(g_conf()->mds_client_prealloc_inos / frac, ocresp.delegated_inos);
4464 }
4465
4466 encode(ocresp, mdr->reply_extra_bl);
4467 } else if (mdr->client_request->get_connection()->has_feature(CEPH_FEATURE_REPLY_CREATE_INODE)) {
7c673cae
FG
4468 dout(10) << "adding ino to reply to indicate inode was created" << dendl;
4469 // add the file created flag onto the reply if create_flags features is supported
f67539c2 4470 encode(newi->ino(), mdr->reply_extra_bl);
7c673cae
FG
4471 }
4472
f67539c2 4473 journal_and_reply(mdr, newi, dn, le, fin);
7c673cae
FG
4474
4475 // We hit_dir (via hit_inode) in our finish callback, but by then we might
4476 // have overshot the split size (multiple opencs in flight), so here is
4477 // an early chance to split the dir if this openc makes it oversized.
4478 mds->balancer->maybe_fragment(dir, false);
4479}
4480
4481
4482
4483void Server::handle_client_readdir(MDRequestRef& mdr)
4484{
9f95a23c 4485 const cref_t<MClientRequest> &req = mdr->client_request;
adb31ebb 4486 Session *session = mds->get_session(req);
7c673cae 4487 client_t client = req->get_source().num();
11fdf7f2 4488 MutationImpl::LockOpVec lov;
9f95a23c 4489 CInode *diri = rdlock_path_pin_ref(mdr, false, true);
7c673cae
FG
4490 if (!diri) return;
4491
4492 // it's a directory, right?
4493 if (!diri->is_dir()) {
4494 // not a dir
f67539c2
TL
4495 dout(10) << "reply to " << *req << " readdir -CEPHFS_ENOTDIR" << dendl;
4496 respond_to_request(mdr, -CEPHFS_ENOTDIR);
7c673cae
FG
4497 return;
4498 }
4499
adb31ebb
TL
4500 auto num_caps = session->get_num_caps();
4501 auto session_cap_acquisition = session->get_cap_acquisition();
4502
4503 if (num_caps > static_cast<uint64_t>(max_caps_per_client * max_caps_throttle_ratio) && session_cap_acquisition >= cap_acquisition_throttle) {
4504 dout(20) << "readdir throttled. max_caps_per_client: " << max_caps_per_client << " num_caps: " << num_caps
4505 << " session_cap_acquistion: " << session_cap_acquisition << " cap_acquisition_throttle: " << cap_acquisition_throttle << dendl;
4506 if (logger)
4507 logger->inc(l_mdss_cap_acquisition_throttle);
4508
4509 mds->timer.add_event_after(caps_throttle_retry_request_timeout, new C_MDS_RetryRequest(mdcache, mdr));
4510 return;
4511 }
4512
11fdf7f2
TL
4513 lov.add_rdlock(&diri->filelock);
4514 lov.add_rdlock(&diri->dirfragtreelock);
7c673cae 4515
11fdf7f2 4516 if (!mds->locker->acquire_locks(mdr, lov))
7c673cae
FG
4517 return;
4518
4519 if (!check_access(mdr, diri, MAY_READ))
4520 return;
4521
4522 // which frag?
4523 frag_t fg = (__u32)req->head.args.readdir.frag;
4524 unsigned req_flags = (__u32)req->head.args.readdir.flags;
4525 string offset_str = req->get_path2();
4526
4527 __u32 offset_hash = 0;
4528 if (!offset_str.empty())
4529 offset_hash = ceph_frag_value(diri->hash_dentry_name(offset_str));
4530 else
4531 offset_hash = (__u32)req->head.args.readdir.offset_hash;
4532
4533 dout(10) << " frag " << fg << " offset '" << offset_str << "'"
4534 << " offset_hash " << offset_hash << " flags " << req_flags << dendl;
4535
4536 // does the frag exist?
4537 if (diri->dirfragtree[fg.value()] != fg) {
4538 frag_t newfg;
4539 if (req_flags & CEPH_READDIR_REPLY_BITFLAGS) {
4540 if (fg.contains((unsigned)offset_hash)) {
4541 newfg = diri->dirfragtree[offset_hash];
4542 } else {
4543 // client actually wants next frag
4544 newfg = diri->dirfragtree[fg.value()];
4545 }
4546 } else {
4547 offset_str.clear();
4548 newfg = diri->dirfragtree[fg.value()];
4549 }
4550 dout(10) << " adjust frag " << fg << " -> " << newfg << " " << diri->dirfragtree << dendl;
4551 fg = newfg;
4552 }
4553
4554 CDir *dir = try_open_auth_dirfrag(diri, fg, mdr);
4555 if (!dir) return;
4556
4557 // ok!
4558 dout(10) << "handle_client_readdir on " << *dir << dendl;
11fdf7f2 4559 ceph_assert(dir->is_auth());
7c673cae
FG
4560
4561 if (!dir->is_complete()) {
4562 if (dir->is_frozen()) {
4563 dout(7) << "dir is frozen " << *dir << dendl;
4564 mds->locker->drop_locks(mdr.get());
4565 mdr->drop_local_auth_pins();
4566 dir->add_waiter(CDir::WAIT_UNFREEZE, new C_MDS_RetryRequest(mdcache, mdr));
4567 return;
4568 }
4569 // fetch
4570 dout(10) << " incomplete dir contents for readdir on " << *dir << ", fetching" << dendl;
4571 dir->fetch(new C_MDS_RetryRequest(mdcache, mdr), true);
4572 return;
4573 }
4574
4575#ifdef MDS_VERIFY_FRAGSTAT
4576 dir->verify_fragstat();
4577#endif
4578
4579 utime_t now = ceph_clock_now();
4580 mdr->set_mds_stamp(now);
4581
4582 snapid_t snapid = mdr->snapid;
4583 dout(10) << "snapid " << snapid << dendl;
4584
4585 SnapRealm *realm = diri->find_snaprealm();
4586
4587 unsigned max = req->head.args.readdir.max_entries;
4588 if (!max)
4589 max = dir->get_num_any(); // whatever, something big.
4590 unsigned max_bytes = req->head.args.readdir.max_bytes;
4591 if (!max_bytes)
4592 // make sure at least one item can be encoded
11fdf7f2 4593 max_bytes = (512 << 10) + g_conf()->mds_max_xattr_pairs_size;
7c673cae
FG
4594
4595 // start final blob
4596 bufferlist dirbl;
11fdf7f2
TL
4597 DirStat ds;
4598 ds.frag = dir->get_frag();
4599 ds.auth = dir->get_dir_auth().first;
f91f0fd5 4600 if (dir->is_auth() && !forward_all_requests_to_auth)
11fdf7f2
TL
4601 dir->get_dist_spec(ds.dist, mds->get_nodeid());
4602
4603 dir->encode_dirstat(dirbl, mdr->session->info, ds);
7c673cae
FG
4604
4605 // count bytes available.
4606 // this isn't perfect, but we should capture the main variable/unbounded size items!
4607 int front_bytes = dirbl.length() + sizeof(__u32) + sizeof(__u8)*2;
4608 int bytes_left = max_bytes - front_bytes;
4609 bytes_left -= realm->get_snap_trace().length();
4610
4611 // build dir contents
4612 bufferlist dnbl;
4613 __u32 numfiles = 0;
4614 bool start = !offset_hash && offset_str.empty();
7c673cae
FG
4615 // skip all dns < dentry_key_t(snapid, offset_str, offset_hash)
4616 dentry_key_t skip_key(snapid, offset_str.c_str(), offset_hash);
181888fb
FG
4617 auto it = start ? dir->begin() : dir->lower_bound(skip_key);
4618 bool end = (it == dir->end());
4619 for (; !end && numfiles < max; end = (it == dir->end())) {
7c673cae
FG
4620 CDentry *dn = it->second;
4621 ++it;
4622
4623 if (dn->state_test(CDentry::STATE_PURGING))
4624 continue;
4625
4626 bool dnp = dn->use_projected(client, mdr);
4627 CDentry::linkage_t *dnl = dnp ? dn->get_projected_linkage() : dn->get_linkage();
4628
4629 if (dnl->is_null())
4630 continue;
4631
4632 if (dn->last < snapid || dn->first > snapid) {
4633 dout(20) << "skipping non-overlapping snap " << *dn << dendl;
4634 continue;
4635 }
4636
4637 if (!start) {
4638 dentry_key_t offset_key(dn->last, offset_str.c_str(), offset_hash);
4639 if (!(offset_key < dn->key()))
4640 continue;
4641 }
4642
4643 CInode *in = dnl->get_inode();
4644
4645 if (in && in->ino() == CEPH_INO_CEPH)
4646 continue;
4647
4648 // remote link?
4649 // better for the MDS to do the work, if we think the client will stat any of these files.
4650 if (dnl->is_remote() && !in) {
4651 in = mdcache->get_inode(dnl->get_remote_ino());
4652 if (in) {
4653 dn->link_remote(dnl, in);
4654 } else if (dn->state_test(CDentry::STATE_BADREMOTEINO)) {
4655 dout(10) << "skipping bad remote ino on " << *dn << dendl;
4656 continue;
4657 } else {
4658 // touch everything i _do_ have
94b18763
FG
4659 for (auto &p : *dir) {
4660 if (!p.second->get_linkage()->is_null())
4661 mdcache->lru.lru_touch(p.second);
4662 }
7c673cae
FG
4663
4664 // already issued caps and leases, reply immediately.
4665 if (dnbl.length() > 0) {
4666 mdcache->open_remote_dentry(dn, dnp, new C_MDSInternalNoop);
4667 dout(10) << " open remote dentry after caps were issued, stopping at "
4668 << dnbl.length() << " < " << bytes_left << dendl;
4669 break;
4670 }
4671
4672 mds->locker->drop_locks(mdr.get());
4673 mdr->drop_local_auth_pins();
4674 mdcache->open_remote_dentry(dn, dnp, new C_MDS_RetryRequest(mdcache, mdr));
4675 return;
4676 }
4677 }
11fdf7f2 4678 ceph_assert(in);
7c673cae 4679
94b18763 4680 if ((int)(dnbl.length() + dn->get_name().length() + sizeof(__u32) + sizeof(LeaseStat)) > bytes_left) {
7c673cae
FG
4681 dout(10) << " ran out of room, stopping at " << dnbl.length() << " < " << bytes_left << dendl;
4682 break;
4683 }
4684
4685 unsigned start_len = dnbl.length();
4686
4687 // dentry
4688 dout(12) << "including dn " << *dn << dendl;
11fdf7f2 4689 encode(dn->get_name(), dnbl);
9f95a23c
TL
4690 int lease_mask = dnl->is_primary() ? CEPH_LEASE_PRIMARY_LINK : 0;
4691 mds->locker->issue_client_lease(dn, mdr, lease_mask, now, dnbl);
7c673cae
FG
4692
4693 // inode
4694 dout(12) << "including inode " << *in << dendl;
4695 int r = in->encode_inodestat(dnbl, mdr->session, realm, snapid, bytes_left - (int)dnbl.length());
4696 if (r < 0) {
4697 // chop off dn->name, lease
4698 dout(10) << " ran out of room, stopping at " << start_len << " < " << bytes_left << dendl;
4699 bufferlist keep;
4700 keep.substr_of(dnbl, 0, start_len);
4701 dnbl.swap(keep);
4702 break;
4703 }
11fdf7f2 4704 ceph_assert(r >= 0);
7c673cae
FG
4705 numfiles++;
4706
4707 // touch dn
4708 mdcache->lru.lru_touch(dn);
4709 }
4710
adb31ebb
TL
4711 session->touch_readdir_cap(numfiles);
4712
7c673cae
FG
4713 __u16 flags = 0;
4714 if (end) {
4715 flags = CEPH_READDIR_FRAG_END;
4716 if (start)
4717 flags |= CEPH_READDIR_FRAG_COMPLETE; // FIXME: what purpose does this serve
4718 }
4719 // client only understand END and COMPLETE flags ?
4720 if (req_flags & CEPH_READDIR_REPLY_BITFLAGS) {
4721 flags |= CEPH_READDIR_HASH_ORDER | CEPH_READDIR_OFFSET_HASH;
4722 }
4723
4724 // finish final blob
11fdf7f2
TL
4725 encode(numfiles, dirbl);
4726 encode(flags, dirbl);
7c673cae
FG
4727 dirbl.claim_append(dnbl);
4728
4729 // yay, reply
4730 dout(10) << "reply to " << *req << " readdir num=" << numfiles
4731 << " bytes=" << dirbl.length()
4732 << " start=" << (int)start
4733 << " end=" << (int)end
4734 << dendl;
4735 mdr->reply_extra_bl = dirbl;
4736
4737 // bump popularity. NOTE: this doesn't quite capture it.
11fdf7f2 4738 mds->balancer->hit_dir(dir, META_POP_IRD, -1, numfiles);
7c673cae
FG
4739
4740 // reply
4741 mdr->tracei = diri;
4742 respond_to_request(mdr, 0);
4743}
4744
4745
4746
4747// ===============================================================================
4748// INODE UPDATES
4749
4750
4751/*
4752 * finisher for basic inode updates
4753 */
4754class C_MDS_inode_update_finish : public ServerLogContext {
4755 CInode *in;
adb31ebb 4756 bool truncating_smaller, changed_ranges, adjust_realm;
7c673cae
FG
4757public:
4758 C_MDS_inode_update_finish(Server *s, MDRequestRef& r, CInode *i,
adb31ebb 4759 bool sm=false, bool cr=false, bool ar=false) :
11fdf7f2 4760 ServerLogContext(s, r), in(i),
adb31ebb 4761 truncating_smaller(sm), changed_ranges(cr), adjust_realm(ar) { }
7c673cae 4762 void finish(int r) override {
11fdf7f2 4763 ceph_assert(r == 0);
7c673cae 4764
adb31ebb
TL
4765 int snap_op = (in->snaprealm ? CEPH_SNAP_OP_UPDATE : CEPH_SNAP_OP_SPLIT);
4766
7c673cae 4767 // apply
7c673cae
FG
4768 mdr->apply();
4769
11fdf7f2
TL
4770 MDSRank *mds = get_mds();
4771
7c673cae 4772 // notify any clients
f67539c2 4773 if (truncating_smaller && in->get_inode()->is_truncating()) {
11fdf7f2
TL
4774 mds->locker->issue_truncate(in);
4775 mds->mdcache->truncate_inode(in, mdr->ls);
4776 }
4777
adb31ebb
TL
4778 if (adjust_realm) {
4779 mds->mdcache->send_snap_update(in, 0, snap_op);
4780 mds->mdcache->do_realm_invalidate_and_update_notify(in, snap_op);
7c673cae
FG
4781 }
4782
11fdf7f2 4783 get_mds()->balancer->hit_inode(in, META_POP_IWR);
7c673cae
FG
4784
4785 server->respond_to_request(mdr, 0);
4786
4787 if (changed_ranges)
4788 get_mds()->locker->share_inode_max_size(in);
4789 }
4790};
4791
4792void Server::handle_client_file_setlock(MDRequestRef& mdr)
4793{
9f95a23c 4794 const cref_t<MClientRequest> &req = mdr->client_request;
11fdf7f2 4795 MutationImpl::LockOpVec lov;
7c673cae
FG
4796
4797 // get the inode to operate on, and set up any locks needed for that
9f95a23c 4798 CInode *cur = rdlock_path_pin_ref(mdr, true);
7c673cae
FG
4799 if (!cur)
4800 return;
4801
11fdf7f2 4802 lov.add_xlock(&cur->flocklock);
7c673cae
FG
4803 /* acquire_locks will return true if it gets the locks. If it fails,
4804 it will redeliver this request at a later date, so drop the request.
4805 */
11fdf7f2 4806 if (!mds->locker->acquire_locks(mdr, lov)) {
7c673cae
FG
4807 dout(10) << "handle_client_file_setlock could not get locks!" << dendl;
4808 return;
4809 }
4810
4811 // copy the lock change into a ceph_filelock so we can store/apply it
4812 ceph_filelock set_lock;
4813 set_lock.start = req->head.args.filelock_change.start;
4814 set_lock.length = req->head.args.filelock_change.length;
4815 set_lock.client = req->get_orig_source().num();
4816 set_lock.owner = req->head.args.filelock_change.owner;
4817 set_lock.pid = req->head.args.filelock_change.pid;
4818 set_lock.type = req->head.args.filelock_change.type;
4819 bool will_wait = req->head.args.filelock_change.wait;
4820
4821 dout(10) << "handle_client_file_setlock: " << set_lock << dendl;
4822
4823 ceph_lock_state_t *lock_state = NULL;
4824 bool interrupt = false;
4825
4826 // get the appropriate lock state
4827 switch (req->head.args.filelock_change.rule) {
4828 case CEPH_LOCK_FLOCK_INTR:
4829 interrupt = true;
4830 // fall-thru
4831 case CEPH_LOCK_FLOCK:
4832 lock_state = cur->get_flock_lock_state();
4833 break;
4834
4835 case CEPH_LOCK_FCNTL_INTR:
4836 interrupt = true;
4837 // fall-thru
4838 case CEPH_LOCK_FCNTL:
4839 lock_state = cur->get_fcntl_lock_state();
4840 break;
4841
4842 default:
4843 dout(10) << "got unknown lock type " << set_lock.type
4844 << ", dropping request!" << dendl;
f67539c2 4845 respond_to_request(mdr, -CEPHFS_EOPNOTSUPP);
7c673cae
FG
4846 return;
4847 }
4848
4849 dout(10) << " state prior to lock change: " << *lock_state << dendl;
4850 if (CEPH_LOCK_UNLOCK == set_lock.type) {
4851 list<ceph_filelock> activated_locks;
11fdf7f2 4852 MDSContext::vec waiters;
7c673cae
FG
4853 if (lock_state->is_waiting(set_lock)) {
4854 dout(10) << " unlock removing waiting lock " << set_lock << dendl;
4855 lock_state->remove_waiting(set_lock);
4856 cur->take_waiting(CInode::WAIT_FLOCK, waiters);
4857 } else if (!interrupt) {
4858 dout(10) << " unlock attempt on " << set_lock << dendl;
4859 lock_state->remove_lock(set_lock, activated_locks);
4860 cur->take_waiting(CInode::WAIT_FLOCK, waiters);
4861 }
4862 mds->queue_waiters(waiters);
4863
4864 respond_to_request(mdr, 0);
4865 } else {
4866 dout(10) << " lock attempt on " << set_lock << dendl;
4867 bool deadlock = false;
4868 if (mdr->more()->flock_was_waiting &&
4869 !lock_state->is_waiting(set_lock)) {
4870 dout(10) << " was waiting for lock but not anymore, must have been canceled " << set_lock << dendl;
f67539c2 4871 respond_to_request(mdr, -CEPHFS_EINTR);
7c673cae
FG
4872 } else if (!lock_state->add_lock(set_lock, will_wait, mdr->more()->flock_was_waiting, &deadlock)) {
4873 dout(10) << " it failed on this attempt" << dendl;
4874 // couldn't set lock right now
4875 if (deadlock) {
f67539c2 4876 respond_to_request(mdr, -CEPHFS_EDEADLK);
7c673cae 4877 } else if (!will_wait) {
f67539c2 4878 respond_to_request(mdr, -CEPHFS_EWOULDBLOCK);
7c673cae
FG
4879 } else {
4880 dout(10) << " added to waiting list" << dendl;
11fdf7f2 4881 ceph_assert(lock_state->is_waiting(set_lock));
7c673cae
FG
4882 mdr->more()->flock_was_waiting = true;
4883 mds->locker->drop_locks(mdr.get());
4884 mdr->drop_local_auth_pins();
1adf2230
AA
4885 mdr->mark_event("failed to add lock, waiting");
4886 mdr->mark_nowarn();
7c673cae
FG
4887 cur->add_waiter(CInode::WAIT_FLOCK, new C_MDS_RetryRequest(mdcache, mdr));
4888 }
4889 } else
4890 respond_to_request(mdr, 0);
4891 }
4892 dout(10) << " state after lock change: " << *lock_state << dendl;
4893}
4894
4895void Server::handle_client_file_readlock(MDRequestRef& mdr)
4896{
9f95a23c 4897 const cref_t<MClientRequest> &req = mdr->client_request;
11fdf7f2 4898 MutationImpl::LockOpVec lov;
7c673cae
FG
4899
4900 // get the inode to operate on, and set up any locks needed for that
9f95a23c 4901 CInode *cur = rdlock_path_pin_ref(mdr, true);
7c673cae
FG
4902 if (!cur)
4903 return;
4904
4905 /* acquire_locks will return true if it gets the locks. If it fails,
4906 it will redeliver this request at a later date, so drop the request.
4907 */
11fdf7f2
TL
4908 lov.add_rdlock(&cur->flocklock);
4909 if (!mds->locker->acquire_locks(mdr, lov)) {
7c673cae
FG
4910 dout(10) << "handle_client_file_readlock could not get locks!" << dendl;
4911 return;
4912 }
4913
4914 // copy the lock change into a ceph_filelock so we can store/apply it
4915 ceph_filelock checking_lock;
4916 checking_lock.start = req->head.args.filelock_change.start;
4917 checking_lock.length = req->head.args.filelock_change.length;
4918 checking_lock.client = req->get_orig_source().num();
4919 checking_lock.owner = req->head.args.filelock_change.owner;
4920 checking_lock.pid = req->head.args.filelock_change.pid;
4921 checking_lock.type = req->head.args.filelock_change.type;
4922
4923 // get the appropriate lock state
4924 ceph_lock_state_t *lock_state = NULL;
4925 switch (req->head.args.filelock_change.rule) {
4926 case CEPH_LOCK_FLOCK:
4927 lock_state = cur->get_flock_lock_state();
4928 break;
4929
4930 case CEPH_LOCK_FCNTL:
4931 lock_state = cur->get_fcntl_lock_state();
4932 break;
4933
4934 default:
4935 dout(10) << "got unknown lock type " << checking_lock.type << dendl;
f67539c2 4936 respond_to_request(mdr, -CEPHFS_EINVAL);
7c673cae
FG
4937 return;
4938 }
4939 lock_state->look_for_lock(checking_lock);
4940
4941 bufferlist lock_bl;
11fdf7f2 4942 encode(checking_lock, lock_bl);
7c673cae
FG
4943
4944 mdr->reply_extra_bl = lock_bl;
4945 respond_to_request(mdr, 0);
4946}
4947
4948void Server::handle_client_setattr(MDRequestRef& mdr)
4949{
9f95a23c 4950 const cref_t<MClientRequest> &req = mdr->client_request;
11fdf7f2 4951 MutationImpl::LockOpVec lov;
9f95a23c 4952 CInode *cur = rdlock_path_pin_ref(mdr, true);
7c673cae
FG
4953 if (!cur) return;
4954
4955 if (mdr->snapid != CEPH_NOSNAP) {
f67539c2 4956 respond_to_request(mdr, -CEPHFS_EROFS);
7c673cae
FG
4957 return;
4958 }
4959 if (cur->ino() < MDS_INO_SYSTEM_BASE && !cur->is_base()) {
f67539c2 4960 respond_to_request(mdr, -CEPHFS_EPERM);
7c673cae
FG
4961 return;
4962 }
4963
4964 __u32 mask = req->head.args.setattr.mask;
4965 __u32 access_mask = MAY_WRITE;
4966
4967 // xlock inode
4968 if (mask & (CEPH_SETATTR_MODE|CEPH_SETATTR_UID|CEPH_SETATTR_GID|CEPH_SETATTR_BTIME|CEPH_SETATTR_KILL_SGUID))
11fdf7f2 4969 lov.add_xlock(&cur->authlock);
7c673cae 4970 if (mask & (CEPH_SETATTR_MTIME|CEPH_SETATTR_ATIME|CEPH_SETATTR_SIZE))
11fdf7f2 4971 lov.add_xlock(&cur->filelock);
7c673cae 4972 if (mask & CEPH_SETATTR_CTIME)
11fdf7f2 4973 lov.add_wrlock(&cur->versionlock);
7c673cae 4974
11fdf7f2 4975 if (!mds->locker->acquire_locks(mdr, lov))
7c673cae
FG
4976 return;
4977
f67539c2 4978 if ((mask & CEPH_SETATTR_UID) && (cur->get_inode()->uid != req->head.args.setattr.uid))
7c673cae
FG
4979 access_mask |= MAY_CHOWN;
4980
f67539c2 4981 if ((mask & CEPH_SETATTR_GID) && (cur->get_inode()->gid != req->head.args.setattr.gid))
7c673cae
FG
4982 access_mask |= MAY_CHGRP;
4983
4984 if (!check_access(mdr, cur, access_mask))
4985 return;
4986
4987 // trunc from bigger -> smaller?
f67539c2 4988 const auto& pip = cur->get_projected_inode();
7c673cae 4989
94b18763 4990 uint64_t old_size = std::max<uint64_t>(pip->size, req->head.args.setattr.old_size);
7c673cae 4991
f67539c2 4992 // CEPHFS_ENOSPC on growing file while full, but allow shrinks
7c673cae 4993 if (is_full && req->head.args.setattr.size > old_size) {
f67539c2
TL
4994 dout(20) << __func__ << ": full, responding CEPHFS_ENOSPC to setattr with larger size" << dendl;
4995 respond_to_request(mdr, -CEPHFS_ENOSPC);
7c673cae
FG
4996 return;
4997 }
4998
4999 bool truncating_smaller = false;
5000 if (mask & CEPH_SETATTR_SIZE) {
5001 truncating_smaller = req->head.args.setattr.size < old_size;
94b18763
FG
5002 if (truncating_smaller && pip->is_truncating()) {
5003 dout(10) << " waiting for pending truncate from " << pip->truncate_from
5004 << " to " << pip->truncate_size << " to complete on " << *cur << dendl;
7c673cae
FG
5005 mds->locker->drop_locks(mdr.get());
5006 mdr->drop_local_auth_pins();
5007 cur->add_waiter(CInode::WAIT_TRUNC, new C_MDS_RetryRequest(mdcache, mdr));
5008 return;
5009 }
5010 }
5011
5012 bool changed_ranges = false;
5013
5014 // project update
5015 mdr->ls = mdlog->get_current_segment();
5016 EUpdate *le = new EUpdate(mdlog, "setattr");
5017 mdlog->start_entry(le);
5018
f67539c2 5019 auto pi = cur->project_inode(mdr);
7c673cae
FG
5020
5021 if (mask & CEPH_SETATTR_UID)
f67539c2 5022 pi.inode->uid = req->head.args.setattr.uid;
7c673cae 5023 if (mask & CEPH_SETATTR_GID)
f67539c2 5024 pi.inode->gid = req->head.args.setattr.gid;
7c673cae
FG
5025
5026 if (mask & CEPH_SETATTR_MODE)
f67539c2 5027 pi.inode->mode = (pi.inode->mode & ~07777) | (req->head.args.setattr.mode & 07777);
7c673cae 5028 else if ((mask & (CEPH_SETATTR_UID|CEPH_SETATTR_GID|CEPH_SETATTR_KILL_SGUID)) &&
f67539c2
TL
5029 S_ISREG(pi.inode->mode) &&
5030 (pi.inode->mode & (S_IXUSR|S_IXGRP|S_IXOTH))) {
5031 pi.inode->mode &= ~(S_ISUID|S_ISGID);
7c673cae
FG
5032 }
5033
5034 if (mask & CEPH_SETATTR_MTIME)
f67539c2 5035 pi.inode->mtime = req->head.args.setattr.mtime;
7c673cae 5036 if (mask & CEPH_SETATTR_ATIME)
f67539c2 5037 pi.inode->atime = req->head.args.setattr.atime;
7c673cae 5038 if (mask & CEPH_SETATTR_BTIME)
f67539c2 5039 pi.inode->btime = req->head.args.setattr.btime;
7c673cae 5040 if (mask & (CEPH_SETATTR_ATIME | CEPH_SETATTR_MTIME | CEPH_SETATTR_BTIME))
f67539c2 5041 pi.inode->time_warp_seq++; // maybe not a timewarp, but still a serialization point.
7c673cae
FG
5042 if (mask & CEPH_SETATTR_SIZE) {
5043 if (truncating_smaller) {
f67539c2 5044 pi.inode->truncate(old_size, req->head.args.setattr.size);
7c673cae
FG
5045 le->metablob.add_truncate_start(cur->ino());
5046 } else {
f67539c2
TL
5047 pi.inode->size = req->head.args.setattr.size;
5048 pi.inode->rstat.rbytes = pi.inode->size;
7c673cae 5049 }
f67539c2 5050 pi.inode->mtime = mdr->get_op_stamp();
7c673cae
FG
5051
5052 // adjust client's max_size?
f67539c2 5053 if (mds->locker->calc_new_client_ranges(cur, pi.inode->size)) {
f91f0fd5 5054 dout(10) << " client_ranges " << cur->get_previous_projected_inode()->client_ranges
f67539c2 5055 << " -> " << pi.inode->client_ranges << dendl;
7c673cae
FG
5056 changed_ranges = true;
5057 }
5058 }
5059
f67539c2
TL
5060 pi.inode->version = cur->pre_dirty();
5061 pi.inode->ctime = mdr->get_op_stamp();
5062 if (mdr->get_op_stamp() > pi.inode->rstat.rctime)
5063 pi.inode->rstat.rctime = mdr->get_op_stamp();
5064 pi.inode->change_attr++;
7c673cae
FG
5065
5066 // log + wait
5067 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
5068 mdcache->predirty_journal_parents(mdr, &le->metablob, cur, 0, PREDIRTY_PRIMARY);
5069 mdcache->journal_dirty_inode(mdr.get(), &le->metablob, cur);
5070
5071 journal_and_reply(mdr, cur, 0, le, new C_MDS_inode_update_finish(this, mdr, cur,
5072 truncating_smaller, changed_ranges));
5073
5074 // flush immediately if there are readers/writers waiting
11fdf7f2 5075 if (mdr->is_xlocked(&cur->filelock) &&
7c673cae
FG
5076 (cur->get_caps_wanted() & (CEPH_CAP_FILE_RD|CEPH_CAP_FILE_WR)))
5077 mds->mdlog->flush();
5078}
5079
5080/* Takes responsibility for mdr */
5081void Server::do_open_truncate(MDRequestRef& mdr, int cmode)
5082{
5083 CInode *in = mdr->in[0];
5084 client_t client = mdr->get_client();
11fdf7f2 5085 ceph_assert(in);
7c673cae
FG
5086
5087 dout(10) << "do_open_truncate " << *in << dendl;
5088
5089 SnapRealm *realm = in->find_snaprealm();
9f95a23c 5090 Capability *cap = mds->locker->issue_new_caps(in, cmode, mdr, realm);
7c673cae
FG
5091
5092 mdr->ls = mdlog->get_current_segment();
5093 EUpdate *le = new EUpdate(mdlog, "open_truncate");
5094 mdlog->start_entry(le);
5095
5096 // prepare
f67539c2
TL
5097 auto pi = in->project_inode(mdr);
5098 pi.inode->version = in->pre_dirty();
5099 pi.inode->mtime = pi.inode->ctime = mdr->get_op_stamp();
5100 if (mdr->get_op_stamp() > pi.inode->rstat.rctime)
5101 pi.inode->rstat.rctime = mdr->get_op_stamp();
5102 pi.inode->change_attr++;
5103
5104 uint64_t old_size = std::max<uint64_t>(pi.inode->size, mdr->client_request->head.args.open.old_size);
7c673cae 5105 if (old_size > 0) {
f67539c2 5106 pi.inode->truncate(old_size, 0);
7c673cae
FG
5107 le->metablob.add_truncate_start(in->ino());
5108 }
5109
5110 bool changed_ranges = false;
a8e16298 5111 if (cap && (cmode & CEPH_FILE_MODE_WR)) {
f67539c2
TL
5112 pi.inode->client_ranges[client].range.first = 0;
5113 pi.inode->client_ranges[client].range.last = pi.inode->get_layout_size_increment();
5114 pi.inode->client_ranges[client].follows = realm->get_newest_seq();
7c673cae 5115 changed_ranges = true;
f91f0fd5 5116 in->mark_clientwriteable();
a8e16298 5117 cap->mark_clientwriteable();
7c673cae
FG
5118 }
5119
5120 le->metablob.add_client_req(mdr->reqid, mdr->client_request->get_oldest_client_tid());
5121
5122 mdcache->predirty_journal_parents(mdr, &le->metablob, in, 0, PREDIRTY_PRIMARY);
5123 mdcache->journal_dirty_inode(mdr.get(), &le->metablob, in);
5124
5125 // make sure ino gets into the journal
5126 le->metablob.add_opened_ino(in->ino());
7c673cae
FG
5127
5128 mdr->o_trunc = true;
5129
5130 CDentry *dn = 0;
5131 if (mdr->client_request->get_dentry_wanted()) {
11fdf7f2 5132 ceph_assert(mdr->dn[0].size());
7c673cae
FG
5133 dn = mdr->dn[0].back();
5134 }
5135
5136 journal_and_reply(mdr, in, dn, le, new C_MDS_inode_update_finish(this, mdr, in, old_size > 0,
5137 changed_ranges));
5138 // Although the `open` part can give an early reply, the truncation won't
5139 // happen until our EUpdate is persistent, to give the client a prompt
5140 // response we must also flush that event.
5141 mdlog->flush();
5142}
5143
5144
5145/* This function cleans up the passed mdr */
5146void Server::handle_client_setlayout(MDRequestRef& mdr)
5147{
9f95a23c
TL
5148 const cref_t<MClientRequest> &req = mdr->client_request;
5149 CInode *cur = rdlock_path_pin_ref(mdr, true);
7c673cae
FG
5150 if (!cur) return;
5151
5152 if (mdr->snapid != CEPH_NOSNAP) {
f67539c2 5153 respond_to_request(mdr, -CEPHFS_EROFS);
7c673cae
FG
5154 return;
5155 }
5156 if (!cur->is_file()) {
f67539c2 5157 respond_to_request(mdr, -CEPHFS_EINVAL);
7c673cae
FG
5158 return;
5159 }
5160 if (cur->get_projected_inode()->size ||
5161 cur->get_projected_inode()->truncate_seq > 1) {
f67539c2 5162 respond_to_request(mdr, -CEPHFS_ENOTEMPTY);
7c673cae
FG
5163 return;
5164 }
5165
5166 // validate layout
5167 file_layout_t layout = cur->get_projected_inode()->layout;
5168 // save existing layout for later
5169 const auto old_layout = layout;
5170
5171 int access = MAY_WRITE;
5172
5173 if (req->head.args.setlayout.layout.fl_object_size > 0)
5174 layout.object_size = req->head.args.setlayout.layout.fl_object_size;
5175 if (req->head.args.setlayout.layout.fl_stripe_unit > 0)
5176 layout.stripe_unit = req->head.args.setlayout.layout.fl_stripe_unit;
5177 if (req->head.args.setlayout.layout.fl_stripe_count > 0)
5178 layout.stripe_count=req->head.args.setlayout.layout.fl_stripe_count;
5179 if (req->head.args.setlayout.layout.fl_pg_pool > 0) {
5180 layout.pool_id = req->head.args.setlayout.layout.fl_pg_pool;
5181
5182 // make sure we have as new a map as the client
5183 if (req->get_mdsmap_epoch() > mds->mdsmap->get_epoch()) {
5184 mds->wait_for_mdsmap(req->get_mdsmap_epoch(), new C_MDS_RetryRequest(mdcache, mdr));
5185 return;
5186 }
5187 }
5188
5189 // Don't permit layout modifications without 'p' caps
5190 if (layout != old_layout) {
5191 access |= MAY_SET_VXATTR;
5192 }
5193
5194 if (!layout.is_valid()) {
5195 dout(10) << "bad layout" << dendl;
f67539c2 5196 respond_to_request(mdr, -CEPHFS_EINVAL);
7c673cae
FG
5197 return;
5198 }
5199 if (!mds->mdsmap->is_data_pool(layout.pool_id)) {
5200 dout(10) << " invalid data pool " << layout.pool_id << dendl;
f67539c2 5201 respond_to_request(mdr, -CEPHFS_EINVAL);
7c673cae
FG
5202 return;
5203 }
5204
9f95a23c 5205 MutationImpl::LockOpVec lov;
11fdf7f2
TL
5206 lov.add_xlock(&cur->filelock);
5207 if (!mds->locker->acquire_locks(mdr, lov))
7c673cae
FG
5208 return;
5209
5210 if (!check_access(mdr, cur, access))
5211 return;
5212
5213 // project update
f67539c2
TL
5214 auto pi = cur->project_inode(mdr);
5215 pi.inode->layout = layout;
7c673cae 5216 // add the old pool to the inode
f67539c2
TL
5217 pi.inode->add_old_pool(old_layout.pool_id);
5218 pi.inode->version = cur->pre_dirty();
5219 pi.inode->ctime = mdr->get_op_stamp();
5220 if (mdr->get_op_stamp() > pi.inode->rstat.rctime)
5221 pi.inode->rstat.rctime = mdr->get_op_stamp();
5222 pi.inode->change_attr++;
7c673cae
FG
5223
5224 // log + wait
5225 mdr->ls = mdlog->get_current_segment();
5226 EUpdate *le = new EUpdate(mdlog, "setlayout");
5227 mdlog->start_entry(le);
5228 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
5229 mdcache->predirty_journal_parents(mdr, &le->metablob, cur, 0, PREDIRTY_PRIMARY);
5230 mdcache->journal_dirty_inode(mdr.get(), &le->metablob, cur);
5231
5232 journal_and_reply(mdr, cur, 0, le, new C_MDS_inode_update_finish(this, mdr, cur));
5233}
5234
9f95a23c 5235bool Server::xlock_policylock(MDRequestRef& mdr, CInode *in, bool want_layout, bool xlock_snaplock)
7c673cae 5236{
9f95a23c
TL
5237 if (mdr->locking_state & MutationImpl::ALL_LOCKED)
5238 return true;
5239
11fdf7f2 5240 MutationImpl::LockOpVec lov;
9f95a23c
TL
5241 lov.add_xlock(&in->policylock);
5242 if (xlock_snaplock)
5243 lov.add_xlock(&in->snaplock);
5244 else
5245 lov.add_rdlock(&in->snaplock);
5246 if (!mds->locker->acquire_locks(mdr, lov))
5247 return false;
7c673cae 5248
9f95a23c
TL
5249 if (want_layout && in->get_projected_inode()->has_layout()) {
5250 mdr->dir_layout = in->get_projected_inode()->layout;
5251 want_layout = false;
5252 }
5253 if (CDentry *pdn = in->get_projected_parent_dn(); pdn) {
5254 if (!mds->locker->try_rdlock_snap_layout(pdn->get_dir()->get_inode(), mdr, 0, want_layout))
5255 return false;
7c673cae
FG
5256 }
5257
9f95a23c
TL
5258 mdr->locking_state |= MutationImpl::ALL_LOCKED;
5259 return true;
5260}
5261
5262CInode* Server::try_get_auth_inode(MDRequestRef& mdr, inodeno_t ino)
5263{
5264 CInode *in = mdcache->get_inode(ino);
5265 if (!in || in->state_test(CInode::STATE_PURGING)) {
f67539c2 5266 respond_to_request(mdr, -CEPHFS_ESTALE);
9f95a23c
TL
5267 return nullptr;
5268 }
5269 if (!in->is_auth()) {
5270 mdcache->request_forward(mdr, in->authority().first);
5271 return nullptr;
5272 }
5273
5274 return in;
5275}
5276
5277void Server::handle_client_setdirlayout(MDRequestRef& mdr)
5278{
5279 const cref_t<MClientRequest> &req = mdr->client_request;
5280
5281 // can't use rdlock_path_pin_ref because we need to xlock snaplock/policylock
5282 CInode *cur = try_get_auth_inode(mdr, req->get_filepath().get_ino());
5283 if (!cur)
5284 return;
5285
7c673cae 5286 if (!cur->is_dir()) {
f67539c2 5287 respond_to_request(mdr, -CEPHFS_ENOTDIR);
7c673cae
FG
5288 return;
5289 }
5290
9f95a23c 5291 if (!xlock_policylock(mdr, cur, true))
7c673cae
FG
5292 return;
5293
5294 // validate layout
f67539c2 5295 const auto& old_pi = cur->get_projected_inode();
7c673cae
FG
5296 file_layout_t layout;
5297 if (old_pi->has_layout())
5298 layout = old_pi->layout;
9f95a23c
TL
5299 else if (mdr->dir_layout != file_layout_t())
5300 layout = mdr->dir_layout;
7c673cae
FG
5301 else
5302 layout = mdcache->default_file_layout;
5303
5304 // Level of access required to complete
5305 int access = MAY_WRITE;
5306
5307 const auto old_layout = layout;
5308
5309 if (req->head.args.setlayout.layout.fl_object_size > 0)
5310 layout.object_size = req->head.args.setlayout.layout.fl_object_size;
5311 if (req->head.args.setlayout.layout.fl_stripe_unit > 0)
5312 layout.stripe_unit = req->head.args.setlayout.layout.fl_stripe_unit;
5313 if (req->head.args.setlayout.layout.fl_stripe_count > 0)
5314 layout.stripe_count=req->head.args.setlayout.layout.fl_stripe_count;
5315 if (req->head.args.setlayout.layout.fl_pg_pool > 0) {
5316 layout.pool_id = req->head.args.setlayout.layout.fl_pg_pool;
5317 // make sure we have as new a map as the client
5318 if (req->get_mdsmap_epoch() > mds->mdsmap->get_epoch()) {
5319 mds->wait_for_mdsmap(req->get_mdsmap_epoch(), new C_MDS_RetryRequest(mdcache, mdr));
5320 return;
5321 }
5322 }
5323
5324 if (layout != old_layout) {
5325 access |= MAY_SET_VXATTR;
5326 }
5327
5328 if (!layout.is_valid()) {
5329 dout(10) << "bad layout" << dendl;
f67539c2 5330 respond_to_request(mdr, -CEPHFS_EINVAL);
7c673cae
FG
5331 return;
5332 }
5333 if (!mds->mdsmap->is_data_pool(layout.pool_id)) {
5334 dout(10) << " invalid data pool " << layout.pool_id << dendl;
f67539c2 5335 respond_to_request(mdr, -CEPHFS_EINVAL);
7c673cae
FG
5336 return;
5337 }
5338
5339 if (!check_access(mdr, cur, access))
5340 return;
5341
f67539c2
TL
5342 auto pi = cur->project_inode(mdr);
5343 pi.inode->layout = layout;
5344 pi.inode->version = cur->pre_dirty();
7c673cae
FG
5345
5346 // log + wait
5347 mdr->ls = mdlog->get_current_segment();
5348 EUpdate *le = new EUpdate(mdlog, "setlayout");
5349 mdlog->start_entry(le);
5350 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
5351 mdcache->predirty_journal_parents(mdr, &le->metablob, cur, 0, PREDIRTY_PRIMARY);
5352 mdcache->journal_dirty_inode(mdr.get(), &le->metablob, cur);
5353
b32b8144 5354 mdr->no_early_reply = true;
7c673cae
FG
5355 journal_and_reply(mdr, cur, 0, le, new C_MDS_inode_update_finish(this, mdr, cur));
5356}
5357
5358// XATTRS
5359
5360int Server::parse_layout_vxattr(string name, string value, const OSDMap& osdmap,
5361 file_layout_t *layout, bool validate)
5362{
5363 dout(20) << "parse_layout_vxattr name " << name << " value '" << value << "'" << dendl;
5364 try {
5365 if (name == "layout") {
5366 string::iterator begin = value.begin();
5367 string::iterator end = value.end();
5368 keys_and_values<string::iterator> p; // create instance of parser
5369 std::map<string, string> m; // map to receive results
5370 if (!qi::parse(begin, end, p, m)) { // returns true if successful
f67539c2 5371 return -CEPHFS_EINVAL;
7c673cae
FG
5372 }
5373 string left(begin, end);
5374 dout(10) << " parsed " << m << " left '" << left << "'" << dendl;
5375 if (begin != end)
f67539c2 5376 return -CEPHFS_EINVAL;
7c673cae
FG
5377 for (map<string,string>::iterator q = m.begin(); q != m.end(); ++q) {
5378 // Skip validation on each attr, we do it once at the end (avoid
5379 // rejecting intermediate states if the overall result is ok)
5380 int r = parse_layout_vxattr(string("layout.") + q->first, q->second,
5381 osdmap, layout, false);
5382 if (r < 0)
5383 return r;
5384 }
5385 } else if (name == "layout.object_size") {
5386 layout->object_size = boost::lexical_cast<unsigned>(value);
5387 } else if (name == "layout.stripe_unit") {
5388 layout->stripe_unit = boost::lexical_cast<unsigned>(value);
5389 } else if (name == "layout.stripe_count") {
5390 layout->stripe_count = boost::lexical_cast<unsigned>(value);
5391 } else if (name == "layout.pool") {
5392 try {
5393 layout->pool_id = boost::lexical_cast<unsigned>(value);
5394 } catch (boost::bad_lexical_cast const&) {
5395 int64_t pool = osdmap.lookup_pg_pool_name(value);
5396 if (pool < 0) {
5397 dout(10) << " unknown pool " << value << dendl;
f67539c2 5398 return -CEPHFS_ENOENT;
7c673cae
FG
5399 }
5400 layout->pool_id = pool;
5401 }
5402 } else if (name == "layout.pool_namespace") {
5403 layout->pool_ns = value;
5404 } else {
5405 dout(10) << " unknown layout vxattr " << name << dendl;
f67539c2 5406 return -CEPHFS_EINVAL;
7c673cae
FG
5407 }
5408 } catch (boost::bad_lexical_cast const&) {
5409 dout(10) << "bad vxattr value, unable to parse int for " << name << dendl;
f67539c2 5410 return -CEPHFS_EINVAL;
7c673cae
FG
5411 }
5412
5413 if (validate && !layout->is_valid()) {
5414 dout(10) << "bad layout" << dendl;
f67539c2 5415 return -CEPHFS_EINVAL;
7c673cae
FG
5416 }
5417 if (!mds->mdsmap->is_data_pool(layout->pool_id)) {
5418 dout(10) << " invalid data pool " << layout->pool_id << dendl;
f67539c2 5419 return -CEPHFS_EINVAL;
7c673cae
FG
5420 }
5421 return 0;
5422}
5423
5424int Server::parse_quota_vxattr(string name, string value, quota_info_t *quota)
5425{
5426 dout(20) << "parse_quota_vxattr name " << name << " value '" << value << "'" << dendl;
5427 try {
5428 if (name == "quota") {
5429 string::iterator begin = value.begin();
5430 string::iterator end = value.end();
11fdf7f2
TL
5431 if (begin == end) {
5432 // keep quota unchanged. (for create_quota_realm())
5433 return 0;
5434 }
7c673cae
FG
5435 keys_and_values<string::iterator> p; // create instance of parser
5436 std::map<string, string> m; // map to receive results
5437 if (!qi::parse(begin, end, p, m)) { // returns true if successful
f67539c2 5438 return -CEPHFS_EINVAL;
7c673cae
FG
5439 }
5440 string left(begin, end);
5441 dout(10) << " parsed " << m << " left '" << left << "'" << dendl;
5442 if (begin != end)
f67539c2 5443 return -CEPHFS_EINVAL;
7c673cae
FG
5444 for (map<string,string>::iterator q = m.begin(); q != m.end(); ++q) {
5445 int r = parse_quota_vxattr(string("quota.") + q->first, q->second, quota);
5446 if (r < 0)
5447 return r;
5448 }
5449 } else if (name == "quota.max_bytes") {
5450 int64_t q = boost::lexical_cast<int64_t>(value);
5451 if (q < 0)
f67539c2 5452 return -CEPHFS_EINVAL;
7c673cae
FG
5453 quota->max_bytes = q;
5454 } else if (name == "quota.max_files") {
5455 int64_t q = boost::lexical_cast<int64_t>(value);
5456 if (q < 0)
f67539c2 5457 return -CEPHFS_EINVAL;
7c673cae
FG
5458 quota->max_files = q;
5459 } else {
5460 dout(10) << " unknown quota vxattr " << name << dendl;
f67539c2 5461 return -CEPHFS_EINVAL;
7c673cae
FG
5462 }
5463 } catch (boost::bad_lexical_cast const&) {
5464 dout(10) << "bad vxattr value, unable to parse int for " << name << dendl;
f67539c2 5465 return -CEPHFS_EINVAL;
7c673cae
FG
5466 }
5467
5468 if (!quota->is_valid()) {
5469 dout(10) << "bad quota" << dendl;
f67539c2 5470 return -CEPHFS_EINVAL;
7c673cae
FG
5471 }
5472 return 0;
5473}
5474
11fdf7f2
TL
5475void Server::create_quota_realm(CInode *in)
5476{
5477 dout(10) << __func__ << " " << *in << dendl;
5478
9f95a23c 5479 auto req = make_message<MClientRequest>(CEPH_MDS_OP_SETXATTR);
11fdf7f2
TL
5480 req->set_filepath(filepath(in->ino()));
5481 req->set_string2("ceph.quota");
5482 // empty vxattr value
5483 req->set_tid(mds->issue_tid());
5484
5485 mds->send_message_mds(req, in->authority().first);
5486}
5487
7c673cae
FG
5488/*
5489 * Verify that the file layout attribute carried by client
5490 * is well-formatted.
5491 * Return 0 on success, otherwise this function takes
5492 * responsibility for the passed mdr.
5493 */
5494int Server::check_layout_vxattr(MDRequestRef& mdr,
5495 string name,
5496 string value,
5497 file_layout_t *layout)
5498{
9f95a23c 5499 const cref_t<MClientRequest> &req = mdr->client_request;
7c673cae
FG
5500 epoch_t epoch;
5501 int r;
5502
5503 mds->objecter->with_osdmap([&](const OSDMap& osdmap) {
5504 r = parse_layout_vxattr(name, value, osdmap, layout);
5505 epoch = osdmap.get_epoch();
5506 });
5507
f67539c2 5508 if (r == -CEPHFS_ENOENT) {
7c673cae
FG
5509
5510 // we don't have the specified pool, make sure our map
5511 // is newer than or as new as the client.
5512 epoch_t req_epoch = req->get_osdmap_epoch();
5513
5514 if (req_epoch > epoch) {
5515
5516 // well, our map is older. consult mds.
f67539c2 5517 auto fin = new C_IO_Wrapper(mds, new C_MDS_RetryRequest(mdcache, mdr));
7c673cae 5518
f67539c2
TL
5519 mds->objecter->wait_for_map(req_epoch, lambdafy(fin));
5520 return r;
7c673cae
FG
5521 } else if (req_epoch == 0 && !mdr->waited_for_osdmap) {
5522
5523 // For compatibility with client w/ old code, we still need get the
5524 // latest map. One day if COMPACT_VERSION of MClientRequest >=3,
5525 // we can remove those code.
5526 mdr->waited_for_osdmap = true;
f67539c2
TL
5527 mds->objecter->wait_for_latest_osdmap(std::ref(*new C_IO_Wrapper(
5528 mds, new C_MDS_RetryRequest(mdcache, mdr))));
7c673cae
FG
5529 return r;
5530 }
5531 }
5532
5533 if (r < 0) {
5534
f67539c2
TL
5535 if (r == -CEPHFS_ENOENT)
5536 r = -CEPHFS_EINVAL;
7c673cae
FG
5537
5538 respond_to_request(mdr, r);
5539 return r;
5540 }
5541
5542 // all is well
5543 return 0;
5544}
5545
9f95a23c 5546void Server::handle_set_vxattr(MDRequestRef& mdr, CInode *cur)
7c673cae 5547{
9f95a23c 5548 const cref_t<MClientRequest> &req = mdr->client_request;
7c673cae
FG
5549 string name(req->get_path2());
5550 bufferlist bl = req->get_data();
5551 string value (bl.c_str(), bl.length());
5552 dout(10) << "handle_set_vxattr " << name
5553 << " val " << value.length()
5554 << " bytes on " << *cur
5555 << dendl;
5556
94b18763 5557 CInode::mempool_inode *pip = nullptr;
7c673cae
FG
5558 string rest;
5559
5560 if (!check_access(mdr, cur, MAY_SET_VXATTR)) {
5561 return;
5562 }
5563
adb31ebb 5564 bool adjust_realm = false;
7c673cae
FG
5565 if (name.compare(0, 15, "ceph.dir.layout") == 0) {
5566 if (!cur->is_dir()) {
f67539c2 5567 respond_to_request(mdr, -CEPHFS_EINVAL);
7c673cae
FG
5568 return;
5569 }
5570
9f95a23c
TL
5571 if (!xlock_policylock(mdr, cur, true))
5572 return;
5573
7c673cae
FG
5574 file_layout_t layout;
5575 if (cur->get_projected_inode()->has_layout())
5576 layout = cur->get_projected_inode()->layout;
9f95a23c
TL
5577 else if (mdr->dir_layout != file_layout_t())
5578 layout = mdr->dir_layout;
7c673cae
FG
5579 else
5580 layout = mdcache->default_file_layout;
5581
5582 rest = name.substr(name.find("layout"));
5583 if (check_layout_vxattr(mdr, rest, value, &layout) < 0)
5584 return;
5585
f67539c2
TL
5586 auto pi = cur->project_inode(mdr);
5587 pi.inode->layout = layout;
b32b8144 5588 mdr->no_early_reply = true;
f67539c2 5589 pip = pi.inode.get();
7c673cae
FG
5590 } else if (name.compare(0, 16, "ceph.file.layout") == 0) {
5591 if (!cur->is_file()) {
f67539c2 5592 respond_to_request(mdr, -CEPHFS_EINVAL);
7c673cae
FG
5593 return;
5594 }
5595 if (cur->get_projected_inode()->size ||
5596 cur->get_projected_inode()->truncate_seq > 1) {
f67539c2 5597 respond_to_request(mdr, -CEPHFS_ENOTEMPTY);
7c673cae
FG
5598 return;
5599 }
5600 file_layout_t layout = cur->get_projected_inode()->layout;
5601 rest = name.substr(name.find("layout"));
5602 if (check_layout_vxattr(mdr, rest, value, &layout) < 0)
5603 return;
5604
9f95a23c 5605 MutationImpl::LockOpVec lov;
11fdf7f2
TL
5606 lov.add_xlock(&cur->filelock);
5607 if (!mds->locker->acquire_locks(mdr, lov))
7c673cae
FG
5608 return;
5609
f67539c2
TL
5610 auto pi = cur->project_inode(mdr);
5611 int64_t old_pool = pi.inode->layout.pool_id;
5612 pi.inode->add_old_pool(old_pool);
5613 pi.inode->layout = layout;
5614 pip = pi.inode.get();
7c673cae 5615 } else if (name.compare(0, 10, "ceph.quota") == 0) {
f67539c2
TL
5616 if (!cur->is_dir()) {
5617 respond_to_request(mdr, -CEPHFS_EINVAL);
7c673cae
FG
5618 return;
5619 }
5620
5621 quota_info_t quota = cur->get_projected_inode()->quota;
5622
5623 rest = name.substr(name.find("quota"));
5624 int r = parse_quota_vxattr(rest, value, &quota);
5625 if (r < 0) {
5626 respond_to_request(mdr, r);
5627 return;
5628 }
5629
9f95a23c 5630 if (quota.is_enable() && !cur->get_projected_srnode())
adb31ebb
TL
5631 adjust_realm = true;
5632
5633 if (!xlock_policylock(mdr, cur, false, adjust_realm))
5634 return;
11fdf7f2 5635
adb31ebb
TL
5636 if (cur->get_projected_inode()->quota == quota) {
5637 respond_to_request(mdr, 0);
7c673cae 5638 return;
adb31ebb 5639 }
7c673cae 5640
f67539c2
TL
5641 auto pi = cur->project_inode(mdr, false, adjust_realm);
5642 pi.inode->quota = quota;
94b18763 5643
adb31ebb
TL
5644 if (adjust_realm)
5645 pi.snapnode->created = pi.snapnode->seq = cur->find_snaprealm()->get_newest_seq();
5646
b32b8144 5647 mdr->no_early_reply = true;
f67539c2 5648 pip = pi.inode.get();
28e407b8
AA
5649
5650 client_t exclude_ct = mdr->get_client();
a8e16298 5651 mdcache->broadcast_quota_to_client(cur, exclude_ct, true);
adb31ebb
TL
5652 } else if (name == "ceph.dir.subvolume"sv) {
5653 if (!cur->is_dir()) {
f67539c2 5654 respond_to_request(mdr, -CEPHFS_EINVAL);
adb31ebb
TL
5655 return;
5656 }
5657
5658 bool val;
5659 try {
5660 val = boost::lexical_cast<bool>(value);
5661 } catch (boost::bad_lexical_cast const&) {
5662 dout(10) << "bad vxattr value, unable to parse bool for " << name << dendl;
f67539c2 5663 respond_to_request(mdr, -CEPHFS_EINVAL);
adb31ebb
TL
5664 return;
5665 }
5666
5667 if (!xlock_policylock(mdr, cur, false, true))
5668 return;
5669
5670 SnapRealm *realm = cur->find_snaprealm();
5671 if (val) {
5672 inodeno_t subvol_ino = realm->get_subvolume_ino();
5673 // can't create subvolume inside another subvolume
5674 if (subvol_ino && subvol_ino != cur->ino()) {
f67539c2 5675 respond_to_request(mdr, -CEPHFS_EINVAL);
adb31ebb
TL
5676 return;
5677 }
5678 }
5679
5680 const auto srnode = cur->get_projected_srnode();
5681 if (val == (srnode && srnode->is_subvolume())) {
5682 respond_to_request(mdr, 0);
5683 return;
5684 }
5685
f67539c2 5686 auto pi = cur->project_inode(mdr, false, true);
adb31ebb
TL
5687 if (!srnode)
5688 pi.snapnode->created = pi.snapnode->seq = realm->get_newest_seq();
5689 if (val)
5690 pi.snapnode->mark_subvolume();
5691 else
5692 pi.snapnode->clear_subvolume();
5693
5694 mdr->no_early_reply = true;
f67539c2 5695 pip = pi.inode.get();
adb31ebb 5696 adjust_realm = true;
f6b5b4d7 5697 } else if (name == "ceph.dir.pin"sv) {
7c673cae 5698 if (!cur->is_dir() || cur->is_root()) {
f67539c2 5699 respond_to_request(mdr, -CEPHFS_EINVAL);
7c673cae
FG
5700 return;
5701 }
5702
5703 mds_rank_t rank;
5704 try {
5705 rank = boost::lexical_cast<mds_rank_t>(value);
5706 if (rank < 0) rank = MDS_RANK_NONE;
5707 } catch (boost::bad_lexical_cast const&) {
5708 dout(10) << "bad vxattr value, unable to parse int for " << name << dendl;
f67539c2 5709 respond_to_request(mdr, -CEPHFS_EINVAL);
7c673cae
FG
5710 return;
5711 }
5712
9f95a23c 5713 if (!xlock_policylock(mdr, cur))
7c673cae
FG
5714 return;
5715
f67539c2 5716 auto pi = cur->project_inode(mdr);
7c673cae 5717 cur->set_export_pin(rank);
f67539c2 5718 pip = pi.inode.get();
f6b5b4d7
TL
5719 } else if (name == "ceph.dir.pin.random"sv) {
5720 if (!cur->is_dir() || cur->is_root()) {
f67539c2 5721 respond_to_request(mdr, -CEPHFS_EINVAL);
f6b5b4d7
TL
5722 return;
5723 }
5724
5725 double val;
5726 try {
5727 val = boost::lexical_cast<double>(value);
5728 } catch (boost::bad_lexical_cast const&) {
5729 dout(10) << "bad vxattr value, unable to parse float for " << name << dendl;
f67539c2 5730 respond_to_request(mdr, -CEPHFS_EINVAL);
f6b5b4d7
TL
5731 return;
5732 }
5733
5734 if (val < 0.0 || 1.0 < val) {
f67539c2 5735 respond_to_request(mdr, -CEPHFS_EDOM);
f6b5b4d7
TL
5736 return;
5737 } else if (mdcache->export_ephemeral_random_max < val) {
f67539c2 5738 respond_to_request(mdr, -CEPHFS_EINVAL);
f6b5b4d7
TL
5739 return;
5740 }
5741
5742 if (!xlock_policylock(mdr, cur))
5743 return;
5744
f67539c2 5745 auto pi = cur->project_inode(mdr);
f6b5b4d7 5746 cur->setxattr_ephemeral_rand(val);
f67539c2 5747 pip = pi.inode.get();
f6b5b4d7
TL
5748 } else if (name == "ceph.dir.pin.distributed"sv) {
5749 if (!cur->is_dir() || cur->is_root()) {
f67539c2 5750 respond_to_request(mdr, -CEPHFS_EINVAL);
f6b5b4d7
TL
5751 return;
5752 }
5753
5754 bool val;
5755 try {
5756 val = boost::lexical_cast<bool>(value);
5757 } catch (boost::bad_lexical_cast const&) {
5758 dout(10) << "bad vxattr value, unable to parse bool for " << name << dendl;
f67539c2 5759 respond_to_request(mdr, -CEPHFS_EINVAL);
f6b5b4d7
TL
5760 return;
5761 }
5762
5763 if (!xlock_policylock(mdr, cur))
5764 return;
5765
f67539c2 5766 auto pi = cur->project_inode(mdr);
f6b5b4d7 5767 cur->setxattr_ephemeral_dist(val);
f67539c2 5768 pip = pi.inode.get();
7c673cae
FG
5769 } else {
5770 dout(10) << " unknown vxattr " << name << dendl;
f67539c2 5771 respond_to_request(mdr, -CEPHFS_EINVAL);
7c673cae
FG
5772 return;
5773 }
5774
94b18763 5775 pip->change_attr++;
91327a77
AA
5776 pip->ctime = mdr->get_op_stamp();
5777 if (mdr->get_op_stamp() > pip->rstat.rctime)
5778 pip->rstat.rctime = mdr->get_op_stamp();
94b18763 5779 pip->version = cur->pre_dirty();
7c673cae 5780 if (cur->is_file())
94b18763 5781 pip->update_backtrace();
7c673cae
FG
5782
5783 // log + wait
5784 mdr->ls = mdlog->get_current_segment();
5785 EUpdate *le = new EUpdate(mdlog, "set vxattr layout");
5786 mdlog->start_entry(le);
5787 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
5788 mdcache->predirty_journal_parents(mdr, &le->metablob, cur, 0, PREDIRTY_PRIMARY);
5789 mdcache->journal_dirty_inode(mdr.get(), &le->metablob, cur);
5790
11fdf7f2 5791 journal_and_reply(mdr, cur, 0, le, new C_MDS_inode_update_finish(this, mdr, cur,
adb31ebb 5792 false, false, adjust_realm));
7c673cae
FG
5793 return;
5794}
5795
9f95a23c 5796void Server::handle_remove_vxattr(MDRequestRef& mdr, CInode *cur)
7c673cae 5797{
9f95a23c 5798 const cref_t<MClientRequest> &req = mdr->client_request;
7c673cae
FG
5799 string name(req->get_path2());
5800
5801 dout(10) << __func__ << " " << name << " on " << *cur << dendl;
5802
5803 if (name == "ceph.dir.layout") {
5804 if (!cur->is_dir()) {
f67539c2 5805 respond_to_request(mdr, -CEPHFS_ENODATA);
7c673cae
FG
5806 return;
5807 }
5808 if (cur->is_root()) {
5809 dout(10) << "can't remove layout policy on the root directory" << dendl;
f67539c2 5810 respond_to_request(mdr, -CEPHFS_EINVAL);
7c673cae
FG
5811 return;
5812 }
5813
5814 if (!cur->get_projected_inode()->has_layout()) {
f67539c2 5815 respond_to_request(mdr, -CEPHFS_ENODATA);
7c673cae
FG
5816 return;
5817 }
5818
9f95a23c 5819 MutationImpl::LockOpVec lov;
11fdf7f2
TL
5820 lov.add_xlock(&cur->policylock);
5821 if (!mds->locker->acquire_locks(mdr, lov))
7c673cae
FG
5822 return;
5823
f67539c2
TL
5824 auto pi = cur->project_inode(mdr);
5825 pi.inode->clear_layout();
5826 pi.inode->version = cur->pre_dirty();
7c673cae
FG
5827
5828 // log + wait
5829 mdr->ls = mdlog->get_current_segment();
5830 EUpdate *le = new EUpdate(mdlog, "remove dir layout vxattr");
5831 mdlog->start_entry(le);
5832 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
5833 mdcache->predirty_journal_parents(mdr, &le->metablob, cur, 0, PREDIRTY_PRIMARY);
5834 mdcache->journal_dirty_inode(mdr.get(), &le->metablob, cur);
5835
b32b8144 5836 mdr->no_early_reply = true;
7c673cae
FG
5837 journal_and_reply(mdr, cur, 0, le, new C_MDS_inode_update_finish(this, mdr, cur));
5838 return;
5839 } else if (name == "ceph.dir.layout.pool_namespace"
5840 || name == "ceph.file.layout.pool_namespace") {
5841 // Namespace is the only layout field that has a meaningful
5842 // null/none value (empty string, means default layout). Is equivalent
5843 // to a setxattr with empty string: pass through the empty payload of
5844 // the rmxattr request to do this.
9f95a23c 5845 handle_set_vxattr(mdr, cur);
7c673cae
FG
5846 return;
5847 }
5848
f67539c2 5849 respond_to_request(mdr, -CEPHFS_ENODATA);
7c673cae
FG
5850}
5851
f67539c2
TL
5852const Server::XattrHandler Server::xattr_handlers[] = {
5853 {
5854 xattr_name: Server::DEFAULT_HANDLER,
5855 description: "default xattr handler",
5856 validate: &Server::default_xattr_validate,
5857 setxattr: &Server::default_setxattr_handler,
5858 removexattr: &Server::default_removexattr_handler,
5859 },
5860 {
5861 xattr_name: "ceph.mirror.info",
5862 description: "mirror info xattr handler",
5863 validate: &Server::mirror_info_xattr_validate,
5864 setxattr: &Server::mirror_info_setxattr_handler,
5865 removexattr: &Server::mirror_info_removexattr_handler
5866 },
5867};
7c673cae 5868
f67539c2
TL
5869const Server::XattrHandler* Server::get_xattr_or_default_handler(std::string_view xattr_name) {
5870 const XattrHandler *default_xattr_handler = nullptr;
7c673cae 5871
f67539c2
TL
5872 for (auto &handler : xattr_handlers) {
5873 if (handler.xattr_name == Server::DEFAULT_HANDLER) {
5874 ceph_assert(default_xattr_handler == nullptr);
5875 default_xattr_handler = &handler;
5876 }
5877 if (handler.xattr_name == xattr_name) {
5878 dout(20) << "handler=" << handler.description << dendl;
5879 return &handler;
5880 }
5881 }
7c673cae 5882
f67539c2
TL
5883 ceph_assert(default_xattr_handler != nullptr);
5884 dout(20) << "handler=" << default_xattr_handler->description << dendl;
5885 return default_xattr_handler;
5886}
7c673cae 5887
f67539c2
TL
5888int Server::xattr_validate(CInode *cur, const InodeStoreBase::xattr_map_const_ptr xattrs,
5889 const std::string &xattr_name, int op, int flags) {
5890 if (op == CEPH_MDS_OP_SETXATTR) {
5891 if (xattrs) {
5892 if ((flags & CEPH_XATTR_CREATE) && xattrs->count(mempool::mds_co::string(xattr_name))) {
5893 dout(10) << "setxattr '" << xattr_name << "' XATTR_CREATE and CEPHFS_EEXIST on " << *cur << dendl;
5894 return -CEPHFS_EEXIST;
5895 }
5896 }
5897 if ((flags & CEPH_XATTR_REPLACE) && !(xattrs && xattrs->count(mempool::mds_co::string(xattr_name)))) {
5898 dout(10) << "setxattr '" << xattr_name << "' XATTR_REPLACE and CEPHFS_ENODATA on " << *cur << dendl;
5899 return -CEPHFS_ENODATA;
5900 }
5901
5902 return 0;
7c673cae 5903 }
f67539c2
TL
5904
5905 if (op == CEPH_MDS_OP_RMXATTR) {
5906 if (!xattrs || xattrs->count(mempool::mds_co::string(xattr_name)) == 0) {
5907 dout(10) << "removexattr '" << xattr_name << "' and CEPHFS_ENODATA on " << *cur << dendl;
5908 return -CEPHFS_ENODATA;
5909 }
5910
5911 return 0;
5912 }
5913
5914 derr << ": unhandled validation for: " << xattr_name << dendl;
5915 return -CEPHFS_EINVAL;
5916}
5917
5918void Server::xattr_set(InodeStoreBase::xattr_map_ptr xattrs, const std::string &xattr_name,
5919 const bufferlist &xattr_value) {
5920 size_t len = xattr_value.length();
5921 bufferptr b = buffer::create(len);
5922 if (len) {
5923 xattr_value.begin().copy(len, b.c_str());
5924 }
5925 auto em = xattrs->emplace(std::piecewise_construct,
5926 std::forward_as_tuple(mempool::mds_co::string(xattr_name)),
5927 std::forward_as_tuple(b));
5928 if (!em.second) {
5929 em.first->second = b;
5930 }
5931}
5932
5933void Server::xattr_rm(InodeStoreBase::xattr_map_ptr xattrs, const std::string &xattr_name) {
5934 xattrs->erase(mempool::mds_co::string(xattr_name));
5935}
5936
5937int Server::default_xattr_validate(CInode *cur, const InodeStoreBase::xattr_map_const_ptr xattrs,
5938 XattrOp *xattr_op) {
5939 return xattr_validate(cur, xattrs, xattr_op->xattr_name, xattr_op->op, xattr_op->flags);
5940}
5941
5942void Server::default_setxattr_handler(CInode *cur, InodeStoreBase::xattr_map_ptr xattrs,
5943 const XattrOp &xattr_op) {
5944 xattr_set(xattrs, xattr_op.xattr_name, xattr_op.xattr_value);
5945}
5946
5947void Server::default_removexattr_handler(CInode *cur, InodeStoreBase::xattr_map_ptr xattrs,
5948 const XattrOp &xattr_op) {
5949 xattr_rm(xattrs, xattr_op.xattr_name);
5950}
5951
5952// mirror info xattr handlers
5953const std::string Server::MirrorXattrInfo::MIRROR_INFO_REGEX = "^cluster_id=([a-f0-9]{8}-" \
5954 "[a-f0-9]{4}-[a-f0-9]{4}-" \
5955 "[a-f0-9]{4}-[a-f0-9]{12})" \
5956 " fs_id=(\\d+)$";
5957const std::string Server::MirrorXattrInfo::CLUSTER_ID = "ceph.mirror.info.cluster_id";
5958const std::string Server::MirrorXattrInfo::FS_ID = "ceph.mirror.info.fs_id";
5959int Server::parse_mirror_info_xattr(const std::string &name, const std::string &value,
5960 std::string &cluster_id, std::string &fs_id) {
5961 dout(20) << "parsing name=" << name << ", value=" << value << dendl;
5962
5963 static const std::regex regex(Server::MirrorXattrInfo::MIRROR_INFO_REGEX);
5964 std::smatch match;
5965
5966 std::regex_search(value, match, regex);
5967 if (match.size() != 3) {
5968 derr << "mirror info parse error" << dendl;
5969 return -CEPHFS_EINVAL;
5970 }
5971
5972 cluster_id = match[1];
5973 fs_id = match[2];
5974 dout(20) << " parsed cluster_id=" << cluster_id << ", fs_id=" << fs_id << dendl;
5975 return 0;
5976}
5977
5978int Server::mirror_info_xattr_validate(CInode *cur, const InodeStoreBase::xattr_map_const_ptr xattrs,
5979 XattrOp *xattr_op) {
5980 if (!cur->is_root()) {
5981 return -CEPHFS_EINVAL;
5982 }
5983
5984 int v1 = xattr_validate(cur, xattrs, Server::MirrorXattrInfo::CLUSTER_ID, xattr_op->op, xattr_op->flags);
5985 int v2 = xattr_validate(cur, xattrs, Server::MirrorXattrInfo::FS_ID, xattr_op->op, xattr_op->flags);
5986 if (v1 != v2) {
5987 derr << "inconsistent mirror info state (" << v1 << "," << v2 << ")" << dendl;
5988 return -CEPHFS_EINVAL;
5989 }
5990
5991 if (v1 < 0) {
5992 return v1;
5993 }
5994
5995 if (xattr_op->op == CEPH_MDS_OP_RMXATTR) {
5996 return 0;
5997 }
5998
5999 std::string cluster_id;
6000 std::string fs_id;
6001 int r = parse_mirror_info_xattr(xattr_op->xattr_name, xattr_op->xattr_value.to_str(),
6002 cluster_id, fs_id);
6003 if (r < 0) {
6004 return r;
6005 }
6006
6007 xattr_op->xinfo = std::make_unique<MirrorXattrInfo>(cluster_id, fs_id);
6008 return 0;
6009}
6010
6011void Server::mirror_info_setxattr_handler(CInode *cur, InodeStoreBase::xattr_map_ptr xattrs,
6012 const XattrOp &xattr_op) {
6013 auto mirror_info = dynamic_cast<MirrorXattrInfo&>(*(xattr_op.xinfo));
6014
6015 bufferlist bl;
6016 bl.append(mirror_info.cluster_id.c_str(), mirror_info.cluster_id.length());
6017 xattr_set(xattrs, Server::MirrorXattrInfo::CLUSTER_ID, bl);
6018
6019 bl.clear();
6020 bl.append(mirror_info.fs_id.c_str(), mirror_info.fs_id.length());
6021 xattr_set(xattrs, Server::MirrorXattrInfo::FS_ID, bl);
6022}
6023
6024void Server::mirror_info_removexattr_handler(CInode *cur, InodeStoreBase::xattr_map_ptr xattrs,
6025 const XattrOp &xattr_op) {
6026 xattr_rm(xattrs, Server::MirrorXattrInfo::CLUSTER_ID);
6027 xattr_rm(xattrs, Server::MirrorXattrInfo::FS_ID);
6028}
7c673cae
FG
6029
6030void Server::handle_client_setxattr(MDRequestRef& mdr)
6031{
9f95a23c 6032 const cref_t<MClientRequest> &req = mdr->client_request;
7c673cae 6033 string name(req->get_path2());
7c673cae 6034
f67539c2
TL
6035 // is a ceph virtual xattr?
6036 if (is_ceph_vxattr(name)) {
9f95a23c
TL
6037 // can't use rdlock_path_pin_ref because we need to xlock snaplock/policylock
6038 CInode *cur = try_get_auth_inode(mdr, req->get_filepath().get_ino());
6039 if (!cur)
6040 return;
6041
6042 handle_set_vxattr(mdr, cur);
6043 return;
6044 }
6045
f67539c2
TL
6046 if (!is_allowed_ceph_xattr(name)) {
6047 respond_to_request(mdr, -CEPHFS_EINVAL);
6048 return;
6049 }
6050
9f95a23c 6051 CInode *cur = rdlock_path_pin_ref(mdr, true);
7c673cae
FG
6052 if (!cur)
6053 return;
6054
6055 if (mdr->snapid != CEPH_NOSNAP) {
f67539c2 6056 respond_to_request(mdr, -CEPHFS_EROFS);
7c673cae
FG
6057 return;
6058 }
6059
6060 int flags = req->head.args.setxattr.flags;
6061
9f95a23c 6062 MutationImpl::LockOpVec lov;
11fdf7f2
TL
6063 lov.add_xlock(&cur->xattrlock);
6064 if (!mds->locker->acquire_locks(mdr, lov))
7c673cae
FG
6065 return;
6066
6067 if (!check_access(mdr, cur, MAY_WRITE))
6068 return;
6069
7c673cae
FG
6070 size_t len = req->get_data().length();
6071 size_t inc = len + name.length();
6072
f67539c2
TL
6073 auto handler = Server::get_xattr_or_default_handler(name);
6074 const auto& pxattrs = cur->get_projected_xattrs();
6075 if (pxattrs) {
6076 // check xattrs kv pairs size
6077 size_t cur_xattrs_size = 0;
6078 for (const auto& p : *pxattrs) {
6079 if ((flags & CEPH_XATTR_REPLACE) && name.compare(p.first) == 0) {
6080 continue;
6081 }
6082 cur_xattrs_size += p.first.length() + p.second.length();
7c673cae 6083 }
7c673cae 6084
f67539c2
TL
6085 if (((cur_xattrs_size + inc) > g_conf()->mds_max_xattr_pairs_size)) {
6086 dout(10) << "xattr kv pairs size too big. cur_xattrs_size "
6087 << cur_xattrs_size << ", inc " << inc << dendl;
6088 respond_to_request(mdr, -CEPHFS_ENOSPC);
6089 return;
6090 }
7c673cae
FG
6091 }
6092
f67539c2
TL
6093 XattrOp xattr_op(CEPH_MDS_OP_SETXATTR, name, req->get_data(), flags);
6094 int r = std::invoke(handler->validate, this, cur, pxattrs, &xattr_op);
6095 if (r < 0) {
6096 respond_to_request(mdr, r);
7c673cae
FG
6097 return;
6098 }
6099
6100 dout(10) << "setxattr '" << name << "' len " << len << " on " << *cur << dendl;
6101
6102 // project update
f67539c2
TL
6103 auto pi = cur->project_inode(mdr, true);
6104 pi.inode->version = cur->pre_dirty();
6105 pi.inode->ctime = mdr->get_op_stamp();
6106 if (mdr->get_op_stamp() > pi.inode->rstat.rctime)
6107 pi.inode->rstat.rctime = mdr->get_op_stamp();
6108 if (name == "encryption.ctx"sv)
6109 pi.inode->fscrypt = true;
6110 pi.inode->change_attr++;
6111 pi.inode->xattr_version++;
6112
94b18763 6113 if ((flags & CEPH_XATTR_REMOVE)) {
f67539c2 6114 std::invoke(handler->removexattr, this, cur, pi.xattrs, xattr_op);
94b18763 6115 } else {
f67539c2 6116 std::invoke(handler->setxattr, this, cur, pi.xattrs, xattr_op);
7c673cae
FG
6117 }
6118
6119 // log + wait
6120 mdr->ls = mdlog->get_current_segment();
6121 EUpdate *le = new EUpdate(mdlog, "setxattr");
6122 mdlog->start_entry(le);
6123 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
6124 mdcache->predirty_journal_parents(mdr, &le->metablob, cur, 0, PREDIRTY_PRIMARY);
6125 mdcache->journal_dirty_inode(mdr.get(), &le->metablob, cur);
6126
6127 journal_and_reply(mdr, cur, 0, le, new C_MDS_inode_update_finish(this, mdr, cur));
6128}
6129
6130void Server::handle_client_removexattr(MDRequestRef& mdr)
6131{
9f95a23c 6132 const cref_t<MClientRequest> &req = mdr->client_request;
94b18763 6133 std::string name(req->get_path2());
11fdf7f2 6134
f67539c2
TL
6135 // is a ceph virtual xattr?
6136 if (is_ceph_vxattr(name)) {
9f95a23c
TL
6137 // can't use rdlock_path_pin_ref because we need to xlock snaplock/policylock
6138 CInode *cur = try_get_auth_inode(mdr, req->get_filepath().get_ino());
6139 if (!cur)
6140 return;
6141
6142 handle_remove_vxattr(mdr, cur);
6143 return;
6144 }
6145
f67539c2
TL
6146 if (!is_allowed_ceph_xattr(name)) {
6147 respond_to_request(mdr, -CEPHFS_EINVAL);
6148 return;
6149 }
6150
9f95a23c 6151 CInode* cur = rdlock_path_pin_ref(mdr, true);
7c673cae
FG
6152 if (!cur)
6153 return;
6154
6155 if (mdr->snapid != CEPH_NOSNAP) {
f67539c2 6156 respond_to_request(mdr, -CEPHFS_EROFS);
7c673cae
FG
6157 return;
6158 }
6159
9f95a23c 6160 MutationImpl::LockOpVec lov;
11fdf7f2
TL
6161 lov.add_xlock(&cur->xattrlock);
6162 if (!mds->locker->acquire_locks(mdr, lov))
7c673cae
FG
6163 return;
6164
f67539c2
TL
6165
6166 auto handler = Server::get_xattr_or_default_handler(name);
6167 bufferlist bl;
6168 XattrOp xattr_op(CEPH_MDS_OP_RMXATTR, name, bl, 0);
6169
6170 const auto& pxattrs = cur->get_projected_xattrs();
6171 int r = std::invoke(handler->validate, this, cur, pxattrs, &xattr_op);
6172 if (r < 0) {
6173 respond_to_request(mdr, r);
7c673cae
FG
6174 return;
6175 }
6176
6177 dout(10) << "removexattr '" << name << "' on " << *cur << dendl;
6178
6179 // project update
f67539c2
TL
6180 auto pi = cur->project_inode(mdr, true);
6181 pi.inode->version = cur->pre_dirty();
6182 pi.inode->ctime = mdr->get_op_stamp();
6183 if (mdr->get_op_stamp() > pi.inode->rstat.rctime)
6184 pi.inode->rstat.rctime = mdr->get_op_stamp();
6185 pi.inode->change_attr++;
6186 pi.inode->xattr_version++;
6187 std::invoke(handler->removexattr, this, cur, pi.xattrs, xattr_op);
7c673cae
FG
6188
6189 // log + wait
6190 mdr->ls = mdlog->get_current_segment();
6191 EUpdate *le = new EUpdate(mdlog, "removexattr");
6192 mdlog->start_entry(le);
6193 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
6194 mdcache->predirty_journal_parents(mdr, &le->metablob, cur, 0, PREDIRTY_PRIMARY);
6195 mdcache->journal_dirty_inode(mdr.get(), &le->metablob, cur);
6196
6197 journal_and_reply(mdr, cur, 0, le, new C_MDS_inode_update_finish(this, mdr, cur));
6198}
6199
6200
6201// =================================================================
6202// DIRECTORY and NAMESPACE OPS
6203
6204
6205// ------------------------------------------------
6206
6207// MKNOD
6208
6209class C_MDS_mknod_finish : public ServerLogContext {
6210 CDentry *dn;
6211 CInode *newi;
6212public:
6213 C_MDS_mknod_finish(Server *s, MDRequestRef& r, CDentry *d, CInode *ni) :
6214 ServerLogContext(s, r), dn(d), newi(ni) {}
6215 void finish(int r) override {
11fdf7f2 6216 ceph_assert(r == 0);
7c673cae
FG
6217
6218 // link the inode
6219 dn->pop_projected_linkage();
6220
6221 // be a bit hacky with the inode version, here.. we decrement it
6222 // just to keep mark_dirty() happen. (we didn't bother projecting
6223 // a new version of hte inode since it's just been created)
f67539c2 6224 newi->mark_dirty(mdr->ls);
28e407b8 6225 newi->mark_dirty_parent(mdr->ls, true);
7c673cae
FG
6226
6227 // mkdir?
f67539c2 6228 if (newi->is_dir()) {
7c673cae 6229 CDir *dir = newi->get_dirfrag(frag_t());
11fdf7f2 6230 ceph_assert(dir);
f67539c2 6231 dir->mark_dirty(mdr->ls);
7c673cae
FG
6232 dir->mark_new(mdr->ls);
6233 }
6234
6235 mdr->apply();
6236
6237 MDRequestRef null_ref;
6238 get_mds()->mdcache->send_dentry_link(dn, null_ref);
6239
f67539c2 6240 if (newi->is_file()) {
7c673cae 6241 get_mds()->locker->share_inode_max_size(newi);
f67539c2 6242 } else if (newi->is_dir()) {
f6b5b4d7 6243 // We do this now so that the linkages on the new directory are stable.
f67539c2 6244 newi->maybe_ephemeral_rand();
f6b5b4d7 6245 }
7c673cae
FG
6246
6247 // hit pop
11fdf7f2 6248 get_mds()->balancer->hit_inode(newi, META_POP_IWR);
7c673cae
FG
6249
6250 // reply
6251 server->respond_to_request(mdr, 0);
6252 }
6253};
6254
6255
6256void Server::handle_client_mknod(MDRequestRef& mdr)
6257{
9f95a23c 6258 const cref_t<MClientRequest> &req = mdr->client_request;
7c673cae 6259 client_t client = mdr->get_client();
9f95a23c
TL
6260
6261 unsigned mode = req->head.args.mknod.mode;
6262 if ((mode & S_IFMT) == 0)
6263 mode |= S_IFREG;
6264
6265 mdr->disable_lock_cache();
6266 CDentry *dn = rdlock_path_xlock_dentry(mdr, true, false, S_ISREG(mode));
6267 if (!dn)
7c673cae
FG
6268 return;
6269
9f95a23c
TL
6270 CDir *dir = dn->get_dir();
6271 CInode *diri = dir->get_inode();
7c673cae
FG
6272 if (!check_access(mdr, diri, MAY_WRITE))
6273 return;
7c673cae
FG
6274 if (!check_fragment_space(mdr, dn->get_dir()))
6275 return;
6276
f67539c2
TL
6277 ceph_assert(dn->get_projected_linkage()->is_null());
6278 if (req->get_alternate_name().size() > alternate_name_max) {
6279 dout(10) << " alternate_name longer than " << alternate_name_max << dendl;
6280 respond_to_request(mdr, -CEPHFS_ENAMETOOLONG);
6281 return;
6282 }
6283 dn->set_alternate_name(req->get_alternate_name());
6284
7c673cae
FG
6285 // set layout
6286 file_layout_t layout;
9f95a23c
TL
6287 if (mdr->dir_layout != file_layout_t())
6288 layout = mdr->dir_layout;
7c673cae
FG
6289 else
6290 layout = mdcache->default_file_layout;
6291
11fdf7f2
TL
6292 CInode *newi = prepare_new_inode(mdr, dn->get_dir(), inodeno_t(req->head.ino), mode, &layout);
6293 ceph_assert(newi);
7c673cae
FG
6294
6295 dn->push_projected_linkage(newi);
6296
f67539c2
TL
6297 auto _inode = newi->_get_inode();
6298 _inode->version = dn->pre_dirty();
6299 _inode->rdev = req->head.args.mknod.rdev;
6300 _inode->rstat.rfiles = 1;
6301 _inode->accounted_rstat = _inode->rstat;
7c673cae 6302 if (layout.pool_id != mdcache->default_file_layout.pool_id)
f67539c2
TL
6303 _inode->add_old_pool(mdcache->default_file_layout.pool_id);
6304 _inode->update_backtrace();
7c673cae 6305
11fdf7f2
TL
6306 snapid_t follows = mdcache->get_global_snaprealm()->get_newest_seq();
6307 SnapRealm *realm = dn->get_dir()->inode->find_snaprealm();
6308 ceph_assert(follows >= realm->get_newest_seq());
6309
7c673cae
FG
6310 // if the client created a _regular_ file via MKNOD, it's highly likely they'll
6311 // want to write to it (e.g., if they are reexporting NFS)
f67539c2 6312 if (S_ISREG(_inode->mode)) {
7c673cae
FG
6313 // issue a cap on the file
6314 int cmode = CEPH_FILE_MODE_RDWR;
9f95a23c 6315 Capability *cap = mds->locker->issue_new_caps(newi, cmode, mdr, realm);
7c673cae
FG
6316 if (cap) {
6317 cap->set_wanted(0);
6318
6319 // put locks in excl mode
6320 newi->filelock.set_state(LOCK_EXCL);
6321 newi->authlock.set_state(LOCK_EXCL);
6322 newi->xattrlock.set_state(LOCK_EXCL);
a8e16298
TL
6323
6324 dout(15) << " setting a client_range too, since this is a regular file" << dendl;
f67539c2
TL
6325 _inode->client_ranges[client].range.first = 0;
6326 _inode->client_ranges[client].range.last = _inode->layout.stripe_unit;
6327 _inode->client_ranges[client].follows = follows;
f91f0fd5 6328 newi->mark_clientwriteable();
a8e16298 6329 cap->mark_clientwriteable();
7c673cae
FG
6330 }
6331 }
6332
11fdf7f2 6333 ceph_assert(dn->first == follows + 1);
7c673cae
FG
6334 newi->first = dn->first;
6335
f67539c2 6336 dout(10) << "mknod mode " << _inode->mode << " rdev " << _inode->rdev << dendl;
7c673cae
FG
6337
6338 // prepare finisher
6339 mdr->ls = mdlog->get_current_segment();
6340 EUpdate *le = new EUpdate(mdlog, "mknod");
6341 mdlog->start_entry(le);
6342 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
6343 journal_allocated_inos(mdr, &le->metablob);
6344
6345 mdcache->predirty_journal_parents(mdr, &le->metablob, newi, dn->get_dir(),
6346 PREDIRTY_PRIMARY|PREDIRTY_DIR, 1);
6347 le->metablob.add_primary_dentry(dn, newi, true, true, true);
6348
6349 journal_and_reply(mdr, newi, dn, le, new C_MDS_mknod_finish(this, mdr, dn, newi));
92f5a8d4 6350 mds->balancer->maybe_fragment(dn->get_dir(), false);
7c673cae
FG
6351}
6352
6353
6354
6355// MKDIR
6356/* This function takes responsibility for the passed mdr*/
6357void Server::handle_client_mkdir(MDRequestRef& mdr)
6358{
9f95a23c 6359 const cref_t<MClientRequest> &req = mdr->client_request;
91327a77 6360
9f95a23c
TL
6361 mdr->disable_lock_cache();
6362 CDentry *dn = rdlock_path_xlock_dentry(mdr, true);
6363 if (!dn)
7c673cae 6364 return;
9f95a23c 6365
7c673cae
FG
6366 CDir *dir = dn->get_dir();
6367 CInode *diri = dir->get_inode();
7c673cae
FG
6368
6369 // mkdir check access
6370 if (!check_access(mdr, diri, MAY_WRITE))
6371 return;
6372
6373 if (!check_fragment_space(mdr, dir))
6374 return;
6375
f67539c2
TL
6376 ceph_assert(dn->get_projected_linkage()->is_null());
6377 if (req->get_alternate_name().size() > alternate_name_max) {
6378 dout(10) << " alternate_name longer than " << alternate_name_max << dendl;
6379 respond_to_request(mdr, -CEPHFS_ENAMETOOLONG);
6380 return;
6381 }
6382 dn->set_alternate_name(req->get_alternate_name());
6383
7c673cae 6384 // new inode
7c673cae
FG
6385 unsigned mode = req->head.args.mkdir.mode;
6386 mode &= ~S_IFMT;
6387 mode |= S_IFDIR;
9f95a23c 6388 CInode *newi = prepare_new_inode(mdr, dir, inodeno_t(req->head.ino), mode);
11fdf7f2 6389 ceph_assert(newi);
7c673cae
FG
6390
6391 // it's a directory.
6392 dn->push_projected_linkage(newi);
6393
f67539c2
TL
6394 auto _inode = newi->_get_inode();
6395 _inode->version = dn->pre_dirty();
6396 _inode->rstat.rsubdirs = 1;
6397 _inode->accounted_rstat = _inode->rstat;
6398 _inode->update_backtrace();
7c673cae 6399
11fdf7f2
TL
6400 snapid_t follows = mdcache->get_global_snaprealm()->get_newest_seq();
6401 SnapRealm *realm = dn->get_dir()->inode->find_snaprealm();
6402 ceph_assert(follows >= realm->get_newest_seq());
6403
7c673cae 6404 dout(12) << " follows " << follows << dendl;
11fdf7f2 6405 ceph_assert(dn->first == follows + 1);
7c673cae
FG
6406 newi->first = dn->first;
6407
6408 // ...and that new dir is empty.
6409 CDir *newdir = newi->get_or_open_dirfrag(mdcache, frag_t());
6410 newdir->state_set(CDir::STATE_CREATING);
6411 newdir->mark_complete();
f67539c2 6412 newdir->_get_fnode()->version = newdir->pre_dirty();
7c673cae
FG
6413
6414 // prepare finisher
6415 mdr->ls = mdlog->get_current_segment();
6416 EUpdate *le = new EUpdate(mdlog, "mkdir");
6417 mdlog->start_entry(le);
6418 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
6419 journal_allocated_inos(mdr, &le->metablob);
6420 mdcache->predirty_journal_parents(mdr, &le->metablob, newi, dn->get_dir(), PREDIRTY_PRIMARY|PREDIRTY_DIR, 1);
6421 le->metablob.add_primary_dentry(dn, newi, true, true);
6422 le->metablob.add_new_dir(newdir); // dirty AND complete AND new
6423
6424 // issue a cap on the directory
6425 int cmode = CEPH_FILE_MODE_RDWR;
9f95a23c 6426 Capability *cap = mds->locker->issue_new_caps(newi, cmode, mdr, realm);
7c673cae
FG
6427 if (cap) {
6428 cap->set_wanted(0);
6429
6430 // put locks in excl mode
6431 newi->filelock.set_state(LOCK_EXCL);
6432 newi->authlock.set_state(LOCK_EXCL);
6433 newi->xattrlock.set_state(LOCK_EXCL);
6434 }
6435
6436 // make sure this inode gets into the journal
6437 le->metablob.add_opened_ino(newi->ino());
7c673cae
FG
6438
6439 journal_and_reply(mdr, newi, dn, le, new C_MDS_mknod_finish(this, mdr, dn, newi));
81eedcae
TL
6440
6441 // We hit_dir (via hit_inode) in our finish callback, but by then we might
6442 // have overshot the split size (multiple mkdir in flight), so here is
6443 // an early chance to split the dir if this mkdir makes it oversized.
6444 mds->balancer->maybe_fragment(dir, false);
7c673cae
FG
6445}
6446
6447
6448// SYMLINK
6449
6450void Server::handle_client_symlink(MDRequestRef& mdr)
6451{
f67539c2
TL
6452 const auto& req = mdr->client_request;
6453
9f95a23c
TL
6454 mdr->disable_lock_cache();
6455 CDentry *dn = rdlock_path_xlock_dentry(mdr, true);
6456 if (!dn)
7c673cae 6457 return;
9f95a23c 6458
7c673cae
FG
6459 CDir *dir = dn->get_dir();
6460 CInode *diri = dir->get_inode();
7c673cae
FG
6461
6462 if (!check_access(mdr, diri, MAY_WRITE))
9f95a23c 6463 return;
7c673cae
FG
6464 if (!check_fragment_space(mdr, dir))
6465 return;
6466
f67539c2
TL
6467 ceph_assert(dn->get_projected_linkage()->is_null());
6468 if (req->get_alternate_name().size() > alternate_name_max) {
6469 dout(10) << " alternate_name longer than " << alternate_name_max << dendl;
6470 respond_to_request(mdr, -CEPHFS_ENAMETOOLONG);
6471 }
6472 dn->set_alternate_name(req->get_alternate_name());
9f95a23c 6473
7c673cae 6474 unsigned mode = S_IFLNK | 0777;
9f95a23c 6475 CInode *newi = prepare_new_inode(mdr, dir, inodeno_t(req->head.ino), mode);
11fdf7f2 6476 ceph_assert(newi);
7c673cae
FG
6477
6478 // it's a symlink
6479 dn->push_projected_linkage(newi);
6480
11fdf7f2 6481 newi->symlink = req->get_path2();
f67539c2
TL
6482 auto _inode = newi->_get_inode();
6483 _inode->version = dn->pre_dirty();
6484 _inode->size = newi->symlink.length();
6485 _inode->rstat.rbytes = _inode->size;
6486 _inode->rstat.rfiles = 1;
6487 _inode->accounted_rstat = _inode->rstat;
6488 _inode->update_backtrace();
7c673cae
FG
6489
6490 newi->first = dn->first;
6491
6492 // prepare finisher
6493 mdr->ls = mdlog->get_current_segment();
6494 EUpdate *le = new EUpdate(mdlog, "symlink");
6495 mdlog->start_entry(le);
6496 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
6497 journal_allocated_inos(mdr, &le->metablob);
6498 mdcache->predirty_journal_parents(mdr, &le->metablob, newi, dn->get_dir(), PREDIRTY_PRIMARY|PREDIRTY_DIR, 1);
6499 le->metablob.add_primary_dentry(dn, newi, true, true);
6500
6501 journal_and_reply(mdr, newi, dn, le, new C_MDS_mknod_finish(this, mdr, dn, newi));
92f5a8d4 6502 mds->balancer->maybe_fragment(dir, false);
7c673cae
FG
6503}
6504
6505
6506
6507
6508
6509// LINK
6510
6511void Server::handle_client_link(MDRequestRef& mdr)
6512{
9f95a23c 6513 const cref_t<MClientRequest> &req = mdr->client_request;
7c673cae
FG
6514
6515 dout(7) << "handle_client_link " << req->get_filepath()
6516 << " to " << req->get_filepath2()
6517 << dendl;
6518
9f95a23c 6519 mdr->disable_lock_cache();
7c673cae 6520
9f95a23c
TL
6521 CDentry *destdn;
6522 CInode *targeti;
6523
6524 if (req->get_filepath2().depth() == 0) {
6525 targeti = mdcache->get_inode(req->get_filepath2().get_ino());
6526 if (!targeti) {
f67539c2 6527 dout(10) << "CEPHFS_ESTALE on path2, attempting recovery" << dendl;
9f95a23c
TL
6528 mdcache->find_ino_peers(req->get_filepath2().get_ino(), new C_MDS_TryFindInode(this, mdr));
6529 return;
6530 }
6531 mdr->pin(targeti);
6532
6533 if (!(mdr->locking_state & MutationImpl::SNAP2_LOCKED)) {
6534 CDentry *pdn = targeti->get_projected_parent_dn();
6535 if (!pdn) {
6536 dout(7) << "target has no parent dn, failing..." << dendl;
f67539c2 6537 respond_to_request(mdr, -CEPHFS_EINVAL);
9f95a23c
TL
6538 return;
6539 }
6540 if (!mds->locker->try_rdlock_snap_layout(pdn->get_dir()->get_inode(), mdr, 1))
6541 return;
6542 mdr->locking_state |= MutationImpl::SNAP2_LOCKED;
6543 }
6544
6545 destdn = rdlock_path_xlock_dentry(mdr, false);
6546 if (!destdn)
6547 return;
9f95a23c
TL
6548 } else {
6549 auto ret = rdlock_two_paths_xlock_destdn(mdr, false);
6550 destdn = ret.first;
6551 if (!destdn)
6552 return;
6553
6554 if (!destdn->get_projected_linkage()->is_null()) {
f67539c2 6555 respond_to_request(mdr, -CEPHFS_EEXIST);
9f95a23c
TL
6556 return;
6557 }
6558
6559 targeti = ret.second->get_projected_linkage()->get_inode();
6560 }
6561
f67539c2
TL
6562 ceph_assert(destdn->get_projected_linkage()->is_null());
6563 if (req->get_alternate_name().size() > alternate_name_max) {
6564 dout(10) << " alternate_name longer than " << alternate_name_max << dendl;
6565 respond_to_request(mdr, -CEPHFS_ENAMETOOLONG);
6566 return;
6567 }
6568 destdn->set_alternate_name(req->get_alternate_name());
6569
9f95a23c
TL
6570 if (targeti->is_dir()) {
6571 dout(7) << "target is a dir, failing..." << dendl;
f67539c2 6572 respond_to_request(mdr, -CEPHFS_EINVAL);
7c673cae
FG
6573 return;
6574 }
6575
9f95a23c
TL
6576 CDir *dir = destdn->get_dir();
6577 dout(7) << "handle_client_link link " << destdn->get_name() << " in " << *dir << dendl;
7c673cae 6578 dout(7) << "target is " << *targeti << dendl;
9f95a23c
TL
6579
6580 if (!(mdr->locking_state & MutationImpl::ALL_LOCKED)) {
6581 MutationImpl::LockOpVec lov;
6582 lov.add_xlock(&targeti->snaplock);
6583 lov.add_xlock(&targeti->linklock);
6584
6585 if (!mds->locker->acquire_locks(mdr, lov))
181888fb 6586 return;
7c673cae 6587
9f95a23c
TL
6588 mdr->locking_state |= MutationImpl::ALL_LOCKED;
6589 }
7c673cae 6590
9f95a23c
TL
6591 if (targeti->get_projected_inode()->nlink == 0) {
6592 dout(7) << "target has no link, failing..." << dendl;
f67539c2 6593 respond_to_request(mdr, -CEPHFS_ENOENT);
9f95a23c 6594 }
7c673cae
FG
6595
6596 if ((!mdr->has_more() || mdr->more()->witnessed.empty())) {
6597 if (!check_access(mdr, targeti, MAY_WRITE))
6598 return;
6599
6600 if (!check_access(mdr, dir->get_inode(), MAY_WRITE))
6601 return;
6602
6603 if (!check_fragment_space(mdr, dir))
6604 return;
6605 }
6606
adb31ebb
TL
6607 CInode* target_pin = targeti->get_projected_parent_dir()->inode;
6608 SnapRealm *target_realm = target_pin->find_snaprealm();
6609 if (target_pin != dir->inode &&
6610 target_realm->get_subvolume_ino() !=
6611 dir->inode->find_snaprealm()->get_subvolume_ino()) {
6612 dout(7) << "target is in different subvolume, failing..." << dendl;
f67539c2 6613 respond_to_request(mdr, -CEPHFS_EXDEV);
adb31ebb
TL
6614 return;
6615 }
6616
7c673cae 6617 // go!
11fdf7f2 6618 ceph_assert(g_conf()->mds_kill_link_at != 1);
7c673cae
FG
6619
6620 // local or remote?
6621 if (targeti->is_auth())
adb31ebb 6622 _link_local(mdr, destdn, targeti, target_realm);
7c673cae 6623 else
9f95a23c 6624 _link_remote(mdr, true, destdn, targeti);
92f5a8d4 6625 mds->balancer->maybe_fragment(dir, false);
7c673cae
FG
6626}
6627
6628
6629class C_MDS_link_local_finish : public ServerLogContext {
6630 CDentry *dn;
6631 CInode *targeti;
6632 version_t dnpv;
6633 version_t tipv;
11fdf7f2 6634 bool adjust_realm;
7c673cae
FG
6635public:
6636 C_MDS_link_local_finish(Server *s, MDRequestRef& r, CDentry *d, CInode *ti,
11fdf7f2 6637 version_t dnpv_, version_t tipv_, bool ar) :
7c673cae 6638 ServerLogContext(s, r), dn(d), targeti(ti),
11fdf7f2 6639 dnpv(dnpv_), tipv(tipv_), adjust_realm(ar) { }
7c673cae 6640 void finish(int r) override {
11fdf7f2
TL
6641 ceph_assert(r == 0);
6642 server->_link_local_finish(mdr, dn, targeti, dnpv, tipv, adjust_realm);
7c673cae
FG
6643 }
6644};
6645
6646
adb31ebb 6647void Server::_link_local(MDRequestRef& mdr, CDentry *dn, CInode *targeti, SnapRealm *target_realm)
7c673cae
FG
6648{
6649 dout(10) << "_link_local " << *dn << " to " << *targeti << dendl;
6650
6651 mdr->ls = mdlog->get_current_segment();
6652
6653 // predirty NEW dentry
6654 version_t dnpv = dn->pre_dirty();
6655 version_t tipv = targeti->pre_dirty();
6656
6657 // project inode update
f67539c2
TL
6658 auto pi = targeti->project_inode(mdr);
6659 pi.inode->nlink++;
6660 pi.inode->ctime = mdr->get_op_stamp();
6661 if (mdr->get_op_stamp() > pi.inode->rstat.rctime)
6662 pi.inode->rstat.rctime = mdr->get_op_stamp();
6663 pi.inode->change_attr++;
6664 pi.inode->version = tipv;
7c673cae 6665
11fdf7f2 6666 bool adjust_realm = false;
adb31ebb 6667 if (!target_realm->get_subvolume_ino() && !targeti->is_projected_snaprealm_global()) {
11fdf7f2
TL
6668 sr_t *newsnap = targeti->project_snaprealm();
6669 targeti->mark_snaprealm_global(newsnap);
adb31ebb 6670 targeti->record_snaprealm_parent_dentry(newsnap, target_realm, targeti->get_projected_parent_dn(), true);
11fdf7f2
TL
6671 adjust_realm = true;
6672 }
6673
7c673cae
FG
6674 // log + wait
6675 EUpdate *le = new EUpdate(mdlog, "link_local");
6676 mdlog->start_entry(le);
6677 le->metablob.add_client_req(mdr->reqid, mdr->client_request->get_oldest_client_tid());
6678 mdcache->predirty_journal_parents(mdr, &le->metablob, targeti, dn->get_dir(), PREDIRTY_DIR, 1); // new dn
6679 mdcache->predirty_journal_parents(mdr, &le->metablob, targeti, 0, PREDIRTY_PRIMARY); // targeti
6680 le->metablob.add_remote_dentry(dn, true, targeti->ino(), targeti->d_type()); // new remote
6681 mdcache->journal_dirty_inode(mdr.get(), &le->metablob, targeti);
6682
6683 // do this after predirty_*, to avoid funky extra dnl arg
6684 dn->push_projected_linkage(targeti->ino(), targeti->d_type());
6685
11fdf7f2
TL
6686 journal_and_reply(mdr, targeti, dn, le,
6687 new C_MDS_link_local_finish(this, mdr, dn, targeti, dnpv, tipv, adjust_realm));
7c673cae
FG
6688}
6689
6690void Server::_link_local_finish(MDRequestRef& mdr, CDentry *dn, CInode *targeti,
11fdf7f2 6691 version_t dnpv, version_t tipv, bool adjust_realm)
7c673cae
FG
6692{
6693 dout(10) << "_link_local_finish " << *dn << " to " << *targeti << dendl;
6694
6695 // link and unlock the NEW dentry
31f18b77
FG
6696 CDentry::linkage_t *dnl = dn->pop_projected_linkage();
6697 if (!dnl->get_inode())
6698 dn->link_remote(dnl, targeti);
7c673cae
FG
6699 dn->mark_dirty(dnpv, mdr->ls);
6700
6701 // target inode
7c673cae
FG
6702 mdr->apply();
6703
6704 MDRequestRef null_ref;
6705 mdcache->send_dentry_link(dn, null_ref);
6706
11fdf7f2
TL
6707 if (adjust_realm) {
6708 int op = CEPH_SNAP_OP_SPLIT;
6709 mds->mdcache->send_snap_update(targeti, 0, op);
6710 mds->mdcache->do_realm_invalidate_and_update_notify(targeti, op);
6711 }
6712
7c673cae 6713 // bump target popularity
11fdf7f2
TL
6714 mds->balancer->hit_inode(targeti, META_POP_IWR);
6715 mds->balancer->hit_dir(dn->get_dir(), META_POP_IWR);
7c673cae
FG
6716
6717 // reply
6718 respond_to_request(mdr, 0);
6719}
6720
6721
6722// link / unlink remote
6723
6724class C_MDS_link_remote_finish : public ServerLogContext {
6725 bool inc;
6726 CDentry *dn;
6727 CInode *targeti;
6728 version_t dpv;
6729public:
6730 C_MDS_link_remote_finish(Server *s, MDRequestRef& r, bool i, CDentry *d, CInode *ti) :
6731 ServerLogContext(s, r), inc(i), dn(d), targeti(ti),
6732 dpv(d->get_projected_version()) {}
6733 void finish(int r) override {
11fdf7f2 6734 ceph_assert(r == 0);
7c673cae
FG
6735 server->_link_remote_finish(mdr, inc, dn, targeti, dpv);
6736 }
6737};
6738
6739void Server::_link_remote(MDRequestRef& mdr, bool inc, CDentry *dn, CInode *targeti)
6740{
6741 dout(10) << "_link_remote "
6742 << (inc ? "link ":"unlink ")
6743 << *dn << " to " << *targeti << dendl;
6744
6745 // 1. send LinkPrepare to dest (journal nlink++ prepare)
6746 mds_rank_t linkauth = targeti->authority().first;
6747 if (mdr->more()->witnessed.count(linkauth) == 0) {
6748 if (mds->is_cluster_degraded() &&
6749 !mds->mdsmap->is_clientreplay_or_active_or_stopping(linkauth)) {
6750 dout(10) << " targeti auth mds." << linkauth << " is not active" << dendl;
f67539c2 6751 if (mdr->more()->waiting_on_peer.empty())
7c673cae
FG
6752 mds->wait_for_active_peer(linkauth, new C_MDS_RetryRequest(mdcache, mdr));
6753 return;
6754 }
6755
6756 dout(10) << " targeti auth must prepare nlink++/--" << dendl;
6757 int op;
6758 if (inc)
f67539c2 6759 op = MMDSPeerRequest::OP_LINKPREP;
7c673cae 6760 else
f67539c2
TL
6761 op = MMDSPeerRequest::OP_UNLINKPREP;
6762 auto req = make_message<MMDSPeerRequest>(mdr->reqid, mdr->attempt, op);
7c673cae
FG
6763 targeti->set_object_info(req->get_object_info());
6764 req->op_stamp = mdr->get_op_stamp();
11fdf7f2
TL
6765 if (auto& desti_srnode = mdr->more()->desti_srnode)
6766 encode(*desti_srnode, req->desti_snapbl);
7c673cae
FG
6767 mds->send_message_mds(req, linkauth);
6768
f67539c2
TL
6769 ceph_assert(mdr->more()->waiting_on_peer.count(linkauth) == 0);
6770 mdr->more()->waiting_on_peer.insert(linkauth);
7c673cae
FG
6771 return;
6772 }
6773 dout(10) << " targeti auth has prepared nlink++/--" << dendl;
6774
11fdf7f2
TL
6775 ceph_assert(g_conf()->mds_kill_link_at != 2);
6776
6777 if (auto& desti_srnode = mdr->more()->desti_srnode) {
6778 delete desti_srnode;
6779 desti_srnode = NULL;
6780 }
7c673cae
FG
6781
6782 mdr->set_mds_stamp(ceph_clock_now());
6783
6784 // add to event
6785 mdr->ls = mdlog->get_current_segment();
6786 EUpdate *le = new EUpdate(mdlog, inc ? "link_remote":"unlink_remote");
6787 mdlog->start_entry(le);
6788 le->metablob.add_client_req(mdr->reqid, mdr->client_request->get_oldest_client_tid());
6789 if (!mdr->more()->witnessed.empty()) {
f67539c2 6790 dout(20) << " noting uncommitted_peers " << mdr->more()->witnessed << dendl;
7c673cae 6791 le->reqid = mdr->reqid;
f67539c2
TL
6792 le->had_peers = true;
6793 mdcache->add_uncommitted_leader(mdr->reqid, mdr->ls, mdr->more()->witnessed);
7c673cae
FG
6794 }
6795
6796 if (inc) {
6797 dn->pre_dirty();
6798 mdcache->predirty_journal_parents(mdr, &le->metablob, targeti, dn->get_dir(), PREDIRTY_DIR, 1);
6799 le->metablob.add_remote_dentry(dn, true, targeti->ino(), targeti->d_type()); // new remote
6800 dn->push_projected_linkage(targeti->ino(), targeti->d_type());
6801 } else {
6802 dn->pre_dirty();
6803 mdcache->predirty_journal_parents(mdr, &le->metablob, targeti, dn->get_dir(), PREDIRTY_DIR, -1);
6804 mdcache->journal_cow_dentry(mdr.get(), &le->metablob, dn);
6805 le->metablob.add_null_dentry(dn, true);
31f18b77 6806 dn->push_projected_linkage();
7c673cae
FG
6807 }
6808
9f95a23c
TL
6809 journal_and_reply(mdr, (inc ? targeti : nullptr), dn, le,
6810 new C_MDS_link_remote_finish(this, mdr, inc, dn, targeti));
7c673cae
FG
6811}
6812
6813void Server::_link_remote_finish(MDRequestRef& mdr, bool inc,
6814 CDentry *dn, CInode *targeti,
6815 version_t dpv)
6816{
6817 dout(10) << "_link_remote_finish "
6818 << (inc ? "link ":"unlink ")
6819 << *dn << " to " << *targeti << dendl;
6820
11fdf7f2 6821 ceph_assert(g_conf()->mds_kill_link_at != 3);
7c673cae
FG
6822
6823 if (!mdr->more()->witnessed.empty())
f67539c2 6824 mdcache->logged_leader_update(mdr->reqid);
7c673cae
FG
6825
6826 if (inc) {
6827 // link the new dentry
31f18b77
FG
6828 CDentry::linkage_t *dnl = dn->pop_projected_linkage();
6829 if (!dnl->get_inode())
6830 dn->link_remote(dnl, targeti);
7c673cae
FG
6831 dn->mark_dirty(dpv, mdr->ls);
6832 } else {
6833 // unlink main dentry
6834 dn->get_dir()->unlink_inode(dn);
31f18b77 6835 dn->pop_projected_linkage();
7c673cae
FG
6836 dn->mark_dirty(dn->get_projected_version(), mdr->ls); // dirty old dentry
6837 }
6838
6839 mdr->apply();
6840
6841 MDRequestRef null_ref;
6842 if (inc)
6843 mdcache->send_dentry_link(dn, null_ref);
6844 else
6845 mdcache->send_dentry_unlink(dn, NULL, null_ref);
6846
6847 // bump target popularity
11fdf7f2
TL
6848 mds->balancer->hit_inode(targeti, META_POP_IWR);
6849 mds->balancer->hit_dir(dn->get_dir(), META_POP_IWR);
7c673cae
FG
6850
6851 // reply
6852 respond_to_request(mdr, 0);
6853
6854 if (!inc)
6855 // removing a new dn?
6856 dn->get_dir()->try_remove_unlinked_dn(dn);
6857}
6858
6859
6860// remote linking/unlinking
6861
f67539c2 6862class C_MDS_PeerLinkPrep : public ServerLogContext {
7c673cae 6863 CInode *targeti;
11fdf7f2 6864 bool adjust_realm;
7c673cae 6865public:
f67539c2 6866 C_MDS_PeerLinkPrep(Server *s, MDRequestRef& r, CInode *t, bool ar) :
11fdf7f2 6867 ServerLogContext(s, r), targeti(t), adjust_realm(ar) { }
7c673cae 6868 void finish(int r) override {
11fdf7f2 6869 ceph_assert(r == 0);
f67539c2 6870 server->_logged_peer_link(mdr, targeti, adjust_realm);
7c673cae
FG
6871 }
6872};
6873
f67539c2 6874class C_MDS_PeerLinkCommit : public ServerContext {
7c673cae
FG
6875 MDRequestRef mdr;
6876 CInode *targeti;
6877public:
f67539c2 6878 C_MDS_PeerLinkCommit(Server *s, MDRequestRef& r, CInode *t) :
7c673cae
FG
6879 ServerContext(s), mdr(r), targeti(t) { }
6880 void finish(int r) override {
f67539c2 6881 server->_commit_peer_link(mdr, r, targeti);
7c673cae
FG
6882 }
6883};
6884
f67539c2 6885void Server::handle_peer_link_prep(MDRequestRef& mdr)
7c673cae 6886{
f67539c2
TL
6887 dout(10) << "handle_peer_link_prep " << *mdr
6888 << " on " << mdr->peer_request->get_object_info()
7c673cae
FG
6889 << dendl;
6890
11fdf7f2 6891 ceph_assert(g_conf()->mds_kill_link_at != 4);
7c673cae 6892
f67539c2 6893 CInode *targeti = mdcache->get_inode(mdr->peer_request->get_object_info().ino);
11fdf7f2 6894 ceph_assert(targeti);
7c673cae
FG
6895 dout(10) << "targeti " << *targeti << dendl;
6896 CDentry *dn = targeti->get_parent_dn();
6897 CDentry::linkage_t *dnl = dn->get_linkage();
11fdf7f2 6898 ceph_assert(dnl->is_primary());
7c673cae 6899
f67539c2 6900 mdr->set_op_stamp(mdr->peer_request->op_stamp);
7c673cae
FG
6901
6902 mdr->auth_pin(targeti);
6903
f67539c2 6904 //ceph_abort(); // test hack: make sure leader can handle a peer that fails to prepare...
11fdf7f2 6905 ceph_assert(g_conf()->mds_kill_link_at != 5);
7c673cae
FG
6906
6907 // journal it
6908 mdr->ls = mdlog->get_current_segment();
f67539c2
TL
6909 EPeerUpdate *le = new EPeerUpdate(mdlog, "peer_link_prep", mdr->reqid, mdr->peer_to_mds,
6910 EPeerUpdate::OP_PREPARE, EPeerUpdate::LINK);
7c673cae
FG
6911 mdlog->start_entry(le);
6912
f67539c2 6913 auto pi = dnl->get_inode()->project_inode(mdr);
7c673cae
FG
6914
6915 // update journaled target inode
6916 bool inc;
11fdf7f2
TL
6917 bool adjust_realm = false;
6918 bool realm_projected = false;
f67539c2 6919 if (mdr->peer_request->get_op() == MMDSPeerRequest::OP_LINKPREP) {
7c673cae 6920 inc = true;
f67539c2 6921 pi.inode->nlink++;
adb31ebb
TL
6922
6923 CDentry *target_pdn = targeti->get_projected_parent_dn();
6924 SnapRealm *target_realm = target_pdn->get_dir()->inode->find_snaprealm();
6925 if (!target_realm->get_subvolume_ino() && !targeti->is_projected_snaprealm_global()) {
11fdf7f2
TL
6926 sr_t *newsnap = targeti->project_snaprealm();
6927 targeti->mark_snaprealm_global(newsnap);
adb31ebb 6928 targeti->record_snaprealm_parent_dentry(newsnap, target_realm, target_pdn, true);
11fdf7f2
TL
6929 adjust_realm = true;
6930 realm_projected = true;
6931 }
7c673cae
FG
6932 } else {
6933 inc = false;
f67539c2 6934 pi.inode->nlink--;
11fdf7f2 6935 if (targeti->is_projected_snaprealm_global()) {
f67539c2
TL
6936 ceph_assert(mdr->peer_request->desti_snapbl.length());
6937 auto p = mdr->peer_request->desti_snapbl.cbegin();
11fdf7f2
TL
6938
6939 sr_t *newsnap = targeti->project_snaprealm();
6940 decode(*newsnap, p);
6941
f67539c2 6942 if (pi.inode->nlink == 0)
11fdf7f2
TL
6943 ceph_assert(!newsnap->is_parent_global());
6944
6945 realm_projected = true;
6946 } else {
f67539c2 6947 ceph_assert(mdr->peer_request->desti_snapbl.length() == 0);
11fdf7f2 6948 }
7c673cae
FG
6949 }
6950
6951 link_rollback rollback;
6952 rollback.reqid = mdr->reqid;
6953 rollback.ino = targeti->ino();
f67539c2
TL
6954 rollback.old_ctime = targeti->get_inode()->ctime; // we hold versionlock xlock; no concorrent projections
6955 const auto& pf = targeti->get_parent_dn()->get_dir()->get_projected_fnode();
7c673cae
FG
6956 rollback.old_dir_mtime = pf->fragstat.mtime;
6957 rollback.old_dir_rctime = pf->rstat.rctime;
6958 rollback.was_inc = inc;
11fdf7f2
TL
6959 if (realm_projected) {
6960 if (targeti->snaprealm) {
6961 encode(true, rollback.snapbl);
6962 targeti->encode_snap_blob(rollback.snapbl);
6963 } else {
6964 encode(false, rollback.snapbl);
6965 }
6966 }
6967 encode(rollback, le->rollback);
7c673cae
FG
6968 mdr->more()->rollback_bl = le->rollback;
6969
f67539c2
TL
6970 pi.inode->ctime = mdr->get_op_stamp();
6971 pi.inode->version = targeti->pre_dirty();
7c673cae 6972
f67539c2 6973 dout(10) << " projected inode " << pi.inode->ino << " v " << pi.inode->version << dendl;
7c673cae
FG
6974
6975 // commit case
6976 mdcache->predirty_journal_parents(mdr, &le->commit, dnl->get_inode(), 0, PREDIRTY_SHALLOW|PREDIRTY_PRIMARY);
6977 mdcache->journal_dirty_inode(mdr.get(), &le->commit, targeti);
f67539c2 6978 mdcache->add_uncommitted_peer(mdr->reqid, mdr->ls, mdr->peer_to_mds);
7c673cae
FG
6979
6980 // set up commit waiter
f67539c2 6981 mdr->more()->peer_commit = new C_MDS_PeerLinkCommit(this, mdr, targeti);
7c673cae 6982
f67539c2
TL
6983 mdr->more()->peer_update_journaled = true;
6984 submit_mdlog_entry(le, new C_MDS_PeerLinkPrep(this, mdr, targeti, adjust_realm),
7c673cae
FG
6985 mdr, __func__);
6986 mdlog->flush();
6987}
6988
f67539c2 6989void Server::_logged_peer_link(MDRequestRef& mdr, CInode *targeti, bool adjust_realm)
7c673cae 6990{
f67539c2 6991 dout(10) << "_logged_peer_link " << *mdr
7c673cae
FG
6992 << " " << *targeti << dendl;
6993
11fdf7f2 6994 ceph_assert(g_conf()->mds_kill_link_at != 6);
7c673cae
FG
6995
6996 // update the target
7c673cae
FG
6997 mdr->apply();
6998
6999 // hit pop
11fdf7f2 7000 mds->balancer->hit_inode(targeti, META_POP_IWR);
7c673cae
FG
7001
7002 // done.
f67539c2 7003 mdr->reset_peer_request();
7c673cae 7004
11fdf7f2
TL
7005 if (adjust_realm) {
7006 int op = CEPH_SNAP_OP_SPLIT;
7007 mds->mdcache->send_snap_update(targeti, 0, op);
7008 mds->mdcache->do_realm_invalidate_and_update_notify(targeti, op);
7009 }
7010
7c673cae
FG
7011 // ack
7012 if (!mdr->aborted) {
f67539c2
TL
7013 auto reply = make_message<MMDSPeerRequest>(mdr->reqid, mdr->attempt, MMDSPeerRequest::OP_LINKPREPACK);
7014 mds->send_message_mds(reply, mdr->peer_to_mds);
7c673cae
FG
7015 } else {
7016 dout(10) << " abort flag set, finishing" << dendl;
7017 mdcache->request_finish(mdr);
7018 }
7019}
7020
7021
f67539c2
TL
7022struct C_MDS_CommittedPeer : public ServerLogContext {
7023 C_MDS_CommittedPeer(Server *s, MDRequestRef& m) : ServerLogContext(s, m) {}
7c673cae 7024 void finish(int r) override {
f67539c2 7025 server->_committed_peer(mdr);
7c673cae
FG
7026 }
7027};
7028
f67539c2 7029void Server::_commit_peer_link(MDRequestRef& mdr, int r, CInode *targeti)
7c673cae 7030{
f67539c2 7031 dout(10) << "_commit_peer_link " << *mdr
7c673cae
FG
7032 << " r=" << r
7033 << " " << *targeti << dendl;
7034
11fdf7f2 7035 ceph_assert(g_conf()->mds_kill_link_at != 7);
7c673cae
FG
7036
7037 if (r == 0) {
7038 // drop our pins, etc.
7039 mdr->cleanup();
7040
7041 // write a commit to the journal
f67539c2
TL
7042 EPeerUpdate *le = new EPeerUpdate(mdlog, "peer_link_commit", mdr->reqid, mdr->peer_to_mds,
7043 EPeerUpdate::OP_COMMIT, EPeerUpdate::LINK);
7c673cae 7044 mdlog->start_entry(le);
f67539c2 7045 submit_mdlog_entry(le, new C_MDS_CommittedPeer(this, mdr), mdr, __func__);
7c673cae
FG
7046 mdlog->flush();
7047 } else {
f67539c2 7048 do_link_rollback(mdr->more()->rollback_bl, mdr->peer_to_mds, mdr);
7c673cae
FG
7049 }
7050}
7051
f67539c2 7052void Server::_committed_peer(MDRequestRef& mdr)
7c673cae 7053{
f67539c2 7054 dout(10) << "_committed_peer " << *mdr << dendl;
7c673cae 7055
11fdf7f2 7056 ceph_assert(g_conf()->mds_kill_link_at != 8);
7c673cae 7057
f67539c2
TL
7058 bool assert_exist = mdr->more()->peer_update_journaled;
7059 mdcache->finish_uncommitted_peer(mdr->reqid, assert_exist);
7060 auto req = make_message<MMDSPeerRequest>(mdr->reqid, mdr->attempt, MMDSPeerRequest::OP_COMMITTED);
7061 mds->send_message_mds(req, mdr->peer_to_mds);
7c673cae
FG
7062 mdcache->request_finish(mdr);
7063}
7064
7065struct C_MDS_LoggedLinkRollback : public ServerLogContext {
7066 MutationRef mut;
9f95a23c 7067 map<client_t,ref_t<MClientSnap>> splits;
11fdf7f2 7068 C_MDS_LoggedLinkRollback(Server *s, MutationRef& m, MDRequestRef& r,
9f95a23c 7069 map<client_t,ref_t<MClientSnap>>&& _splits) :
11fdf7f2
TL
7070 ServerLogContext(s, r), mut(m), splits(std::move(_splits)) {
7071 }
7c673cae 7072 void finish(int r) override {
11fdf7f2 7073 server->_link_rollback_finish(mut, mdr, splits);
7c673cae
FG
7074 }
7075};
7076
f67539c2 7077void Server::do_link_rollback(bufferlist &rbl, mds_rank_t leader, MDRequestRef& mdr)
7c673cae
FG
7078{
7079 link_rollback rollback;
11fdf7f2
TL
7080 auto p = rbl.cbegin();
7081 decode(rollback, p);
7c673cae
FG
7082
7083 dout(10) << "do_link_rollback on " << rollback.reqid
7084 << (rollback.was_inc ? " inc":" dec")
7085 << " ino " << rollback.ino
7086 << dendl;
7087
11fdf7f2 7088 ceph_assert(g_conf()->mds_kill_link_at != 9);
7c673cae 7089
f67539c2 7090 mdcache->add_rollback(rollback.reqid, leader); // need to finish this update before resolve finishes
11fdf7f2 7091 ceph_assert(mdr || mds->is_resolve());
7c673cae
FG
7092
7093 MutationRef mut(new MutationImpl(nullptr, utime_t(), rollback.reqid));
7094 mut->ls = mds->mdlog->get_current_segment();
7095
7096 CInode *in = mdcache->get_inode(rollback.ino);
11fdf7f2 7097 ceph_assert(in);
7c673cae 7098 dout(10) << " target is " << *in << dendl;
f67539c2 7099 ceph_assert(!in->is_projected()); // live peer request hold versionlock xlock.
7c673cae 7100
f67539c2
TL
7101 auto pi = in->project_inode(mut);
7102 pi.inode->version = in->pre_dirty();
7c673cae
FG
7103
7104 // parent dir rctime
7105 CDir *parent = in->get_projected_parent_dn()->get_dir();
f67539c2 7106 auto pf = parent->project_fnode(mut);
7c673cae 7107 pf->version = parent->pre_dirty();
f67539c2 7108 if (pf->fragstat.mtime == pi.inode->ctime) {
7c673cae 7109 pf->fragstat.mtime = rollback.old_dir_mtime;
f67539c2 7110 if (pf->rstat.rctime == pi.inode->ctime)
7c673cae
FG
7111 pf->rstat.rctime = rollback.old_dir_rctime;
7112 mut->add_updated_lock(&parent->get_inode()->filelock);
7113 mut->add_updated_lock(&parent->get_inode()->nestlock);
7114 }
7115
7116 // inode
f67539c2 7117 pi.inode->ctime = rollback.old_ctime;
7c673cae 7118 if (rollback.was_inc)
f67539c2 7119 pi.inode->nlink--;
7c673cae 7120 else
f67539c2 7121 pi.inode->nlink++;
7c673cae 7122
9f95a23c 7123 map<client_t,ref_t<MClientSnap>> splits;
11fdf7f2
TL
7124 if (rollback.snapbl.length() && in->snaprealm) {
7125 bool hadrealm;
7126 auto p = rollback.snapbl.cbegin();
7127 decode(hadrealm, p);
7128 if (hadrealm) {
7129 if (!mds->is_resolve()) {
7130 sr_t *new_srnode = new sr_t();
7131 decode(*new_srnode, p);
7132 in->project_snaprealm(new_srnode);
7133 } else {
7134 decode(in->snaprealm->srnode, p);
7135 }
7136 } else {
7137 SnapRealm *realm = parent->get_inode()->find_snaprealm();
7138 if (!mds->is_resolve())
7139 mdcache->prepare_realm_merge(in->snaprealm, realm, splits);
7140 in->project_snaprealm(NULL);
7141 }
7142 }
7143
7c673cae 7144 // journal it
f67539c2
TL
7145 EPeerUpdate *le = new EPeerUpdate(mdlog, "peer_link_rollback", rollback.reqid, leader,
7146 EPeerUpdate::OP_ROLLBACK, EPeerUpdate::LINK);
7c673cae
FG
7147 mdlog->start_entry(le);
7148 le->commit.add_dir_context(parent);
7149 le->commit.add_dir(parent, true);
7150 le->commit.add_primary_dentry(in->get_projected_parent_dn(), 0, true);
7151
11fdf7f2 7152 submit_mdlog_entry(le, new C_MDS_LoggedLinkRollback(this, mut, mdr, std::move(splits)),
7c673cae
FG
7153 mdr, __func__);
7154 mdlog->flush();
7155}
7156
11fdf7f2 7157void Server::_link_rollback_finish(MutationRef& mut, MDRequestRef& mdr,
9f95a23c 7158 map<client_t,ref_t<MClientSnap>>& splits)
7c673cae
FG
7159{
7160 dout(10) << "_link_rollback_finish" << dendl;
7161
11fdf7f2 7162 ceph_assert(g_conf()->mds_kill_link_at != 10);
7c673cae
FG
7163
7164 mut->apply();
11fdf7f2
TL
7165
7166 if (!mds->is_resolve())
7167 mdcache->send_snaps(splits);
7168
7c673cae
FG
7169 if (mdr)
7170 mdcache->request_finish(mdr);
7171
e306af50 7172 mdcache->finish_rollback(mut->reqid, mdr);
7c673cae
FG
7173
7174 mut->cleanup();
7175}
7176
7177
f67539c2 7178void Server::handle_peer_link_prep_ack(MDRequestRef& mdr, const cref_t<MMDSPeerRequest> &m)
7c673cae 7179{
f67539c2 7180 dout(10) << "handle_peer_link_prep_ack " << *mdr
7c673cae
FG
7181 << " " << *m << dendl;
7182 mds_rank_t from = mds_rank_t(m->get_source().num());
7183
11fdf7f2 7184 ceph_assert(g_conf()->mds_kill_link_at != 11);
7c673cae 7185
f67539c2
TL
7186 // note peer
7187 mdr->more()->peers.insert(from);
7c673cae
FG
7188
7189 // witnessed!
11fdf7f2 7190 ceph_assert(mdr->more()->witnessed.count(from) == 0);
7c673cae 7191 mdr->more()->witnessed.insert(from);
11fdf7f2 7192 ceph_assert(!m->is_not_journaled());
f67539c2 7193 mdr->more()->has_journaled_peers = true;
7c673cae
FG
7194
7195 // remove from waiting list
f67539c2
TL
7196 ceph_assert(mdr->more()->waiting_on_peer.count(from));
7197 mdr->more()->waiting_on_peer.erase(from);
7c673cae 7198
f67539c2 7199 ceph_assert(mdr->more()->waiting_on_peer.empty());
7c673cae 7200
9f95a23c
TL
7201 dispatch_client_request(mdr); // go again!
7202}
7c673cae 7203
9f95a23c
TL
7204
7205
7206
7207
7208// UNLINK
7209
7210void Server::handle_client_unlink(MDRequestRef& mdr)
7211{
7212 const cref_t<MClientRequest> &req = mdr->client_request;
7213 client_t client = mdr->get_client();
7214
7215 // rmdir or unlink?
7216 bool rmdir = (req->get_op() == CEPH_MDS_OP_RMDIR);
7217
7218 if (rmdir)
7219 mdr->disable_lock_cache();
7220 CDentry *dn = rdlock_path_xlock_dentry(mdr, false, true);
7221 if (!dn)
7222 return;
7c673cae
FG
7223
7224 CDentry::linkage_t *dnl = dn->get_linkage(client, mdr);
11fdf7f2 7225 ceph_assert(!dnl->is_null());
9f95a23c 7226 CInode *in = dnl->get_inode();
7c673cae
FG
7227
7228 if (rmdir) {
7229 dout(7) << "handle_client_rmdir on " << *dn << dendl;
7230 } else {
7231 dout(7) << "handle_client_unlink on " << *dn << dendl;
7232 }
7233 dout(7) << "dn links to " << *in << dendl;
7234
7235 // rmdir vs is_dir
7236 if (in->is_dir()) {
7237 if (rmdir) {
7238 // do empty directory checks
7239 if (_dir_is_nonempty_unlocked(mdr, in)) {
f67539c2 7240 respond_to_request(mdr, -CEPHFS_ENOTEMPTY);
7c673cae
FG
7241 return;
7242 }
7243 } else {
7244 dout(7) << "handle_client_unlink on dir " << *in << ", returning error" << dendl;
f67539c2 7245 respond_to_request(mdr, -CEPHFS_EISDIR);
7c673cae
FG
7246 return;
7247 }
7248 } else {
7249 if (rmdir) {
7250 // unlink
7251 dout(7) << "handle_client_rmdir on non-dir " << *in << ", returning error" << dendl;
f67539c2 7252 respond_to_request(mdr, -CEPHFS_ENOTDIR);
7c673cae
FG
7253 return;
7254 }
7255 }
7256
9f95a23c
TL
7257 CInode *diri = dn->get_dir()->get_inode();
7258 if ((!mdr->has_more() || mdr->more()->witnessed.empty())) {
7259 if (!check_access(mdr, diri, MAY_WRITE))
7260 return;
7261 }
7262
7c673cae
FG
7263 // -- create stray dentry? --
7264 CDentry *straydn = NULL;
7265 if (dnl->is_primary()) {
7266 straydn = prepare_stray_dentry(mdr, dnl->get_inode());
7267 if (!straydn)
7268 return;
7269 dout(10) << " straydn is " << *straydn << dendl;
7270 } else if (mdr->straydn) {
7271 mdr->unpin(mdr->straydn);
7272 mdr->straydn = NULL;
7273 }
7274
7275 // lock
9f95a23c
TL
7276 if (!(mdr->locking_state & MutationImpl::ALL_LOCKED)) {
7277 MutationImpl::LockOpVec lov;
11fdf7f2 7278
9f95a23c
TL
7279 lov.add_xlock(&in->linklock);
7280 lov.add_xlock(&in->snaplock);
7281 if (in->is_dir())
7282 lov.add_rdlock(&in->filelock); // to verify it's empty
7283
7284 if (straydn) {
7285 lov.add_wrlock(&straydn->get_dir()->inode->filelock);
7286 lov.add_wrlock(&straydn->get_dir()->inode->nestlock);
7287 lov.add_xlock(&straydn->lock);
7288 }
11fdf7f2 7289
9f95a23c
TL
7290 if (!mds->locker->acquire_locks(mdr, lov))
7291 return;
7c673cae 7292
9f95a23c
TL
7293 mdr->locking_state |= MutationImpl::ALL_LOCKED;
7294 }
7c673cae
FG
7295
7296 if (in->is_dir() &&
7297 _dir_is_nonempty(mdr, in)) {
f67539c2 7298 respond_to_request(mdr, -CEPHFS_ENOTEMPTY);
7c673cae
FG
7299 return;
7300 }
7301
11fdf7f2
TL
7302 if (straydn)
7303 straydn->first = mdcache->get_global_snaprealm()->get_newest_seq() + 1;
7304
7305 if (!mdr->more()->desti_srnode) {
7306 if (in->is_projected_snaprealm_global()) {
7307 sr_t *new_srnode = in->prepare_new_srnode(0);
adb31ebb 7308 in->record_snaprealm_parent_dentry(new_srnode, nullptr, dn, dnl->is_primary());
11fdf7f2
TL
7309 // dropping the last linkage or dropping the last remote linkage,
7310 // detch the inode from global snaprealm
7311 auto nlink = in->get_projected_inode()->nlink;
7312 if (nlink == 1 ||
7313 (nlink == 2 && !dnl->is_primary() &&
7314 !in->get_projected_parent_dir()->inode->is_stray()))
7315 in->clear_snaprealm_global(new_srnode);
7316 mdr->more()->desti_srnode = new_srnode;
7317 } else if (dnl->is_primary()) {
f67539c2 7318 // prepare snaprealm blob for peer request
11fdf7f2
TL
7319 SnapRealm *realm = in->find_snaprealm();
7320 snapid_t follows = realm->get_newest_seq();
7321 if (in->snaprealm || follows + 1 > in->get_oldest_snap()) {
7322 sr_t *new_srnode = in->prepare_new_srnode(follows);
7323 in->record_snaprealm_past_parent(new_srnode, straydn->get_dir()->inode->find_snaprealm());
7324 mdr->more()->desti_srnode = new_srnode;
7325 }
7326 }
7327 }
7328
7c673cae
FG
7329 // yay!
7330 if (in->is_dir() && in->has_subtree_root_dirfrag()) {
7331 // subtree root auths need to be witnesses
7332 set<mds_rank_t> witnesses;
7333 in->list_replicas(witnesses);
7334 dout(10) << " witnesses " << witnesses << ", have " << mdr->more()->witnessed << dendl;
7335
7336 for (set<mds_rank_t>::iterator p = witnesses.begin();
7337 p != witnesses.end();
7338 ++p) {
7339 if (mdr->more()->witnessed.count(*p)) {
7340 dout(10) << " already witnessed by mds." << *p << dendl;
f67539c2 7341 } else if (mdr->more()->waiting_on_peer.count(*p)) {
7c673cae
FG
7342 dout(10) << " already waiting on witness mds." << *p << dendl;
7343 } else {
9f95a23c 7344 if (!_rmdir_prepare_witness(mdr, *p, mdr->dn[0], straydn))
7c673cae
FG
7345 return;
7346 }
7347 }
f67539c2 7348 if (!mdr->more()->waiting_on_peer.empty())
7c673cae
FG
7349 return; // we're waiting for a witness.
7350 }
7351
9f95a23c
TL
7352 if (!rmdir && dnl->is_primary() && mdr->dn[0].size() == 1)
7353 mds->locker->create_lock_cache(mdr, diri);
7354
7c673cae
FG
7355 // ok!
7356 if (dnl->is_remote() && !dnl->get_inode()->is_auth())
7357 _link_remote(mdr, false, dn, dnl->get_inode());
7358 else
7359 _unlink_local(mdr, dn, straydn);
7360}
7361
7362class C_MDS_unlink_local_finish : public ServerLogContext {
7363 CDentry *dn;
7364 CDentry *straydn;
7365 version_t dnpv; // deleted dentry
7366public:
7367 C_MDS_unlink_local_finish(Server *s, MDRequestRef& r, CDentry *d, CDentry *sd) :
7368 ServerLogContext(s, r), dn(d), straydn(sd),
7369 dnpv(d->get_projected_version()) {}
7370 void finish(int r) override {
11fdf7f2 7371 ceph_assert(r == 0);
7c673cae
FG
7372 server->_unlink_local_finish(mdr, dn, straydn, dnpv);
7373 }
7374};
7375
7376void Server::_unlink_local(MDRequestRef& mdr, CDentry *dn, CDentry *straydn)
7377{
7378 dout(10) << "_unlink_local " << *dn << dendl;
7379
7380 CDentry::linkage_t *dnl = dn->get_projected_linkage();
7381 CInode *in = dnl->get_inode();
7382
7c673cae
FG
7383
7384 // ok, let's do it.
7385 mdr->ls = mdlog->get_current_segment();
7386
7387 // prepare log entry
7388 EUpdate *le = new EUpdate(mdlog, "unlink_local");
7389 mdlog->start_entry(le);
7390 le->metablob.add_client_req(mdr->reqid, mdr->client_request->get_oldest_client_tid());
7391 if (!mdr->more()->witnessed.empty()) {
f67539c2 7392 dout(20) << " noting uncommitted_peers " << mdr->more()->witnessed << dendl;
7c673cae 7393 le->reqid = mdr->reqid;
f67539c2
TL
7394 le->had_peers = true;
7395 mdcache->add_uncommitted_leader(mdr->reqid, mdr->ls, mdr->more()->witnessed);
7c673cae
FG
7396 }
7397
7398 if (straydn) {
11fdf7f2 7399 ceph_assert(dnl->is_primary());
7c673cae 7400 straydn->push_projected_linkage(in);
7c673cae
FG
7401 }
7402
7403 // the unlinked dentry
7404 dn->pre_dirty();
7405
f67539c2 7406 auto pi = in->project_inode(mdr);
94b18763
FG
7407 {
7408 std::string t;
7409 dn->make_path_string(t, true);
f67539c2
TL
7410 pi.inode->stray_prior_path = std::move(t);
7411 }
7412 pi.inode->version = in->pre_dirty();
7413 pi.inode->ctime = mdr->get_op_stamp();
7414 if (mdr->get_op_stamp() > pi.inode->rstat.rctime)
7415 pi.inode->rstat.rctime = mdr->get_op_stamp();
7416 pi.inode->change_attr++;
7417 pi.inode->nlink--;
7418 if (pi.inode->nlink == 0)
7c673cae
FG
7419 in->state_set(CInode::STATE_ORPHAN);
7420
11fdf7f2
TL
7421 if (mdr->more()->desti_srnode) {
7422 auto& desti_srnode = mdr->more()->desti_srnode;
7423 in->project_snaprealm(desti_srnode);
7424 desti_srnode = NULL;
7425 }
7426
7427 if (straydn) {
7428 // will manually pop projected inode
7429
7c673cae 7430 // primary link. add stray dentry.
7c673cae
FG
7431 mdcache->predirty_journal_parents(mdr, &le->metablob, in, dn->get_dir(), PREDIRTY_PRIMARY|PREDIRTY_DIR, -1);
7432 mdcache->predirty_journal_parents(mdr, &le->metablob, in, straydn->get_dir(), PREDIRTY_PRIMARY|PREDIRTY_DIR, 1);
7433
f67539c2 7434 pi.inode->update_backtrace();
7c673cae
FG
7435 le->metablob.add_primary_dentry(straydn, in, true, true);
7436 } else {
7437 // remote link. update remote inode.
7438 mdcache->predirty_journal_parents(mdr, &le->metablob, in, dn->get_dir(), PREDIRTY_DIR, -1);
7439 mdcache->predirty_journal_parents(mdr, &le->metablob, in, 0, PREDIRTY_PRIMARY);
7440 mdcache->journal_dirty_inode(mdr.get(), &le->metablob, in);
7441 }
7442
7443 mdcache->journal_cow_dentry(mdr.get(), &le->metablob, dn);
7444 le->metablob.add_null_dentry(dn, true);
7445
7446 if (in->is_dir()) {
7447 dout(10) << " noting renamed (unlinked) dir ino " << in->ino() << " in metablob" << dendl;
7448 le->metablob.renamed_dirino = in->ino();
7449 }
7450
7451 dn->push_projected_linkage();
7452
11fdf7f2
TL
7453 if (straydn) {
7454 ceph_assert(in->first <= straydn->first);
7455 in->first = straydn->first;
7456 }
7457
7c673cae 7458 if (in->is_dir()) {
11fdf7f2 7459 ceph_assert(straydn);
7c673cae
FG
7460 mdcache->project_subtree_rename(in, dn->get_dir(), straydn->get_dir());
7461 }
7462
7463 journal_and_reply(mdr, 0, dn, le, new C_MDS_unlink_local_finish(this, mdr, dn, straydn));
7464}
7465
7466void Server::_unlink_local_finish(MDRequestRef& mdr,
7467 CDentry *dn, CDentry *straydn,
7468 version_t dnpv)
7469{
7470 dout(10) << "_unlink_local_finish " << *dn << dendl;
7471
7472 if (!mdr->more()->witnessed.empty())
f67539c2 7473 mdcache->logged_leader_update(mdr->reqid);
7c673cae 7474
11fdf7f2
TL
7475 CInode *strayin = NULL;
7476 bool hadrealm = false;
7477 if (straydn) {
7478 // if there is newly created snaprealm, need to split old snaprealm's
7479 // inodes_with_caps. So pop snaprealm before linkage changes.
7480 strayin = dn->get_linkage()->get_inode();
7481 hadrealm = strayin->snaprealm ? true : false;
7482 strayin->early_pop_projected_snaprealm();
7483 }
7484
7c673cae
FG
7485 // unlink main dentry
7486 dn->get_dir()->unlink_inode(dn);
7487 dn->pop_projected_linkage();
f67539c2 7488 dn->mark_dirty(dnpv, mdr->ls);
7c673cae
FG
7489
7490 // relink as stray? (i.e. was primary link?)
7c673cae
FG
7491 if (straydn) {
7492 dout(20) << " straydn is " << *straydn << dendl;
11fdf7f2 7493 straydn->pop_projected_linkage();
7c673cae
FG
7494 mdcache->touch_dentry_bottom(straydn);
7495 }
7496
7c673cae 7497 mdr->apply();
7c673cae
FG
7498
7499 mdcache->send_dentry_unlink(dn, straydn, mdr);
7500
11fdf7f2
TL
7501 if (straydn) {
7502 // update subtree map?
7503 if (strayin->is_dir())
7504 mdcache->adjust_subtree_after_rename(strayin, dn->get_dir(), true);
7505
7506 if (strayin->snaprealm && !hadrealm)
7507 mdcache->do_realm_invalidate_and_update_notify(strayin, CEPH_SNAP_OP_SPLIT, false);
7508 }
7c673cae
FG
7509
7510 // bump pop
11fdf7f2 7511 mds->balancer->hit_dir(dn->get_dir(), META_POP_IWR);
7c673cae
FG
7512
7513 // reply
7514 respond_to_request(mdr, 0);
7515
7516 // removing a new dn?
7517 dn->get_dir()->try_remove_unlinked_dn(dn);
7518
7519 // clean up ?
7520 // respond_to_request() drops locks. So stray reintegration can race with us.
7521 if (straydn && !straydn->get_projected_linkage()->is_null()) {
7522 // Tip off the MDCache that this dentry is a stray that
7523 // might be elegible for purge.
7524 mdcache->notify_stray(straydn);
7525 }
7526}
7527
7528bool Server::_rmdir_prepare_witness(MDRequestRef& mdr, mds_rank_t who, vector<CDentry*>& trace, CDentry *straydn)
7529{
7530 if (mds->is_cluster_degraded() &&
7531 !mds->mdsmap->is_clientreplay_or_active_or_stopping(who)) {
7532 dout(10) << "_rmdir_prepare_witness mds." << who << " is not active" << dendl;
f67539c2 7533 if (mdr->more()->waiting_on_peer.empty())
7c673cae
FG
7534 mds->wait_for_active_peer(who, new C_MDS_RetryRequest(mdcache, mdr));
7535 return false;
7536 }
7537
7538 dout(10) << "_rmdir_prepare_witness mds." << who << dendl;
f67539c2 7539 auto req = make_message<MMDSPeerRequest>(mdr->reqid, mdr->attempt, MMDSPeerRequest::OP_RMDIRPREP);
7c673cae
FG
7540 req->srcdnpath = filepath(trace.front()->get_dir()->ino());
7541 for (auto dn : trace)
94b18763 7542 req->srcdnpath.push_dentry(dn->get_name());
9f95a23c 7543 mdcache->encode_replica_stray(straydn, who, req->straybl);
11fdf7f2
TL
7544 if (mdr->more()->desti_srnode)
7545 encode(*mdr->more()->desti_srnode, req->desti_snapbl);
7c673cae
FG
7546
7547 req->op_stamp = mdr->get_op_stamp();
7548 mds->send_message_mds(req, who);
7549
f67539c2
TL
7550 ceph_assert(mdr->more()->waiting_on_peer.count(who) == 0);
7551 mdr->more()->waiting_on_peer.insert(who);
7c673cae
FG
7552 return true;
7553}
7554
f67539c2 7555struct C_MDS_PeerRmdirPrep : public ServerLogContext {
7c673cae 7556 CDentry *dn, *straydn;
f67539c2 7557 C_MDS_PeerRmdirPrep(Server *s, MDRequestRef& r, CDentry *d, CDentry *st)
7c673cae
FG
7558 : ServerLogContext(s, r), dn(d), straydn(st) {}
7559 void finish(int r) override {
f67539c2 7560 server->_logged_peer_rmdir(mdr, dn, straydn);
7c673cae
FG
7561 }
7562};
7563
f67539c2 7564struct C_MDS_PeerRmdirCommit : public ServerContext {
7c673cae 7565 MDRequestRef mdr;
31f18b77 7566 CDentry *straydn;
f67539c2 7567 C_MDS_PeerRmdirCommit(Server *s, MDRequestRef& r, CDentry *sd)
31f18b77 7568 : ServerContext(s), mdr(r), straydn(sd) { }
7c673cae 7569 void finish(int r) override {
f67539c2 7570 server->_commit_peer_rmdir(mdr, r, straydn);
7c673cae
FG
7571 }
7572};
7573
f67539c2 7574void Server::handle_peer_rmdir_prep(MDRequestRef& mdr)
7c673cae 7575{
f67539c2
TL
7576 dout(10) << "handle_peer_rmdir_prep " << *mdr
7577 << " " << mdr->peer_request->srcdnpath
7578 << " to " << mdr->peer_request->destdnpath
7c673cae
FG
7579 << dendl;
7580
7581 vector<CDentry*> trace;
f67539c2 7582 filepath srcpath(mdr->peer_request->srcdnpath);
7c673cae
FG
7583 dout(10) << " src " << srcpath << dendl;
7584 CInode *in;
f67539c2 7585 CF_MDS_RetryRequestFactory cf(mdcache, mdr, false);
9f95a23c
TL
7586 int r = mdcache->path_traverse(mdr, cf, srcpath,
7587 MDS_TRAVERSE_DISCOVER | MDS_TRAVERSE_PATH_LOCKED,
7588 &trace, &in);
7c673cae 7589 if (r > 0) return;
f67539c2 7590 if (r == -CEPHFS_ESTALE) {
7c673cae 7591 mdcache->find_ino_peers(srcpath.get_ino(), new C_MDS_RetryRequest(mdcache, mdr),
f67539c2 7592 mdr->peer_to_mds, true);
7c673cae
FG
7593 return;
7594 }
11fdf7f2 7595 ceph_assert(r == 0);
91327a77 7596 CDentry *dn = trace.back();
7c673cae
FG
7597 dout(10) << " dn " << *dn << dendl;
7598 mdr->pin(dn);
7599
11fdf7f2 7600 ceph_assert(mdr->straydn);
7c673cae
FG
7601 CDentry *straydn = mdr->straydn;
7602 dout(10) << " straydn " << *straydn << dendl;
7603
f67539c2 7604 mdr->set_op_stamp(mdr->peer_request->op_stamp);
7c673cae
FG
7605
7606 rmdir_rollback rollback;
7607 rollback.reqid = mdr->reqid;
7608 rollback.src_dir = dn->get_dir()->dirfrag();
11fdf7f2 7609 rollback.src_dname = dn->get_name();
7c673cae 7610 rollback.dest_dir = straydn->get_dir()->dirfrag();
11fdf7f2 7611 rollback.dest_dname = straydn->get_name();
f67539c2 7612 if (mdr->peer_request->desti_snapbl.length()) {
11fdf7f2
TL
7613 if (in->snaprealm) {
7614 encode(true, rollback.snapbl);
7615 in->encode_snap_blob(rollback.snapbl);
7616 } else {
7617 encode(false, rollback.snapbl);
7618 }
7619 }
7620 encode(rollback, mdr->more()->rollback_bl);
7621 // FIXME: rollback snaprealm
7c673cae
FG
7622 dout(20) << " rollback is " << mdr->more()->rollback_bl.length() << " bytes" << dendl;
7623
7624 // set up commit waiter
f67539c2 7625 mdr->more()->peer_commit = new C_MDS_PeerRmdirCommit(this, mdr, straydn);
7c673cae 7626
11fdf7f2
TL
7627 straydn->push_projected_linkage(in);
7628 dn->push_projected_linkage();
7c673cae 7629
11fdf7f2
TL
7630 ceph_assert(straydn->first >= in->first);
7631 in->first = straydn->first;
7c673cae 7632
11fdf7f2
TL
7633 if (!in->has_subtree_root_dirfrag(mds->get_nodeid())) {
7634 dout(10) << " no auth subtree in " << *in << ", skipping journal" << dendl;
f67539c2 7635 _logged_peer_rmdir(mdr, dn, straydn);
7c673cae
FG
7636 return;
7637 }
7638
e306af50 7639 mdr->ls = mdlog->get_current_segment();
f67539c2
TL
7640 EPeerUpdate *le = new EPeerUpdate(mdlog, "peer_rmdir", mdr->reqid, mdr->peer_to_mds,
7641 EPeerUpdate::OP_PREPARE, EPeerUpdate::RMDIR);
7c673cae
FG
7642 mdlog->start_entry(le);
7643 le->rollback = mdr->more()->rollback_bl;
7644
7645 le->commit.add_dir_context(straydn->get_dir());
7646 le->commit.add_primary_dentry(straydn, in, true);
f67539c2 7647 // peer: no need to journal original dentry
7c673cae
FG
7648
7649 dout(10) << " noting renamed (unlinked) dir ino " << in->ino() << " in metablob" << dendl;
7650 le->commit.renamed_dirino = in->ino();
7651
7652 mdcache->project_subtree_rename(in, dn->get_dir(), straydn->get_dir());
f67539c2 7653 mdcache->add_uncommitted_peer(mdr->reqid, mdr->ls, mdr->peer_to_mds);
7c673cae 7654
f67539c2
TL
7655 mdr->more()->peer_update_journaled = true;
7656 submit_mdlog_entry(le, new C_MDS_PeerRmdirPrep(this, mdr, dn, straydn),
7c673cae
FG
7657 mdr, __func__);
7658 mdlog->flush();
7659}
7660
f67539c2 7661void Server::_logged_peer_rmdir(MDRequestRef& mdr, CDentry *dn, CDentry *straydn)
7c673cae 7662{
f67539c2 7663 dout(10) << "_logged_peer_rmdir " << *mdr << " on " << *dn << dendl;
11fdf7f2
TL
7664 CInode *in = dn->get_linkage()->get_inode();
7665
7666 bool new_realm;
f67539c2 7667 if (mdr->peer_request->desti_snapbl.length()) {
11fdf7f2 7668 new_realm = !in->snaprealm;
f67539c2 7669 in->decode_snap_blob(mdr->peer_request->desti_snapbl);
11fdf7f2 7670 ceph_assert(in->snaprealm);
11fdf7f2
TL
7671 } else {
7672 new_realm = false;
7673 }
7c673cae
FG
7674
7675 // update our cache now, so we are consistent with what is in the journal
7676 // when we journal a subtree map
7c673cae
FG
7677 dn->get_dir()->unlink_inode(dn);
7678 straydn->pop_projected_linkage();
7679 dn->pop_projected_linkage();
11fdf7f2 7680
f67539c2 7681 mdcache->adjust_subtree_after_rename(in, dn->get_dir(), mdr->more()->peer_update_journaled);
11fdf7f2
TL
7682
7683 if (new_realm)
7684 mdcache->do_realm_invalidate_and_update_notify(in, CEPH_SNAP_OP_SPLIT, false);
7c673cae
FG
7685
7686 // done.
f67539c2 7687 mdr->reset_peer_request();
7c673cae
FG
7688 mdr->straydn = 0;
7689
7690 if (!mdr->aborted) {
f67539c2
TL
7691 auto reply = make_message<MMDSPeerRequest>(mdr->reqid, mdr->attempt, MMDSPeerRequest::OP_RMDIRPREPACK);
7692 if (!mdr->more()->peer_update_journaled)
11fdf7f2 7693 reply->mark_not_journaled();
f67539c2 7694 mds->send_message_mds(reply, mdr->peer_to_mds);
7c673cae
FG
7695 } else {
7696 dout(10) << " abort flag set, finishing" << dendl;
7697 mdcache->request_finish(mdr);
7698 }
7699}
7700
f67539c2 7701void Server::handle_peer_rmdir_prep_ack(MDRequestRef& mdr, const cref_t<MMDSPeerRequest> &ack)
7c673cae 7702{
f67539c2 7703 dout(10) << "handle_peer_rmdir_prep_ack " << *mdr
7c673cae
FG
7704 << " " << *ack << dendl;
7705
7706 mds_rank_t from = mds_rank_t(ack->get_source().num());
7707
f67539c2 7708 mdr->more()->peers.insert(from);
7c673cae
FG
7709 mdr->more()->witnessed.insert(from);
7710 if (!ack->is_not_journaled())
f67539c2 7711 mdr->more()->has_journaled_peers = true;
7c673cae
FG
7712
7713 // remove from waiting list
f67539c2
TL
7714 ceph_assert(mdr->more()->waiting_on_peer.count(from));
7715 mdr->more()->waiting_on_peer.erase(from);
7c673cae 7716
f67539c2 7717 if (mdr->more()->waiting_on_peer.empty())
7c673cae
FG
7718 dispatch_client_request(mdr); // go again!
7719 else
f67539c2 7720 dout(10) << "still waiting on peers " << mdr->more()->waiting_on_peer << dendl;
7c673cae
FG
7721}
7722
f67539c2 7723void Server::_commit_peer_rmdir(MDRequestRef& mdr, int r, CDentry *straydn)
7c673cae 7724{
f67539c2 7725 dout(10) << "_commit_peer_rmdir " << *mdr << " r=" << r << dendl;
e306af50 7726
7c673cae 7727 if (r == 0) {
f67539c2 7728 if (mdr->more()->peer_update_journaled) {
31f18b77
FG
7729 CInode *strayin = straydn->get_projected_linkage()->get_inode();
7730 if (strayin && !strayin->snaprealm)
7731 mdcache->clear_dirty_bits_for_stray(strayin);
7732 }
7733
7c673cae
FG
7734 mdr->cleanup();
7735
f67539c2 7736 if (mdr->more()->peer_update_journaled) {
7c673cae 7737 // write a commit to the journal
f67539c2
TL
7738 EPeerUpdate *le = new EPeerUpdate(mdlog, "peer_rmdir_commit", mdr->reqid,
7739 mdr->peer_to_mds, EPeerUpdate::OP_COMMIT,
7740 EPeerUpdate::RMDIR);
7c673cae 7741 mdlog->start_entry(le);
f67539c2 7742 submit_mdlog_entry(le, new C_MDS_CommittedPeer(this, mdr), mdr, __func__);
7c673cae
FG
7743 mdlog->flush();
7744 } else {
f67539c2 7745 _committed_peer(mdr);
7c673cae
FG
7746 }
7747 } else {
7748 // abort
f67539c2 7749 do_rmdir_rollback(mdr->more()->rollback_bl, mdr->peer_to_mds, mdr);
7c673cae
FG
7750 }
7751}
7752
7753struct C_MDS_LoggedRmdirRollback : public ServerLogContext {
7754 metareqid_t reqid;
7755 CDentry *dn;
7756 CDentry *straydn;
7757 C_MDS_LoggedRmdirRollback(Server *s, MDRequestRef& m, metareqid_t mr, CDentry *d, CDentry *st)
7758 : ServerLogContext(s, m), reqid(mr), dn(d), straydn(st) {}
7759 void finish(int r) override {
7760 server->_rmdir_rollback_finish(mdr, reqid, dn, straydn);
7761 }
7762};
7763
f67539c2 7764void Server::do_rmdir_rollback(bufferlist &rbl, mds_rank_t leader, MDRequestRef& mdr)
7c673cae
FG
7765{
7766 // unlink the other rollback methods, the rmdir rollback is only
7767 // needed to record the subtree changes in the journal for inode
7768 // replicas who are auth for empty dirfrags. no actual changes to
7769 // the file system are taking place here, so there is no Mutation.
7770
7771 rmdir_rollback rollback;
11fdf7f2
TL
7772 auto p = rbl.cbegin();
7773 decode(rollback, p);
7c673cae
FG
7774
7775 dout(10) << "do_rmdir_rollback on " << rollback.reqid << dendl;
f67539c2 7776 mdcache->add_rollback(rollback.reqid, leader); // need to finish this update before resolve finishes
11fdf7f2 7777 ceph_assert(mdr || mds->is_resolve());
7c673cae
FG
7778
7779 CDir *dir = mdcache->get_dirfrag(rollback.src_dir);
7780 if (!dir)
7781 dir = mdcache->get_dirfrag(rollback.src_dir.ino, rollback.src_dname);
11fdf7f2 7782 ceph_assert(dir);
7c673cae 7783 CDentry *dn = dir->lookup(rollback.src_dname);
11fdf7f2 7784 ceph_assert(dn);
7c673cae 7785 dout(10) << " dn " << *dn << dendl;
11fdf7f2
TL
7786 CDir *straydir = mdcache->get_dirfrag(rollback.dest_dir);
7787 ceph_assert(straydir);
7788 CDentry *straydn = straydir->lookup(rollback.dest_dname);
7789 ceph_assert(straydn);
7790 dout(10) << " straydn " << *straydn << dendl;
7c673cae
FG
7791 CInode *in = straydn->get_linkage()->get_inode();
7792
11fdf7f2
TL
7793 dn->push_projected_linkage(in);
7794 straydn->push_projected_linkage();
7c673cae 7795
11fdf7f2
TL
7796 if (rollback.snapbl.length() && in->snaprealm) {
7797 bool hadrealm;
7798 auto p = rollback.snapbl.cbegin();
7799 decode(hadrealm, p);
7800 if (hadrealm) {
7801 decode(in->snaprealm->srnode, p);
7802 } else {
7803 in->snaprealm->merge_to(dir->get_inode()->find_snaprealm());
7804 }
7805 }
7c673cae 7806
f67539c2 7807 if (mdr && !mdr->more()->peer_update_journaled) {
11fdf7f2 7808 ceph_assert(!in->has_subtree_root_dirfrag(mds->get_nodeid()));
7c673cae 7809
11fdf7f2 7810 _rmdir_rollback_finish(mdr, rollback.reqid, dn, straydn);
7c673cae
FG
7811 return;
7812 }
7813
7c673cae 7814
f67539c2
TL
7815 EPeerUpdate *le = new EPeerUpdate(mdlog, "peer_rmdir_rollback", rollback.reqid, leader,
7816 EPeerUpdate::OP_ROLLBACK, EPeerUpdate::RMDIR);
7c673cae
FG
7817 mdlog->start_entry(le);
7818
7819 le->commit.add_dir_context(dn->get_dir());
7820 le->commit.add_primary_dentry(dn, in, true);
f67539c2 7821 // peer: no need to journal straydn
7c673cae
FG
7822
7823 dout(10) << " noting renamed (unlinked) dir ino " << in->ino() << " in metablob" << dendl;
7824 le->commit.renamed_dirino = in->ino();
7825
7826 mdcache->project_subtree_rename(in, straydn->get_dir(), dn->get_dir());
7827
7828 submit_mdlog_entry(le,
7829 new C_MDS_LoggedRmdirRollback(this, mdr,rollback.reqid,
7830 dn, straydn),
7831 mdr, __func__);
7832 mdlog->flush();
7833}
7834
7835void Server::_rmdir_rollback_finish(MDRequestRef& mdr, metareqid_t reqid, CDentry *dn, CDentry *straydn)
7836{
7837 dout(10) << "_rmdir_rollback_finish " << reqid << dendl;
7838
7839 straydn->get_dir()->unlink_inode(straydn);
7840 dn->pop_projected_linkage();
7841 straydn->pop_projected_linkage();
7842
7843 CInode *in = dn->get_linkage()->get_inode();
11fdf7f2 7844 mdcache->adjust_subtree_after_rename(in, straydn->get_dir(),
f67539c2 7845 !mdr || mdr->more()->peer_update_journaled);
11fdf7f2 7846
7c673cae
FG
7847 if (mds->is_resolve()) {
7848 CDir *root = mdcache->get_subtree_root(straydn->get_dir());
7849 mdcache->try_trim_non_auth_subtree(root);
7850 }
7851
7852 if (mdr)
7853 mdcache->request_finish(mdr);
7854
e306af50 7855 mdcache->finish_rollback(reqid, mdr);
7c673cae
FG
7856}
7857
7858
7859/** _dir_is_nonempty[_unlocked]
7860 *
7861 * check if a directory is non-empty (i.e. we can rmdir it).
7862 *
7863 * the unlocked varient this is a fastpath check. we can't really be
7864 * sure until we rdlock the filelock.
7865 */
7866bool Server::_dir_is_nonempty_unlocked(MDRequestRef& mdr, CInode *in)
7867{
7868 dout(10) << "dir_is_nonempty_unlocked " << *in << dendl;
11fdf7f2 7869 ceph_assert(in->is_auth());
7c673cae 7870
9f95a23c
TL
7871 if (in->filelock.is_cached())
7872 return false; // there can be pending async create/unlink. don't know.
7c673cae
FG
7873 if (in->snaprealm && in->snaprealm->srnode.snaps.size())
7874 return true; // in a snapshot!
7875
9f95a23c
TL
7876 auto&& ls = in->get_dirfrags();
7877 for (const auto& dir : ls) {
7c673cae
FG
7878 // is the frag obviously non-empty?
7879 if (dir->is_auth()) {
7880 if (dir->get_projected_fnode()->fragstat.size()) {
7881 dout(10) << "dir_is_nonempty_unlocked dirstat has "
7882 << dir->get_projected_fnode()->fragstat.size() << " items " << *dir << dendl;
7883 return true;
7884 }
7885 }
7886 }
7887
7888 return false;
7889}
7890
7891bool Server::_dir_is_nonempty(MDRequestRef& mdr, CInode *in)
7892{
7893 dout(10) << "dir_is_nonempty " << *in << dendl;
11fdf7f2
TL
7894 ceph_assert(in->is_auth());
7895 ceph_assert(in->filelock.can_read(mdr->get_client()));
7c673cae
FG
7896
7897 frag_info_t dirstat;
7898 version_t dirstat_version = in->get_projected_inode()->dirstat.version;
7899
9f95a23c
TL
7900 auto&& ls = in->get_dirfrags();
7901 for (const auto& dir : ls) {
f67539c2 7902 const auto& pf = dir->get_projected_fnode();
7c673cae
FG
7903 if (pf->fragstat.size()) {
7904 dout(10) << "dir_is_nonempty dirstat has "
7905 << pf->fragstat.size() << " items " << *dir << dendl;
7906 return true;
7907 }
7908
7909 if (pf->accounted_fragstat.version == dirstat_version)
7910 dirstat.add(pf->accounted_fragstat);
7911 else
7912 dirstat.add(pf->fragstat);
7913 }
7914
7915 return dirstat.size() != in->get_projected_inode()->dirstat.size();
7916}
7917
7918
7919// ======================================================
7920
7921
7922class C_MDS_rename_finish : public ServerLogContext {
7923 CDentry *srcdn;
7924 CDentry *destdn;
7925 CDentry *straydn;
7926public:
7927 C_MDS_rename_finish(Server *s, MDRequestRef& r,
7928 CDentry *sdn, CDentry *ddn, CDentry *stdn) :
7929 ServerLogContext(s, r),
7930 srcdn(sdn), destdn(ddn), straydn(stdn) { }
7931 void finish(int r) override {
11fdf7f2 7932 ceph_assert(r == 0);
7c673cae
FG
7933 server->_rename_finish(mdr, srcdn, destdn, straydn);
7934 }
7935};
7936
7937
7938/** handle_client_rename
7939 *
f67539c2 7940 * rename leader is the destdn auth. this is because cached inodes
7c673cae
FG
7941 * must remain connected. thus, any replica of srci, must also
7942 * replicate destdn, and possibly straydn, so that srci (and
7943 * destdn->inode) remain connected during the rename.
7944 *
f67539c2 7945 * to do this, we freeze srci, then leader (destdn auth) verifies that
7c673cae
FG
7946 * all other nodes have also replciated destdn and straydn. note that
7947 * destdn replicas need not also replicate srci. this only works when
f67539c2 7948 * destdn is leader.
7c673cae
FG
7949 *
7950 * This function takes responsibility for the passed mdr.
7951 */
7952void Server::handle_client_rename(MDRequestRef& mdr)
7953{
f67539c2 7954 const auto& req = mdr->client_request;
7c673cae
FG
7955 dout(7) << "handle_client_rename " << *req << dendl;
7956
7957 filepath destpath = req->get_filepath();
7958 filepath srcpath = req->get_filepath2();
91327a77 7959 if (srcpath.is_last_dot_or_dotdot() || destpath.is_last_dot_or_dotdot()) {
f67539c2
TL
7960 respond_to_request(mdr, -CEPHFS_EBUSY);
7961 return;
7962 }
7963
7964 if (req->get_alternate_name().size() > alternate_name_max) {
7965 dout(10) << " alternate_name longer than " << alternate_name_max << dendl;
7966 respond_to_request(mdr, -CEPHFS_ENAMETOOLONG);
91327a77
AA
7967 return;
7968 }
7969
9f95a23c
TL
7970 auto [destdn, srcdn] = rdlock_two_paths_xlock_destdn(mdr, true);
7971 if (!destdn)
7972 return;
7c673cae 7973
7c673cae 7974 dout(10) << " destdn " << *destdn << dendl;
7c673cae 7975 CDir *destdir = destdn->get_dir();
11fdf7f2 7976 ceph_assert(destdir->is_auth());
9f95a23c 7977 CDentry::linkage_t *destdnl = destdn->get_projected_linkage();
7c673cae 7978
7c673cae 7979 dout(10) << " srcdn " << *srcdn << dendl;
11fdf7f2 7980 CDir *srcdir = srcdn->get_dir();
7c673cae
FG
7981 CDentry::linkage_t *srcdnl = srcdn->get_projected_linkage();
7982 CInode *srci = srcdnl->get_inode();
7983 dout(10) << " srci " << *srci << dendl;
7984
9f95a23c
TL
7985 // -- some sanity checks --
7986 if (destdn == srcdn) {
7987 dout(7) << "rename src=dest, noop" << dendl;
7988 respond_to_request(mdr, 0);
7989 return;
7990 }
7991
7992 // dest a child of src?
7993 // e.g. mv /usr /usr/foo
7994 if (srci->is_dir() && srci->is_projected_ancestor_of(destdir->get_inode())) {
7995 dout(7) << "cannot rename item to be a child of itself" << dendl;
f67539c2 7996 respond_to_request(mdr, -CEPHFS_EINVAL);
9f95a23c
TL
7997 return;
7998 }
7999
8000 // is this a stray migration, reintegration or merge? (sanity checks!)
8001 if (mdr->reqid.name.is_mds() &&
8002 !(MDS_INO_IS_STRAY(srcpath.get_ino()) &&
8003 MDS_INO_IS_STRAY(destpath.get_ino())) &&
8004 !(destdnl->is_remote() &&
8005 destdnl->get_remote_ino() == srci->ino())) {
f67539c2 8006 respond_to_request(mdr, -CEPHFS_EINVAL); // actually, this won't reply, but whatev.
9f95a23c
TL
8007 return;
8008 }
8009
7c673cae
FG
8010 CInode *oldin = 0;
8011 if (!destdnl->is_null()) {
8012 //dout(10) << "dest dn exists " << *destdn << dendl;
8013 oldin = mdcache->get_dentry_inode(destdn, mdr, true);
8014 if (!oldin) return;
8015 dout(10) << " oldin " << *oldin << dendl;
7c673cae
FG
8016
8017 // non-empty dir? do trivial fast unlocked check, do another check later with read locks
8018 if (oldin->is_dir() && _dir_is_nonempty_unlocked(mdr, oldin)) {
f67539c2 8019 respond_to_request(mdr, -CEPHFS_ENOTEMPTY);
7c673cae
FG
8020 return;
8021 }
181888fb 8022
9f95a23c
TL
8023 // mv /some/thing /to/some/existing_other_thing
8024 if (oldin->is_dir() && !srci->is_dir()) {
f67539c2 8025 respond_to_request(mdr, -CEPHFS_EISDIR);
9f95a23c
TL
8026 return;
8027 }
8028 if (!oldin->is_dir() && srci->is_dir()) {
f67539c2 8029 respond_to_request(mdr, -CEPHFS_ENOTDIR);
9f95a23c
TL
8030 return;
8031 }
8032 if (srci == oldin && !srcdir->inode->is_stray()) {
8033 respond_to_request(mdr, 0); // no-op. POSIX makes no sense.
8034 return;
7c673cae 8035 }
f67539c2
TL
8036 if (destdn->get_alternate_name() != req->get_alternate_name()) {
8037 /* the dentry exists but the alternate_names do not match, fail... */
8038 respond_to_request(mdr, -CEPHFS_EINVAL);
8039 return;
8040 }
7c673cae
FG
8041 }
8042
9f95a23c
TL
8043 vector<CDentry*>& srctrace = mdr->dn[1];
8044 vector<CDentry*>& desttrace = mdr->dn[0];
7c673cae
FG
8045
8046 // src+dest traces _must_ share a common ancestor for locking to prevent orphans
8047 if (destpath.get_ino() != srcpath.get_ino() &&
8048 !(req->get_source().is_mds() &&
9f95a23c 8049 MDS_INO_IS_STRAY(srcpath.get_ino()))) { // <-- mds 'rename' out of stray dir is ok!
7c673cae
FG
8050 CInode *srcbase = srctrace[0]->get_dir()->get_inode();
8051 CInode *destbase = desttrace[0]->get_dir()->get_inode();
8052 // ok, extend srctrace toward root until it is an ancestor of desttrace.
8053 while (srcbase != destbase &&
8054 !srcbase->is_projected_ancestor_of(destbase)) {
8055 CDentry *pdn = srcbase->get_projected_parent_dn();
8056 srctrace.insert(srctrace.begin(), pdn);
8057 dout(10) << "rename prepending srctrace with " << *pdn << dendl;
8058 srcbase = pdn->get_dir()->get_inode();
8059 }
8060
8061 // then, extend destpath until it shares the same parent inode as srcpath.
8062 while (destbase != srcbase) {
8063 CDentry *pdn = destbase->get_projected_parent_dn();
8064 desttrace.insert(desttrace.begin(), pdn);
7c673cae
FG
8065 dout(10) << "rename prepending desttrace with " << *pdn << dendl;
8066 destbase = pdn->get_dir()->get_inode();
8067 }
8068 dout(10) << "rename src and dest traces now share common ancestor " << *destbase << dendl;
8069 }
8070
7c673cae 8071
11fdf7f2 8072 bool linkmerge = srcdnl->get_inode() == destdnl->get_inode();
7c673cae
FG
8073 if (linkmerge)
8074 dout(10) << " this is a link merge" << dendl;
8075
8076 // -- create stray dentry? --
8077 CDentry *straydn = NULL;
8078 if (destdnl->is_primary() && !linkmerge) {
8079 straydn = prepare_stray_dentry(mdr, destdnl->get_inode());
8080 if (!straydn)
8081 return;
8082 dout(10) << " straydn is " << *straydn << dendl;
8083 } else if (mdr->straydn) {
8084 mdr->unpin(mdr->straydn);
8085 mdr->straydn = NULL;
8086 }
8087
7c673cae
FG
8088
8089 // -- locks --
9f95a23c
TL
8090 if (!(mdr->locking_state & MutationImpl::ALL_LOCKED)) {
8091 MutationImpl::LockOpVec lov;
7c673cae 8092
9f95a23c
TL
8093 // we need to update srci's ctime. xlock its least contended lock to do that...
8094 lov.add_xlock(&srci->linklock);
8095 lov.add_xlock(&srci->snaplock);
7c673cae 8096
9f95a23c
TL
8097 if (oldin) {
8098 // xlock oldin (for nlink--)
8099 lov.add_xlock(&oldin->linklock);
8100 lov.add_xlock(&oldin->snaplock);
8101 if (oldin->is_dir()) {
8102 ceph_assert(srci->is_dir());
11fdf7f2 8103 lov.add_rdlock(&oldin->filelock); // to verify it's empty
7c673cae 8104
9f95a23c
TL
8105 // adjust locking order?
8106 int cmp = mdr->compare_paths();
8107 if (cmp < 0 || (cmp == 0 && oldin->ino() < srci->ino()))
8108 std::reverse(lov.begin(), lov.end());
8109 } else {
8110 ceph_assert(!srci->is_dir());
8111 // adjust locking order;
8112 if (srci->ino() > oldin->ino())
8113 std::reverse(lov.begin(), lov.end());
8114 }
8115 }
8116
8117 // straydn?
8118 if (straydn) {
8119 lov.add_wrlock(&straydn->get_dir()->inode->filelock);
8120 lov.add_wrlock(&straydn->get_dir()->inode->nestlock);
8121 lov.add_xlock(&straydn->lock);
8122 }
8123
8124 CInode *auth_pin_freeze = !srcdn->is_auth() && srcdnl->is_primary() ? srci : nullptr;
8125 if (!mds->locker->acquire_locks(mdr, lov, auth_pin_freeze))
8126 return;
8127
8128 mdr->locking_state |= MutationImpl::ALL_LOCKED;
8129 }
7c673cae 8130
11fdf7f2
TL
8131 if (linkmerge)
8132 ceph_assert(srcdir->inode->is_stray() && srcdnl->is_primary() && destdnl->is_remote());
8133
7c673cae 8134 if ((!mdr->has_more() || mdr->more()->witnessed.empty())) {
11fdf7f2 8135 if (!check_access(mdr, srcdir->get_inode(), MAY_WRITE))
7c673cae
FG
8136 return;
8137
8138 if (!check_access(mdr, destdn->get_dir()->get_inode(), MAY_WRITE))
8139 return;
8140
8141 if (!check_fragment_space(mdr, destdn->get_dir()))
8142 return;
8143
8144 if (!check_access(mdr, srci, MAY_WRITE))
8145 return;
8146 }
8147
8148 // with read lock, really verify oldin is empty
8149 if (oldin &&
8150 oldin->is_dir() &&
8151 _dir_is_nonempty(mdr, oldin)) {
f67539c2 8152 respond_to_request(mdr, -CEPHFS_ENOTEMPTY);
7c673cae
FG
8153 return;
8154 }
8155
11fdf7f2 8156 /* project_snaprealm_past_parent() will do this job
7c673cae
FG
8157 *
8158 // moving between snaprealms?
8159 if (srcdnl->is_primary() && srci->is_multiversion() && !srci->snaprealm) {
8160 SnapRealm *srcrealm = srci->find_snaprealm();
8161 SnapRealm *destrealm = destdn->get_dir()->inode->find_snaprealm();
8162 if (srcrealm != destrealm &&
8163 (srcrealm->get_newest_seq() + 1 > srcdn->first ||
8164 destrealm->get_newest_seq() + 1 > srcdn->first)) {
8165 dout(10) << " renaming between snaprealms, creating snaprealm for " << *srci << dendl;
8166 mdcache->snaprealm_create(mdr, srci);
8167 return;
8168 }
8169 }
8170 */
8171
adb31ebb
TL
8172 SnapRealm *dest_realm = nullptr;
8173 SnapRealm *src_realm = nullptr;
8174 if (!linkmerge) {
8175 dest_realm = destdir->inode->find_snaprealm();
8176 if (srcdir->inode == destdir->inode)
8177 src_realm = dest_realm;
8178 else
8179 src_realm = srcdir->inode->find_snaprealm();
8180 if (src_realm != dest_realm &&
8181 src_realm->get_subvolume_ino() != dest_realm->get_subvolume_ino()) {
f67539c2 8182 respond_to_request(mdr, -CEPHFS_EXDEV);
adb31ebb
TL
8183 return;
8184 }
8185 }
8186
11fdf7f2 8187 ceph_assert(g_conf()->mds_kill_rename_at != 1);
7c673cae
FG
8188
8189 // -- open all srcdn inode frags, if any --
8190 // we need these open so that auth can properly delegate from inode to dirfrags
8191 // after the inode is _ours_.
8192 if (srcdnl->is_primary() &&
8193 !srcdn->is_auth() &&
8194 srci->is_dir()) {
8195 dout(10) << "srci is remote dir, setting stickydirs and opening all frags" << dendl;
8196 mdr->set_stickydirs(srci);
8197
11fdf7f2
TL
8198 frag_vec_t leaves;
8199 srci->dirfragtree.get_leaves(leaves);
8200 for (const auto& leaf : leaves) {
8201 CDir *dir = srci->get_dirfrag(leaf);
7c673cae 8202 if (!dir) {
11fdf7f2
TL
8203 dout(10) << " opening " << leaf << " under " << *srci << dendl;
8204 mdcache->open_remote_dirfrag(srci, leaf, new C_MDS_RetryRequest(mdcache, mdr));
7c673cae
FG
8205 return;
8206 }
8207 }
8208 }
8209
11fdf7f2
TL
8210 // -- prepare snaprealm ---
8211
8212 if (linkmerge) {
8213 if (!mdr->more()->srci_srnode &&
8214 srci->get_projected_inode()->nlink == 1 &&
8215 srci->is_projected_snaprealm_global()) {
8216 sr_t *new_srnode = srci->prepare_new_srnode(0);
adb31ebb 8217 srci->record_snaprealm_parent_dentry(new_srnode, nullptr, destdn, false);
11fdf7f2
TL
8218
8219 srci->clear_snaprealm_global(new_srnode);
8220 mdr->more()->srci_srnode = new_srnode;
8221 }
8222 } else {
8223 if (oldin && !mdr->more()->desti_srnode) {
8224 if (oldin->is_projected_snaprealm_global()) {
8225 sr_t *new_srnode = oldin->prepare_new_srnode(0);
adb31ebb 8226 oldin->record_snaprealm_parent_dentry(new_srnode, dest_realm, destdn, destdnl->is_primary());
11fdf7f2
TL
8227 // dropping the last linkage or dropping the last remote linkage,
8228 // detch the inode from global snaprealm
8229 auto nlink = oldin->get_projected_inode()->nlink;
8230 if (nlink == 1 ||
8231 (nlink == 2 && !destdnl->is_primary() &&
8232 !oldin->get_projected_parent_dir()->inode->is_stray()))
8233 oldin->clear_snaprealm_global(new_srnode);
8234 mdr->more()->desti_srnode = new_srnode;
8235 } else if (destdnl->is_primary()) {
11fdf7f2
TL
8236 snapid_t follows = dest_realm->get_newest_seq();
8237 if (oldin->snaprealm || follows + 1 > oldin->get_oldest_snap()) {
8238 sr_t *new_srnode = oldin->prepare_new_srnode(follows);
8239 oldin->record_snaprealm_past_parent(new_srnode, straydn->get_dir()->inode->find_snaprealm());
8240 mdr->more()->desti_srnode = new_srnode;
8241 }
8242 }
8243 }
8244 if (!mdr->more()->srci_srnode) {
11fdf7f2
TL
8245 if (srci->is_projected_snaprealm_global()) {
8246 sr_t *new_srnode = srci->prepare_new_srnode(0);
adb31ebb 8247 srci->record_snaprealm_parent_dentry(new_srnode, src_realm, srcdn, srcdnl->is_primary());
11fdf7f2
TL
8248 mdr->more()->srci_srnode = new_srnode;
8249 } else if (srcdnl->is_primary()) {
11fdf7f2
TL
8250 snapid_t follows = src_realm->get_newest_seq();
8251 if (src_realm != dest_realm &&
8252 (srci->snaprealm || follows + 1 > srci->get_oldest_snap())) {
8253 sr_t *new_srnode = srci->prepare_new_srnode(follows);
8254 srci->record_snaprealm_past_parent(new_srnode, dest_realm);
8255 mdr->more()->srci_srnode = new_srnode;
8256 }
8257 }
8258 }
8259 }
8260
7c673cae
FG
8261 // -- prepare witnesses --
8262
9f95a23c
TL
8263 /*
8264 * NOTE: we use _all_ replicas as witnesses.
8265 * this probably isn't totally necessary (esp for file renames),
8266 * but if/when we change that, we have to make sure rejoin is
8267 * sufficiently robust to handle strong rejoins from survivors
8268 * with totally wrong dentry->inode linkage.
8269 * (currently, it can ignore rename effects, because the resolve
8270 * stage will sort them out.)
8271 */
8272 set<mds_rank_t> witnesses = mdr->more()->extra_witnesses;
8273 if (srcdn->is_auth())
8274 srcdn->list_replicas(witnesses);
8275 else
8276 witnesses.insert(srcdn->authority().first);
8277 if (srcdnl->is_remote() && !srci->is_auth())
8278 witnesses.insert(srci->authority().first);
8279 destdn->list_replicas(witnesses);
8280 if (destdnl->is_remote() && !oldin->is_auth())
8281 witnesses.insert(oldin->authority().first);
8282 dout(10) << " witnesses " << witnesses << ", have " << mdr->more()->witnessed << dendl;
8283
8284 if (!witnesses.empty()) {
8285 // Replicas can't see projected dentry linkages and will get confused.
8286 // We have taken snaplocks on ancestor inodes. Later rename/rmdir requests
8287 // can't project these inodes' linkages.
8288 bool need_flush = false;
8289 for (auto& dn : srctrace) {
8290 if (dn->is_projected()) {
8291 need_flush = true;
8292 break;
8293 }
8294 }
8295 if (!need_flush) {
8296 CDentry *dn = destdn;
8297 do {
8298 if (dn->is_projected()) {
8299 need_flush = true;
8300 break;
8301 }
8302 CInode *diri = dn->get_dir()->get_inode();
8303 dn = diri->get_projected_parent_dn();
8304 } while (dn);
8305 }
8306 if (need_flush) {
8307 mdlog->wait_for_safe(
8308 new MDSInternalContextWrapper(mds,
8309 new C_MDS_RetryRequest(mdcache, mdr)));
8310 mdlog->flush();
8311 return;
8312 }
8313 }
8314
7c673cae
FG
8315 // do srcdn auth last
8316 mds_rank_t last = MDS_RANK_NONE;
8317 if (!srcdn->is_auth()) {
8318 last = srcdn->authority().first;
8319 mdr->more()->srcdn_auth_mds = last;
8320 // ask auth of srci to mark srci as ambiguous auth if more than two MDS
8321 // are involved in the rename operation.
8322 if (srcdnl->is_primary() && !mdr->more()->is_ambiguous_auth) {
8323 dout(10) << " preparing ambiguous auth for srci" << dendl;
11fdf7f2
TL
8324 ceph_assert(mdr->more()->is_remote_frozen_authpin);
8325 ceph_assert(mdr->more()->rename_inode == srci);
7c673cae
FG
8326 _rename_prepare_witness(mdr, last, witnesses, srctrace, desttrace, straydn);
8327 return;
8328 }
8329 }
8330
8331 for (set<mds_rank_t>::iterator p = witnesses.begin();
8332 p != witnesses.end();
8333 ++p) {
8334 if (*p == last) continue; // do it last!
8335 if (mdr->more()->witnessed.count(*p)) {
8336 dout(10) << " already witnessed by mds." << *p << dendl;
f67539c2 8337 } else if (mdr->more()->waiting_on_peer.count(*p)) {
7c673cae
FG
8338 dout(10) << " already waiting on witness mds." << *p << dendl;
8339 } else {
8340 if (!_rename_prepare_witness(mdr, *p, witnesses, srctrace, desttrace, straydn))
8341 return;
8342 }
8343 }
f67539c2 8344 if (!mdr->more()->waiting_on_peer.empty())
7c673cae
FG
8345 return; // we're waiting for a witness.
8346
8347 if (last != MDS_RANK_NONE && mdr->more()->witnessed.count(last) == 0) {
8348 dout(10) << " preparing last witness (srcdn auth)" << dendl;
f67539c2 8349 ceph_assert(mdr->more()->waiting_on_peer.count(last) == 0);
7c673cae
FG
8350 _rename_prepare_witness(mdr, last, witnesses, srctrace, desttrace, straydn);
8351 return;
8352 }
8353
f67539c2
TL
8354 // test hack: bail after peer does prepare, so we can verify it's _live_ rollback.
8355 if (!mdr->more()->peers.empty() && !srci->is_dir())
11fdf7f2 8356 ceph_assert(g_conf()->mds_kill_rename_at != 3);
f67539c2 8357 if (!mdr->more()->peers.empty() && srci->is_dir())
11fdf7f2 8358 ceph_assert(g_conf()->mds_kill_rename_at != 4);
7c673cae
FG
8359
8360 // -- declare now --
8361 mdr->set_mds_stamp(ceph_clock_now());
8362
8363 // -- prepare journal entry --
8364 mdr->ls = mdlog->get_current_segment();
8365 EUpdate *le = new EUpdate(mdlog, "rename");
8366 mdlog->start_entry(le);
f67539c2 8367 le->metablob.add_client_req(mdr->reqid, req->get_oldest_client_tid());
7c673cae 8368 if (!mdr->more()->witnessed.empty()) {
f67539c2 8369 dout(20) << " noting uncommitted_peers " << mdr->more()->witnessed << dendl;
7c673cae
FG
8370
8371 le->reqid = mdr->reqid;
f67539c2 8372 le->had_peers = true;
7c673cae 8373
f67539c2 8374 mdcache->add_uncommitted_leader(mdr->reqid, mdr->ls, mdr->more()->witnessed);
7c673cae
FG
8375 // no need to send frozen auth pin to recovring auth MDS of srci
8376 mdr->more()->is_remote_frozen_authpin = false;
8377 }
8378
f67539c2 8379 _rename_prepare(mdr, &le->metablob, &le->client_map, srcdn, destdn, req->get_alternate_name(), straydn);
7c673cae
FG
8380 if (le->client_map.length())
8381 le->cmapv = mds->sessionmap.get_projected();
8382
8383 // -- commit locally --
8384 C_MDS_rename_finish *fin = new C_MDS_rename_finish(this, mdr, srcdn, destdn, straydn);
8385
8386 journal_and_reply(mdr, srci, destdn, le, fin);
81eedcae 8387 mds->balancer->maybe_fragment(destdn->get_dir(), false);
7c673cae
FG
8388}
8389
8390
8391void Server::_rename_finish(MDRequestRef& mdr, CDentry *srcdn, CDentry *destdn, CDentry *straydn)
8392{
8393 dout(10) << "_rename_finish " << *mdr << dendl;
8394
8395 if (!mdr->more()->witnessed.empty())
f67539c2 8396 mdcache->logged_leader_update(mdr->reqid);
7c673cae
FG
8397
8398 // apply
8399 _rename_apply(mdr, srcdn, destdn, straydn);
8400
8401 mdcache->send_dentry_link(destdn, mdr);
8402
8403 CDentry::linkage_t *destdnl = destdn->get_linkage();
8404 CInode *in = destdnl->get_inode();
8405 bool need_eval = mdr->more()->cap_imports.count(in);
8406
f67539c2
TL
8407 // test hack: test peer commit
8408 if (!mdr->more()->peers.empty() && !in->is_dir())
11fdf7f2 8409 ceph_assert(g_conf()->mds_kill_rename_at != 5);
f67539c2 8410 if (!mdr->more()->peers.empty() && in->is_dir())
11fdf7f2 8411 ceph_assert(g_conf()->mds_kill_rename_at != 6);
7c673cae
FG
8412
8413 // bump popularity
11fdf7f2 8414 mds->balancer->hit_dir(srcdn->get_dir(), META_POP_IWR);
7c673cae 8415 if (destdnl->is_remote() && in->is_auth())
11fdf7f2 8416 mds->balancer->hit_inode(in, META_POP_IWR);
7c673cae
FG
8417
8418 // did we import srci? if so, explicitly ack that import that, before we unlock and reply.
8419
11fdf7f2 8420 ceph_assert(g_conf()->mds_kill_rename_at != 7);
7c673cae
FG
8421
8422 // reply
8423 respond_to_request(mdr, 0);
8424
8425 if (need_eval)
8426 mds->locker->eval(in, CEPH_CAP_LOCKS, true);
8427
8428 // clean up?
8429 // respond_to_request() drops locks. So stray reintegration can race with us.
8430 if (straydn && !straydn->get_projected_linkage()->is_null()) {
8431 mdcache->notify_stray(straydn);
8432 }
8433}
8434
8435
8436
8437// helpers
8438
8439bool Server::_rename_prepare_witness(MDRequestRef& mdr, mds_rank_t who, set<mds_rank_t> &witnesse,
8440 vector<CDentry*>& srctrace, vector<CDentry*>& dsttrace, CDentry *straydn)
8441{
f67539c2
TL
8442 const auto& client_req = mdr->client_request;
8443 ceph_assert(client_req);
8444
7c673cae
FG
8445 if (mds->is_cluster_degraded() &&
8446 !mds->mdsmap->is_clientreplay_or_active_or_stopping(who)) {
8447 dout(10) << "_rename_prepare_witness mds." << who << " is not active" << dendl;
f67539c2 8448 if (mdr->more()->waiting_on_peer.empty())
7c673cae
FG
8449 mds->wait_for_active_peer(who, new C_MDS_RetryRequest(mdcache, mdr));
8450 return false;
8451 }
8452
8453 dout(10) << "_rename_prepare_witness mds." << who << dendl;
f67539c2 8454 auto req = make_message<MMDSPeerRequest>(mdr->reqid, mdr->attempt, MMDSPeerRequest::OP_RENAMEPREP);
7c673cae
FG
8455
8456 req->srcdnpath = filepath(srctrace.front()->get_dir()->ino());
8457 for (auto dn : srctrace)
94b18763 8458 req->srcdnpath.push_dentry(dn->get_name());
7c673cae
FG
8459 req->destdnpath = filepath(dsttrace.front()->get_dir()->ino());
8460 for (auto dn : dsttrace)
94b18763 8461 req->destdnpath.push_dentry(dn->get_name());
f67539c2 8462 req->alternate_name = client_req->alternate_name;
7c673cae 8463 if (straydn)
9f95a23c 8464 mdcache->encode_replica_stray(straydn, who, req->straybl);
11fdf7f2
TL
8465
8466 if (mdr->more()->srci_srnode)
8467 encode(*mdr->more()->srci_srnode, req->srci_snapbl);
8468 if (mdr->more()->desti_srnode)
8469 encode(*mdr->more()->desti_srnode, req->desti_snapbl);
31f18b77
FG
8470
8471 req->srcdn_auth = mdr->more()->srcdn_auth_mds;
7c673cae
FG
8472
8473 // srcdn auth will verify our current witness list is sufficient
8474 req->witnesses = witnesse;
8475
8476 req->op_stamp = mdr->get_op_stamp();
8477 mds->send_message_mds(req, who);
8478
f67539c2
TL
8479 ceph_assert(mdr->more()->waiting_on_peer.count(who) == 0);
8480 mdr->more()->waiting_on_peer.insert(who);
7c673cae
FG
8481 return true;
8482}
8483
8484version_t Server::_rename_prepare_import(MDRequestRef& mdr, CDentry *srcdn, bufferlist *client_map_bl)
8485{
8486 version_t oldpv = mdr->more()->inode_import_v;
8487
8488 CDentry::linkage_t *srcdnl = srcdn->get_linkage();
8489
8490 /* import node */
11fdf7f2 8491 auto blp = mdr->more()->inode_import.cbegin();
7c673cae
FG
8492
8493 // imported caps
28e407b8 8494 map<client_t,entity_inst_t> client_map;
11fdf7f2 8495 map<client_t, client_metadata_t> client_metadata_map;
28e407b8 8496 decode(client_map, blp);
11fdf7f2
TL
8497 decode(client_metadata_map, blp);
8498 prepare_force_open_sessions(client_map, client_metadata_map,
8499 mdr->more()->imported_session_map);
28e407b8 8500 encode(client_map, *client_map_bl, mds->mdsmap->get_up_features());
11fdf7f2 8501 encode(client_metadata_map, *client_map_bl);
7c673cae
FG
8502
8503 list<ScatterLock*> updated_scatterlocks;
8504 mdcache->migrator->decode_import_inode(srcdn, blp, srcdn->authority().first, mdr->ls,
8505 mdr->more()->cap_imports, updated_scatterlocks);
8506
8507 // hack: force back to !auth and clean, temporarily
8508 srcdnl->get_inode()->state_clear(CInode::STATE_AUTH);
8509 srcdnl->get_inode()->mark_clean();
8510
8511 return oldpv;
8512}
8513
8514bool Server::_need_force_journal(CInode *diri, bool empty)
8515{
9f95a23c 8516 auto&& dirs = diri->get_dirfrags();
7c673cae
FG
8517
8518 bool force_journal = false;
8519 if (empty) {
11fdf7f2
TL
8520 for (const auto& dir : dirs) {
8521 if (dir->is_subtree_root() && dir->get_dir_auth().first == mds->get_nodeid()) {
8522 dout(10) << " frag " << dir->get_frag() << " is auth subtree dirfrag, will force journal" << dendl;
7c673cae
FG
8523 force_journal = true;
8524 break;
8525 } else
11fdf7f2 8526 dout(20) << " frag " << dir->get_frag() << " is not auth subtree dirfrag" << dendl;
7c673cae
FG
8527 }
8528 } else {
8529 // see if any children of our frags are auth subtrees.
11fdf7f2
TL
8530 std::vector<CDir*> subtrees;
8531 mdcache->get_subtrees(subtrees);
8532 dout(10) << " subtrees " << subtrees << " frags " << dirs << dendl;
8533 for (const auto& dir : dirs) {
8534 for (const auto& subtree : subtrees) {
8535 if (dir->contains(subtree)) {
8536 if (subtree->get_dir_auth().first == mds->get_nodeid()) {
8537 dout(10) << " frag " << dir->get_frag() << " contains (maybe) auth subtree, will force journal "
8538 << *subtree << dendl;
7c673cae
FG
8539 force_journal = true;
8540 break;
8541 } else
11fdf7f2 8542 dout(20) << " frag " << dir->get_frag() << " contains but isn't auth for " << *subtree << dendl;
7c673cae 8543 } else
11fdf7f2 8544 dout(20) << " frag " << dir->get_frag() << " does not contain " << *subtree << dendl;
7c673cae
FG
8545 }
8546 if (force_journal)
8547 break;
8548 }
8549 }
8550 return force_journal;
8551}
8552
8553void Server::_rename_prepare(MDRequestRef& mdr,
8554 EMetaBlob *metablob, bufferlist *client_map_bl,
f67539c2
TL
8555 CDentry *srcdn, CDentry *destdn, std::string_view alternate_name,
8556 CDentry *straydn)
7c673cae
FG
8557{
8558 dout(10) << "_rename_prepare " << *mdr << " " << *srcdn << " " << *destdn << dendl;
8559 if (straydn)
8560 dout(10) << " straydn " << *straydn << dendl;
8561
8562 CDentry::linkage_t *srcdnl = srcdn->get_projected_linkage();
8563 CDentry::linkage_t *destdnl = destdn->get_projected_linkage();
8564 CInode *srci = srcdnl->get_inode();
8565 CInode *oldin = destdnl->get_inode();
8566
8567 // primary+remote link merge?
11fdf7f2
TL
8568 bool linkmerge = (srci == oldin);
8569 if (linkmerge)
8570 ceph_assert(srcdnl->is_primary() && destdnl->is_remote());
7c673cae
FG
8571 bool silent = srcdn->get_dir()->inode->is_stray();
8572
8573 bool force_journal_dest = false;
8574 if (srci->is_dir() && !destdn->is_auth()) {
8575 if (srci->is_auth()) {
8576 // if we are auth for srci and exporting it, force journal because journal replay needs
8577 // the source inode to create auth subtrees.
8578 dout(10) << " we are exporting srci, will force journal destdn" << dendl;
8579 force_journal_dest = true;
8580 } else
8581 force_journal_dest = _need_force_journal(srci, false);
8582 }
8583
8584 bool force_journal_stray = false;
8585 if (oldin && oldin->is_dir() && straydn && !straydn->is_auth())
8586 force_journal_stray = _need_force_journal(oldin, true);
8587
8588 if (linkmerge)
8589 dout(10) << " merging remote and primary links to the same inode" << dendl;
8590 if (silent)
8591 dout(10) << " reintegrating stray; will avoid changing nlink or dir mtime" << dendl;
8592 if (force_journal_dest)
8593 dout(10) << " forcing journal destdn because we (will) have auth subtrees nested beneath it" << dendl;
8594 if (force_journal_stray)
8595 dout(10) << " forcing journal straydn because we (will) have auth subtrees nested beneath it" << dendl;
8596
8597 if (srci->is_dir() && (destdn->is_auth() || force_journal_dest)) {
8598 dout(10) << " noting renamed dir ino " << srci->ino() << " in metablob" << dendl;
8599 metablob->renamed_dirino = srci->ino();
8600 } else if (oldin && oldin->is_dir() && force_journal_stray) {
8601 dout(10) << " noting rename target dir " << oldin->ino() << " in metablob" << dendl;
8602 metablob->renamed_dirino = oldin->ino();
8603 }
8604
8605 // prepare
94b18763
FG
8606 CInode::mempool_inode *spi = 0; // renamed inode
8607 CInode::mempool_inode *tpi = 0; // target/overwritten inode
7c673cae
FG
8608
8609 // target inode
8610 if (!linkmerge) {
8611 if (destdnl->is_primary()) {
11fdf7f2 8612 ceph_assert(straydn); // moving to straydn.
7c673cae
FG
8613 // link--, and move.
8614 if (destdn->is_auth()) {
f67539c2
TL
8615 auto pi= oldin->project_inode(mdr); //project_snaprealm
8616 pi.inode->version = straydn->pre_dirty(pi.inode->version);
8617 pi.inode->update_backtrace();
8618 tpi = pi.inode.get();
7c673cae
FG
8619 }
8620 straydn->push_projected_linkage(oldin);
8621 } else if (destdnl->is_remote()) {
8622 // nlink-- targeti
8623 if (oldin->is_auth()) {
f67539c2
TL
8624 auto pi = oldin->project_inode(mdr);
8625 pi.inode->version = oldin->pre_dirty();
8626 tpi = pi.inode.get();
7c673cae
FG
8627 }
8628 }
8629 }
8630
8631 // dest
f67539c2
TL
8632 if (destdnl->is_null()) {
8633 /* handle_client_rename checks that alternate_name matches for existing destdn */
8634 destdn->set_alternate_name(alternate_name);
8635 }
7c673cae
FG
8636 if (srcdnl->is_remote()) {
8637 if (!linkmerge) {
8638 // destdn
8639 if (destdn->is_auth())
8640 mdr->more()->pvmap[destdn] = destdn->pre_dirty();
8641 destdn->push_projected_linkage(srcdnl->get_remote_ino(), srcdnl->get_remote_d_type());
8642 // srci
8643 if (srci->is_auth()) {
f67539c2
TL
8644 auto pi = srci->project_inode(mdr);
8645 pi.inode->version = srci->pre_dirty();
8646 spi = pi.inode.get();
7c673cae
FG
8647 }
8648 } else {
8649 dout(10) << " will merge remote onto primary link" << dendl;
8650 if (destdn->is_auth()) {
f67539c2
TL
8651 auto pi = oldin->project_inode(mdr);
8652 pi.inode->version = mdr->more()->pvmap[destdn] = destdn->pre_dirty(oldin->get_version());
8653 spi = pi.inode.get();
7c673cae
FG
8654 }
8655 }
8656 } else { // primary
8657 if (destdn->is_auth()) {
8658 version_t oldpv;
8659 if (srcdn->is_auth())
8660 oldpv = srci->get_projected_version();
8661 else {
8662 oldpv = _rename_prepare_import(mdr, srcdn, client_map_bl);
8663
8664 // note which dirfrags have child subtrees in the journal
8665 // event, so that we can open those (as bounds) during replay.
8666 if (srci->is_dir()) {
9f95a23c
TL
8667 auto&& ls = srci->get_dirfrags();
8668 for (const auto& dir : ls) {
7c673cae
FG
8669 if (!dir->is_auth())
8670 metablob->renamed_dir_frags.push_back(dir->get_frag());
8671 }
8672 dout(10) << " noting renamed dir open frags " << metablob->renamed_dir_frags << dendl;
8673 }
8674 }
f67539c2 8675 auto pi = srci->project_inode(mdr); // project snaprealm if srcdnl->is_primary
7c673cae 8676 // & srcdnl->snaprealm
f67539c2
TL
8677 pi.inode->version = mdr->more()->pvmap[destdn] = destdn->pre_dirty(oldpv);
8678 pi.inode->update_backtrace();
8679 spi = pi.inode.get();
7c673cae
FG
8680 }
8681 destdn->push_projected_linkage(srci);
8682 }
8683
8684 // src
8685 if (srcdn->is_auth())
8686 mdr->more()->pvmap[srcdn] = srcdn->pre_dirty();
8687 srcdn->push_projected_linkage(); // push null linkage
8688
8689 if (!silent) {
94b18763 8690 if (spi) {
91327a77
AA
8691 spi->ctime = mdr->get_op_stamp();
8692 if (mdr->get_op_stamp() > spi->rstat.rctime)
8693 spi->rstat.rctime = mdr->get_op_stamp();
94b18763 8694 spi->change_attr++;
7c673cae 8695 if (linkmerge)
94b18763 8696 spi->nlink--;
7c673cae
FG
8697 }
8698 if (tpi) {
91327a77
AA
8699 tpi->ctime = mdr->get_op_stamp();
8700 if (mdr->get_op_stamp() > tpi->rstat.rctime)
8701 tpi->rstat.rctime = mdr->get_op_stamp();
7c673cae 8702 tpi->change_attr++;
94b18763
FG
8703 {
8704 std::string t;
8705 destdn->make_path_string(t, true);
11fdf7f2 8706 tpi->stray_prior_path = std::move(t);
94b18763 8707 }
7c673cae
FG
8708 tpi->nlink--;
8709 if (tpi->nlink == 0)
8710 oldin->state_set(CInode::STATE_ORPHAN);
8711 }
8712 }
8713
8714 // prepare nesting, mtime updates
8715 int predirty_dir = silent ? 0:PREDIRTY_DIR;
8716
8717 // guarantee stray dir is processed first during journal replay. unlink the old inode,
8718 // then link the source inode to destdn
8719 if (destdnl->is_primary()) {
11fdf7f2 8720 ceph_assert(straydn);
7c673cae
FG
8721 if (straydn->is_auth()) {
8722 metablob->add_dir_context(straydn->get_dir());
8723 metablob->add_dir(straydn->get_dir(), true);
8724 }
8725 }
8726
f67539c2
TL
8727 if (!linkmerge && destdnl->is_remote() && oldin->is_auth()) {
8728 CDir *oldin_dir = oldin->get_projected_parent_dir();
8729 if (oldin_dir != srcdn->get_dir() && oldin_dir != destdn->get_dir())
8730 mdcache->predirty_journal_parents(mdr, metablob, oldin, oldin_dir, PREDIRTY_PRIMARY);
8731 }
8732
7c673cae
FG
8733 // sub off target
8734 if (destdn->is_auth() && !destdnl->is_null()) {
8735 mdcache->predirty_journal_parents(mdr, metablob, oldin, destdn->get_dir(),
8736 (destdnl->is_primary() ? PREDIRTY_PRIMARY:0)|predirty_dir, -1);
224ce89b 8737 if (destdnl->is_primary()) {
11fdf7f2 8738 ceph_assert(straydn);
7c673cae
FG
8739 mdcache->predirty_journal_parents(mdr, metablob, oldin, straydn->get_dir(),
8740 PREDIRTY_PRIMARY|PREDIRTY_DIR, 1);
224ce89b 8741 }
7c673cae 8742 }
f67539c2
TL
8743
8744 if (srcdnl->is_remote() && srci->is_auth()) {
8745 CDir *srci_dir = srci->get_projected_parent_dir();
8746 if (srci_dir != srcdn->get_dir() && srci_dir != destdn->get_dir())
8747 mdcache->predirty_journal_parents(mdr, metablob, srci, srci_dir, PREDIRTY_PRIMARY);
8748 }
7c673cae
FG
8749
8750 // move srcdn
8751 int predirty_primary = (srcdnl->is_primary() && srcdn->get_dir() != destdn->get_dir()) ? PREDIRTY_PRIMARY:0;
8752 int flags = predirty_dir | predirty_primary;
8753 if (srcdn->is_auth())
8754 mdcache->predirty_journal_parents(mdr, metablob, srci, srcdn->get_dir(), PREDIRTY_SHALLOW|flags, -1);
8755 if (destdn->is_auth())
8756 mdcache->predirty_journal_parents(mdr, metablob, srci, destdn->get_dir(), flags, 1);
8757
7c673cae
FG
8758 // add it all to the metablob
8759 // target inode
8760 if (!linkmerge) {
8761 if (destdnl->is_primary()) {
11fdf7f2 8762 ceph_assert(straydn);
7c673cae
FG
8763 if (destdn->is_auth()) {
8764 // project snaprealm, too
11fdf7f2
TL
8765 if (auto& desti_srnode = mdr->more()->desti_srnode) {
8766 oldin->project_snaprealm(desti_srnode);
8767 if (tpi->nlink == 0)
8768 ceph_assert(!desti_srnode->is_parent_global());
8769 desti_srnode = NULL;
8770 }
8771 straydn->first = mdcache->get_global_snaprealm()->get_newest_seq() + 1;
7c673cae
FG
8772 metablob->add_primary_dentry(straydn, oldin, true, true);
8773 } else if (force_journal_stray) {
8774 dout(10) << " forced journaling straydn " << *straydn << dendl;
8775 metablob->add_dir_context(straydn->get_dir());
8776 metablob->add_primary_dentry(straydn, oldin, true);
8777 }
8778 } else if (destdnl->is_remote()) {
8779 if (oldin->is_auth()) {
11fdf7f2 8780 sr_t *new_srnode = NULL;
f67539c2
TL
8781 if (mdr->peer_request) {
8782 if (mdr->peer_request->desti_snapbl.length() > 0) {
11fdf7f2 8783 new_srnode = new sr_t();
f67539c2 8784 auto p = mdr->peer_request->desti_snapbl.cbegin();
11fdf7f2
TL
8785 decode(*new_srnode, p);
8786 }
8787 } else if (auto& desti_srnode = mdr->more()->desti_srnode) {
8788 new_srnode = desti_srnode;
8789 desti_srnode = NULL;
8790 }
8791 if (new_srnode) {
8792 oldin->project_snaprealm(new_srnode);
8793 if (tpi->nlink == 0)
8794 ceph_assert(!new_srnode->is_parent_global());
8795 }
7c673cae 8796 // auth for targeti
f67539c2
TL
8797 CDentry *oldin_pdn = oldin->get_projected_parent_dn();
8798 mdcache->journal_cow_dentry(mdr.get(), metablob, oldin_pdn);
8799 metablob->add_primary_dentry(oldin_pdn, oldin, true);
7c673cae
FG
8800 }
8801 }
8802 }
8803
8804 // dest
8805 if (srcdnl->is_remote()) {
11fdf7f2
TL
8806 ceph_assert(!linkmerge);
8807 if (destdn->is_auth() && !destdnl->is_null())
8808 mdcache->journal_cow_dentry(mdr.get(), metablob, destdn, CEPH_NOSNAP, 0, destdnl);
8809 else
8810 destdn->first = mdcache->get_global_snaprealm()->get_newest_seq() + 1;
7c673cae 8811
11fdf7f2
TL
8812 if (destdn->is_auth())
8813 metablob->add_remote_dentry(destdn, true, srcdnl->get_remote_ino(), srcdnl->get_remote_d_type());
8814
8815 if (srci->is_auth() ) { // it's remote
f67539c2
TL
8816 if (mdr->peer_request) {
8817 if (mdr->peer_request->srci_snapbl.length() > 0) {
11fdf7f2 8818 sr_t *new_srnode = new sr_t();
f67539c2 8819 auto p = mdr->peer_request->srci_snapbl.cbegin();
11fdf7f2
TL
8820 decode(*new_srnode, p);
8821 srci->project_snaprealm(new_srnode);
8822 }
8823 } else if (auto& srci_srnode = mdr->more()->srci_srnode) {
8824 srci->project_snaprealm(srci_srnode);
8825 srci_srnode = NULL;
7c673cae 8826 }
7c673cae 8827
11fdf7f2 8828 CDentry *srci_pdn = srci->get_projected_parent_dn();
f67539c2 8829 mdcache->journal_cow_dentry(mdr.get(), metablob, srci_pdn);
11fdf7f2 8830 metablob->add_primary_dentry(srci_pdn, srci, true);
7c673cae
FG
8831 }
8832 } else if (srcdnl->is_primary()) {
8833 // project snap parent update?
11fdf7f2
TL
8834 if (destdn->is_auth()) {
8835 if (auto& srci_srnode = mdr->more()->srci_srnode) {
8836 srci->project_snaprealm(srci_srnode);
8837 srci_srnode = NULL;
8838 }
8839 }
7c673cae
FG
8840
8841 if (destdn->is_auth() && !destdnl->is_null())
8842 mdcache->journal_cow_dentry(mdr.get(), metablob, destdn, CEPH_NOSNAP, 0, destdnl);
11fdf7f2
TL
8843
8844 destdn->first = mdcache->get_global_snaprealm()->get_newest_seq() + 1;
7c673cae
FG
8845
8846 if (destdn->is_auth())
8847 metablob->add_primary_dentry(destdn, srci, true, true);
8848 else if (force_journal_dest) {
8849 dout(10) << " forced journaling destdn " << *destdn << dendl;
8850 metablob->add_dir_context(destdn->get_dir());
8851 metablob->add_primary_dentry(destdn, srci, true);
8852 if (srcdn->is_auth() && srci->is_dir()) {
8853 // journal new subtrees root dirfrags
9f95a23c
TL
8854 auto&& ls = srci->get_dirfrags();
8855 for (const auto& dir : ls) {
7c673cae
FG
8856 if (dir->is_auth())
8857 metablob->add_dir(dir, true);
8858 }
8859 }
8860 }
8861 }
8862
8863 // src
8864 if (srcdn->is_auth()) {
8865 dout(10) << " journaling srcdn " << *srcdn << dendl;
8866 mdcache->journal_cow_dentry(mdr.get(), metablob, srcdn, CEPH_NOSNAP, 0, srcdnl);
f67539c2 8867 // also journal the inode in case we need do peer rename rollback. It is Ok to add
7c673cae
FG
8868 // both primary and NULL dentries. Because during journal replay, null dentry is
8869 // processed after primary dentry.
8870 if (srcdnl->is_primary() && !srci->is_dir() && !destdn->is_auth())
8871 metablob->add_primary_dentry(srcdn, srci, true);
8872 metablob->add_null_dentry(srcdn, true);
8873 } else
8874 dout(10) << " NOT journaling srcdn " << *srcdn << dendl;
8875
8876 // make renamed inode first track the dn
11fdf7f2
TL
8877 if (srcdnl->is_primary() && destdn->is_auth()) {
8878 ceph_assert(srci->first <= destdn->first);
8879 srci->first = destdn->first;
8880 }
8881 // make stray inode first track the straydn
8882 if (straydn && straydn->is_auth()) {
8883 ceph_assert(oldin->first <= straydn->first);
8884 oldin->first = straydn->first;
8885 }
7c673cae 8886
224ce89b 8887 if (oldin && oldin->is_dir()) {
11fdf7f2 8888 ceph_assert(straydn);
7c673cae 8889 mdcache->project_subtree_rename(oldin, destdn->get_dir(), straydn->get_dir());
224ce89b 8890 }
7c673cae
FG
8891 if (srci->is_dir())
8892 mdcache->project_subtree_rename(srci, srcdn->get_dir(), destdn->get_dir());
8893
8894}
8895
8896
8897void Server::_rename_apply(MDRequestRef& mdr, CDentry *srcdn, CDentry *destdn, CDentry *straydn)
8898{
8899 dout(10) << "_rename_apply " << *mdr << " " << *srcdn << " " << *destdn << dendl;
8900 dout(10) << " pvs " << mdr->more()->pvmap << dendl;
8901
8902 CDentry::linkage_t *srcdnl = srcdn->get_linkage();
8903 CDentry::linkage_t *destdnl = destdn->get_linkage();
8904
8905 CInode *oldin = destdnl->get_inode();
7c673cae
FG
8906
8907 // primary+remote link merge?
11fdf7f2
TL
8908 bool linkmerge = (srcdnl->get_inode() == oldin);
8909 if (linkmerge)
8910 ceph_assert(srcdnl->is_primary() || destdnl->is_remote());
8911
8912 bool new_in_snaprealm = false;
8913 bool new_oldin_snaprealm = false;
7c673cae
FG
8914
8915 // target inode
8916 if (!linkmerge) {
8917 if (destdnl->is_primary()) {
11fdf7f2 8918 ceph_assert(straydn);
7c673cae 8919 dout(10) << "straydn is " << *straydn << dendl;
11fdf7f2
TL
8920
8921 // if there is newly created snaprealm, need to split old snaprealm's
8922 // inodes_with_caps. So pop snaprealm before linkage changes.
8923 if (destdn->is_auth()) {
8924 bool hadrealm = (oldin->snaprealm ? true : false);
8925 oldin->early_pop_projected_snaprealm();
8926 new_oldin_snaprealm = (oldin->snaprealm && !hadrealm);
8927 } else {
f67539c2
TL
8928 ceph_assert(mdr->peer_request);
8929 if (mdr->peer_request->desti_snapbl.length()) {
11fdf7f2 8930 new_oldin_snaprealm = !oldin->snaprealm;
f67539c2 8931 oldin->decode_snap_blob(mdr->peer_request->desti_snapbl);
11fdf7f2 8932 ceph_assert(oldin->snaprealm);
11fdf7f2
TL
8933 }
8934 }
8935
31f18b77 8936 destdn->get_dir()->unlink_inode(destdn, false);
7c673cae
FG
8937
8938 straydn->pop_projected_linkage();
f67539c2 8939 if (mdr->is_peer() && !mdr->more()->peer_update_journaled)
11fdf7f2 8940 ceph_assert(!straydn->is_projected()); // no other projected
7c673cae
FG
8941
8942 // nlink-- targeti
11fdf7f2 8943 if (destdn->is_auth())
f67539c2 8944 oldin->pop_and_dirty_projected_inode(mdr->ls, mdr);
11fdf7f2
TL
8945
8946 mdcache->touch_dentry_bottom(straydn); // drop dn as quickly as possible.
7c673cae 8947 } else if (destdnl->is_remote()) {
31f18b77 8948 destdn->get_dir()->unlink_inode(destdn, false);
11fdf7f2 8949 if (oldin->is_auth()) {
f67539c2
TL
8950 oldin->pop_and_dirty_projected_inode(mdr->ls, mdr);
8951 } else if (mdr->peer_request) {
8952 if (mdr->peer_request->desti_snapbl.length() > 0) {
11fdf7f2 8953 ceph_assert(oldin->snaprealm);
f67539c2 8954 oldin->decode_snap_blob(mdr->peer_request->desti_snapbl);
11fdf7f2
TL
8955 }
8956 } else if (auto& desti_srnode = mdr->more()->desti_srnode) {
8957 delete desti_srnode;
8958 desti_srnode = NULL;
8959 }
7c673cae
FG
8960 }
8961 }
8962
8963 // unlink src before we relink it at dest
8964 CInode *in = srcdnl->get_inode();
11fdf7f2 8965 ceph_assert(in);
7c673cae
FG
8966
8967 bool srcdn_was_remote = srcdnl->is_remote();
11fdf7f2
TL
8968 if (!srcdn_was_remote) {
8969 // if there is newly created snaprealm, need to split old snaprealm's
8970 // inodes_with_caps. So pop snaprealm before linkage changes.
8971 if (destdn->is_auth()) {
8972 bool hadrealm = (in->snaprealm ? true : false);
8973 in->early_pop_projected_snaprealm();
8974 new_in_snaprealm = (in->snaprealm && !hadrealm);
8975 } else {
f67539c2
TL
8976 ceph_assert(mdr->peer_request);
8977 if (mdr->peer_request->srci_snapbl.length()) {
11fdf7f2 8978 new_in_snaprealm = !in->snaprealm;
f67539c2 8979 in->decode_snap_blob(mdr->peer_request->srci_snapbl);
11fdf7f2 8980 ceph_assert(in->snaprealm);
11fdf7f2
TL
8981 }
8982 }
8983 }
8984
7c673cae
FG
8985 srcdn->get_dir()->unlink_inode(srcdn);
8986
8987 // dest
8988 if (srcdn_was_remote) {
8989 if (!linkmerge) {
8990 // destdn
8991 destdnl = destdn->pop_projected_linkage();
f67539c2 8992 if (mdr->is_peer() && !mdr->more()->peer_update_journaled)
11fdf7f2 8993 ceph_assert(!destdn->is_projected()); // no other projected
7c673cae
FG
8994
8995 destdn->link_remote(destdnl, in);
8996 if (destdn->is_auth())
8997 destdn->mark_dirty(mdr->more()->pvmap[destdn], mdr->ls);
8998 // in
11fdf7f2 8999 if (in->is_auth()) {
f67539c2
TL
9000 in->pop_and_dirty_projected_inode(mdr->ls, mdr);
9001 } else if (mdr->peer_request) {
9002 if (mdr->peer_request->srci_snapbl.length() > 0) {
11fdf7f2 9003 ceph_assert(in->snaprealm);
f67539c2 9004 in->decode_snap_blob(mdr->peer_request->srci_snapbl);
11fdf7f2
TL
9005 }
9006 } else if (auto& srci_srnode = mdr->more()->srci_srnode) {
9007 delete srci_srnode;
9008 srci_srnode = NULL;
9009 }
7c673cae
FG
9010 } else {
9011 dout(10) << "merging remote onto primary link" << dendl;
f67539c2 9012 oldin->pop_and_dirty_projected_inode(mdr->ls, mdr);
7c673cae
FG
9013 }
9014 } else { // primary
9015 if (linkmerge) {
9016 dout(10) << "merging primary onto remote link" << dendl;
31f18b77 9017 destdn->get_dir()->unlink_inode(destdn, false);
7c673cae
FG
9018 }
9019 destdnl = destdn->pop_projected_linkage();
f67539c2 9020 if (mdr->is_peer() && !mdr->more()->peer_update_journaled)
11fdf7f2 9021 ceph_assert(!destdn->is_projected()); // no other projected
7c673cae
FG
9022
9023 // srcdn inode import?
9024 if (!srcdn->is_auth() && destdn->is_auth()) {
11fdf7f2 9025 ceph_assert(mdr->more()->inode_import.length() > 0);
7c673cae
FG
9026
9027 map<client_t,Capability::Import> imported_caps;
9028
9029 // finish cap imports
28e407b8 9030 finish_force_open_sessions(mdr->more()->imported_session_map);
7c673cae
FG
9031 if (mdr->more()->cap_imports.count(destdnl->get_inode())) {
9032 mdcache->migrator->finish_import_inode_caps(destdnl->get_inode(),
28e407b8
AA
9033 mdr->more()->srcdn_auth_mds, true,
9034 mdr->more()->imported_session_map,
9035 mdr->more()->cap_imports[destdnl->get_inode()],
9036 imported_caps);
7c673cae
FG
9037 }
9038
9039 mdr->more()->inode_import.clear();
11fdf7f2 9040 encode(imported_caps, mdr->more()->inode_import);
7c673cae
FG
9041
9042 /* hack: add an auth pin for each xlock we hold. These were
9043 * remote xlocks previously but now they're local and
9044 * we're going to try and unpin when we xlock_finish. */
11fdf7f2
TL
9045
9046 for (auto i = mdr->locks.lower_bound(&destdnl->get_inode()->versionlock);
9047 i != mdr->locks.end();
9048 ++i) {
9049 SimpleLock *lock = i->lock;
9050 if (lock->get_parent() != destdnl->get_inode())
9051 break;
9052 if (i->is_xlock() && !lock->is_locallock())
9053 mds->locker->xlock_import(lock);
9054 }
7c673cae
FG
9055
9056 // hack: fix auth bit
9057 in->state_set(CInode::STATE_AUTH);
7c673cae
FG
9058
9059 mdr->clear_ambiguous_auth();
9060 }
9061
11fdf7f2 9062 if (destdn->is_auth())
f67539c2 9063 in->pop_and_dirty_projected_inode(mdr->ls, mdr);
7c673cae
FG
9064 }
9065
9066 // src
9067 if (srcdn->is_auth())
9068 srcdn->mark_dirty(mdr->more()->pvmap[srcdn], mdr->ls);
9069 srcdn->pop_projected_linkage();
f67539c2 9070 if (mdr->is_peer() && !mdr->more()->peer_update_journaled)
11fdf7f2 9071 ceph_assert(!srcdn->is_projected()); // no other projected
7c673cae
FG
9072
9073 // apply remaining projected inodes (nested)
9074 mdr->apply();
9075
9076 // update subtree map?
11fdf7f2 9077 if (destdnl->is_primary() && in->is_dir())
224ce89b 9078 mdcache->adjust_subtree_after_rename(in, srcdn->get_dir(), true);
7c673cae
FG
9079
9080 if (straydn && oldin->is_dir())
9081 mdcache->adjust_subtree_after_rename(oldin, destdn->get_dir(), true);
9082
11fdf7f2
TL
9083 if (new_oldin_snaprealm)
9084 mdcache->do_realm_invalidate_and_update_notify(oldin, CEPH_SNAP_OP_SPLIT, false);
9085 if (new_in_snaprealm)
9086 mdcache->do_realm_invalidate_and_update_notify(in, CEPH_SNAP_OP_SPLIT, true);
9087
7c673cae
FG
9088 // removing a new dn?
9089 if (srcdn->is_auth())
9090 srcdn->get_dir()->try_remove_unlinked_dn(srcdn);
9091}
9092
9093
9094
9095// ------------
f67539c2 9096// PEER
7c673cae 9097
f67539c2 9098class C_MDS_PeerRenamePrep : public ServerLogContext {
7c673cae
FG
9099 CDentry *srcdn, *destdn, *straydn;
9100public:
f67539c2 9101 C_MDS_PeerRenamePrep(Server *s, MDRequestRef& m, CDentry *sr, CDentry *de, CDentry *st) :
7c673cae
FG
9102 ServerLogContext(s, m), srcdn(sr), destdn(de), straydn(st) {}
9103 void finish(int r) override {
f67539c2 9104 server->_logged_peer_rename(mdr, srcdn, destdn, straydn);
7c673cae
FG
9105 }
9106};
9107
f67539c2 9108class C_MDS_PeerRenameCommit : public ServerContext {
7c673cae
FG
9109 MDRequestRef mdr;
9110 CDentry *srcdn, *destdn, *straydn;
9111public:
f67539c2 9112 C_MDS_PeerRenameCommit(Server *s, MDRequestRef& m, CDentry *sr, CDentry *de, CDentry *st) :
7c673cae
FG
9113 ServerContext(s), mdr(m), srcdn(sr), destdn(de), straydn(st) {}
9114 void finish(int r) override {
f67539c2 9115 server->_commit_peer_rename(mdr, r, srcdn, destdn, straydn);
7c673cae
FG
9116 }
9117};
9118
f67539c2 9119class C_MDS_PeerRenameSessionsFlushed : public ServerContext {
7c673cae
FG
9120 MDRequestRef mdr;
9121public:
f67539c2 9122 C_MDS_PeerRenameSessionsFlushed(Server *s, MDRequestRef& r) :
7c673cae
FG
9123 ServerContext(s), mdr(r) {}
9124 void finish(int r) override {
f67539c2 9125 server->_peer_rename_sessions_flushed(mdr);
7c673cae
FG
9126 }
9127};
9128
f67539c2 9129void Server::handle_peer_rename_prep(MDRequestRef& mdr)
7c673cae 9130{
f67539c2
TL
9131 dout(10) << "handle_peer_rename_prep " << *mdr
9132 << " " << mdr->peer_request->srcdnpath
9133 << " to " << mdr->peer_request->destdnpath
7c673cae 9134 << dendl;
31f18b77 9135
f67539c2
TL
9136 if (mdr->peer_request->is_interrupted()) {
9137 dout(10) << " peer request interrupted, sending noop reply" << dendl;
9138 auto reply = make_message<MMDSPeerRequest>(mdr->reqid, mdr->attempt, MMDSPeerRequest::OP_RENAMEPREPACK);
31f18b77 9139 reply->mark_interrupted();
f67539c2
TL
9140 mds->send_message_mds(reply, mdr->peer_to_mds);
9141 mdr->reset_peer_request();
31f18b77
FG
9142 return;
9143 }
9144
7c673cae 9145 // discover destdn
f67539c2 9146 filepath destpath(mdr->peer_request->destdnpath);
7c673cae
FG
9147 dout(10) << " dest " << destpath << dendl;
9148 vector<CDentry*> trace;
f67539c2 9149 CF_MDS_RetryRequestFactory cf(mdcache, mdr, false);
9f95a23c
TL
9150 int r = mdcache->path_traverse(mdr, cf, destpath,
9151 MDS_TRAVERSE_DISCOVER | MDS_TRAVERSE_PATH_LOCKED | MDS_TRAVERSE_WANT_DENTRY,
9152 &trace);
7c673cae 9153 if (r > 0) return;
f67539c2 9154 if (r == -CEPHFS_ESTALE) {
7c673cae 9155 mdcache->find_ino_peers(destpath.get_ino(), new C_MDS_RetryRequest(mdcache, mdr),
f67539c2 9156 mdr->peer_to_mds, true);
7c673cae
FG
9157 return;
9158 }
11fdf7f2 9159 ceph_assert(r == 0); // we shouldn't get an error here!
7c673cae 9160
91327a77 9161 CDentry *destdn = trace.back();
7c673cae
FG
9162 CDentry::linkage_t *destdnl = destdn->get_projected_linkage();
9163 dout(10) << " destdn " << *destdn << dendl;
9164 mdr->pin(destdn);
9165
9166 // discover srcdn
f67539c2 9167 filepath srcpath(mdr->peer_request->srcdnpath);
7c673cae
FG
9168 dout(10) << " src " << srcpath << dendl;
9169 CInode *srci = nullptr;
9f95a23c
TL
9170 r = mdcache->path_traverse(mdr, cf, srcpath,
9171 MDS_TRAVERSE_DISCOVER | MDS_TRAVERSE_PATH_LOCKED,
9172 &trace, &srci);
7c673cae 9173 if (r > 0) return;
11fdf7f2 9174 ceph_assert(r == 0);
7c673cae 9175
91327a77 9176 CDentry *srcdn = trace.back();
7c673cae
FG
9177 CDentry::linkage_t *srcdnl = srcdn->get_projected_linkage();
9178 dout(10) << " srcdn " << *srcdn << dendl;
9179 mdr->pin(srcdn);
9180 mdr->pin(srci);
9181
9182 // stray?
11fdf7f2
TL
9183 bool linkmerge = srcdnl->get_inode() == destdnl->get_inode();
9184 if (linkmerge)
9185 ceph_assert(srcdnl->is_primary() && destdnl->is_remote());
7c673cae
FG
9186 CDentry *straydn = mdr->straydn;
9187 if (destdnl->is_primary() && !linkmerge)
11fdf7f2 9188 ceph_assert(straydn);
7c673cae 9189
f67539c2 9190 mdr->set_op_stamp(mdr->peer_request->op_stamp);
7c673cae
FG
9191 mdr->more()->srcdn_auth_mds = srcdn->authority().first;
9192
9193 // set up commit waiter (early, to clean up any freezing etc we do)
f67539c2
TL
9194 if (!mdr->more()->peer_commit)
9195 mdr->more()->peer_commit = new C_MDS_PeerRenameCommit(this, mdr, srcdn, destdn, straydn);
7c673cae
FG
9196
9197 // am i srcdn auth?
9198 if (srcdn->is_auth()) {
9199 set<mds_rank_t> srcdnrep;
9200 srcdn->list_replicas(srcdnrep);
9201
9202 bool reply_witness = false;
9203 if (srcdnl->is_primary() && !srcdnl->get_inode()->state_test(CInode::STATE_AMBIGUOUSAUTH)) {
9204 // freeze?
9205 // we need this to
9206 // - avoid conflicting lock state changes
9207 // - avoid concurrent updates to the inode
9208 // (this could also be accomplished with the versionlock)
11fdf7f2 9209 int allowance = 3; // 1 for the mdr auth_pin, 1 for the link lock, 1 for the snap lock
7c673cae
FG
9210 dout(10) << " freezing srci " << *srcdnl->get_inode() << " with allowance " << allowance << dendl;
9211 bool frozen_inode = srcdnl->get_inode()->freeze_inode(allowance);
9212
9213 // unfreeze auth pin after freezing the inode to avoid queueing waiters
9214 if (srcdnl->get_inode()->is_frozen_auth_pin())
9215 mdr->unfreeze_auth_pin();
9216
9217 if (!frozen_inode) {
9218 srcdnl->get_inode()->add_waiter(CInode::WAIT_FROZEN, new C_MDS_RetryRequest(mdcache, mdr));
9219 return;
9220 }
9221
9222 /*
9223 * set ambiguous auth for srci
9224 * NOTE: we don't worry about ambiguous cache expire as we do
f67539c2 9225 * with subtree migrations because all peers will pin
7c673cae
FG
9226 * srcdn->get_inode() for duration of this rename.
9227 */
9228 mdr->set_ambiguous_auth(srcdnl->get_inode());
9229
9230 // just mark the source inode as ambiguous auth if more than two MDS are involved.
f67539c2
TL
9231 // the leader will send another OP_RENAMEPREP peer request later.
9232 if (mdr->peer_request->witnesses.size() > 1) {
7c673cae
FG
9233 dout(10) << " set srci ambiguous auth; providing srcdn replica list" << dendl;
9234 reply_witness = true;
9235 }
9236
9237 // make sure bystanders have received all lock related messages
9238 for (set<mds_rank_t>::iterator p = srcdnrep.begin(); p != srcdnrep.end(); ++p) {
f67539c2 9239 if (*p == mdr->peer_to_mds ||
7c673cae
FG
9240 (mds->is_cluster_degraded() &&
9241 !mds->mdsmap->is_clientreplay_or_active_or_stopping(*p)))
9242 continue;
f67539c2 9243 auto notify = make_message<MMDSPeerRequest>(mdr->reqid, mdr->attempt, MMDSPeerRequest::OP_RENAMENOTIFY);
7c673cae 9244 mds->send_message_mds(notify, *p);
f67539c2 9245 mdr->more()->waiting_on_peer.insert(*p);
7c673cae
FG
9246 }
9247
9248 // make sure clients have received all cap related messages
9249 set<client_t> export_client_set;
9250 mdcache->migrator->get_export_client_set(srcdnl->get_inode(), export_client_set);
9251
9252 MDSGatherBuilder gather(g_ceph_context);
9253 flush_client_sessions(export_client_set, gather);
9254 if (gather.has_subs()) {
f67539c2
TL
9255 mdr->more()->waiting_on_peer.insert(MDS_RANK_NONE);
9256 gather.set_finisher(new C_MDS_PeerRenameSessionsFlushed(this, mdr));
7c673cae
FG
9257 gather.activate();
9258 }
9259 }
9260
9261 // is witness list sufficient?
9262 for (set<mds_rank_t>::iterator p = srcdnrep.begin(); p != srcdnrep.end(); ++p) {
f67539c2
TL
9263 if (*p == mdr->peer_to_mds ||
9264 mdr->peer_request->witnesses.count(*p)) continue;
7c673cae
FG
9265 dout(10) << " witness list insufficient; providing srcdn replica list" << dendl;
9266 reply_witness = true;
9267 break;
9268 }
9269
9270 if (reply_witness) {
11fdf7f2 9271 ceph_assert(!srcdnrep.empty());
f67539c2 9272 auto reply = make_message<MMDSPeerRequest>(mdr->reqid, mdr->attempt, MMDSPeerRequest::OP_RENAMEPREPACK);
7c673cae 9273 reply->witnesses.swap(srcdnrep);
f67539c2
TL
9274 mds->send_message_mds(reply, mdr->peer_to_mds);
9275 mdr->reset_peer_request();
7c673cae
FG
9276 return;
9277 }
9278 dout(10) << " witness list sufficient: includes all srcdn replicas" << dendl;
f67539c2 9279 if (!mdr->more()->waiting_on_peer.empty()) {
7c673cae 9280 dout(10) << " still waiting for rename notify acks from "
f67539c2 9281 << mdr->more()->waiting_on_peer << dendl;
7c673cae
FG
9282 return;
9283 }
9284 } else if (srcdnl->is_primary() && srcdn->authority() != destdn->authority()) {
9285 // set ambiguous auth for srci on witnesses
9286 mdr->set_ambiguous_auth(srcdnl->get_inode());
9287 }
9288
9289 // encode everything we'd need to roll this back... basically, just the original state.
9290 rename_rollback rollback;
9291
9292 rollback.reqid = mdr->reqid;
9293
9294 rollback.orig_src.dirfrag = srcdn->get_dir()->dirfrag();
9295 rollback.orig_src.dirfrag_old_mtime = srcdn->get_dir()->get_projected_fnode()->fragstat.mtime;
9296 rollback.orig_src.dirfrag_old_rctime = srcdn->get_dir()->get_projected_fnode()->rstat.rctime;
11fdf7f2 9297 rollback.orig_src.dname = srcdn->get_name();
7c673cae
FG
9298 if (srcdnl->is_primary())
9299 rollback.orig_src.ino = srcdnl->get_inode()->ino();
9300 else {
11fdf7f2 9301 ceph_assert(srcdnl->is_remote());
7c673cae
FG
9302 rollback.orig_src.remote_ino = srcdnl->get_remote_ino();
9303 rollback.orig_src.remote_d_type = srcdnl->get_remote_d_type();
9304 }
9305
9306 rollback.orig_dest.dirfrag = destdn->get_dir()->dirfrag();
9307 rollback.orig_dest.dirfrag_old_mtime = destdn->get_dir()->get_projected_fnode()->fragstat.mtime;
9308 rollback.orig_dest.dirfrag_old_rctime = destdn->get_dir()->get_projected_fnode()->rstat.rctime;
11fdf7f2 9309 rollback.orig_dest.dname = destdn->get_name();
7c673cae
FG
9310 if (destdnl->is_primary())
9311 rollback.orig_dest.ino = destdnl->get_inode()->ino();
9312 else if (destdnl->is_remote()) {
9313 rollback.orig_dest.remote_ino = destdnl->get_remote_ino();
9314 rollback.orig_dest.remote_d_type = destdnl->get_remote_d_type();
9315 }
9316
9317 if (straydn) {
9318 rollback.stray.dirfrag = straydn->get_dir()->dirfrag();
9319 rollback.stray.dirfrag_old_mtime = straydn->get_dir()->get_projected_fnode()->fragstat.mtime;
9320 rollback.stray.dirfrag_old_rctime = straydn->get_dir()->get_projected_fnode()->rstat.rctime;
11fdf7f2
TL
9321 rollback.stray.dname = straydn->get_name();
9322 }
f67539c2 9323 if (mdr->peer_request->desti_snapbl.length()) {
11fdf7f2
TL
9324 CInode *oldin = destdnl->get_inode();
9325 if (oldin->snaprealm) {
9326 encode(true, rollback.desti_snapbl);
9327 oldin->encode_snap_blob(rollback.desti_snapbl);
9328 } else {
9329 encode(false, rollback.desti_snapbl);
9330 }
9331 }
f67539c2 9332 if (mdr->peer_request->srci_snapbl.length()) {
11fdf7f2
TL
9333 if (srci->snaprealm) {
9334 encode(true, rollback.srci_snapbl);
9335 srci->encode_snap_blob(rollback.srci_snapbl);
9336 } else {
9337 encode(false, rollback.srci_snapbl);
9338 }
7c673cae 9339 }
11fdf7f2
TL
9340 encode(rollback, mdr->more()->rollback_bl);
9341 // FIXME: rollback snaprealm
7c673cae
FG
9342 dout(20) << " rollback is " << mdr->more()->rollback_bl.length() << " bytes" << dendl;
9343
9344 // journal.
9345 mdr->ls = mdlog->get_current_segment();
f67539c2
TL
9346 EPeerUpdate *le = new EPeerUpdate(mdlog, "peer_rename_prep", mdr->reqid, mdr->peer_to_mds,
9347 EPeerUpdate::OP_PREPARE, EPeerUpdate::RENAME);
7c673cae
FG
9348 mdlog->start_entry(le);
9349 le->rollback = mdr->more()->rollback_bl;
9350
f67539c2
TL
9351 bufferlist blah; // inode import data... obviously not used if we're the peer
9352 _rename_prepare(mdr, &le->commit, &blah, srcdn, destdn, mdr->peer_request->alternate_name, straydn);
7c673cae
FG
9353
9354 if (le->commit.empty()) {
9355 dout(10) << " empty metablob, skipping journal" << dendl;
9356 mdlog->cancel_entry(le);
9357 mdr->ls = NULL;
f67539c2 9358 _logged_peer_rename(mdr, srcdn, destdn, straydn);
7c673cae 9359 } else {
f67539c2
TL
9360 mdcache->add_uncommitted_peer(mdr->reqid, mdr->ls, mdr->peer_to_mds);
9361 mdr->more()->peer_update_journaled = true;
9362 submit_mdlog_entry(le, new C_MDS_PeerRenamePrep(this, mdr, srcdn, destdn, straydn),
7c673cae
FG
9363 mdr, __func__);
9364 mdlog->flush();
9365 }
9366}
9367
f67539c2 9368void Server::_logged_peer_rename(MDRequestRef& mdr,
7c673cae
FG
9369 CDentry *srcdn, CDentry *destdn, CDentry *straydn)
9370{
f67539c2 9371 dout(10) << "_logged_peer_rename " << *mdr << dendl;
7c673cae
FG
9372
9373 // prepare ack
f67539c2 9374 ref_t<MMDSPeerRequest> reply;
7c673cae 9375 if (!mdr->aborted) {
f67539c2
TL
9376 reply = make_message<MMDSPeerRequest>(mdr->reqid, mdr->attempt, MMDSPeerRequest::OP_RENAMEPREPACK);
9377 if (!mdr->more()->peer_update_journaled)
7c673cae
FG
9378 reply->mark_not_journaled();
9379 }
9380
9381 CDentry::linkage_t *srcdnl = srcdn->get_linkage();
7c673cae
FG
9382 //CDentry::linkage_t *straydnl = straydn ? straydn->get_linkage() : 0;
9383
9384 // export srci?
9385 if (srcdn->is_auth() && srcdnl->is_primary()) {
9386 // set export bounds for CInode::encode_export()
11fdf7f2 9387 if (reply) {
9f95a23c 9388 std::vector<CDir*> bounds;
11fdf7f2
TL
9389 if (srcdnl->get_inode()->is_dir()) {
9390 srcdnl->get_inode()->get_dirfrags(bounds);
9f95a23c
TL
9391 for (const auto& bound : bounds) {
9392 bound->state_set(CDir::STATE_EXPORTBOUND);
9393 }
11fdf7f2 9394 }
7c673cae 9395
11fdf7f2
TL
9396 map<client_t,entity_inst_t> exported_client_map;
9397 map<client_t, client_metadata_t> exported_client_metadata_map;
9398 bufferlist inodebl;
9399 mdcache->migrator->encode_export_inode(srcdnl->get_inode(), inodebl,
9400 exported_client_map,
9401 exported_client_metadata_map);
7c673cae 9402
9f95a23c
TL
9403 for (const auto& bound : bounds) {
9404 bound->state_clear(CDir::STATE_EXPORTBOUND);
9405 }
7c673cae 9406
11fdf7f2
TL
9407 encode(exported_client_map, reply->inode_export, mds->mdsmap->get_up_features());
9408 encode(exported_client_metadata_map, reply->inode_export);
7c673cae 9409 reply->inode_export.claim_append(inodebl);
f67539c2 9410 reply->inode_export_v = srcdnl->get_inode()->get_version();
7c673cae
FG
9411 }
9412
9413 // remove mdr auth pin
9414 mdr->auth_unpin(srcdnl->get_inode());
9415 mdr->more()->is_inode_exporter = true;
9416
9417 if (srcdnl->get_inode()->is_dirty())
9418 srcdnl->get_inode()->mark_clean();
9419
9420 dout(10) << " exported srci " << *srcdnl->get_inode() << dendl;
9421 }
9422
9423 // apply
9424 _rename_apply(mdr, srcdn, destdn, straydn);
11fdf7f2
TL
9425
9426 CDentry::linkage_t *destdnl = destdn->get_linkage();
7c673cae
FG
9427
9428 // bump popularity
11fdf7f2 9429 mds->balancer->hit_dir(srcdn->get_dir(), META_POP_IWR);
7c673cae 9430 if (destdnl->get_inode() && destdnl->get_inode()->is_auth())
11fdf7f2 9431 mds->balancer->hit_inode(destdnl->get_inode(), META_POP_IWR);
7c673cae
FG
9432
9433 // done.
f67539c2 9434 mdr->reset_peer_request();
7c673cae
FG
9435 mdr->straydn = 0;
9436
9437 if (reply) {
f67539c2 9438 mds->send_message_mds(reply, mdr->peer_to_mds);
7c673cae 9439 } else {
11fdf7f2 9440 ceph_assert(mdr->aborted);
7c673cae
FG
9441 dout(10) << " abort flag set, finishing" << dendl;
9442 mdcache->request_finish(mdr);
9443 }
9444}
9445
f67539c2 9446void Server::_commit_peer_rename(MDRequestRef& mdr, int r,
7c673cae
FG
9447 CDentry *srcdn, CDentry *destdn, CDentry *straydn)
9448{
f67539c2 9449 dout(10) << "_commit_peer_rename " << *mdr << " r=" << r << dendl;
7c673cae 9450
f64942e4
AA
9451 CInode *in = destdn->get_linkage()->get_inode();
9452
9453 inodeno_t migrated_stray;
9454 if (srcdn->is_auth() && srcdn->get_dir()->inode->is_stray())
9455 migrated_stray = in->ino();
7c673cae 9456
11fdf7f2 9457 MDSContext::vec finished;
7c673cae
FG
9458 if (r == 0) {
9459 // unfreeze+singleauth inode
9460 // hmm, do i really need to delay this?
9461 if (mdr->more()->is_inode_exporter) {
7c673cae
FG
9462 // drop our pins
9463 // we exported, clear out any xlocks that we moved to another MDS
7c673cae 9464
11fdf7f2
TL
9465 for (auto i = mdr->locks.lower_bound(&in->versionlock);
9466 i != mdr->locks.end(); ) {
9467 SimpleLock *lock = i->lock;
9468 if (lock->get_parent() != in)
9469 break;
7c673cae 9470 // we only care about xlocks on the exported inode
11fdf7f2
TL
9471 if (i->is_xlock() && !lock->is_locallock())
9472 mds->locker->xlock_export(i++, mdr.get());
9473 else
9474 ++i;
7c673cae
FG
9475 }
9476
9477 map<client_t,Capability::Import> peer_imported;
11fdf7f2
TL
9478 auto bp = mdr->more()->inode_import.cbegin();
9479 decode(peer_imported, bp);
7c673cae 9480
f64942e4 9481 dout(10) << " finishing inode export on " << *in << dendl;
f67539c2 9482 mdcache->migrator->finish_export_inode(in, mdr->peer_to_mds, peer_imported, finished);
7c673cae
FG
9483 mds->queue_waiters(finished); // this includes SINGLEAUTH waiters.
9484
9485 // unfreeze
11fdf7f2 9486 ceph_assert(in->is_frozen_inode());
f64942e4 9487 in->unfreeze_inode(finished);
7c673cae
FG
9488 }
9489
9490 // singleauth
9491 if (mdr->more()->is_ambiguous_auth) {
9492 mdr->more()->rename_inode->clear_ambiguous_auth(finished);
9493 mdr->more()->is_ambiguous_auth = false;
9494 }
9495
f67539c2 9496 if (straydn && mdr->more()->peer_update_journaled) {
31f18b77
FG
9497 CInode *strayin = straydn->get_projected_linkage()->get_inode();
9498 if (strayin && !strayin->snaprealm)
9499 mdcache->clear_dirty_bits_for_stray(strayin);
9500 }
7c673cae
FG
9501
9502 mds->queue_waiters(finished);
9503 mdr->cleanup();
9504
f67539c2 9505 if (mdr->more()->peer_update_journaled) {
7c673cae 9506 // write a commit to the journal
f67539c2
TL
9507 EPeerUpdate *le = new EPeerUpdate(mdlog, "peer_rename_commit", mdr->reqid,
9508 mdr->peer_to_mds, EPeerUpdate::OP_COMMIT,
9509 EPeerUpdate::RENAME);
7c673cae 9510 mdlog->start_entry(le);
f67539c2 9511 submit_mdlog_entry(le, new C_MDS_CommittedPeer(this, mdr), mdr, __func__);
7c673cae
FG
9512 mdlog->flush();
9513 } else {
f67539c2 9514 _committed_peer(mdr);
7c673cae
FG
9515 }
9516 } else {
9517
9518 // abort
9519 // rollback_bl may be empty if we froze the inode but had to provide an expanded
f67539c2 9520 // witness list from the leader, and they failed before we tried prep again.
7c673cae
FG
9521 if (mdr->more()->rollback_bl.length()) {
9522 if (mdr->more()->is_inode_exporter) {
f64942e4
AA
9523 dout(10) << " reversing inode export of " << *in << dendl;
9524 in->abort_export();
7c673cae 9525 }
f67539c2
TL
9526 if (mdcache->is_ambiguous_peer_update(mdr->reqid, mdr->peer_to_mds)) {
9527 mdcache->remove_ambiguous_peer_update(mdr->reqid, mdr->peer_to_mds);
9528 // rollback but preserve the peer request
9529 do_rename_rollback(mdr->more()->rollback_bl, mdr->peer_to_mds, mdr, false);
7c673cae
FG
9530 mdr->more()->rollback_bl.clear();
9531 } else
f67539c2 9532 do_rename_rollback(mdr->more()->rollback_bl, mdr->peer_to_mds, mdr, true);
7c673cae 9533 } else {
f67539c2 9534 dout(10) << " rollback_bl empty, not rollback back rename (leader failed after getting extra witnesses?)" << dendl;
7c673cae
FG
9535 // singleauth
9536 if (mdr->more()->is_ambiguous_auth) {
9537 if (srcdn->is_auth())
9538 mdr->more()->rename_inode->unfreeze_inode(finished);
9539
9540 mdr->more()->rename_inode->clear_ambiguous_auth(finished);
9541 mdr->more()->is_ambiguous_auth = false;
9542 }
9543 mds->queue_waiters(finished);
9544 mdcache->request_finish(mdr);
9545 }
9546 }
f64942e4
AA
9547
9548 if (migrated_stray && mds->is_stopping())
9549 mdcache->shutdown_export_stray_finish(migrated_stray);
7c673cae
FG
9550}
9551
f67539c2
TL
9552static void _rollback_repair_dir(MutationRef& mut, CDir *dir,
9553 rename_rollback::drec &r, utime_t ctime,
9554 bool isdir, const nest_info_t &rstat)
7c673cae 9555{
f67539c2 9556 auto pf = dir->project_fnode(mut);
7c673cae
FG
9557 pf->version = dir->pre_dirty();
9558
9559 if (isdir) {
f67539c2 9560 pf->fragstat.nsubdirs += 1;
7c673cae 9561 } else {
f67539c2 9562 pf->fragstat.nfiles += 1;
7c673cae
FG
9563 }
9564 if (r.ino) {
f67539c2
TL
9565 pf->rstat.rbytes += rstat.rbytes;
9566 pf->rstat.rfiles += rstat.rfiles;
9567 pf->rstat.rsubdirs += rstat.rsubdirs;
9568 pf->rstat.rsnaps += rstat.rsnaps;
7c673cae
FG
9569 }
9570 if (pf->fragstat.mtime == ctime) {
9571 pf->fragstat.mtime = r.dirfrag_old_mtime;
9572 if (pf->rstat.rctime == ctime)
9573 pf->rstat.rctime = r.dirfrag_old_rctime;
9574 }
9575 mut->add_updated_lock(&dir->get_inode()->filelock);
9576 mut->add_updated_lock(&dir->get_inode()->nestlock);
9577}
9578
9579struct C_MDS_LoggedRenameRollback : public ServerLogContext {
9580 MutationRef mut;
9581 CDentry *srcdn;
9582 version_t srcdnpv;
9583 CDentry *destdn;
9584 CDentry *straydn;
9f95a23c 9585 map<client_t,ref_t<MClientSnap>> splits[2];
7c673cae
FG
9586 bool finish_mdr;
9587 C_MDS_LoggedRenameRollback(Server *s, MutationRef& m, MDRequestRef& r,
11fdf7f2 9588 CDentry *sd, version_t pv, CDentry *dd, CDentry *st,
9f95a23c 9589 map<client_t,ref_t<MClientSnap>> _splits[2], bool f) :
7c673cae 9590 ServerLogContext(s, r), mut(m), srcdn(sd), srcdnpv(pv), destdn(dd),
11fdf7f2
TL
9591 straydn(st), finish_mdr(f) {
9592 splits[0].swap(_splits[0]);
9593 splits[1].swap(_splits[1]);
9594 }
7c673cae
FG
9595 void finish(int r) override {
9596 server->_rename_rollback_finish(mut, mdr, srcdn, srcdnpv,
11fdf7f2 9597 destdn, straydn, splits, finish_mdr);
7c673cae
FG
9598 }
9599};
9600
f67539c2 9601void Server::do_rename_rollback(bufferlist &rbl, mds_rank_t leader, MDRequestRef& mdr,
7c673cae
FG
9602 bool finish_mdr)
9603{
9604 rename_rollback rollback;
11fdf7f2
TL
9605 auto p = rbl.cbegin();
9606 decode(rollback, p);
7c673cae
FG
9607
9608 dout(10) << "do_rename_rollback on " << rollback.reqid << dendl;
9609 // need to finish this update before sending resolve to claim the subtree
f67539c2 9610 mdcache->add_rollback(rollback.reqid, leader);
7c673cae
FG
9611
9612 MutationRef mut(new MutationImpl(nullptr, utime_t(), rollback.reqid));
9613 mut->ls = mds->mdlog->get_current_segment();
9614
9615 CDentry *srcdn = NULL;
9616 CDir *srcdir = mdcache->get_dirfrag(rollback.orig_src.dirfrag);
9617 if (!srcdir)
9618 srcdir = mdcache->get_dirfrag(rollback.orig_src.dirfrag.ino, rollback.orig_src.dname);
9619 if (srcdir) {
9620 dout(10) << " srcdir " << *srcdir << dendl;
9621 srcdn = srcdir->lookup(rollback.orig_src.dname);
9622 if (srcdn) {
9623 dout(10) << " srcdn " << *srcdn << dendl;
11fdf7f2 9624 ceph_assert(srcdn->get_linkage()->is_null());
7c673cae
FG
9625 } else
9626 dout(10) << " srcdn not found" << dendl;
9627 } else
9628 dout(10) << " srcdir not found" << dendl;
9629
9630 CDentry *destdn = NULL;
9631 CDir *destdir = mdcache->get_dirfrag(rollback.orig_dest.dirfrag);
9632 if (!destdir)
9633 destdir = mdcache->get_dirfrag(rollback.orig_dest.dirfrag.ino, rollback.orig_dest.dname);
9634 if (destdir) {
9635 dout(10) << " destdir " << *destdir << dendl;
9636 destdn = destdir->lookup(rollback.orig_dest.dname);
9637 if (destdn)
9638 dout(10) << " destdn " << *destdn << dendl;
9639 else
9640 dout(10) << " destdn not found" << dendl;
9641 } else
9642 dout(10) << " destdir not found" << dendl;
9643
9644 CInode *in = NULL;
9645 if (rollback.orig_src.ino) {
9646 in = mdcache->get_inode(rollback.orig_src.ino);
9647 if (in && in->is_dir())
11fdf7f2 9648 ceph_assert(srcdn && destdn);
7c673cae
FG
9649 } else
9650 in = mdcache->get_inode(rollback.orig_src.remote_ino);
9651
9652 CDir *straydir = NULL;
9653 CDentry *straydn = NULL;
9654 if (rollback.stray.dirfrag.ino) {
9655 straydir = mdcache->get_dirfrag(rollback.stray.dirfrag);
9656 if (straydir) {
9657 dout(10) << "straydir " << *straydir << dendl;
9658 straydn = straydir->lookup(rollback.stray.dname);
9659 if (straydn) {
9660 dout(10) << " straydn " << *straydn << dendl;
11fdf7f2 9661 ceph_assert(straydn->get_linkage()->is_primary());
7c673cae
FG
9662 } else
9663 dout(10) << " straydn not found" << dendl;
9664 } else
9665 dout(10) << "straydir not found" << dendl;
9666 }
9667
9668 CInode *target = NULL;
9669 if (rollback.orig_dest.ino) {
9670 target = mdcache->get_inode(rollback.orig_dest.ino);
9671 if (target)
11fdf7f2 9672 ceph_assert(destdn && straydn);
7c673cae
FG
9673 } else if (rollback.orig_dest.remote_ino)
9674 target = mdcache->get_inode(rollback.orig_dest.remote_ino);
9675
9676 // can't use is_auth() in the resolve stage
9677 mds_rank_t whoami = mds->get_nodeid();
f67539c2 9678 // peer
11fdf7f2
TL
9679 ceph_assert(!destdn || destdn->authority().first != whoami);
9680 ceph_assert(!straydn || straydn->authority().first != whoami);
7c673cae
FG
9681
9682 bool force_journal_src = false;
9683 bool force_journal_dest = false;
9684 if (in && in->is_dir() && srcdn->authority().first != whoami)
9685 force_journal_src = _need_force_journal(in, false);
9686 if (in && target && target->is_dir())
9687 force_journal_dest = _need_force_journal(in, true);
9688
9689 version_t srcdnpv = 0;
9690 // repair src
9691 if (srcdn) {
9692 if (srcdn->authority().first == whoami)
9693 srcdnpv = srcdn->pre_dirty();
9694 if (rollback.orig_src.ino) {
11fdf7f2 9695 ceph_assert(in);
7c673cae
FG
9696 srcdn->push_projected_linkage(in);
9697 } else
9698 srcdn->push_projected_linkage(rollback.orig_src.remote_ino,
9699 rollback.orig_src.remote_d_type);
9700 }
9701
9f95a23c 9702 map<client_t,ref_t<MClientSnap>> splits[2];
11fdf7f2 9703
f67539c2 9704 const CInode::mempool_inode *pip = nullptr;
7c673cae 9705 if (in) {
11fdf7f2 9706 bool projected;
f67539c2
TL
9707 CDir *pdir = in->get_projected_parent_dir();
9708 if (pdir->authority().first == whoami) {
9709 auto pi = in->project_inode(mut);
9710 pi.inode->version = in->pre_dirty();
9711 if (pdir != srcdir) {
9712 auto pf = pdir->project_fnode(mut);
9713 pf->version = pdir->pre_dirty();
9714 }
9715 if (pi.inode->ctime == rollback.ctime)
9716 pi.inode->ctime = rollback.orig_src.old_ctime;
11fdf7f2
TL
9717 projected = true;
9718 } else {
f67539c2
TL
9719 if (in->get_inode()->ctime == rollback.ctime) {
9720 auto _inode = CInode::allocate_inode(*in->get_inode());
9721 _inode->ctime = rollback.orig_src.old_ctime;
9722 in->reset_inode(_inode);
9723 }
11fdf7f2
TL
9724 projected = false;
9725 }
f67539c2 9726 pip = in->get_projected_inode().get();
11fdf7f2
TL
9727
9728 if (rollback.srci_snapbl.length() && in->snaprealm) {
9729 bool hadrealm;
9730 auto p = rollback.srci_snapbl.cbegin();
9731 decode(hadrealm, p);
9732 if (hadrealm) {
9733 if (projected && !mds->is_resolve()) {
9734 sr_t *new_srnode = new sr_t();
9735 decode(*new_srnode, p);
9736 in->project_snaprealm(new_srnode);
9737 } else
9738 decode(in->snaprealm->srnode, p);
9739 } else {
9740 SnapRealm *realm;
9741 if (rollback.orig_src.ino) {
9742 ceph_assert(srcdir);
9743 realm = srcdir->get_inode()->find_snaprealm();
9744 } else {
9745 realm = in->snaprealm->parent;
9746 }
9747 if (!mds->is_resolve())
9748 mdcache->prepare_realm_merge(in->snaprealm, realm, splits[0]);
9749 if (projected)
9750 in->project_snaprealm(NULL);
9751 else
9752 in->snaprealm->merge_to(realm);
9753 }
9754 }
7c673cae
FG
9755 }
9756
7c673cae
FG
9757 // repair dest
9758 if (destdn) {
9759 if (rollback.orig_dest.ino && target) {
9760 destdn->push_projected_linkage(target);
9761 } else if (rollback.orig_dest.remote_ino) {
9762 destdn->push_projected_linkage(rollback.orig_dest.remote_ino,
9763 rollback.orig_dest.remote_d_type);
9764 } else {
9765 // the dentry will be trimmed soon, it's ok to have wrong linkage
9766 if (rollback.orig_dest.ino)
11fdf7f2 9767 ceph_assert(mds->is_resolve());
7c673cae
FG
9768 destdn->push_projected_linkage();
9769 }
9770 }
9771
9772 if (straydn)
9773 straydn->push_projected_linkage();
9774
9775 if (target) {
11fdf7f2 9776 bool projected;
f67539c2
TL
9777 CInode::inode_ptr ti;
9778 CDir *pdir = target->get_projected_parent_dir();
9779 if (pdir->authority().first == whoami) {
9780 auto pi = target->project_inode(mut);
9781 pi.inode->version = target->pre_dirty();
9782 if (pdir != srcdir) {
9783 auto pf = pdir->project_fnode(mut);
9784 pf->version = pdir->pre_dirty();
9785 }
9786 ti = pi.inode;
11fdf7f2
TL
9787 projected = true;
9788 } else {
f67539c2 9789 ti = CInode::allocate_inode(*target->get_inode());
11fdf7f2
TL
9790 projected = false;
9791 }
f67539c2 9792
7c673cae 9793 if (ti->ctime == rollback.ctime)
91327a77 9794 ti->ctime = rollback.orig_dest.old_ctime;
7c673cae
FG
9795 if (MDS_INO_IS_STRAY(rollback.orig_src.dirfrag.ino)) {
9796 if (MDS_INO_IS_STRAY(rollback.orig_dest.dirfrag.ino))
11fdf7f2 9797 ceph_assert(!rollback.orig_dest.ino && !rollback.orig_dest.remote_ino);
7c673cae 9798 else
11fdf7f2 9799 ceph_assert(rollback.orig_dest.remote_ino &&
7c673cae
FG
9800 rollback.orig_dest.remote_ino == rollback.orig_src.ino);
9801 } else
9802 ti->nlink++;
11fdf7f2 9803
f67539c2
TL
9804 if (!projected)
9805 target->reset_inode(ti);
9806
11fdf7f2
TL
9807 if (rollback.desti_snapbl.length() && target->snaprealm) {
9808 bool hadrealm;
9809 auto p = rollback.desti_snapbl.cbegin();
9810 decode(hadrealm, p);
9811 if (hadrealm) {
9812 if (projected && !mds->is_resolve()) {
9813 sr_t *new_srnode = new sr_t();
9814 decode(*new_srnode, p);
9815 target->project_snaprealm(new_srnode);
9816 } else
9817 decode(target->snaprealm->srnode, p);
9818 } else {
9819 SnapRealm *realm;
9820 if (rollback.orig_dest.ino) {
9821 ceph_assert(destdir);
9822 realm = destdir->get_inode()->find_snaprealm();
9823 } else {
9824 realm = target->snaprealm->parent;
9825 }
9826 if (!mds->is_resolve())
9827 mdcache->prepare_realm_merge(target->snaprealm, realm, splits[1]);
9828 if (projected)
9829 target->project_snaprealm(NULL);
9830 else
9831 target->snaprealm->merge_to(realm);
9832 }
9833 }
7c673cae
FG
9834 }
9835
f67539c2
TL
9836 if (srcdn && srcdn->authority().first == whoami) {
9837 nest_info_t blah;
9838 _rollback_repair_dir(mut, srcdir, rollback.orig_src, rollback.ctime,
9839 in && in->is_dir(), pip ? pip->accounted_rstat : blah);
9840 }
9841
7c673cae
FG
9842 if (srcdn)
9843 dout(0) << " srcdn back to " << *srcdn << dendl;
9844 if (in)
9845 dout(0) << " srci back to " << *in << dendl;
9846 if (destdn)
9847 dout(0) << " destdn back to " << *destdn << dendl;
9848 if (target)
9849 dout(0) << " desti back to " << *target << dendl;
9850
9851 // journal it
f67539c2
TL
9852 EPeerUpdate *le = new EPeerUpdate(mdlog, "peer_rename_rollback", rollback.reqid, leader,
9853 EPeerUpdate::OP_ROLLBACK, EPeerUpdate::RENAME);
7c673cae
FG
9854 mdlog->start_entry(le);
9855
9856 if (srcdn && (srcdn->authority().first == whoami || force_journal_src)) {
9857 le->commit.add_dir_context(srcdir);
9858 if (rollback.orig_src.ino)
9859 le->commit.add_primary_dentry(srcdn, 0, true);
9860 else
9861 le->commit.add_remote_dentry(srcdn, true);
9862 }
9863
9864 if (!rollback.orig_src.ino && // remote linkage
9865 in && in->authority().first == whoami) {
9866 le->commit.add_dir_context(in->get_projected_parent_dir());
9867 le->commit.add_primary_dentry(in->get_projected_parent_dn(), in, true);
9868 }
9869
9870 if (force_journal_dest) {
11fdf7f2 9871 ceph_assert(rollback.orig_dest.ino);
7c673cae
FG
9872 le->commit.add_dir_context(destdir);
9873 le->commit.add_primary_dentry(destdn, 0, true);
9874 }
9875
f67539c2 9876 // peer: no need to journal straydn
7c673cae
FG
9877
9878 if (target && target != in && target->authority().first == whoami) {
11fdf7f2 9879 ceph_assert(rollback.orig_dest.remote_ino);
7c673cae
FG
9880 le->commit.add_dir_context(target->get_projected_parent_dir());
9881 le->commit.add_primary_dentry(target->get_projected_parent_dn(), target, true);
9882 }
9883
9884 if (in && in->is_dir() && (srcdn->authority().first == whoami || force_journal_src)) {
9885 dout(10) << " noting renamed dir ino " << in->ino() << " in metablob" << dendl;
9886 le->commit.renamed_dirino = in->ino();
9887 if (srcdn->authority().first == whoami) {
9f95a23c
TL
9888 auto&& ls = in->get_dirfrags();
9889 for (const auto& dir : ls) {
7c673cae
FG
9890 if (!dir->is_auth())
9891 le->commit.renamed_dir_frags.push_back(dir->get_frag());
9892 }
9893 dout(10) << " noting renamed dir open frags " << le->commit.renamed_dir_frags << dendl;
9894 }
9895 } else if (force_journal_dest) {
9896 dout(10) << " noting rename target ino " << target->ino() << " in metablob" << dendl;
9897 le->commit.renamed_dirino = target->ino();
9898 }
9899
9900 if (target && target->is_dir()) {
11fdf7f2 9901 ceph_assert(destdn);
7c673cae
FG
9902 mdcache->project_subtree_rename(target, straydir, destdir);
9903 }
9904
9905 if (in && in->is_dir()) {
11fdf7f2 9906 ceph_assert(srcdn);
7c673cae
FG
9907 mdcache->project_subtree_rename(in, destdir, srcdir);
9908 }
9909
f67539c2 9910 if (mdr && !mdr->more()->peer_update_journaled) {
11fdf7f2 9911 ceph_assert(le->commit.empty());
7c673cae
FG
9912 mdlog->cancel_entry(le);
9913 mut->ls = NULL;
11fdf7f2 9914 _rename_rollback_finish(mut, mdr, srcdn, srcdnpv, destdn, straydn, splits, finish_mdr);
7c673cae 9915 } else {
11fdf7f2 9916 ceph_assert(!le->commit.empty());
7c673cae 9917 if (mdr)
f67539c2 9918 mdr->more()->peer_update_journaled = false;
11fdf7f2
TL
9919 MDSLogContextBase *fin = new C_MDS_LoggedRenameRollback(this, mut, mdr,
9920 srcdn, srcdnpv, destdn, straydn,
9921 splits, finish_mdr);
7c673cae
FG
9922 submit_mdlog_entry(le, fin, mdr, __func__);
9923 mdlog->flush();
9924 }
9925}
9926
9927void Server::_rename_rollback_finish(MutationRef& mut, MDRequestRef& mdr, CDentry *srcdn,
11fdf7f2 9928 version_t srcdnpv, CDentry *destdn, CDentry *straydn,
9f95a23c 9929 map<client_t,ref_t<MClientSnap>> splits[2], bool finish_mdr)
7c673cae
FG
9930{
9931 dout(10) << "_rename_rollback_finish " << mut->reqid << dendl;
9932
9933 if (straydn) {
9934 straydn->get_dir()->unlink_inode(straydn);
9935 straydn->pop_projected_linkage();
9936 }
9937 if (destdn) {
9938 destdn->get_dir()->unlink_inode(destdn);
9939 destdn->pop_projected_linkage();
9940 }
9941 if (srcdn) {
9942 srcdn->pop_projected_linkage();
11fdf7f2 9943 if (srcdn->authority().first == mds->get_nodeid()) {
7c673cae 9944 srcdn->mark_dirty(srcdnpv, mut->ls);
11fdf7f2
TL
9945 if (srcdn->get_linkage()->is_primary())
9946 srcdn->get_linkage()->get_inode()->state_set(CInode::STATE_AUTH);
9947 }
7c673cae
FG
9948 }
9949
9950 mut->apply();
9951
9952 if (srcdn && srcdn->get_linkage()->is_primary()) {
9953 CInode *in = srcdn->get_linkage()->get_inode();
7c673cae 9954 if (in && in->is_dir()) {
11fdf7f2 9955 ceph_assert(destdn);
7c673cae
FG
9956 mdcache->adjust_subtree_after_rename(in, destdn->get_dir(), true);
9957 }
9958 }
9959
9960 if (destdn) {
9961 CInode *oldin = destdn->get_linkage()->get_inode();
9962 // update subtree map?
9963 if (oldin && oldin->is_dir()) {
11fdf7f2 9964 ceph_assert(straydn);
7c673cae
FG
9965 mdcache->adjust_subtree_after_rename(oldin, straydn->get_dir(), true);
9966 }
9967 }
9968
9969 if (mds->is_resolve()) {
9970 CDir *root = NULL;
9971 if (straydn)
9972 root = mdcache->get_subtree_root(straydn->get_dir());
9973 else if (destdn)
9974 root = mdcache->get_subtree_root(destdn->get_dir());
9975 if (root)
9976 mdcache->try_trim_non_auth_subtree(root);
11fdf7f2
TL
9977 } else {
9978 mdcache->send_snaps(splits[1]);
9979 mdcache->send_snaps(splits[0]);
7c673cae
FG
9980 }
9981
9982 if (mdr) {
11fdf7f2 9983 MDSContext::vec finished;
7c673cae
FG
9984 if (mdr->more()->is_ambiguous_auth) {
9985 if (srcdn->is_auth())
9986 mdr->more()->rename_inode->unfreeze_inode(finished);
9987
9988 mdr->more()->rename_inode->clear_ambiguous_auth(finished);
9989 mdr->more()->is_ambiguous_auth = false;
9990 }
9991 mds->queue_waiters(finished);
9992 if (finish_mdr || mdr->aborted)
9993 mdcache->request_finish(mdr);
9994 else
f67539c2 9995 mdr->more()->peer_rolling_back = false;
7c673cae
FG
9996 }
9997
e306af50 9998 mdcache->finish_rollback(mut->reqid, mdr);
7c673cae
FG
9999
10000 mut->cleanup();
10001}
10002
f67539c2 10003void Server::handle_peer_rename_prep_ack(MDRequestRef& mdr, const cref_t<MMDSPeerRequest> &ack)
7c673cae 10004{
f67539c2 10005 dout(10) << "handle_peer_rename_prep_ack " << *mdr
7c673cae
FG
10006 << " witnessed by " << ack->get_source()
10007 << " " << *ack << dendl;
10008 mds_rank_t from = mds_rank_t(ack->get_source().num());
10009
f67539c2
TL
10010 // note peer
10011 mdr->more()->peers.insert(from);
7c673cae
FG
10012 if (mdr->more()->srcdn_auth_mds == from &&
10013 mdr->more()->is_remote_frozen_authpin &&
10014 !mdr->more()->is_ambiguous_auth) {
10015 mdr->set_ambiguous_auth(mdr->more()->rename_inode);
10016 }
10017
10018 // witnessed? or add extra witnesses?
11fdf7f2 10019 ceph_assert(mdr->more()->witnessed.count(from) == 0);
31f18b77 10020 if (ack->is_interrupted()) {
f67539c2 10021 dout(10) << " peer request interrupted, noop" << dendl;
31f18b77 10022 } else if (ack->witnesses.empty()) {
7c673cae
FG
10023 mdr->more()->witnessed.insert(from);
10024 if (!ack->is_not_journaled())
f67539c2 10025 mdr->more()->has_journaled_peers = true;
7c673cae
FG
10026 } else {
10027 dout(10) << " extra witnesses (srcdn replicas) are " << ack->witnesses << dendl;
11fdf7f2 10028 mdr->more()->extra_witnesses = ack->witnesses;
7c673cae
FG
10029 mdr->more()->extra_witnesses.erase(mds->get_nodeid()); // not me!
10030 }
10031
10032 // srci import?
10033 if (ack->inode_export.length()) {
10034 dout(10) << " got srci import" << dendl;
11fdf7f2 10035 mdr->more()->inode_import.share(ack->inode_export);
7c673cae
FG
10036 mdr->more()->inode_import_v = ack->inode_export_v;
10037 }
10038
10039 // remove from waiting list
f67539c2
TL
10040 ceph_assert(mdr->more()->waiting_on_peer.count(from));
10041 mdr->more()->waiting_on_peer.erase(from);
7c673cae 10042
f67539c2 10043 if (mdr->more()->waiting_on_peer.empty())
7c673cae
FG
10044 dispatch_client_request(mdr); // go again!
10045 else
f67539c2 10046 dout(10) << "still waiting on peers " << mdr->more()->waiting_on_peer << dendl;
7c673cae
FG
10047}
10048
f67539c2 10049void Server::handle_peer_rename_notify_ack(MDRequestRef& mdr, const cref_t<MMDSPeerRequest> &ack)
7c673cae 10050{
f67539c2 10051 dout(10) << "handle_peer_rename_notify_ack " << *mdr << " from mds."
7c673cae 10052 << ack->get_source() << dendl;
f67539c2 10053 ceph_assert(mdr->is_peer());
7c673cae
FG
10054 mds_rank_t from = mds_rank_t(ack->get_source().num());
10055
f67539c2
TL
10056 if (mdr->more()->waiting_on_peer.count(from)) {
10057 mdr->more()->waiting_on_peer.erase(from);
7c673cae 10058
f67539c2
TL
10059 if (mdr->more()->waiting_on_peer.empty()) {
10060 if (mdr->peer_request)
10061 dispatch_peer_request(mdr);
7c673cae
FG
10062 } else
10063 dout(10) << " still waiting for rename notify acks from "
f67539c2 10064 << mdr->more()->waiting_on_peer << dendl;
7c673cae
FG
10065 }
10066}
10067
f67539c2 10068void Server::_peer_rename_sessions_flushed(MDRequestRef& mdr)
7c673cae 10069{
f67539c2 10070 dout(10) << "_peer_rename_sessions_flushed " << *mdr << dendl;
7c673cae 10071
f67539c2
TL
10072 if (mdr->more()->waiting_on_peer.count(MDS_RANK_NONE)) {
10073 mdr->more()->waiting_on_peer.erase(MDS_RANK_NONE);
7c673cae 10074
f67539c2
TL
10075 if (mdr->more()->waiting_on_peer.empty()) {
10076 if (mdr->peer_request)
10077 dispatch_peer_request(mdr);
7c673cae
FG
10078 } else
10079 dout(10) << " still waiting for rename notify acks from "
f67539c2 10080 << mdr->more()->waiting_on_peer << dendl;
7c673cae
FG
10081 }
10082}
10083
10084// snaps
10085/* This function takes responsibility for the passed mdr*/
10086void Server::handle_client_lssnap(MDRequestRef& mdr)
10087{
9f95a23c 10088 const cref_t<MClientRequest> &req = mdr->client_request;
7c673cae
FG
10089
10090 // traverse to path
9f95a23c
TL
10091 CInode *diri = try_get_auth_inode(mdr, req->get_filepath().get_ino());
10092 if (!diri)
7c673cae 10093 return;
9f95a23c 10094
7c673cae 10095 if (!diri->is_dir()) {
f67539c2 10096 respond_to_request(mdr, -CEPHFS_ENOTDIR);
7c673cae
FG
10097 return;
10098 }
10099 dout(10) << "lssnap on " << *diri << dendl;
10100
10101 // lock snap
9f95a23c 10102 if (!mds->locker->try_rdlock_snap_layout(diri, mdr))
7c673cae
FG
10103 return;
10104
10105 if (!check_access(mdr, diri, MAY_READ))
10106 return;
10107
10108 SnapRealm *realm = diri->find_snaprealm();
11fdf7f2 10109 map<snapid_t,const SnapInfo*> infomap;
7c673cae
FG
10110 realm->get_snap_info(infomap, diri->get_oldest_snap());
10111
10112 unsigned max_entries = req->head.args.readdir.max_entries;
10113 if (!max_entries)
10114 max_entries = infomap.size();
10115 int max_bytes = req->head.args.readdir.max_bytes;
10116 if (!max_bytes)
10117 // make sure at least one item can be encoded
11fdf7f2 10118 max_bytes = (512 << 10) + g_conf()->mds_max_xattr_pairs_size;
7c673cae
FG
10119
10120 __u64 last_snapid = 0;
10121 string offset_str = req->get_path2();
10122 if (!offset_str.empty())
10123 last_snapid = realm->resolve_snapname(offset_str, diri->ino());
10124
11fdf7f2 10125 //Empty DirStat
7c673cae 10126 bufferlist dirbl;
11fdf7f2
TL
10127 static DirStat empty;
10128 CDir::encode_dirstat(dirbl, mdr->session->info, empty);
7c673cae
FG
10129
10130 max_bytes -= dirbl.length() - sizeof(__u32) + sizeof(__u8) * 2;
10131
10132 __u32 num = 0;
10133 bufferlist dnbl;
11fdf7f2 10134 auto p = infomap.upper_bound(last_snapid);
7c673cae
FG
10135 for (; p != infomap.end() && num < max_entries; ++p) {
10136 dout(10) << p->first << " -> " << *p->second << dendl;
10137
10138 // actual
10139 string snap_name;
10140 if (p->second->ino == diri->ino())
11fdf7f2 10141 snap_name = p->second->name;
7c673cae 10142 else
11fdf7f2 10143 snap_name = p->second->get_long_name();
7c673cae
FG
10144
10145 unsigned start_len = dnbl.length();
10146 if (int(start_len + snap_name.length() + sizeof(__u32) + sizeof(LeaseStat)) > max_bytes)
10147 break;
10148
11fdf7f2
TL
10149 encode(snap_name, dnbl);
10150 //infinite lease
9f95a23c 10151 LeaseStat e(CEPH_LEASE_VALID, -1, 0);
11fdf7f2
TL
10152 mds->locker->encode_lease(dnbl, mdr->session->info, e);
10153 dout(20) << "encode_infinite_lease" << dendl;
7c673cae
FG
10154
10155 int r = diri->encode_inodestat(dnbl, mdr->session, realm, p->first, max_bytes - (int)dnbl.length());
10156 if (r < 0) {
10157 bufferlist keep;
10158 keep.substr_of(dnbl, 0, start_len);
10159 dnbl.swap(keep);
10160 break;
10161 }
10162 ++num;
10163 }
10164
11fdf7f2 10165 encode(num, dirbl);
7c673cae
FG
10166 __u16 flags = 0;
10167 if (p == infomap.end()) {
10168 flags = CEPH_READDIR_FRAG_END;
10169 if (last_snapid == 0)
10170 flags |= CEPH_READDIR_FRAG_COMPLETE;
10171 }
11fdf7f2 10172 encode(flags, dirbl);
7c673cae
FG
10173 dirbl.claim_append(dnbl);
10174
10175 mdr->reply_extra_bl = dirbl;
10176 mdr->tracei = diri;
10177 respond_to_request(mdr, 0);
10178}
10179
10180
10181// MKSNAP
10182
10183struct C_MDS_mksnap_finish : public ServerLogContext {
10184 CInode *diri;
10185 SnapInfo info;
10186 C_MDS_mksnap_finish(Server *s, MDRequestRef& r, CInode *di, SnapInfo &i) :
10187 ServerLogContext(s, r), diri(di), info(i) {}
10188 void finish(int r) override {
10189 server->_mksnap_finish(mdr, diri, info);
10190 }
10191};
10192
10193/* This function takes responsibility for the passed mdr*/
10194void Server::handle_client_mksnap(MDRequestRef& mdr)
10195{
9f95a23c 10196 const cref_t<MClientRequest> &req = mdr->client_request;
11fdf7f2
TL
10197 // make sure we have as new a map as the client
10198 if (req->get_mdsmap_epoch() > mds->mdsmap->get_epoch()) {
10199 mds->wait_for_mdsmap(req->get_mdsmap_epoch(), new C_MDS_RetryRequest(mdcache, mdr));
10200 return;
10201 }
7c673cae
FG
10202 if (!mds->mdsmap->allows_snaps()) {
10203 // you can't make snapshots until you set an option right now
f67539c2 10204 respond_to_request(mdr, -CEPHFS_EPERM);
7c673cae
FG
10205 return;
10206 }
10207
9f95a23c
TL
10208 CInode *diri = try_get_auth_inode(mdr, req->get_filepath().get_ino());
10209 if (!diri)
7c673cae 10210 return;
7c673cae
FG
10211
10212 // dir only
10213 if (!diri->is_dir()) {
f67539c2 10214 respond_to_request(mdr, -CEPHFS_ENOTDIR);
7c673cae
FG
10215 return;
10216 }
10217 if (diri->is_system() && !diri->is_root()) {
10218 // no snaps in system dirs (root is ok)
f67539c2 10219 respond_to_request(mdr, -CEPHFS_EPERM);
7c673cae
FG
10220 return;
10221 }
10222
11fdf7f2 10223 std::string_view snapname = req->get_filepath().last_dentry();
7c673cae 10224
11fdf7f2 10225 if (mdr->client_request->get_caller_uid() < g_conf()->mds_snap_min_uid || mdr->client_request->get_caller_uid() > g_conf()->mds_snap_max_uid) {
7c673cae 10226 dout(20) << "mksnap " << snapname << " on " << *diri << " denied to uid " << mdr->client_request->get_caller_uid() << dendl;
f67539c2 10227 respond_to_request(mdr, -CEPHFS_EPERM);
7c673cae
FG
10228 return;
10229 }
10230
10231 dout(10) << "mksnap " << snapname << " on " << *diri << dendl;
10232
10233 // lock snap
9f95a23c
TL
10234 if (!(mdr->locking_state & MutationImpl::ALL_LOCKED)) {
10235 MutationImpl::LockOpVec lov;
10236 lov.add_xlock(&diri->snaplock);
10237 if (!mds->locker->acquire_locks(mdr, lov))
10238 return;
7c673cae 10239
9f95a23c
TL
10240 if (CDentry *pdn = diri->get_projected_parent_dn(); pdn) {
10241 if (!mds->locker->try_rdlock_snap_layout(pdn->get_dir()->get_inode(), mdr))
10242 return;
10243 }
10244 mdr->locking_state |= MutationImpl::ALL_LOCKED;
10245 }
7c673cae 10246
9f95a23c 10247 if (!check_access(mdr, diri, MAY_WRITE|MAY_SNAPSHOT))
7c673cae
FG
10248 return;
10249
adb31ebb
TL
10250 if (inodeno_t subvol_ino = diri->find_snaprealm()->get_subvolume_ino();
10251 (subvol_ino && subvol_ino != diri->ino())) {
f67539c2 10252 respond_to_request(mdr, -CEPHFS_EPERM);
adb31ebb
TL
10253 return;
10254 }
10255
9f95a23c
TL
10256 // check if we can create any more snapshots
10257 // we don't allow any more if we are already at or beyond the limit
10258 if (diri->snaprealm &&
10259 diri->snaprealm->get_snaps().size() >= max_snaps_per_dir) {
f67539c2 10260 respond_to_request(mdr, -CEPHFS_EMLINK);
7c673cae 10261 return;
9f95a23c 10262 }
7c673cae
FG
10263
10264 // make sure name is unique
10265 if (diri->snaprealm &&
10266 diri->snaprealm->exists(snapname)) {
f67539c2 10267 respond_to_request(mdr, -CEPHFS_EEXIST);
7c673cae
FG
10268 return;
10269 }
10270 if (snapname.length() == 0 ||
10271 snapname[0] == '_') {
f67539c2 10272 respond_to_request(mdr, -CEPHFS_EINVAL);
7c673cae
FG
10273 return;
10274 }
10275
10276 // allocate a snapid
10277 if (!mdr->more()->stid) {
10278 // prepare an stid
10279 mds->snapclient->prepare_create(diri->ino(), snapname,
10280 mdr->get_mds_stamp(),
10281 &mdr->more()->stid, &mdr->more()->snapidbl,
10282 new C_MDS_RetryRequest(mdcache, mdr));
10283 return;
10284 }
10285
10286 version_t stid = mdr->more()->stid;
10287 snapid_t snapid;
11fdf7f2
TL
10288 auto p = mdr->more()->snapidbl.cbegin();
10289 decode(snapid, p);
7c673cae
FG
10290 dout(10) << " stid " << stid << " snapid " << snapid << dendl;
10291
11fdf7f2
TL
10292 ceph_assert(mds->snapclient->get_cached_version() >= stid);
10293
f67539c2
TL
10294 SnapPayload payload;
10295 if (req->get_data().length()) {
10296 try {
10297 auto iter = req->get_data().cbegin();
10298 decode(payload, iter);
10299 } catch (const ceph::buffer::error &e) {
10300 // backward compat -- client sends xattr bufferlist. however,
10301 // that is not used anywhere -- so (log and) ignore.
10302 dout(20) << ": no metadata in payload (old client?)" << dendl;
10303 }
10304 }
10305
7c673cae
FG
10306 // journal
10307 SnapInfo info;
10308 info.ino = diri->ino();
10309 info.snapid = snapid;
11fdf7f2 10310 info.name = snapname;
7c673cae 10311 info.stamp = mdr->get_op_stamp();
f67539c2 10312 info.metadata = payload.metadata;
7c673cae 10313
f67539c2
TL
10314 auto pi = diri->project_inode(mdr, false, true);
10315 pi.inode->ctime = info.stamp;
10316 if (info.stamp > pi.inode->rstat.rctime)
10317 pi.inode->rstat.rctime = info.stamp;
10318 pi.inode->rstat.rsnaps++;
10319 pi.inode->version = diri->pre_dirty();
7c673cae
FG
10320
10321 // project the snaprealm
94b18763
FG
10322 auto &newsnap = *pi.snapnode;
10323 newsnap.created = snapid;
10324 auto em = newsnap.snaps.emplace(std::piecewise_construct, std::forward_as_tuple(snapid), std::forward_as_tuple(info));
10325 if (!em.second)
10326 em.first->second = info;
10327 newsnap.seq = snapid;
10328 newsnap.last_created = snapid;
7c673cae
FG
10329
10330 // journal the inode changes
10331 mdr->ls = mdlog->get_current_segment();
10332 EUpdate *le = new EUpdate(mdlog, "mksnap");
10333 mdlog->start_entry(le);
10334
10335 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
10336 le->metablob.add_table_transaction(TABLE_SNAP, stid);
10337 mdcache->predirty_journal_parents(mdr, &le->metablob, diri, 0, PREDIRTY_PRIMARY, false);
10338 mdcache->journal_dirty_inode(mdr.get(), &le->metablob, diri);
10339
10340 // journal the snaprealm changes
10341 submit_mdlog_entry(le, new C_MDS_mksnap_finish(this, mdr, diri, info),
10342 mdr, __func__);
10343 mdlog->flush();
10344}
10345
10346void Server::_mksnap_finish(MDRequestRef& mdr, CInode *diri, SnapInfo &info)
10347{
10348 dout(10) << "_mksnap_finish " << *mdr << " " << info << dendl;
10349
10350 int op = (diri->snaprealm? CEPH_SNAP_OP_CREATE : CEPH_SNAP_OP_SPLIT);
10351
7c673cae
FG
10352 mdr->apply();
10353
10354 mds->snapclient->commit(mdr->more()->stid, mdr->ls);
10355
10356 // create snap
10357 dout(10) << "snaprealm now " << *diri->snaprealm << dendl;
10358
11fdf7f2
TL
10359 // notify other mds
10360 mdcache->send_snap_update(diri, mdr->more()->stid, op);
10361
7c673cae
FG
10362 mdcache->do_realm_invalidate_and_update_notify(diri, op);
10363
10364 // yay
10365 mdr->in[0] = diri;
10366 mdr->snapid = info.snapid;
10367 mdr->tracei = diri;
10368 respond_to_request(mdr, 0);
10369}
10370
10371
10372// RMSNAP
10373
10374struct C_MDS_rmsnap_finish : public ServerLogContext {
10375 CInode *diri;
10376 snapid_t snapid;
10377 C_MDS_rmsnap_finish(Server *s, MDRequestRef& r, CInode *di, snapid_t sn) :
10378 ServerLogContext(s, r), diri(di), snapid(sn) {}
10379 void finish(int r) override {
10380 server->_rmsnap_finish(mdr, diri, snapid);
10381 }
10382};
10383
10384/* This function takes responsibility for the passed mdr*/
10385void Server::handle_client_rmsnap(MDRequestRef& mdr)
10386{
9f95a23c 10387 const cref_t<MClientRequest> &req = mdr->client_request;
7c673cae 10388
9f95a23c
TL
10389 CInode *diri = try_get_auth_inode(mdr, req->get_filepath().get_ino());
10390 if (!diri)
7c673cae 10391 return;
9f95a23c 10392
7c673cae 10393 if (!diri->is_dir()) {
f67539c2 10394 respond_to_request(mdr, -CEPHFS_ENOTDIR);
7c673cae
FG
10395 return;
10396 }
10397
11fdf7f2 10398 std::string_view snapname = req->get_filepath().last_dentry();
7c673cae 10399
11fdf7f2 10400 if (mdr->client_request->get_caller_uid() < g_conf()->mds_snap_min_uid || mdr->client_request->get_caller_uid() > g_conf()->mds_snap_max_uid) {
7c673cae 10401 dout(20) << "rmsnap " << snapname << " on " << *diri << " denied to uid " << mdr->client_request->get_caller_uid() << dendl;
f67539c2 10402 respond_to_request(mdr, -CEPHFS_EPERM);
7c673cae
FG
10403 return;
10404 }
10405
10406 dout(10) << "rmsnap " << snapname << " on " << *diri << dendl;
10407
10408 // does snap exist?
10409 if (snapname.length() == 0 || snapname[0] == '_') {
f67539c2 10410 respond_to_request(mdr, -CEPHFS_EINVAL); // can't prune a parent snap, currently.
7c673cae
FG
10411 return;
10412 }
10413 if (!diri->snaprealm || !diri->snaprealm->exists(snapname)) {
f67539c2 10414 respond_to_request(mdr, -CEPHFS_ENOENT);
7c673cae
FG
10415 return;
10416 }
10417 snapid_t snapid = diri->snaprealm->resolve_snapname(snapname, diri->ino());
10418 dout(10) << " snapname " << snapname << " is " << snapid << dendl;
10419
9f95a23c
TL
10420 if (!(mdr->locking_state & MutationImpl::ALL_LOCKED)) {
10421 MutationImpl::LockOpVec lov;
10422 lov.add_xlock(&diri->snaplock);
10423 if (!mds->locker->acquire_locks(mdr, lov))
10424 return;
10425 if (CDentry *pdn = diri->get_projected_parent_dn(); pdn) {
10426 if (!mds->locker->try_rdlock_snap_layout(pdn->get_dir()->get_inode(), mdr))
10427 return;
10428 }
10429 mdr->locking_state |= MutationImpl::ALL_LOCKED;
10430 }
7c673cae 10431
11fdf7f2 10432 if (!check_access(mdr, diri, MAY_WRITE|MAY_SNAPSHOT))
7c673cae
FG
10433 return;
10434
10435 // prepare
10436 if (!mdr->more()->stid) {
10437 mds->snapclient->prepare_destroy(diri->ino(), snapid,
10438 &mdr->more()->stid, &mdr->more()->snapidbl,
10439 new C_MDS_RetryRequest(mdcache, mdr));
10440 return;
10441 }
10442 version_t stid = mdr->more()->stid;
11fdf7f2 10443 auto p = mdr->more()->snapidbl.cbegin();
7c673cae 10444 snapid_t seq;
11fdf7f2 10445 decode(seq, p);
7c673cae
FG
10446 dout(10) << " stid is " << stid << ", seq is " << seq << dendl;
10447
11fdf7f2
TL
10448 ceph_assert(mds->snapclient->get_cached_version() >= stid);
10449
7c673cae 10450 // journal
f67539c2
TL
10451 auto pi = diri->project_inode(mdr, false, true);
10452 pi.inode->version = diri->pre_dirty();
10453 pi.inode->ctime = mdr->get_op_stamp();
10454 if (mdr->get_op_stamp() > pi.inode->rstat.rctime)
10455 pi.inode->rstat.rctime = mdr->get_op_stamp();
10456 pi.inode->rstat.rsnaps--;
7c673cae
FG
10457
10458 mdr->ls = mdlog->get_current_segment();
10459 EUpdate *le = new EUpdate(mdlog, "rmsnap");
10460 mdlog->start_entry(le);
10461
10462 // project the snaprealm
94b18763
FG
10463 auto &newnode = *pi.snapnode;
10464 newnode.snaps.erase(snapid);
10465 newnode.seq = seq;
10466 newnode.last_destroyed = seq;
7c673cae
FG
10467
10468 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
10469 le->metablob.add_table_transaction(TABLE_SNAP, stid);
10470 mdcache->predirty_journal_parents(mdr, &le->metablob, diri, 0, PREDIRTY_PRIMARY, false);
10471 mdcache->journal_dirty_inode(mdr.get(), &le->metablob, diri);
10472
10473 submit_mdlog_entry(le, new C_MDS_rmsnap_finish(this, mdr, diri, snapid),
10474 mdr, __func__);
10475 mdlog->flush();
10476}
10477
10478void Server::_rmsnap_finish(MDRequestRef& mdr, CInode *diri, snapid_t snapid)
10479{
10480 dout(10) << "_rmsnap_finish " << *mdr << " " << snapid << dendl;
10481 snapid_t stid = mdr->more()->stid;
11fdf7f2 10482 auto p = mdr->more()->snapidbl.cbegin();
7c673cae 10483 snapid_t seq;
11fdf7f2 10484 decode(seq, p);
7c673cae 10485
7c673cae
FG
10486 mdr->apply();
10487
10488 mds->snapclient->commit(stid, mdr->ls);
10489
10490 dout(10) << "snaprealm now " << *diri->snaprealm << dendl;
10491
11fdf7f2
TL
10492 // notify other mds
10493 mdcache->send_snap_update(diri, mdr->more()->stid, CEPH_SNAP_OP_DESTROY);
10494
7c673cae
FG
10495 mdcache->do_realm_invalidate_and_update_notify(diri, CEPH_SNAP_OP_DESTROY);
10496
10497 // yay
10498 mdr->in[0] = diri;
10499 respond_to_request(mdr, 0);
10500
10501 // purge snapshot data
f67539c2 10502 diri->purge_stale_snap_data(diri->snaprealm->get_snaps());
7c673cae
FG
10503}
10504
10505struct C_MDS_renamesnap_finish : public ServerLogContext {
10506 CInode *diri;
10507 snapid_t snapid;
10508 C_MDS_renamesnap_finish(Server *s, MDRequestRef& r, CInode *di, snapid_t sn) :
10509 ServerLogContext(s, r), diri(di), snapid(sn) {}
10510 void finish(int r) override {
10511 server->_renamesnap_finish(mdr, diri, snapid);
10512 }
10513};
10514
10515/* This function takes responsibility for the passed mdr*/
10516void Server::handle_client_renamesnap(MDRequestRef& mdr)
10517{
9f95a23c 10518 const cref_t<MClientRequest> &req = mdr->client_request;
7c673cae 10519 if (req->get_filepath().get_ino() != req->get_filepath2().get_ino()) {
f67539c2 10520 respond_to_request(mdr, -CEPHFS_EINVAL);
7c673cae
FG
10521 return;
10522 }
10523
9f95a23c
TL
10524 CInode *diri = try_get_auth_inode(mdr, req->get_filepath().get_ino());
10525 if (!diri)
7c673cae 10526 return;
7c673cae
FG
10527
10528 if (!diri->is_dir()) { // dir only
f67539c2 10529 respond_to_request(mdr, -CEPHFS_ENOTDIR);
7c673cae
FG
10530 return;
10531 }
10532
11fdf7f2
TL
10533 if (mdr->client_request->get_caller_uid() < g_conf()->mds_snap_min_uid ||
10534 mdr->client_request->get_caller_uid() > g_conf()->mds_snap_max_uid) {
f67539c2 10535 respond_to_request(mdr, -CEPHFS_EPERM);
7c673cae
FG
10536 return;
10537 }
10538
11fdf7f2
TL
10539 std::string_view dstname = req->get_filepath().last_dentry();
10540 std::string_view srcname = req->get_filepath2().last_dentry();
7c673cae
FG
10541 dout(10) << "renamesnap " << srcname << "->" << dstname << " on " << *diri << dendl;
10542
10543 if (srcname.length() == 0 || srcname[0] == '_') {
f67539c2 10544 respond_to_request(mdr, -CEPHFS_EINVAL); // can't rename a parent snap.
7c673cae
FG
10545 return;
10546 }
10547 if (!diri->snaprealm || !diri->snaprealm->exists(srcname)) {
f67539c2 10548 respond_to_request(mdr, -CEPHFS_ENOENT);
7c673cae
FG
10549 return;
10550 }
10551 if (dstname.length() == 0 || dstname[0] == '_') {
f67539c2 10552 respond_to_request(mdr, -CEPHFS_EINVAL);
7c673cae
FG
10553 return;
10554 }
10555 if (diri->snaprealm->exists(dstname)) {
f67539c2 10556 respond_to_request(mdr, -CEPHFS_EEXIST);
7c673cae
FG
10557 return;
10558 }
10559
10560 snapid_t snapid = diri->snaprealm->resolve_snapname(srcname, diri->ino());
10561 dout(10) << " snapname " << srcname << " is " << snapid << dendl;
10562
10563 // lock snap
9f95a23c
TL
10564 if (!(mdr->locking_state & MutationImpl::ALL_LOCKED)) {
10565 MutationImpl::LockOpVec lov;
10566 lov.add_xlock(&diri->snaplock);
10567 if (!mds->locker->acquire_locks(mdr, lov))
10568 return;
10569 if (CDentry *pdn = diri->get_projected_parent_dn(); pdn) {
10570 if (!mds->locker->try_rdlock_snap_layout(pdn->get_dir()->get_inode(), mdr))
10571 return;
10572 }
10573 mdr->locking_state |= MutationImpl::ALL_LOCKED;
10574 }
7c673cae 10575
11fdf7f2 10576 if (!check_access(mdr, diri, MAY_WRITE|MAY_SNAPSHOT))
7c673cae
FG
10577 return;
10578
10579 // prepare
10580 if (!mdr->more()->stid) {
10581 mds->snapclient->prepare_update(diri->ino(), snapid, dstname, utime_t(),
11fdf7f2 10582 &mdr->more()->stid,
7c673cae
FG
10583 new C_MDS_RetryRequest(mdcache, mdr));
10584 return;
10585 }
10586
10587 version_t stid = mdr->more()->stid;
11fdf7f2
TL
10588 dout(10) << " stid is " << stid << dendl;
10589
10590 ceph_assert(mds->snapclient->get_cached_version() >= stid);
7c673cae
FG
10591
10592 // journal
f67539c2
TL
10593 auto pi = diri->project_inode(mdr, false, true);
10594 pi.inode->ctime = mdr->get_op_stamp();
10595 if (mdr->get_op_stamp() > pi.inode->rstat.rctime)
10596 pi.inode->rstat.rctime = mdr->get_op_stamp();
10597 pi.inode->version = diri->pre_dirty();
7c673cae
FG
10598
10599 // project the snaprealm
94b18763
FG
10600 auto &newsnap = *pi.snapnode;
10601 auto it = newsnap.snaps.find(snapid);
11fdf7f2
TL
10602 ceph_assert(it != newsnap.snaps.end());
10603 it->second.name = dstname;
7c673cae
FG
10604
10605 // journal the inode changes
10606 mdr->ls = mdlog->get_current_segment();
10607 EUpdate *le = new EUpdate(mdlog, "renamesnap");
10608 mdlog->start_entry(le);
10609
10610 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
10611 le->metablob.add_table_transaction(TABLE_SNAP, stid);
10612 mdcache->predirty_journal_parents(mdr, &le->metablob, diri, 0, PREDIRTY_PRIMARY, false);
10613 mdcache->journal_dirty_inode(mdr.get(), &le->metablob, diri);
10614
10615 // journal the snaprealm changes
10616 submit_mdlog_entry(le, new C_MDS_renamesnap_finish(this, mdr, diri, snapid),
10617 mdr, __func__);
10618 mdlog->flush();
10619}
10620
10621void Server::_renamesnap_finish(MDRequestRef& mdr, CInode *diri, snapid_t snapid)
10622{
10623 dout(10) << "_renamesnap_finish " << *mdr << " " << snapid << dendl;
10624
7c673cae
FG
10625 mdr->apply();
10626
10627 mds->snapclient->commit(mdr->more()->stid, mdr->ls);
10628
10629 dout(10) << "snaprealm now " << *diri->snaprealm << dendl;
10630
11fdf7f2
TL
10631 // notify other mds
10632 mdcache->send_snap_update(diri, mdr->more()->stid, CEPH_SNAP_OP_UPDATE);
10633
10634 mdcache->do_realm_invalidate_and_update_notify(diri, CEPH_SNAP_OP_UPDATE);
7c673cae
FG
10635
10636 // yay
10637 mdr->in[0] = diri;
10638 mdr->tracei = diri;
10639 mdr->snapid = snapid;
10640 respond_to_request(mdr, 0);
10641}
10642
10643/**
10644 * Return true if server is in state RECONNECT and this
10645 * client has not yet reconnected.
10646 */
10647bool Server::waiting_for_reconnect(client_t c) const
10648{
10649 return client_reconnect_gather.count(c) > 0;
10650}
10651
10652void Server::dump_reconnect_status(Formatter *f) const
10653{
10654 f->open_object_section("reconnect_status");
10655 f->dump_stream("client_reconnect_gather") << client_reconnect_gather;
10656 f->close_section();
10657}