]> git.proxmox.com Git - ceph.git/blame - ceph/src/mds/Server.cc
bump version to 18.2.2-pve1
[ceph.git] / ceph / src / mds / Server.cc
CommitLineData
7c673cae
FG
1// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2// vim: ts=8 sw=2 smarttab
3/*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 *
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
12 *
13 */
14
15#include <boost/lexical_cast.hpp>
11fdf7f2 16#include "include/ceph_assert.h" // lexical_cast includes system assert.h
7c673cae
FG
17
18#include <boost/config/warning_disable.hpp>
19#include <boost/fusion/include/std_pair.hpp>
a8e16298 20#include <boost/range/adaptor/reversed.hpp>
7c673cae
FG
21
22#include "MDSRank.h"
23#include "Server.h"
24#include "Locker.h"
25#include "MDCache.h"
26#include "MDLog.h"
27#include "Migrator.h"
28#include "MDBalancer.h"
29#include "InoTable.h"
30#include "SnapClient.h"
31#include "Mutation.h"
f67539c2 32#include "MetricsHandler.h"
11fdf7f2 33#include "cephfs_features.h"
7c673cae
FG
34
35#include "msg/Messenger.h"
36
37#include "osdc/Objecter.h"
38
7c673cae 39#include "events/EUpdate.h"
f67539c2 40#include "events/EPeerUpdate.h"
7c673cae
FG
41#include "events/ESession.h"
42#include "events/EOpen.h"
43#include "events/ECommitted.h"
9f95a23c 44#include "events/EPurged.h"
7c673cae 45
11fdf7f2 46#include "include/stringify.h"
7c673cae
FG
47#include "include/filepath.h"
48#include "common/errno.h"
49#include "common/Timer.h"
50#include "common/perf_counters.h"
51#include "include/compat.h"
52#include "osd/OSDMap.h"
1e59de90 53#include "fscrypt.h"
7c673cae
FG
54
55#include <errno.h>
56
57#include <list>
f67539c2 58#include <regex>
11fdf7f2 59#include <string_view>
f67539c2 60#include <functional>
7c673cae
FG
61
62#include "common/config.h"
63
39ae355f
TL
64#include "msg/Message.h"
65
7c673cae
FG
66#define dout_context g_ceph_context
67#define dout_subsys ceph_subsys_mds
68#undef dout_prefix
69#define dout_prefix *_dout << "mds." << mds->get_nodeid() << ".server "
70
20effc67
TL
71using namespace std;
72
11fdf7f2 73class ServerContext : public MDSContext {
7c673cae
FG
74 protected:
75 Server *server;
76 MDSRank *get_mds() override
77 {
78 return server->mds;
79 }
80
81 public:
82 explicit ServerContext(Server *s) : server(s) {
11fdf7f2 83 ceph_assert(server != NULL);
7c673cae
FG
84 }
85};
86
9f95a23c
TL
87class Batch_Getattr_Lookup : public BatchOp {
88protected:
89 Server* server;
90 ceph::ref_t<MDRequestImpl> mdr;
f91f0fd5 91 std::vector<ceph::ref_t<MDRequestImpl>> batch_reqs;
9f95a23c
TL
92 int res = 0;
93public:
f91f0fd5
TL
94 Batch_Getattr_Lookup(Server* s, const ceph::ref_t<MDRequestImpl>& r)
95 : server(s), mdr(r) {
96 if (mdr->client_request->get_op() == CEPH_MDS_OP_LOOKUP)
97 mdr->batch_op_map = &mdr->dn[0].back()->batch_ops;
98 else
99 mdr->batch_op_map = &mdr->in[0]->batch_ops;
100 }
101 void add_request(const ceph::ref_t<MDRequestImpl>& r) override {
102 batch_reqs.push_back(r);
9f95a23c 103 }
f91f0fd5
TL
104 ceph::ref_t<MDRequestImpl> find_new_head() override {
105 while (!batch_reqs.empty()) {
106 auto r = std::move(batch_reqs.back());
107 batch_reqs.pop_back();
108 if (r->killed)
109 continue;
110
111 r->batch_op_map = mdr->batch_op_map;
112 mdr->batch_op_map = nullptr;
113 mdr = r;
114 return mdr;
115 }
116 return nullptr;
9f95a23c
TL
117 }
118 void _forward(mds_rank_t t) override {
f91f0fd5 119 MDCache* mdcache = server->mdcache;
aee94f69 120 mdcache->mds->forward_message_mds(mdr, t);
9f95a23c 121 mdr->set_mds_stamp(ceph_clock_now());
f91f0fd5 122 for (auto& m : batch_reqs) {
9f95a23c
TL
123 if (!m->killed)
124 mdcache->request_forward(m, t);
125 }
f91f0fd5 126 batch_reqs.clear();
9f95a23c
TL
127 }
128 void _respond(int r) override {
129 mdr->set_mds_stamp(ceph_clock_now());
f91f0fd5 130 for (auto& m : batch_reqs) {
9f95a23c
TL
131 if (!m->killed) {
132 m->tracei = mdr->tracei;
133 m->tracedn = mdr->tracedn;
134 server->respond_to_request(m, r);
135 }
136 }
f91f0fd5 137 batch_reqs.clear();
9f95a23c
TL
138 server->reply_client_request(mdr, make_message<MClientReply>(*mdr->client_request, r));
139 }
aee94f69 140 void print(std::ostream& o) const override {
9f95a23c
TL
141 o << "[batch front=" << *mdr << "]";
142 }
143};
144
7c673cae
FG
145class ServerLogContext : public MDSLogContextBase {
146protected:
147 Server *server;
148 MDSRank *get_mds() override
149 {
150 return server->mds;
151 }
152
153 MDRequestRef mdr;
154 void pre_finish(int r) override {
155 if (mdr)
156 mdr->mark_event("journal_committed: ");
157 }
158public:
159 explicit ServerLogContext(Server *s) : server(s) {
11fdf7f2 160 ceph_assert(server != NULL);
7c673cae
FG
161 }
162 explicit ServerLogContext(Server *s, MDRequestRef& r) : server(s), mdr(r) {
11fdf7f2 163 ceph_assert(server != NULL);
7c673cae
FG
164 }
165};
166
167void Server::create_logger()
168{
169 PerfCountersBuilder plb(g_ceph_context, "mds_server", l_mdss_first, l_mdss_last);
91327a77
AA
170
171 plb.add_u64_counter(l_mdss_handle_client_request, "handle_client_request",
172 "Client requests", "hcr", PerfCountersBuilder::PRIO_INTERESTING);
f67539c2
TL
173 plb.add_u64_counter(l_mdss_handle_peer_request, "handle_peer_request",
174 "Peer requests", "hsr", PerfCountersBuilder::PRIO_INTERESTING);
91327a77
AA
175 plb.add_u64_counter(l_mdss_handle_client_session,
176 "handle_client_session", "Client session messages", "hcs",
177 PerfCountersBuilder::PRIO_INTERESTING);
11fdf7f2
TL
178 plb.add_u64_counter(l_mdss_cap_revoke_eviction, "cap_revoke_eviction",
179 "Cap Revoke Client Eviction", "cre", PerfCountersBuilder::PRIO_INTERESTING);
adb31ebb
TL
180 plb.add_u64_counter(l_mdss_cap_acquisition_throttle,
181 "cap_acquisition_throttle", "Cap acquisition throttle counter", "cat",
182 PerfCountersBuilder::PRIO_INTERESTING);
91327a77
AA
183
184 // fop latencies are useful
185 plb.set_prio_default(PerfCountersBuilder::PRIO_USEFUL);
186 plb.add_time_avg(l_mdss_req_lookuphash_latency, "req_lookuphash_latency",
187 "Request type lookup hash of inode latency");
188 plb.add_time_avg(l_mdss_req_lookupino_latency, "req_lookupino_latency",
189 "Request type lookup inode latency");
190 plb.add_time_avg(l_mdss_req_lookupparent_latency, "req_lookupparent_latency",
191 "Request type lookup parent latency");
192 plb.add_time_avg(l_mdss_req_lookupname_latency, "req_lookupname_latency",
193 "Request type lookup name latency");
194 plb.add_time_avg(l_mdss_req_lookup_latency, "req_lookup_latency",
195 "Request type lookup latency");
196 plb.add_time_avg(l_mdss_req_lookupsnap_latency, "req_lookupsnap_latency",
197 "Request type lookup snapshot latency");
198 plb.add_time_avg(l_mdss_req_getattr_latency, "req_getattr_latency",
199 "Request type get attribute latency");
200 plb.add_time_avg(l_mdss_req_setattr_latency, "req_setattr_latency",
201 "Request type set attribute latency");
202 plb.add_time_avg(l_mdss_req_setlayout_latency, "req_setlayout_latency",
203 "Request type set file layout latency");
204 plb.add_time_avg(l_mdss_req_setdirlayout_latency, "req_setdirlayout_latency",
205 "Request type set directory layout latency");
1d09f67e
TL
206 plb.add_time_avg(l_mdss_req_getvxattr_latency, "req_getvxattr_latency",
207 "Request type get virtual extended attribute latency");
91327a77
AA
208 plb.add_time_avg(l_mdss_req_setxattr_latency, "req_setxattr_latency",
209 "Request type set extended attribute latency");
210 plb.add_time_avg(l_mdss_req_rmxattr_latency, "req_rmxattr_latency",
211 "Request type remove extended attribute latency");
212 plb.add_time_avg(l_mdss_req_readdir_latency, "req_readdir_latency",
213 "Request type read directory latency");
214 plb.add_time_avg(l_mdss_req_setfilelock_latency, "req_setfilelock_latency",
215 "Request type set file lock latency");
216 plb.add_time_avg(l_mdss_req_getfilelock_latency, "req_getfilelock_latency",
217 "Request type get file lock latency");
218 plb.add_time_avg(l_mdss_req_create_latency, "req_create_latency",
219 "Request type create latency");
220 plb.add_time_avg(l_mdss_req_open_latency, "req_open_latency",
221 "Request type open latency");
222 plb.add_time_avg(l_mdss_req_mknod_latency, "req_mknod_latency",
223 "Request type make node latency");
224 plb.add_time_avg(l_mdss_req_link_latency, "req_link_latency",
225 "Request type link latency");
226 plb.add_time_avg(l_mdss_req_unlink_latency, "req_unlink_latency",
227 "Request type unlink latency");
228 plb.add_time_avg(l_mdss_req_rmdir_latency, "req_rmdir_latency",
229 "Request type remove directory latency");
230 plb.add_time_avg(l_mdss_req_rename_latency, "req_rename_latency",
231 "Request type rename latency");
232 plb.add_time_avg(l_mdss_req_mkdir_latency, "req_mkdir_latency",
233 "Request type make directory latency");
234 plb.add_time_avg(l_mdss_req_symlink_latency, "req_symlink_latency",
235 "Request type symbolic link latency");
236 plb.add_time_avg(l_mdss_req_lssnap_latency, "req_lssnap_latency",
237 "Request type list snapshot latency");
238 plb.add_time_avg(l_mdss_req_mksnap_latency, "req_mksnap_latency",
239 "Request type make snapshot latency");
240 plb.add_time_avg(l_mdss_req_rmsnap_latency, "req_rmsnap_latency",
241 "Request type remove snapshot latency");
242 plb.add_time_avg(l_mdss_req_renamesnap_latency, "req_renamesnap_latency",
243 "Request type rename snapshot latency");
aee94f69
TL
244 plb.add_time_avg(l_mdss_req_snapdiff_latency, "req_snapdiff_latency",
245 "Request type snapshot difference latency");
91327a77 246
91327a77
AA
247 plb.set_prio_default(PerfCountersBuilder::PRIO_DEBUGONLY);
248 plb.add_u64_counter(l_mdss_dispatch_client_request, "dispatch_client_request",
249 "Client requests dispatched");
f67539c2 250 plb.add_u64_counter(l_mdss_dispatch_peer_request, "dispatch_server_request",
91327a77
AA
251 "Server requests dispatched");
252
7c673cae
FG
253 logger = plb.create_perf_counters();
254 g_ceph_context->get_perfcounters_collection()->add(logger);
255}
256
f67539c2 257Server::Server(MDSRank *m, MetricsHandler *metrics_handler) :
7c673cae
FG
258 mds(m),
259 mdcache(mds->mdcache), mdlog(mds->mdlog),
1e59de90 260 inject_rename_corrupt_dentry_first(g_conf().get_val<double>("mds_inject_rename_corrupt_dentry_first")),
f67539c2
TL
261 recall_throttle(g_conf().get_val<double>("mds_recall_max_decay_rate")),
262 metrics_handler(metrics_handler)
7c673cae 263{
f91f0fd5 264 forward_all_requests_to_auth = g_conf().get_val<bool>("mds_forward_all_requests_to_auth");
92f5a8d4 265 replay_unsafe_with_closed_session = g_conf().get_val<bool>("mds_replay_unsafe_with_closed_session");
81eedcae 266 cap_revoke_eviction_timeout = g_conf().get_val<double>("mds_cap_revoke_eviction_timeout");
9f95a23c
TL
267 max_snaps_per_dir = g_conf().get_val<uint64_t>("mds_max_snaps_per_dir");
268 delegate_inos_pct = g_conf().get_val<uint64_t>("mds_client_delegate_inos_pct");
adb31ebb
TL
269 max_caps_per_client = g_conf().get_val<uint64_t>("mds_max_caps_per_client");
270 cap_acquisition_throttle = g_conf().get_val<uint64_t>("mds_session_cap_acquisition_throttle");
271 max_caps_throttle_ratio = g_conf().get_val<double>("mds_session_max_caps_throttle_ratio");
272 caps_throttle_retry_request_timeout = g_conf().get_val<double>("mds_cap_acquisition_throttle_retry_request_timeout");
20effc67
TL
273 dir_max_entries = g_conf().get_val<uint64_t>("mds_dir_max_entries");
274 bal_fragment_size_max = g_conf().get_val<int64_t>("mds_bal_fragment_size_max");
11fdf7f2 275 supported_features = feature_bitset_t(CEPHFS_FEATURES_MDS_SUPPORTED);
33c7a0ef 276 supported_metric_spec = feature_bitset_t(CEPHFS_METRIC_FEATURES_ALL);
7c673cae
FG
277}
278
9f95a23c 279void Server::dispatch(const cref_t<Message> &m)
7c673cae
FG
280{
281 switch (m->get_type()) {
282 case CEPH_MSG_CLIENT_RECONNECT:
9f95a23c 283 handle_client_reconnect(ref_cast<MClientReconnect>(m));
7c673cae
FG
284 return;
285 }
286
92f5a8d4
TL
287/*
288 *In reconnect phase, client sent unsafe requests to mds before reconnect msg. Seting sessionclosed_isok will handle scenario like this:
289
2901. In reconnect phase, client sent unsafe requests to mds.
2912. It reached reconnect timeout. All sessions without sending reconnect msg in time, some of which may had sent unsafe requests, are marked as closed.
292(Another situation is #31668, which will deny all client reconnect msg to speed up reboot).
2933.So these unsafe request from session without sending reconnect msg in time or being denied could be handled in clientreplay phase.
294
295*/
296 bool sessionclosed_isok = replay_unsafe_with_closed_session;
7c673cae 297 // active?
f67539c2 298 // handle_peer_request()/handle_client_session() will wait if necessary
94b18763 299 if (m->get_type() == CEPH_MSG_CLIENT_REQUEST && !mds->is_active()) {
9f95a23c 300 const auto &req = ref_cast<MClientRequest>(m);
94b18763
FG
301 if (mds->is_reconnect() || mds->get_want_state() == CEPH_MDS_STATE_RECONNECT) {
302 Session *session = mds->get_session(req);
92f5a8d4 303 if (!session || (!session->is_open() && !sessionclosed_isok)) {
7c673cae 304 dout(5) << "session is closed, dropping " << req->get_reqid() << dendl;
7c673cae
FG
305 return;
306 }
307 bool queue_replay = false;
9f95a23c 308 if (req->is_replay() || req->is_async()) {
7c673cae
FG
309 dout(3) << "queuing replayed op" << dendl;
310 queue_replay = true;
11fdf7f2
TL
311 if (req->head.ino &&
312 !session->have_completed_request(req->get_reqid().tid, nullptr)) {
f67539c2
TL
313 inodeno_t ino(req->head.ino);
314 mdcache->add_replay_ino_alloc(ino);
315 if (replay_unsafe_with_closed_session &&
316 session->free_prealloc_inos.contains(ino)) {
317 // don't purge inodes that will be created by later replay
318 session->free_prealloc_inos.erase(ino);
319 session->delegated_inos.insert(ino);
320 }
11fdf7f2 321 }
7c673cae
FG
322 } else if (req->get_retry_attempt()) {
323 // process completed request in clientreplay stage. The completed request
324 // might have created new file/directorie. This guarantees MDS sends a reply
325 // to client before other request modifies the new file/directorie.
326 if (session->have_completed_request(req->get_reqid().tid, NULL)) {
327 dout(3) << "queuing completed op" << dendl;
328 queue_replay = true;
329 }
330 // this request was created before the cap reconnect message, drop any embedded
331 // cap releases.
332 req->releases.clear();
333 }
334 if (queue_replay) {
335 req->mark_queued_for_replay();
336 mds->enqueue_replay(new C_MDS_RetryMessage(mds, m));
337 return;
338 }
339 }
340
341 bool wait_for_active = true;
94b18763 342 if (mds->is_stopping()) {
28e407b8 343 wait_for_active = false;
7c673cae 344 } else if (mds->is_clientreplay()) {
94b18763 345 if (req->is_queued_for_replay()) {
7c673cae 346 wait_for_active = false;
7c673cae
FG
347 }
348 }
349 if (wait_for_active) {
350 dout(3) << "not active yet, waiting" << dendl;
351 mds->wait_for_active(new C_MDS_RetryMessage(mds, m));
352 return;
353 }
354 }
355
356 switch (m->get_type()) {
357 case CEPH_MSG_CLIENT_SESSION:
9f95a23c 358 handle_client_session(ref_cast<MClientSession>(m));
7c673cae
FG
359 return;
360 case CEPH_MSG_CLIENT_REQUEST:
9f95a23c 361 handle_client_request(ref_cast<MClientRequest>(m));
11fdf7f2
TL
362 return;
363 case CEPH_MSG_CLIENT_RECLAIM:
9f95a23c 364 handle_client_reclaim(ref_cast<MClientReclaim>(m));
7c673cae 365 return;
f67539c2
TL
366 case MSG_MDS_PEER_REQUEST:
367 handle_peer_request(ref_cast<MMDSPeerRequest>(m));
7c673cae
FG
368 return;
369 default:
39ae355f
TL
370 derr << "Server unknown message " << m->get_type() << " from peer type " << m->get_connection()->get_peer_type() << dendl;
371 ceph_abort_msg("server unknown message " + to_string(m->get_type()) + " from peer type " + to_string(m->get_connection()->get_peer_type()));
7c673cae
FG
372 }
373}
374
375
376
377// ----------------------------------------------------------
378// SESSION management
379
380class C_MDS_session_finish : public ServerLogContext {
381 Session *session;
382 uint64_t state_seq;
383 bool open;
384 version_t cmapv;
f67539c2 385 interval_set<inodeno_t> inos_to_free;
7c673cae 386 version_t inotablev;
f67539c2 387 interval_set<inodeno_t> inos_to_purge;
9f95a23c 388 LogSegment *ls = nullptr;
7c673cae
FG
389 Context *fin;
390public:
f67539c2 391 C_MDS_session_finish(Server *srv, Session *se, uint64_t sseq, bool s, version_t mv, Context *fin_ = nullptr) :
7c673cae 392 ServerLogContext(srv), session(se), state_seq(sseq), open(s), cmapv(mv), inotablev(0), fin(fin_) { }
f67539c2
TL
393 C_MDS_session_finish(Server *srv, Session *se, uint64_t sseq, bool s, version_t mv,
394 const interval_set<inodeno_t>& to_free, version_t iv,
395 const interval_set<inodeno_t>& to_purge, LogSegment *_ls, Context *fin_ = nullptr) :
396 ServerLogContext(srv), session(se), state_seq(sseq), open(s), cmapv(mv),
397 inos_to_free(to_free), inotablev(iv), inos_to_purge(to_purge), ls(_ls), fin(fin_) {}
7c673cae 398 void finish(int r) override {
11fdf7f2 399 ceph_assert(r == 0);
f67539c2 400 server->_session_logged(session, state_seq, open, cmapv, inos_to_free, inotablev, inos_to_purge, ls);
7c673cae
FG
401 if (fin) {
402 fin->complete(r);
403 }
404 }
405};
406
11fdf7f2
TL
407Session* Server::find_session_by_uuid(std::string_view uuid)
408{
409 Session* session = nullptr;
410 for (auto& it : mds->sessionmap.get_sessions()) {
411 auto& metadata = it.second->info.client_metadata;
412
413 auto p = metadata.find("uuid");
414 if (p == metadata.end() || p->second != uuid)
415 continue;
416
417 if (!session) {
418 session = it.second;
419 } else if (!session->reclaiming_from) {
20effc67 420 ceph_assert(it.second->reclaiming_from == session);
11fdf7f2
TL
421 session = it.second;
422 } else {
20effc67 423 ceph_assert(session->reclaiming_from == it.second);
11fdf7f2
TL
424 }
425 }
426 return session;
427}
428
9f95a23c 429void Server::reclaim_session(Session *session, const cref_t<MClientReclaim> &m)
11fdf7f2
TL
430{
431 if (!session->is_open() && !session->is_stale()) {
432 dout(10) << "session not open, dropping this req" << dendl;
433 return;
434 }
435
9f95a23c 436 auto reply = make_message<MClientReclaimReply>(0);
11fdf7f2
TL
437 if (m->get_uuid().empty()) {
438 dout(10) << __func__ << " invalid message (no uuid)" << dendl;
f67539c2 439 reply->set_result(-CEPHFS_EINVAL);
11fdf7f2
TL
440 mds->send_message_client(reply, session);
441 return;
442 }
443
444 unsigned flags = m->get_flags();
445 if (flags != CEPH_RECLAIM_RESET) { // currently only support reset
446 dout(10) << __func__ << " unsupported flags" << dendl;
39ae355f 447 reply->set_result(-CEPHFS_EINVAL);
11fdf7f2
TL
448 mds->send_message_client(reply, session);
449 return;
450 }
451
452 Session* target = find_session_by_uuid(m->get_uuid());
453 if (target) {
454 if (session->info.auth_name != target->info.auth_name) {
455 dout(10) << __func__ << " session auth_name " << session->info.auth_name
456 << " != target auth_name " << target->info.auth_name << dendl;
f67539c2 457 reply->set_result(-CEPHFS_EPERM);
11fdf7f2
TL
458 mds->send_message_client(reply, session);
459 }
460
20effc67
TL
461 ceph_assert(!target->reclaiming_from);
462 ceph_assert(!session->reclaiming_from);
11fdf7f2
TL
463 session->reclaiming_from = target;
464 reply->set_addrs(entity_addrvec_t(target->info.inst.addr));
465 }
466
467 if (flags & CEPH_RECLAIM_RESET) {
468 finish_reclaim_session(session, reply);
39ae355f 469 } else ceph_assert(0); /* no other flags are handled at this time */
11fdf7f2
TL
470}
471
9f95a23c 472void Server::finish_reclaim_session(Session *session, const ref_t<MClientReclaimReply> &reply)
11fdf7f2
TL
473{
474 Session *target = session->reclaiming_from;
475 if (target) {
476 session->reclaiming_from = nullptr;
477
478 Context *send_reply;
479 if (reply) {
480 int64_t session_id = session->get_client().v;
9f95a23c 481 send_reply = new LambdaContext([this, session_id, reply](int r) {
20effc67 482 ceph_assert(ceph_mutex_is_locked_by_me(mds->mds_lock));
11fdf7f2
TL
483 Session *session = mds->sessionmap.get_session(entity_name_t::CLIENT(session_id));
484 if (!session) {
485 return;
486 }
487 auto epoch = mds->objecter->with_osdmap([](const OSDMap &map){ return map.get_epoch(); });
488 reply->set_epoch(epoch);
489 mds->send_message_client(reply, session);
490 });
491 } else {
492 send_reply = nullptr;
493 }
494
f67539c2
TL
495 bool blocklisted = mds->objecter->with_osdmap([target](const OSDMap &map) {
496 return map.is_blocklisted(target->info.inst.addr);
11fdf7f2
TL
497 });
498
f67539c2 499 if (blocklisted || !g_conf()->mds_session_blocklist_on_evict) {
11fdf7f2
TL
500 kill_session(target, send_reply);
501 } else {
f67539c2
TL
502 CachedStackStringStream css;
503 mds->evict_client(target->get_client().v, false, true, *css, send_reply);
11fdf7f2
TL
504 }
505 } else if (reply) {
506 mds->send_message_client(reply, session);
507 }
508}
509
9f95a23c 510void Server::handle_client_reclaim(const cref_t<MClientReclaim> &m)
11fdf7f2
TL
511{
512 Session *session = mds->get_session(m);
39ae355f 513 uint32_t flags = m->get_flags();
11fdf7f2 514 dout(3) << __func__ << " " << *m << " from " << m->get_source() << dendl;
39ae355f 515 ceph_assert(m->is_a_client()); // should _not_ come from an mds!
11fdf7f2
TL
516
517 if (!session) {
518 dout(0) << " ignoring sessionless msg " << *m << dendl;
519 return;
520 }
521
20effc67 522 std::string_view fs_name = mds->mdsmap->get_fs_name();
f67539c2
TL
523 if (!fs_name.empty() && !session->fs_name_capable(fs_name, MAY_READ)) {
524 dout(0) << " dropping message not allowed for this fs_name: " << *m << dendl;
525 return;
526 }
527
11fdf7f2
TL
528 if (mds->get_state() < MDSMap::STATE_CLIENTREPLAY) {
529 mds->wait_for_replay(new C_MDS_RetryMessage(mds, m));
530 return;
531 }
532
39ae355f
TL
533 if (flags & MClientReclaim::FLAG_FINISH) {
534 if (flags ^ MClientReclaim::FLAG_FINISH) {
535 dout(0) << __func__ << " client specified FLAG_FINISH with other flags."
536 " Other flags:" << flags << dendl;
537 auto reply = make_message<MClientReclaimReply>(0);
538 reply->set_result(-CEPHFS_EINVAL);
539 mds->send_message_client(reply, session);
540 return;
541 }
11fdf7f2
TL
542 finish_reclaim_session(session);
543 } else {
544 reclaim_session(session, m);
545 }
546}
547
9f95a23c 548void Server::handle_client_session(const cref_t<MClientSession> &m)
7c673cae
FG
549{
550 version_t pv;
94b18763 551 Session *session = mds->get_session(m);
7c673cae
FG
552
553 dout(3) << "handle_client_session " << *m << " from " << m->get_source() << dendl;
39ae355f 554 ceph_assert(m->is_a_client()); // should _not_ come from an mds!
7c673cae
FG
555
556 if (!session) {
557 dout(0) << " ignoring sessionless msg " << *m << dendl;
9f95a23c 558 auto reply = make_message<MClientSession>(CEPH_SESSION_REJECT);
92f5a8d4
TL
559 reply->metadata["error_string"] = "sessionless";
560 mds->send_message(reply, m->get_connection());
7c673cae
FG
561 return;
562 }
563
20effc67 564 std::string_view fs_name = mds->mdsmap->get_fs_name();
f67539c2
TL
565 if (!fs_name.empty() && !session->fs_name_capable(fs_name, MAY_READ)) {
566 dout(0) << " dropping message not allowed for this fs_name: " << *m << dendl;
567 auto reply = make_message<MClientSession>(CEPH_SESSION_REJECT);
568 reply->metadata["error_string"] = "client doesn't have caps for FS \"" +
569 std::string(fs_name) + "\"";
570 mds->send_message(std::move(reply), m->get_connection());
571 return;
572 }
573
94b18763
FG
574 if (m->get_op() == CEPH_SESSION_REQUEST_RENEWCAPS) {
575 // always handle renewcaps (state >= MDSMap::STATE_RECONNECT)
576 } else if (m->get_op() == CEPH_SESSION_REQUEST_CLOSE) {
577 // close requests need to be handled when mds is active
578 if (mds->get_state() < MDSMap::STATE_ACTIVE) {
579 mds->wait_for_active(new C_MDS_RetryMessage(mds, m));
580 return;
581 }
582 } else {
583 if (mds->get_state() < MDSMap::STATE_CLIENTREPLAY) {
584 mds->wait_for_replay(new C_MDS_RetryMessage(mds, m));
585 return;
586 }
587 }
588
7c673cae
FG
589 if (logger)
590 logger->inc(l_mdss_handle_client_session);
591
592 uint64_t sseq = 0;
593 switch (m->get_op()) {
594 case CEPH_SESSION_REQUEST_OPEN:
1e59de90
TL
595 if(mds->mdsmap->test_flag(CEPH_MDSMAP_REFUSE_CLIENT_SESSION)) {
596 dout(0) << "new sessions are not permitted, enable again via"
597 "`ceph fs set <fs_name> refuse_client_session false`" << dendl;
598 auto reply = make_message<MClientSession>(CEPH_SESSION_REJECT);
599 reply->metadata["error_string"] = "new sessions are not permitted,"
600 " enable again via `ceph fs set"
601 " <fs_name> refuse_client_session false`";
602 mds->send_message(reply, m->get_connection());
603 return;
604 }
7c673cae
FG
605 if (session->is_opening() ||
606 session->is_open() ||
607 session->is_stale() ||
28e407b8
AA
608 session->is_killing() ||
609 terminating_sessions) {
39ae355f
TL
610 if (m->supported_features.test(CEPHFS_FEATURE_NOTIFY_SESSION_STATE)) {
611 if (session->is_open() && !mds->is_stopping()) {
612 dout(10) << "currently already opened" << dendl;
613
614 auto reply = make_message<MClientSession>(CEPH_SESSION_OPEN,
615 session->get_push_seq());
616 if (session->info.has_feature(CEPHFS_FEATURE_MIMIC))
617 reply->supported_features = supported_features;
618 mds->send_message_client(reply, session);
619 if (mdcache->is_readonly()) {
620 auto m = make_message<MClientSession>(CEPH_SESSION_FORCE_RO);
621 mds->send_message_client(m, session);
622 }
623 }
624 }
625 dout(10) << "currently " << session->get_state_name()
626 << ", dropping this req" << dendl;
7c673cae
FG
627 return;
628 }
11fdf7f2 629 ceph_assert(session->is_closed() || session->is_closing());
7c673cae 630
b32b8144
FG
631 if (mds->is_stopping()) {
632 dout(10) << "mds is stopping, dropping open req" << dendl;
b32b8144
FG
633 return;
634 }
635
a8e16298
TL
636 {
637 auto& addr = session->info.inst.addr;
9f95a23c 638 session->set_client_metadata(client_metadata_t(m->metadata, m->supported_features, m->metric_spec));
a8e16298
TL
639 auto& client_metadata = session->info.client_metadata;
640
11fdf7f2 641 auto log_session_status = [this, m, session](std::string_view status, std::string_view err) {
a8e16298
TL
642 auto now = ceph_clock_now();
643 auto throttle_elapsed = m->get_recv_complete_stamp() - m->get_throttle_stamp();
644 auto elapsed = now - m->get_recv_stamp();
11fdf7f2
TL
645 CachedStackStringStream css;
646 *css << "New client session:"
a8e16298
TL
647 << " addr=\"" << session->info.inst.addr << "\""
648 << ",elapsed=" << elapsed
649 << ",throttled=" << throttle_elapsed
650 << ",status=\"" << status << "\"";
651 if (!err.empty()) {
11fdf7f2 652 *css << ",error=\"" << err << "\"";
a8e16298
TL
653 }
654 const auto& metadata = session->info.client_metadata;
11fdf7f2
TL
655 if (auto it = metadata.find("root"); it != metadata.end()) {
656 *css << ",root=\"" << it->second << "\"";
a8e16298 657 }
11fdf7f2
TL
658 dout(2) << css->strv() << dendl;
659 };
660
a4b75251
TL
661 auto send_reject_message = [this, &session, &log_session_status](std::string_view err_str, unsigned flags=0) {
662 auto m = make_message<MClientSession>(CEPH_SESSION_REJECT, 0, flags);
11fdf7f2
TL
663 if (session->info.has_feature(CEPHFS_FEATURE_MIMIC))
664 m->metadata["error_string"] = err_str;
665 mds->send_message_client(m, session);
666 log_session_status("REJECTED", err_str);
a8e16298 667 };
7c673cae 668
f67539c2 669 bool blocklisted = mds->objecter->with_osdmap(
11fdf7f2 670 [&addr](const OSDMap &osd_map) -> bool {
f67539c2 671 return osd_map.is_blocklisted(addr);
11fdf7f2
TL
672 });
673
f67539c2
TL
674 if (blocklisted) {
675 dout(10) << "rejecting blocklisted client " << addr << dendl;
676 // This goes on the wire and the "blacklisted" substring is
677 // depended upon by the kernel client for detecting whether it
678 // has been blocklisted. If mounted with recover_session=clean
679 // (since 5.4), it tries to automatically recover itself from
680 // blocklisting.
a4b75251
TL
681 unsigned flags = 0;
682 flags |= MClientSession::SESSION_BLOCKLISTED;
683 send_reject_message("blocklisted (blacklisted)", flags);
11fdf7f2
TL
684 session->clear();
685 break;
7c673cae 686 }
7c673cae 687
11fdf7f2
TL
688 if (client_metadata.features.empty())
689 infer_supported_features(session, client_metadata);
690
691 dout(20) << __func__ << " CEPH_SESSION_REQUEST_OPEN metadata entries:" << dendl;
9f95a23c
TL
692 dout(20) << " features: '" << client_metadata.features << "'" << dendl;
693 dout(20) << " metric specification: [" << client_metadata.metric_spec << "]" << dendl;
11fdf7f2
TL
694 for (const auto& p : client_metadata) {
695 dout(20) << " " << p.first << ": " << p.second << dendl;
696 }
697
698 feature_bitset_t missing_features = required_client_features;
699 missing_features -= client_metadata.features;
700 if (!missing_features.empty()) {
f67539c2
TL
701 CachedStackStringStream css;
702 *css << "missing required features '" << missing_features << "'";
703 send_reject_message(css->strv());
92f5a8d4
TL
704 mds->clog->warn() << "client session (" << session->info.inst
705 << ") lacks required features " << missing_features
706 << "; client supports " << client_metadata.features;
11fdf7f2
TL
707 session->clear();
708 break;
a8e16298 709 }
7c673cae 710
a8e16298
TL
711 // Special case for the 'root' metadata path; validate that the claimed
712 // root is actually within the caps of the session
11fdf7f2
TL
713 if (auto it = client_metadata.find("root"); it != client_metadata.end()) {
714 auto claimed_root = it->second;
f67539c2 715 CachedStackStringStream css;
11fdf7f2
TL
716 bool denied = false;
717 // claimed_root has a leading "/" which we strip before passing
718 // into caps check
719 if (claimed_root.empty() || claimed_root[0] != '/') {
720 denied = true;
f67539c2 721 *css << "invalue root '" << claimed_root << "'";
11fdf7f2
TL
722 } else if (!session->auth_caps.path_capable(claimed_root.substr(1))) {
723 denied = true;
f67539c2 724 *css << "non-allowable root '" << claimed_root << "'";
11fdf7f2
TL
725 }
726
727 if (denied) {
728 // Tell the client we're rejecting their open
f67539c2
TL
729 send_reject_message(css->strv());
730 mds->clog->warn() << "client session with " << css->strv()
11fdf7f2
TL
731 << " denied (" << session->info.inst << ")";
732 session->clear();
733 break;
734 }
735 }
736
737 if (auto it = client_metadata.find("uuid"); it != client_metadata.end()) {
738 if (find_session_by_uuid(it->second)) {
739 send_reject_message("duplicated session uuid");
740 mds->clog->warn() << "client session with duplicated session uuid '"
741 << it->second << "' denied (" << session->info.inst << ")";
742 session->clear();
743 break;
744 }
a8e16298
TL
745 }
746
f67539c2
TL
747 if (session->is_closed()) {
748 mds->sessionmap.add_session(session);
749 }
a8e16298
TL
750
751 pv = mds->sessionmap.mark_projected(session);
752 sseq = mds->sessionmap.set_state(session, Session::STATE_OPENING);
753 mds->sessionmap.touch_session(session);
9f95a23c 754 auto fin = new LambdaContext([log_session_status = std::move(log_session_status)](int r){
11fdf7f2 755 ceph_assert(r == 0);
a8e16298
TL
756 log_session_status("ACCEPTED", "");
757 });
758 mdlog->start_submit_entry(new ESession(m->get_source_inst(), true, pv, client_metadata),
759 new C_MDS_session_finish(this, session, sseq, true, pv, fin));
11fdf7f2 760 mdlog->flush();
a8e16298 761 }
7c673cae
FG
762 break;
763
764 case CEPH_SESSION_REQUEST_RENEWCAPS:
11fdf7f2 765 if (session->is_open() || session->is_stale()) {
7c673cae
FG
766 mds->sessionmap.touch_session(session);
767 if (session->is_stale()) {
768 mds->sessionmap.set_state(session, Session::STATE_OPEN);
769 mds->locker->resume_stale_caps(session);
770 mds->sessionmap.touch_session(session);
771 }
9f95a23c 772 auto reply = make_message<MClientSession>(CEPH_SESSION_RENEWCAPS, m->get_seq());
11fdf7f2 773 mds->send_message_client(reply, session);
7c673cae
FG
774 } else {
775 dout(10) << "ignoring renewcaps on non open|stale session (" << session->get_state_name() << ")" << dendl;
776 }
777 break;
778
779 case CEPH_SESSION_REQUEST_CLOSE:
780 {
781 if (session->is_closed() ||
782 session->is_closing() ||
783 session->is_killing()) {
784 dout(10) << "already closed|closing|killing, dropping this req" << dendl;
7c673cae
FG
785 return;
786 }
787 if (session->is_importing()) {
788 dout(10) << "ignoring close req on importing session" << dendl;
7c673cae
FG
789 return;
790 }
11fdf7f2 791 ceph_assert(session->is_open() ||
7c673cae
FG
792 session->is_stale() ||
793 session->is_opening());
794 if (m->get_seq() < session->get_push_seq()) {
795 dout(10) << "old push seq " << m->get_seq() << " < " << session->get_push_seq()
796 << ", dropping" << dendl;
7c673cae
FG
797 return;
798 }
799 // We are getting a seq that is higher than expected.
800 // Handle the same as any other seqn error.
801 //
802 if (m->get_seq() != session->get_push_seq()) {
803 dout(0) << "old push seq " << m->get_seq() << " != " << session->get_push_seq()
804 << ", BUGGY!" << dendl;
805 mds->clog->warn() << "incorrect push seq " << m->get_seq() << " != "
806 << session->get_push_seq() << ", dropping" << " from client : " << session->get_human_name();
7c673cae
FG
807 return;
808 }
809 journal_close_session(session, Session::STATE_CLOSING, NULL);
810 }
811 break;
812
813 case CEPH_SESSION_FLUSHMSG_ACK:
814 finish_flush_session(session, m->get_seq());
815 break;
816
31f18b77 817 case CEPH_SESSION_REQUEST_FLUSH_MDLOG:
b32b8144
FG
818 if (mds->is_active())
819 mdlog->flush();
31f18b77
FG
820 break;
821
7c673cae 822 default:
39ae355f
TL
823 auto m = make_message<MClientSession>(CEPH_SESSION_REJECT);
824 mds->send_message_client(m, session);
825 derr << "Server received unknown message " << m->get_type() << ", closing session and blocklisting the client " << session->get_client() << dendl;
826 CachedStackStringStream css;
827 mds->evict_client(session->get_client().v, false, true, *css, nullptr);
7c673cae 828 }
7c673cae
FG
829}
830
f91f0fd5 831void Server::flush_session(Session *session, MDSGatherBuilder& gather) {
f64942e4 832 if (!session->is_open() ||
11fdf7f2
TL
833 !session->get_connection() ||
834 !session->get_connection()->has_feature(CEPH_FEATURE_EXPORT_PEER)) {
f64942e4
AA
835 return;
836 }
837
f91f0fd5 838 version_t seq = session->wait_for_flush(gather.new_sub());
11fdf7f2 839 mds->send_message_client(
9f95a23c 840 make_message<MClientSession>(CEPH_SESSION_FLUSHMSG, seq), session);
f64942e4
AA
841}
842
7c673cae
FG
843void Server::flush_client_sessions(set<client_t>& client_set, MDSGatherBuilder& gather)
844{
f91f0fd5
TL
845 for (const auto& client : client_set) {
846 Session *session = mds->sessionmap.get_session(entity_name_t::CLIENT(client.v));
11fdf7f2 847 ceph_assert(session);
f91f0fd5 848 flush_session(session, gather);
7c673cae
FG
849 }
850}
851
852void Server::finish_flush_session(Session *session, version_t seq)
853{
11fdf7f2 854 MDSContext::vec finished;
7c673cae
FG
855 session->finish_flush(seq, finished);
856 mds->queue_waiters(finished);
857}
858
859void Server::_session_logged(Session *session, uint64_t state_seq, bool open, version_t pv,
f67539c2
TL
860 const interval_set<inodeno_t>& inos_to_free, version_t piv,
861 const interval_set<inodeno_t>& inos_to_purge, LogSegment *ls)
7c673cae 862{
9f95a23c
TL
863 dout(10) << "_session_logged " << session->info.inst
864 << " state_seq " << state_seq
f67539c2
TL
865 << " " << (open ? "open":"close") << " " << pv
866 << " inos_to_free " << inos_to_free << " inotablev " << piv
867 << " inos_to_purge " << inos_to_purge << dendl;
868
869 if (!open) {
870 if (inos_to_purge.size()){
871 ceph_assert(ls);
872 session->info.prealloc_inos.subtract(inos_to_purge);
873 ls->purging_inodes.insert(inos_to_purge);
874 if (mds->is_clientreplay() || mds->is_active() || mds->is_stopping())
875 mdcache->purge_inodes(inos_to_purge, ls);
876 }
877
878 if (inos_to_free.size()) {
879 ceph_assert(piv);
880 ceph_assert(session->is_closing() || session->is_killing() ||
881 session->is_opening()); // re-open closing session
882 session->info.prealloc_inos.subtract(inos_to_free);
883 mds->inotable->apply_release_ids(inos_to_free);
884 ceph_assert(mds->inotable->get_version() == piv);
885 }
886 session->free_prealloc_inos = session->info.prealloc_inos;
9f95a23c 887 session->delegated_inos.clear();
7c673cae
FG
888 }
889
890 mds->sessionmap.mark_dirty(session);
891
892 // apply
893 if (session->get_state_seq() != state_seq) {
894 dout(10) << " journaled state_seq " << state_seq << " != current " << session->get_state_seq()
895 << ", noop" << dendl;
896 // close must have been canceled (by an import?), or any number of other things..
897 } else if (open) {
11fdf7f2 898 ceph_assert(session->is_opening());
7c673cae
FG
899 mds->sessionmap.set_state(session, Session::STATE_OPEN);
900 mds->sessionmap.touch_session(session);
f67539c2 901 metrics_handler->add_session(session);
11fdf7f2 902 ceph_assert(session->get_connection());
9f95a23c 903 auto reply = make_message<MClientSession>(CEPH_SESSION_OPEN);
33c7a0ef 904 if (session->info.has_feature(CEPHFS_FEATURE_MIMIC)) {
11fdf7f2 905 reply->supported_features = supported_features;
33c7a0ef
TL
906 reply->metric_spec = supported_metric_spec;
907 }
11fdf7f2
TL
908 mds->send_message_client(reply, session);
909 if (mdcache->is_readonly()) {
9f95a23c 910 auto m = make_message<MClientSession>(CEPH_SESSION_FORCE_RO);
11fdf7f2
TL
911 mds->send_message_client(m, session);
912 }
7c673cae
FG
913 } else if (session->is_closing() ||
914 session->is_killing()) {
915 // kill any lingering capabilities, leases, requests
f91f0fd5 916 bool killing = session->is_killing();
7c673cae
FG
917 while (!session->caps.empty()) {
918 Capability *cap = session->caps.front();
919 CInode *in = cap->get_inode();
920 dout(20) << " killing capability " << ccap_string(cap->issued()) << " on " << *in << dendl;
f91f0fd5 921 mds->locker->remove_client_cap(in, cap, killing);
7c673cae
FG
922 }
923 while (!session->leases.empty()) {
924 ClientLease *r = session->leases.front();
925 CDentry *dn = static_cast<CDentry*>(r->parent);
926 dout(20) << " killing client lease of " << *dn << dendl;
927 dn->remove_client_lease(r, mds->locker);
928 }
11fdf7f2 929 if (client_reconnect_gather.erase(session->info.get_client())) {
7c673cae 930 dout(20) << " removing client from reconnect set" << dendl;
7c673cae
FG
931 if (client_reconnect_gather.empty()) {
932 dout(7) << " client " << session->info.inst << " was last reconnect, finishing" << dendl;
933 reconnect_gather_finish();
934 }
935 }
11fdf7f2
TL
936 if (client_reclaim_gather.erase(session->info.get_client())) {
937 dout(20) << " removing client from reclaim set" << dendl;
938 if (client_reclaim_gather.empty()) {
939 dout(7) << " client " << session->info.inst << " was last reclaimed, finishing" << dendl;
940 mds->maybe_clientreplay_done();
941 }
942 }
7c673cae
FG
943
944 if (session->is_closing()) {
945 // mark con disposable. if there is a fault, we will get a
946 // reset and clean it up. if the client hasn't received the
947 // CLOSE message yet, they will reconnect and get an
948 // ms_handle_remote_reset() and realize they had in fact closed.
949 // do this *before* sending the message to avoid a possible
950 // race.
11fdf7f2 951 if (session->get_connection()) {
7c673cae
FG
952 // Conditional because terminate_sessions will indiscrimately
953 // put sessions in CLOSING whether they ever had a conn or not.
11fdf7f2 954 session->get_connection()->mark_disposable();
7c673cae
FG
955 }
956
957 // reset session
9f95a23c 958 mds->send_message_client(make_message<MClientSession>(CEPH_SESSION_CLOSE), session);
7c673cae
FG
959 mds->sessionmap.set_state(session, Session::STATE_CLOSED);
960 session->clear();
f67539c2 961 metrics_handler->remove_session(session);
7c673cae
FG
962 mds->sessionmap.remove_session(session);
963 } else if (session->is_killing()) {
964 // destroy session, close connection
11fdf7f2 965 if (session->get_connection()) {
92f5a8d4
TL
966 session->get_connection()->mark_down();
967 mds->sessionmap.set_state(session, Session::STATE_CLOSED);
968 session->set_connection(nullptr);
7c673cae 969 }
f67539c2 970 metrics_handler->remove_session(session);
7c673cae
FG
971 mds->sessionmap.remove_session(session);
972 } else {
973 ceph_abort();
974 }
975 } else {
976 ceph_abort();
977 }
978}
979
980/**
981 * Inject sessions from some source other than actual connections.
982 *
983 * For example:
984 * - sessions inferred from journal replay
985 * - sessions learned from other MDSs during rejoin
986 * - sessions learned from other MDSs during dir/caps migration
987 * - sessions learned from other MDSs during a cross-MDS rename
988 */
989version_t Server::prepare_force_open_sessions(map<client_t,entity_inst_t>& cm,
11fdf7f2 990 map<client_t,client_metadata_t>& cmm,
28e407b8 991 map<client_t, pair<Session*,uint64_t> >& smap)
7c673cae
FG
992{
993 version_t pv = mds->sessionmap.get_projected();
994
995 dout(10) << "prepare_force_open_sessions " << pv
996 << " on " << cm.size() << " clients"
997 << dendl;
7c673cae 998
28e407b8 999 mds->objecter->with_osdmap(
11fdf7f2 1000 [this, &cm, &cmm](const OSDMap &osd_map) {
28e407b8 1001 for (auto p = cm.begin(); p != cm.end(); ) {
f67539c2
TL
1002 if (osd_map.is_blocklisted(p->second.addr)) {
1003 dout(10) << " ignoring blocklisted client." << p->first
28e407b8 1004 << " (" << p->second.addr << ")" << dendl;
11fdf7f2 1005 cmm.erase(p->first);
28e407b8
AA
1006 cm.erase(p++);
1007 } else {
1008 ++p;
1009 }
1010 }
1011 });
1012
1013 for (map<client_t,entity_inst_t>::iterator p = cm.begin(); p != cm.end(); ++p) {
7c673cae
FG
1014 Session *session = mds->sessionmap.get_or_add_session(p->second);
1015 pv = mds->sessionmap.mark_projected(session);
28e407b8 1016 uint64_t sseq;
7c673cae
FG
1017 if (session->is_closed() ||
1018 session->is_closing() ||
28e407b8
AA
1019 session->is_killing()) {
1020 sseq = mds->sessionmap.set_state(session, Session::STATE_OPENING);
11fdf7f2
TL
1021 auto q = cmm.find(p->first);
1022 if (q != cmm.end())
1023 session->info.client_metadata.merge(q->second);
28e407b8 1024 } else {
11fdf7f2 1025 ceph_assert(session->is_open() ||
7c673cae
FG
1026 session->is_opening() ||
1027 session->is_stale());
28e407b8
AA
1028 sseq = 0;
1029 }
1030 smap[p->first] = make_pair(session, sseq);
7c673cae
FG
1031 session->inc_importing();
1032 }
1033 return pv;
1034}
1035
28e407b8 1036void Server::finish_force_open_sessions(const map<client_t,pair<Session*,uint64_t> >& smap,
7c673cae
FG
1037 bool dec_import)
1038{
1039 /*
1040 * FIXME: need to carefully consider the race conditions between a
1041 * client trying to close a session and an MDS doing an import
1042 * trying to force open a session...
1043 */
28e407b8 1044 dout(10) << "finish_force_open_sessions on " << smap.size() << " clients,"
7c673cae 1045 << " initial v " << mds->sessionmap.get_version() << dendl;
7c673cae 1046
28e407b8
AA
1047 for (auto &it : smap) {
1048 Session *session = it.second.first;
1049 uint64_t sseq = it.second.second;
1050 if (sseq > 0) {
7c673cae
FG
1051 if (session->get_state_seq() != sseq) {
1052 dout(10) << "force_open_sessions skipping changed " << session->info.inst << dendl;
1053 } else {
1054 dout(10) << "force_open_sessions opened " << session->info.inst << dendl;
1055 mds->sessionmap.set_state(session, Session::STATE_OPEN);
1056 mds->sessionmap.touch_session(session);
f67539c2 1057 metrics_handler->add_session(session);
11fdf7f2 1058
9f95a23c 1059 auto reply = make_message<MClientSession>(CEPH_SESSION_OPEN);
33c7a0ef 1060 if (session->info.has_feature(CEPHFS_FEATURE_MIMIC)) {
11fdf7f2 1061 reply->supported_features = supported_features;
33c7a0ef
TL
1062 reply->metric_spec = supported_metric_spec;
1063 }
11fdf7f2
TL
1064 mds->send_message_client(reply, session);
1065
7c673cae 1066 if (mdcache->is_readonly())
9f95a23c 1067 mds->send_message_client(make_message<MClientSession>(CEPH_SESSION_FORCE_RO), session);
7c673cae
FG
1068 }
1069 } else {
1070 dout(10) << "force_open_sessions skipping already-open " << session->info.inst << dendl;
11fdf7f2 1071 ceph_assert(session->is_open() || session->is_stale());
7c673cae
FG
1072 }
1073
1074 if (dec_import) {
1075 session->dec_importing();
1076 }
1077
1078 mds->sessionmap.mark_dirty(session);
1079 }
1080
1081 dout(10) << __func__ << ": final v " << mds->sessionmap.get_version() << dendl;
1082}
1083
1084class C_MDS_TerminatedSessions : public ServerContext {
1085 void finish(int r) override {
1086 server->terminating_sessions = false;
1087 }
1088 public:
1089 explicit C_MDS_TerminatedSessions(Server *s) : ServerContext(s) {}
1090};
1091
1092void Server::terminate_sessions()
1093{
a8e16298 1094 dout(5) << "terminating all sessions..." << dendl;
7c673cae
FG
1095
1096 terminating_sessions = true;
1097
1098 // kill them off. clients will retry etc.
1099 set<Session*> sessions;
1100 mds->sessionmap.get_client_session_set(sessions);
1101 for (set<Session*>::const_iterator p = sessions.begin();
1102 p != sessions.end();
1103 ++p) {
1104 Session *session = *p;
1105 if (session->is_closing() ||
1106 session->is_killing() ||
1107 session->is_closed())
1108 continue;
1109 journal_close_session(session, Session::STATE_CLOSING, NULL);
1110 }
1111
1112 mdlog->wait_for_safe(new C_MDS_TerminatedSessions(this));
1113}
1114
1115
1116void Server::find_idle_sessions()
1117{
91327a77
AA
1118 auto now = clock::now();
1119 auto last_cleared_laggy = mds->last_cleared_laggy();
1120
1121 dout(10) << "find_idle_sessions. last cleared laggy state " << last_cleared_laggy << "s ago" << dendl;
7c673cae
FG
1122
1123 // timeout/stale
1124 // (caps go stale, lease die)
91327a77
AA
1125 double queue_max_age = mds->get_dispatch_queue_max_age(ceph_clock_now());
1126 double cutoff = queue_max_age + mds->mdsmap->get_session_timeout();
f64942e4 1127
494da23a
TL
1128 // don't kick clients if we've been laggy
1129 if (last_cleared_laggy < cutoff) {
1130 dout(10) << " last cleared laggy " << last_cleared_laggy << "s ago (< cutoff " << cutoff
1131 << "), not marking any client stale" << dendl;
1132 return;
1133 }
1134
11fdf7f2
TL
1135 std::vector<Session*> to_evict;
1136
494da23a 1137 bool defer_session_stale = g_conf().get_val<bool>("mds_defer_session_stale");
f64942e4
AA
1138 const auto sessions_p1 = mds->sessionmap.by_state.find(Session::STATE_OPEN);
1139 if (sessions_p1 != mds->sessionmap.by_state.end() && !sessions_p1->second->empty()) {
1140 std::vector<Session*> new_stale;
1141
1142 for (auto session : *(sessions_p1->second)) {
1143 auto last_cap_renew_span = std::chrono::duration<double>(now - session->last_cap_renew).count();
1144 if (last_cap_renew_span < cutoff) {
1145 dout(20) << "laggiest active session is " << session->info.inst
1146 << " and renewed caps recently (" << last_cap_renew_span << "s ago)" << dendl;
1147 break;
1148 }
1149
1150 if (session->last_seen > session->last_cap_renew) {
1151 last_cap_renew_span = std::chrono::duration<double>(now - session->last_seen).count();
1152 if (last_cap_renew_span < cutoff) {
1153 dout(20) << "laggiest active session is " << session->info.inst
1154 << " and renewed caps recently (" << last_cap_renew_span << "s ago)" << dendl;
1155 continue;
1156 }
1157 }
1158
494da23a
TL
1159 if (last_cap_renew_span >= mds->mdsmap->get_session_autoclose()) {
1160 dout(20) << "evicting session " << session->info.inst << " since autoclose "
1161 "has arrived" << dendl;
1162 // evict session without marking it stale
1163 to_evict.push_back(session);
1164 continue;
1165 }
1166
1167 if (defer_session_stale &&
1168 !session->is_any_flush_waiter() &&
1169 !mds->locker->is_revoking_any_caps_from(session->get_client())) {
1170 dout(20) << "deferring marking session " << session->info.inst << " stale "
1171 "since it holds no caps" << dendl;
1172 continue;
1173 }
1174
11fdf7f2
TL
1175 auto it = session->info.client_metadata.find("timeout");
1176 if (it != session->info.client_metadata.end()) {
1177 unsigned timeout = strtoul(it->second.c_str(), nullptr, 0);
1178 if (timeout == 0) {
1179 dout(10) << "skipping session " << session->info.inst
1180 << ", infinite timeout specified" << dendl;
1181 continue;
1182 }
1183 double cutoff = queue_max_age + timeout;
1184 if (last_cap_renew_span < cutoff) {
1185 dout(10) << "skipping session " << session->info.inst
1186 << ", timeout (" << timeout << ") specified"
1187 << " and renewed caps recently (" << last_cap_renew_span << "s ago)" << dendl;
1188 continue;
1189 }
1190
1191 // do not go through stale, evict it directly.
1192 to_evict.push_back(session);
1193 } else {
1194 dout(10) << "new stale session " << session->info.inst
1195 << " last renewed caps " << last_cap_renew_span << "s ago" << dendl;
1196 new_stale.push_back(session);
1197 }
7c673cae
FG
1198 }
1199
f64942e4
AA
1200 for (auto session : new_stale) {
1201 mds->sessionmap.set_state(session, Session::STATE_STALE);
494da23a
TL
1202 if (mds->locker->revoke_stale_caps(session)) {
1203 mds->locker->remove_stale_leases(session);
1204 finish_flush_session(session, session->get_push_seq());
1e59de90 1205 auto m = make_message<MClientSession>(CEPH_SESSION_STALE);
494da23a
TL
1206 mds->send_message_client(m, session);
1207 } else {
1208 to_evict.push_back(session);
1209 }
f64942e4 1210 }
7c673cae
FG
1211 }
1212
1213 // autoclose
91327a77 1214 cutoff = queue_max_age + mds->mdsmap->get_session_autoclose();
7c673cae 1215
31f18b77 1216 // Collect a list of sessions exceeding the autoclose threshold
f64942e4 1217 const auto sessions_p2 = mds->sessionmap.by_state.find(Session::STATE_STALE);
11fdf7f2
TL
1218 if (sessions_p2 != mds->sessionmap.by_state.end() && !sessions_p2->second->empty()) {
1219 for (auto session : *(sessions_p2->second)) {
20effc67 1220 ceph_assert(session->is_stale());
11fdf7f2
TL
1221 auto last_cap_renew_span = std::chrono::duration<double>(now - session->last_cap_renew).count();
1222 if (last_cap_renew_span < cutoff) {
1223 dout(20) << "oldest stale session is " << session->info.inst
1224 << " and recently renewed caps " << last_cap_renew_span << "s ago" << dendl;
1225 break;
1226 }
1227 to_evict.push_back(session);
1228 }
31f18b77 1229 }
31f18b77 1230
11fdf7f2 1231 for (auto session: to_evict) {
7c673cae 1232 if (session->is_importing()) {
11fdf7f2
TL
1233 dout(10) << "skipping session " << session->info.inst << ", it's being imported" << dendl;
1234 continue;
7c673cae 1235 }
31f18b77 1236
11fdf7f2
TL
1237 auto last_cap_renew_span = std::chrono::duration<double>(now - session->last_cap_renew).count();
1238 mds->clog->warn() << "evicting unresponsive client " << *session
1239 << ", after " << last_cap_renew_span << " seconds";
1240 dout(10) << "autoclosing stale session " << session->info.inst
1241 << " last renewed caps " << last_cap_renew_span << "s ago" << dendl;
31f18b77 1242
f67539c2
TL
1243 if (g_conf()->mds_session_blocklist_on_timeout) {
1244 CachedStackStringStream css;
1245 mds->evict_client(session->get_client().v, false, true, *css, nullptr);
31f18b77
FG
1246 } else {
1247 kill_session(session, NULL);
1248 }
7c673cae
FG
1249 }
1250}
1251
91327a77
AA
1252void Server::evict_cap_revoke_non_responders() {
1253 if (!cap_revoke_eviction_timeout) {
1254 return;
1255 }
1256
9f95a23c 1257 auto&& to_evict = mds->locker->get_late_revoking_clients(cap_revoke_eviction_timeout);
91327a77
AA
1258
1259 for (auto const &client: to_evict) {
1260 mds->clog->warn() << "client id " << client << " has not responded to"
1261 << " cap revoke by MDS for over " << cap_revoke_eviction_timeout
1262 << " seconds, evicting";
1263 dout(1) << __func__ << ": evicting cap revoke non-responder client id "
1264 << client << dendl;
1265
f67539c2 1266 CachedStackStringStream css;
91327a77 1267 bool evicted = mds->evict_client(client.v, false,
f67539c2
TL
1268 g_conf()->mds_session_blocklist_on_evict,
1269 *css, nullptr);
91327a77
AA
1270 if (evicted && logger) {
1271 logger->inc(l_mdss_cap_revoke_eviction);
1272 }
1273 }
1274}
1275
92f5a8d4 1276void Server::handle_conf_change(const std::set<std::string>& changed) {
f91f0fd5
TL
1277 if (changed.count("mds_forward_all_requests_to_auth")){
1278 forward_all_requests_to_auth = g_conf().get_val<bool>("mds_forward_all_requests_to_auth");
92f5a8d4 1279 }
91327a77 1280 if (changed.count("mds_cap_revoke_eviction_timeout")) {
11fdf7f2 1281 cap_revoke_eviction_timeout = g_conf().get_val<double>("mds_cap_revoke_eviction_timeout");
91327a77
AA
1282 dout(20) << __func__ << " cap revoke eviction timeout changed to "
1283 << cap_revoke_eviction_timeout << dendl;
1284 }
a8e16298 1285 if (changed.count("mds_recall_max_decay_rate")) {
11fdf7f2 1286 recall_throttle = DecayCounter(g_conf().get_val<double>("mds_recall_max_decay_rate"));
a8e16298 1287 }
9f95a23c
TL
1288 if (changed.count("mds_max_snaps_per_dir")) {
1289 max_snaps_per_dir = g_conf().get_val<uint64_t>("mds_max_snaps_per_dir");
1290 dout(20) << __func__ << " max snapshots per directory changed to "
1291 << max_snaps_per_dir << dendl;
1292 }
1293 if (changed.count("mds_client_delegate_inos_pct")) {
1294 delegate_inos_pct = g_conf().get_val<uint64_t>("mds_client_delegate_inos_pct");
1295 }
adb31ebb
TL
1296 if (changed.count("mds_max_caps_per_client")) {
1297 max_caps_per_client = g_conf().get_val<uint64_t>("mds_max_caps_per_client");
1298 }
1299 if (changed.count("mds_session_cap_acquisition_throttle")) {
1300 cap_acquisition_throttle = g_conf().get_val<uint64_t>("mds_session_cap_acquisition_throttle");
1301 }
1302 if (changed.count("mds_session_max_caps_throttle_ratio")) {
1303 max_caps_throttle_ratio = g_conf().get_val<double>("mds_session_max_caps_throttle_ratio");
1304 }
1305 if (changed.count("mds_cap_acquisition_throttle_retry_request_timeout")) {
1306 caps_throttle_retry_request_timeout = g_conf().get_val<double>("mds_cap_acquisition_throttle_retry_request_timeout");
1307 }
f67539c2
TL
1308 if (changed.count("mds_alternate_name_max")) {
1309 alternate_name_max = g_conf().get_val<Option::size_t>("mds_alternate_name_max");
1310 }
1e59de90
TL
1311 if (changed.count("mds_fscrypt_last_block_max_size")) {
1312 fscrypt_last_block_max_size = g_conf().get_val<Option::size_t>("mds_fscrypt_last_block_max_size");
1313 }
20effc67
TL
1314 if (changed.count("mds_dir_max_entries")) {
1315 dir_max_entries = g_conf().get_val<uint64_t>("mds_dir_max_entries");
1316 dout(20) << __func__ << " max entries per directory changed to "
1317 << dir_max_entries << dendl;
1318 }
1319 if (changed.count("mds_bal_fragment_size_max")) {
1320 bal_fragment_size_max = g_conf().get_val<int64_t>("mds_bal_fragment_size_max");
1321 dout(20) << __func__ << " max fragment size changed to "
1322 << bal_fragment_size_max << dendl;
1323 }
1e59de90
TL
1324 if (changed.count("mds_inject_rename_corrupt_dentry_first")) {
1325 inject_rename_corrupt_dentry_first = g_conf().get_val<double>("mds_inject_rename_corrupt_dentry_first");
1326 }
91327a77
AA
1327}
1328
7c673cae 1329/*
11fdf7f2 1330 * XXX bump in the interface here, not using an MDSContext here
7c673cae
FG
1331 * because all the callers right now happen to use a SaferCond
1332 */
f67539c2 1333void Server::kill_session(Session *session, Context *on_safe)
7c673cae 1334{
9f95a23c 1335 ceph_assert(ceph_mutex_is_locked_by_me(mds->mds_lock));
31f18b77 1336
7c673cae
FG
1337 if ((session->is_opening() ||
1338 session->is_open() ||
1339 session->is_stale()) &&
1340 !session->is_importing()) {
1341 dout(10) << "kill_session " << session << dendl;
f67539c2 1342 journal_close_session(session, Session::STATE_KILLING, on_safe);
7c673cae
FG
1343 } else {
1344 dout(10) << "kill_session importing or already closing/killing " << session << dendl;
11fdf7f2
TL
1345 if (session->is_closing() ||
1346 session->is_killing()) {
1347 if (on_safe)
1348 mdlog->wait_for_safe(new MDSInternalContextWrapper(mds, on_safe));
1349 } else {
1350 ceph_assert(session->is_closed() ||
1351 session->is_importing());
1352 if (on_safe)
1353 on_safe->complete(0);
7c673cae
FG
1354 }
1355 }
1356}
1357
33c7a0ef 1358size_t Server::apply_blocklist()
31f18b77 1359{
81eedcae 1360 std::vector<Session*> victims;
11fdf7f2 1361 const auto& sessions = mds->sessionmap.get_sessions();
33c7a0ef
TL
1362 mds->objecter->with_osdmap(
1363 [&](const OSDMap& o) {
1364 for (const auto& p : sessions) {
1365 if (!p.first.is_client()) {
1366 // Do not apply OSDMap blocklist to MDS daemons, we find out
1367 // about their death via MDSMap.
1368 continue;
1369 }
1370 if (o.is_blocklisted(p.second->info.inst.addr)) {
1371 victims.push_back(p.second);
1372 }
81eedcae 1373 }
33c7a0ef 1374 });
31f18b77 1375
9f95a23c 1376 for (const auto& s : victims) {
31f18b77
FG
1377 kill_session(s, nullptr);
1378 }
1379
f67539c2 1380 dout(10) << "apply_blocklist: killed " << victims.size() << dendl;
31f18b77
FG
1381
1382 return victims.size();
1383}
1384
f67539c2 1385void Server::journal_close_session(Session *session, int state, Context *on_safe)
7c673cae 1386{
9f95a23c 1387 dout(10) << __func__ << " : "
9f95a23c 1388 << session->info.inst
f67539c2
TL
1389 << " pending_prealloc_inos " << session->pending_prealloc_inos
1390 << " free_prealloc_inos " << session->free_prealloc_inos
1391 << " delegated_inos " << session->delegated_inos << dendl;
9f95a23c 1392
7c673cae
FG
1393 uint64_t sseq = mds->sessionmap.set_state(session, state);
1394 version_t pv = mds->sessionmap.mark_projected(session);
1395 version_t piv = 0;
1396
1397 // release alloc and pending-alloc inos for this session
1398 // and wipe out session state, in case the session close aborts for some reason
f67539c2
TL
1399 interval_set<inodeno_t> inos_to_free;
1400 inos_to_free.insert(session->pending_prealloc_inos);
1401 inos_to_free.insert(session->free_prealloc_inos);
1402 if (inos_to_free.size()) {
1403 mds->inotable->project_release_ids(inos_to_free);
7c673cae
FG
1404 piv = mds->inotable->get_projected_version();
1405 } else
1406 piv = 0;
9f95a23c 1407
f67539c2
TL
1408 auto le = new ESession(session->info.inst, false, pv, inos_to_free, piv, session->delegated_inos);
1409 auto fin = new C_MDS_session_finish(this, session, sseq, false, pv, inos_to_free, piv,
1410 session->delegated_inos, mdlog->get_current_segment(), on_safe);
1411 mdlog->start_submit_entry(le, fin);
7c673cae
FG
1412 mdlog->flush();
1413
1414 // clean up requests, too
f67539c2
TL
1415 while(!session->requests.empty()) {
1416 auto mdr = MDRequestRef(*session->requests.begin());
7c673cae
FG
1417 mdcache->request_kill(mdr);
1418 }
1419
1420 finish_flush_session(session, session->get_push_seq());
1421}
1422
11fdf7f2 1423void Server::reconnect_clients(MDSContext *reconnect_done_)
7c673cae
FG
1424{
1425 reconnect_done = reconnect_done_;
28e407b8 1426
11fdf7f2 1427 auto now = clock::now();
28e407b8
AA
1428 set<Session*> sessions;
1429 mds->sessionmap.get_client_session_set(sessions);
1430 for (auto session : sessions) {
11fdf7f2
TL
1431 if (session->is_open()) {
1432 client_reconnect_gather.insert(session->get_client());
92f5a8d4 1433 session->set_reconnecting(true);
11fdf7f2
TL
1434 session->last_cap_renew = now;
1435 }
28e407b8 1436 }
7c673cae
FG
1437
1438 if (client_reconnect_gather.empty()) {
1439 dout(7) << "reconnect_clients -- no sessions, doing nothing." << dendl;
1440 reconnect_gather_finish();
1441 return;
1442 }
1443
1444 // clients will get the mdsmap and discover we're reconnecting via the monitor.
1445
11fdf7f2 1446 reconnect_start = now;
7c673cae
FG
1447 dout(1) << "reconnect_clients -- " << client_reconnect_gather.size() << " sessions" << dendl;
1448 mds->sessionmap.dump();
1449}
1450
9f95a23c 1451void Server::handle_client_reconnect(const cref_t<MClientReconnect> &m)
7c673cae 1452{
11fdf7f2
TL
1453 dout(7) << "handle_client_reconnect " << m->get_source()
1454 << (m->has_more() ? " (more)" : "") << dendl;
7c673cae 1455 client_t from = m->get_source().num();
94b18763 1456 Session *session = mds->get_session(m);
92f5a8d4
TL
1457 if (!session) {
1458 dout(0) << " ignoring sessionless msg " << *m << dendl;
9f95a23c 1459 auto reply = make_message<MClientSession>(CEPH_SESSION_REJECT);
92f5a8d4
TL
1460 reply->metadata["error_string"] = "sessionless";
1461 mds->send_message(reply, m->get_connection());
81eedcae 1462 return;
92f5a8d4
TL
1463 }
1464
1e59de90
TL
1465 if(mds->mdsmap->test_flag(CEPH_MDSMAP_REFUSE_CLIENT_SESSION)) {
1466 mds->clog->warn() << "client could not reconnect as"
1467 " file system flag refuse_client_session is set";
1468 dout(0) << "client cannot reconnect when file system flag"
1469 " refuse_client_session is set" << dendl;
1470 auto reply = make_message<MClientSession>(CEPH_SESSION_CLOSE);
1471 reply->metadata["error_string"] = "client cannot reconnect when file system flag"
1472 " refuse_client_session is set";
1473 mds->send_message(reply, m->get_connection());
1474 return;
1475 }
1476
92f5a8d4
TL
1477 if (!session->is_open()) {
1478 dout(0) << " ignoring msg from not-open session" << *m << dendl;
9f95a23c 1479 auto reply = make_message<MClientSession>(CEPH_SESSION_CLOSE);
92f5a8d4
TL
1480 mds->send_message(reply, m->get_connection());
1481 return;
1482 }
7c673cae 1483
f67539c2
TL
1484 bool reconnect_all_deny = g_conf().get_val<bool>("mds_deny_all_reconnect");
1485
7c673cae
FG
1486 if (!mds->is_reconnect() && mds->get_want_state() == CEPH_MDS_STATE_RECONNECT) {
1487 dout(10) << " we're almost in reconnect state (mdsmap delivery race?); waiting" << dendl;
1488 mds->wait_for_reconnect(new C_MDS_RetryMessage(mds, m));
1489 return;
1490 }
1491
f64942e4 1492 auto delay = std::chrono::duration<double>(clock::now() - reconnect_start).count();
7c673cae
FG
1493 dout(10) << " reconnect_start " << reconnect_start << " delay " << delay << dendl;
1494
1495 bool deny = false;
f67539c2 1496 if (reconnect_all_deny || !mds->is_reconnect() || mds->get_want_state() != CEPH_MDS_STATE_RECONNECT || reconnect_evicting) {
7c673cae 1497 // XXX maybe in the future we can do better than this?
f67539c2
TL
1498 if (reconnect_all_deny) {
1499 dout(1) << "mds_deny_all_reconnect was set to speed up reboot phase, ignoring reconnect, sending close" << dendl;
1500 } else {
1501 dout(1) << "no longer in reconnect state, ignoring reconnect, sending close" << dendl;
1502 }
7c673cae
FG
1503 mds->clog->info() << "denied reconnect attempt (mds is "
1504 << ceph_mds_state_name(mds->get_state())
1505 << ") from " << m->get_source_inst()
11fdf7f2 1506 << " after " << delay << " (allowed interval " << g_conf()->mds_reconnect_timeout << ")";
7c673cae 1507 deny = true;
11fdf7f2
TL
1508 } else {
1509 std::string error_str;
1510 if (!session->is_open()) {
1511 error_str = "session is closed";
1512 } else if (mdcache->is_readonly()) {
1513 error_str = "mds is readonly";
1514 } else {
1515 if (session->info.client_metadata.features.empty())
1516 infer_supported_features(session, session->info.client_metadata);
1517
1518 feature_bitset_t missing_features = required_client_features;
1519 missing_features -= session->info.client_metadata.features;
1520 if (!missing_features.empty()) {
f67539c2
TL
1521 CachedStackStringStream css;
1522 *css << "missing required features '" << missing_features << "'";
1523 error_str = css->strv();
11fdf7f2
TL
1524 }
1525 }
1526
1527 if (!error_str.empty()) {
1528 deny = true;
1529 dout(1) << " " << error_str << ", ignoring reconnect, sending close" << dendl;
1530 mds->clog->info() << "denied reconnect attempt from "
1531 << m->get_source_inst() << " (" << error_str << ")";
1532 }
7c673cae
FG
1533 }
1534
1535 if (deny) {
9f95a23c 1536 auto r = make_message<MClientSession>(CEPH_SESSION_CLOSE);
11fdf7f2 1537 mds->send_message_client(r, session);
f67539c2
TL
1538 if (session->is_open()) {
1539 client_reconnect_denied.insert(session->get_client());
1540 }
7c673cae
FG
1541 return;
1542 }
1543
11fdf7f2 1544 if (!m->has_more()) {
f67539c2 1545 metrics_handler->add_session(session);
11fdf7f2 1546 // notify client of success with an OPEN
9f95a23c 1547 auto reply = make_message<MClientSession>(CEPH_SESSION_OPEN);
33c7a0ef 1548 if (session->info.has_feature(CEPHFS_FEATURE_MIMIC)) {
11fdf7f2 1549 reply->supported_features = supported_features;
33c7a0ef
TL
1550 reply->metric_spec = supported_metric_spec;
1551 }
11fdf7f2
TL
1552 mds->send_message_client(reply, session);
1553 mds->clog->debug() << "reconnect by " << session->info.inst << " after " << delay;
1554 }
1555
91327a77 1556 session->last_cap_renew = clock::now();
7c673cae
FG
1557
1558 // snaprealms
11fdf7f2
TL
1559 for (const auto &r : m->realms) {
1560 CInode *in = mdcache->get_inode(inodeno_t(r.realm.ino));
7c673cae
FG
1561 if (in && in->state_test(CInode::STATE_PURGING))
1562 continue;
1563 if (in) {
11fdf7f2
TL
1564 if (in->snaprealm) {
1565 dout(15) << "open snaprealm (w inode) on " << *in << dendl;
7c673cae 1566 } else {
11fdf7f2
TL
1567 // this can happen if we are non-auth or we rollback snaprealm
1568 dout(15) << "open snaprealm (null snaprealm) on " << *in << dendl;
7c673cae 1569 }
11fdf7f2 1570 mdcache->add_reconnected_snaprealm(from, inodeno_t(r.realm.ino), snapid_t(r.realm.seq));
7c673cae 1571 } else {
11fdf7f2
TL
1572 dout(15) << "open snaprealm (w/o inode) on " << inodeno_t(r.realm.ino)
1573 << " seq " << r.realm.seq << dendl;
1574 mdcache->add_reconnected_snaprealm(from, inodeno_t(r.realm.ino), snapid_t(r.realm.seq));
7c673cae
FG
1575 }
1576 }
1577
1578 // caps
11fdf7f2 1579 for (const auto &p : m->caps) {
7c673cae 1580 // make sure our last_cap_id is MAX over all issued caps
11fdf7f2
TL
1581 if (p.second.capinfo.cap_id > mdcache->last_cap_id)
1582 mdcache->last_cap_id = p.second.capinfo.cap_id;
7c673cae 1583
11fdf7f2 1584 CInode *in = mdcache->get_inode(p.first);
7c673cae
FG
1585 if (in && in->state_test(CInode::STATE_PURGING))
1586 continue;
1587 if (in && in->is_auth()) {
1588 // we recovered it, and it's ours. take note.
11fdf7f2 1589 dout(15) << "open cap realm " << inodeno_t(p.second.capinfo.snaprealm)
7c673cae 1590 << " on " << *in << dendl;
11fdf7f2
TL
1591 in->reconnect_cap(from, p.second, session);
1592 mdcache->add_reconnected_cap(from, p.first, p.second);
1593 recover_filelocks(in, p.second.flockbl, m->get_orig_source().num());
7c673cae
FG
1594 continue;
1595 }
1596
1597 if (in && !in->is_auth()) {
1598 // not mine.
1599 dout(10) << "non-auth " << *in << ", will pass off to authority" << dendl;
1600 // add to cap export list.
11fdf7f2
TL
1601 mdcache->rejoin_export_caps(p.first, from, p.second,
1602 in->authority().first, true);
7c673cae
FG
1603 } else {
1604 // don't know if the inode is mine
11fdf7f2
TL
1605 dout(10) << "missing ino " << p.first << ", will load later" << dendl;
1606 mdcache->rejoin_recovered_caps(p.first, from, p.second, MDS_RANK_NONE);
7c673cae
FG
1607 }
1608 }
1609
f64942e4
AA
1610 reconnect_last_seen = clock::now();
1611
11fdf7f2
TL
1612 if (!m->has_more()) {
1613 mdcache->rejoin_recovered_client(session->get_client(), session->info.inst);
1614
1615 // remove from gather set
1616 client_reconnect_gather.erase(from);
92f5a8d4 1617 session->set_reconnecting(false);
11fdf7f2
TL
1618 if (client_reconnect_gather.empty())
1619 reconnect_gather_finish();
1620 }
1621}
1622
1623void Server::infer_supported_features(Session *session, client_metadata_t& client_metadata)
1624{
1625 int supported = -1;
1626 auto it = client_metadata.find("ceph_version");
1627 if (it != client_metadata.end()) {
1628 // user space client
1629 if (it->second.compare(0, 16, "ceph version 12.") == 0)
1630 supported = CEPHFS_FEATURE_LUMINOUS;
1631 else if (session->get_connection()->has_feature(CEPH_FEATURE_FS_CHANGE_ATTR))
1632 supported = CEPHFS_FEATURE_KRAKEN;
1633 } else {
1634 it = client_metadata.find("kernel_version");
1635 if (it != client_metadata.end()) {
1636 // kernel client
1637 if (session->get_connection()->has_feature(CEPH_FEATURE_NEW_OSDOP_ENCODING))
1638 supported = CEPHFS_FEATURE_LUMINOUS;
1639 }
1640 }
1641 if (supported == -1 &&
1642 session->get_connection()->has_feature(CEPH_FEATURE_FS_FILE_LAYOUT_V2))
1643 supported = CEPHFS_FEATURE_JEWEL;
7c673cae 1644
11fdf7f2
TL
1645 if (supported >= 0) {
1646 unsigned long value = (1UL << (supported + 1)) - 1;
1647 client_metadata.features = feature_bitset_t(value);
1648 dout(10) << __func__ << " got '" << client_metadata.features << "'" << dendl;
1649 }
7c673cae
FG
1650}
1651
11fdf7f2
TL
1652void Server::update_required_client_features()
1653{
f67539c2 1654 required_client_features = mds->mdsmap->get_required_client_features();
11fdf7f2
TL
1655 dout(7) << "required_client_features: " << required_client_features << dendl;
1656
1657 if (mds->get_state() >= MDSMap::STATE_RECONNECT) {
1658 set<Session*> sessions;
1659 mds->sessionmap.get_client_session_set(sessions);
1660 for (auto session : sessions) {
1661 feature_bitset_t missing_features = required_client_features;
1662 missing_features -= session->info.client_metadata.features;
1663 if (!missing_features.empty()) {
f67539c2 1664 bool blocklisted = mds->objecter->with_osdmap(
11fdf7f2 1665 [session](const OSDMap &osd_map) -> bool {
f67539c2 1666 return osd_map.is_blocklisted(session->info.inst.addr);
11fdf7f2 1667 });
f67539c2 1668 if (blocklisted)
11fdf7f2 1669 continue;
7c673cae 1670
11fdf7f2
TL
1671 mds->clog->warn() << "evicting session " << *session << ", missing required features '"
1672 << missing_features << "'";
f67539c2 1673 CachedStackStringStream css;
11fdf7f2 1674 mds->evict_client(session->get_client().v, false,
f67539c2 1675 g_conf()->mds_session_blocklist_on_evict, *css);
11fdf7f2
TL
1676 }
1677 }
1678 }
1679}
7c673cae
FG
1680
1681void Server::reconnect_gather_finish()
1682{
1683 dout(7) << "reconnect_gather_finish. failed on " << failed_reconnects << " clients" << dendl;
11fdf7f2
TL
1684 ceph_assert(reconnect_done);
1685
1686 if (!mds->snapclient->is_synced()) {
1687 // make sure snaptable cache is populated. snaprealms will be
1688 // extensively used in rejoin stage.
1689 dout(7) << " snaptable cache isn't synced, delaying state transition" << dendl;
1690 mds->snapclient->wait_for_sync(reconnect_done);
1691 } else {
1692 reconnect_done->complete(0);
1693 }
7c673cae
FG
1694 reconnect_done = NULL;
1695}
1696
1697void Server::reconnect_tick()
1698{
f67539c2 1699 bool reject_all_reconnect = false;
31f18b77 1700 if (reconnect_evicting) {
f64942e4 1701 dout(7) << "reconnect_tick: waiting for evictions" << dendl;
31f18b77
FG
1702 return;
1703 }
1704
f67539c2
TL
1705 /*
1706 * Set mds_deny_all_reconnect to reject all the reconnect req ,
1707 * then load less meta information in rejoin phase. This will shorten reboot time.
1708 * Moreover, loading less meta increases the chance standby with less memory can failover.
1709
1710 * Why not shorten reconnect period?
1711 * Clients may send unsafe or retry requests, which haven't been
1712 * completed before old mds stop, to new mds. These requests may
1713 * need to be processed during new mds's clientreplay phase,
1714 * see: #https://github.com/ceph/ceph/pull/29059.
1715 */
1716 bool reconnect_all_deny = g_conf().get_val<bool>("mds_deny_all_reconnect");
f64942e4
AA
1717 if (client_reconnect_gather.empty())
1718 return;
31f18b77 1719
f67539c2
TL
1720 if (reconnect_all_deny && (client_reconnect_gather == client_reconnect_denied))
1721 reject_all_reconnect = true;
1722
f64942e4
AA
1723 auto now = clock::now();
1724 auto elapse1 = std::chrono::duration<double>(now - reconnect_start).count();
f67539c2 1725 if (elapse1 < g_conf()->mds_reconnect_timeout && !reject_all_reconnect)
f64942e4 1726 return;
31f18b77 1727
f64942e4
AA
1728 vector<Session*> remaining_sessions;
1729 remaining_sessions.reserve(client_reconnect_gather.size());
1730 for (auto c : client_reconnect_gather) {
1731 Session *session = mds->sessionmap.get_session(entity_name_t::CLIENT(c.v));
1732 ceph_assert(session);
1733 remaining_sessions.push_back(session);
1734 // client re-sends cap flush messages before the reconnect message
1735 if (session->last_seen > reconnect_last_seen)
1736 reconnect_last_seen = session->last_seen;
1737 }
31f18b77 1738
f64942e4 1739 auto elapse2 = std::chrono::duration<double>(now - reconnect_last_seen).count();
f67539c2 1740 if (elapse2 < g_conf()->mds_reconnect_timeout / 2 && !reject_all_reconnect) {
f64942e4
AA
1741 dout(7) << "reconnect_tick: last seen " << elapse2
1742 << " seconds ago, extending reconnect interval" << dendl;
1743 return;
1744 }
1745
1746 dout(7) << "reconnect timed out, " << remaining_sessions.size()
f67539c2 1747 << " clients have not reconnected in time" << dendl;
f64942e4 1748
f67539c2 1749 // If we're doing blocklist evictions, use this to wait for them before
f64942e4
AA
1750 // proceeding to reconnect_gather_finish
1751 MDSGatherBuilder gather(g_ceph_context);
1752
1753 for (auto session : remaining_sessions) {
11fdf7f2
TL
1754 // Keep sessions that have specified timeout. These sessions will prevent
1755 // mds from going to active. MDS goes to active after they all have been
1756 // killed or reclaimed.
1757 if (session->info.client_metadata.find("timeout") !=
1758 session->info.client_metadata.end()) {
1759 dout(1) << "reconnect keeps " << session->info.inst
1760 << ", need to be reclaimed" << dendl;
1761 client_reclaim_gather.insert(session->get_client());
1762 continue;
1763 }
1764
f64942e4 1765 dout(1) << "reconnect gives up on " << session->info.inst << dendl;
31f18b77 1766
f64942e4
AA
1767 mds->clog->warn() << "evicting unresponsive client " << *session
1768 << ", after waiting " << elapse1
1769 << " seconds during MDS startup";
1770
f67539c2
TL
1771 // make _session_logged() purge orphan objects of lost async/unsafe requests
1772 session->delegated_inos.swap(session->free_prealloc_inos);
1773
1774 if (g_conf()->mds_session_blocklist_on_timeout) {
1775 CachedStackStringStream css;
1776 mds->evict_client(session->get_client().v, false, true, *css,
f64942e4 1777 gather.new_sub());
31f18b77 1778 } else {
f67539c2 1779 kill_session(session, NULL);
31f18b77 1780 }
f64942e4
AA
1781
1782 failed_reconnects++;
1783 }
1784 client_reconnect_gather.clear();
f67539c2 1785 client_reconnect_denied.clear();
f64942e4
AA
1786
1787 if (gather.has_subs()) {
1788 dout(1) << "reconnect will complete once clients are evicted" << dendl;
9f95a23c 1789 gather.set_finisher(new MDSInternalContextWrapper(mds, new LambdaContext(
f64942e4
AA
1790 [this](int r){reconnect_gather_finish();})));
1791 gather.activate();
1792 reconnect_evicting = true;
1793 } else {
1794 reconnect_gather_finish();
7c673cae
FG
1795 }
1796}
1797
1798void Server::recover_filelocks(CInode *in, bufferlist locks, int64_t client)
1799{
1800 if (!locks.length()) return;
1801 int numlocks;
1802 ceph_filelock lock;
11fdf7f2
TL
1803 auto p = locks.cbegin();
1804 decode(numlocks, p);
7c673cae 1805 for (int i = 0; i < numlocks; ++i) {
11fdf7f2 1806 decode(lock, p);
7c673cae
FG
1807 lock.client = client;
1808 in->get_fcntl_lock_state()->held_locks.insert(pair<uint64_t, ceph_filelock>(lock.start, lock));
1809 ++in->get_fcntl_lock_state()->client_held_lock_counts[client];
1810 }
11fdf7f2 1811 decode(numlocks, p);
7c673cae 1812 for (int i = 0; i < numlocks; ++i) {
11fdf7f2 1813 decode(lock, p);
7c673cae
FG
1814 lock.client = client;
1815 in->get_flock_lock_state()->held_locks.insert(pair<uint64_t, ceph_filelock> (lock.start, lock));
1816 ++in->get_flock_lock_state()->client_held_lock_counts[client];
1817 }
1818}
1819
7c673cae
FG
1820/**
1821 * Call this when the MDCache is oversized, to send requests to the clients
1822 * to trim some caps, and consequently unpin some inodes in the MDCache so
1823 * that it can trim too.
1824 */
a8e16298
TL
1825std::pair<bool, uint64_t> Server::recall_client_state(MDSGatherBuilder* gather, RecallFlags flags)
1826{
1827 const auto now = clock::now();
92f5a8d4
TL
1828 const bool steady = !!(flags&RecallFlags::STEADY);
1829 const bool enforce_max = !!(flags&RecallFlags::ENFORCE_MAX);
1830 const bool enforce_liveness = !!(flags&RecallFlags::ENFORCE_LIVENESS);
1831 const bool trim = !!(flags&RecallFlags::TRIM);
a8e16298 1832
11fdf7f2
TL
1833 const auto max_caps_per_client = g_conf().get_val<uint64_t>("mds_max_caps_per_client");
1834 const auto min_caps_per_client = g_conf().get_val<uint64_t>("mds_min_caps_per_client");
1835 const auto recall_global_max_decay_threshold = g_conf().get_val<Option::size_t>("mds_recall_global_max_decay_threshold");
1836 const auto recall_max_caps = g_conf().get_val<Option::size_t>("mds_recall_max_caps");
1837 const auto recall_max_decay_threshold = g_conf().get_val<Option::size_t>("mds_recall_max_decay_threshold");
92f5a8d4 1838 const auto cache_liveness_magnitude = g_conf().get_val<Option::size_t>("mds_session_cache_liveness_magnitude");
a8e16298
TL
1839
1840 dout(7) << __func__ << ":"
1841 << " min=" << min_caps_per_client
1842 << " max=" << max_caps_per_client
1843 << " total=" << Capability::count()
92f5a8d4 1844 << " flags=" << flags
a8e16298 1845 << dendl;
f64942e4 1846
a8e16298
TL
1847 /* trim caps of sessions with the most caps first */
1848 std::multimap<uint64_t, Session*> caps_session;
92f5a8d4 1849 auto f = [&caps_session, enforce_max, enforce_liveness, trim, max_caps_per_client, cache_liveness_magnitude](auto& s) {
a8e16298 1850 auto num_caps = s->caps.size();
92f5a8d4
TL
1851 auto cache_liveness = s->get_session_cache_liveness();
1852 if (trim || (enforce_max && num_caps > max_caps_per_client) || (enforce_liveness && cache_liveness < (num_caps>>cache_liveness_magnitude))) {
a8e16298
TL
1853 caps_session.emplace(std::piecewise_construct, std::forward_as_tuple(num_caps), std::forward_as_tuple(s));
1854 }
1855 };
1856 mds->sessionmap.get_client_sessions(std::move(f));
1857
1858 std::pair<bool, uint64_t> result = {false, 0};
11fdf7f2 1859 auto& [throttled, caps_recalled] = result;
a8e16298 1860 last_recall_state = now;
11fdf7f2 1861 for (const auto& [num_caps, session] : boost::adaptors::reverse(caps_session)) {
7c673cae 1862 if (!session->is_open() ||
11fdf7f2 1863 !session->get_connection() ||
7c673cae
FG
1864 !session->info.inst.name.is_client())
1865 continue;
1866
a8e16298
TL
1867 dout(10) << __func__ << ":"
1868 << " session " << session->info.inst
1869 << " caps " << num_caps
7c673cae
FG
1870 << ", leases " << session->leases.size()
1871 << dendl;
1872
a8e16298
TL
1873 uint64_t newlim;
1874 if (num_caps < recall_max_caps || (num_caps-recall_max_caps) < min_caps_per_client) {
1875 newlim = min_caps_per_client;
1876 } else {
1877 newlim = num_caps-recall_max_caps;
1878 }
1879 if (num_caps > newlim) {
1880 /* now limit the number of caps we recall at a time to prevent overloading ourselves */
1881 uint64_t recall = std::min<uint64_t>(recall_max_caps, num_caps-newlim);
1882 newlim = num_caps-recall;
1883 const uint64_t session_recall_throttle = session->get_recall_caps_throttle();
11fdf7f2
TL
1884 const uint64_t session_recall_throttle2o = session->get_recall_caps_throttle2o();
1885 const uint64_t global_recall_throttle = recall_throttle.get();
a8e16298
TL
1886 if (session_recall_throttle+recall > recall_max_decay_threshold) {
1887 dout(15) << " session recall threshold (" << recall_max_decay_threshold << ") hit at " << session_recall_throttle << "; skipping!" << dendl;
1888 throttled = true;
1889 continue;
11fdf7f2
TL
1890 } else if (session_recall_throttle2o+recall > recall_max_caps*2) {
1891 dout(15) << " session recall 2nd-order threshold (" << 2*recall_max_caps << ") hit at " << session_recall_throttle2o << "; skipping!" << dendl;
1892 throttled = true;
1893 continue;
a8e16298
TL
1894 } else if (global_recall_throttle+recall > recall_global_max_decay_threshold) {
1895 dout(15) << " global recall threshold (" << recall_global_max_decay_threshold << ") hit at " << global_recall_throttle << "; skipping!" << dendl;
1896 throttled = true;
1897 break;
1898 }
1899
1900 // now check if we've recalled caps recently and the client is unlikely to satisfy a new recall
1901 if (steady) {
1902 const auto session_recall = session->get_recall_caps();
1903 const auto session_release = session->get_release_caps();
1904 if (2*session_release < session_recall && 2*session_recall > recall_max_decay_threshold) {
1905 /* The session has been unable to keep up with the number of caps
1906 * recalled (by half); additionally, to prevent marking sessions
1907 * we've just begun to recall from, the session_recall counter
1908 * (decayed count of caps recently recalled) is **greater** than the
1909 * session threshold for the session's cap recall throttle.
1910 */
1911 dout(15) << " 2*session_release < session_recall"
11fdf7f2
TL
1912 " (2*" << session_release << " < " << session_recall << ") &&"
1913 " 2*session_recall < recall_max_decay_threshold"
1914 " (2*" << session_recall << " > " << recall_max_decay_threshold << ")"
a8e16298
TL
1915 " Skipping because we are unlikely to get more released." << dendl;
1916 continue;
1917 } else if (recall < recall_max_caps && 2*recall < session_recall) {
1918 /* The number of caps recalled is less than the number we *could*
1919 * recall (so there isn't much left to recall?) and the number of
1920 * caps is less than the current recall_caps counter (decayed count
1921 * of caps recently recalled).
1922 */
1923 dout(15) << " 2*recall < session_recall "
1924 " (2*" << recall << " < " << session_recall << ") &&"
1925 " recall < recall_max_caps (" << recall << " < " << recall_max_caps << ");"
1926 " Skipping because we are unlikely to get more released." << dendl;
1927 continue;
1928 }
1929 }
1930
1931 dout(7) << " recalling " << recall << " caps; session_recall_throttle = " << session_recall_throttle << "; global_recall_throttle = " << global_recall_throttle << dendl;
1932
9f95a23c 1933 auto m = make_message<MClientSession>(CEPH_SESSION_RECALL_STATE);
3efd9988
FG
1934 m->head.max_caps = newlim;
1935 mds->send_message_client(m, session);
a8e16298 1936 if (gather) {
f91f0fd5 1937 flush_session(session, *gather);
f64942e4 1938 }
a8e16298 1939 caps_recalled += session->notify_recall_sent(newlim);
11fdf7f2 1940 recall_throttle.hit(recall);
7c673cae
FG
1941 }
1942 }
a8e16298
TL
1943
1944 dout(7) << "recalled" << (throttled ? " (throttled)" : "") << " " << caps_recalled << " client caps." << dendl;
1945
1946 return result;
7c673cae
FG
1947}
1948
1949void Server::force_clients_readonly()
1950{
1951 dout(10) << "force_clients_readonly" << dendl;
1952 set<Session*> sessions;
1953 mds->sessionmap.get_client_session_set(sessions);
1954 for (set<Session*>::const_iterator p = sessions.begin();
1955 p != sessions.end();
1956 ++p) {
1957 Session *session = *p;
1958 if (!session->info.inst.name.is_client() ||
1959 !(session->is_open() || session->is_stale()))
1960 continue;
9f95a23c 1961 mds->send_message_client(make_message<MClientSession>(CEPH_SESSION_FORCE_RO), session);
7c673cae
FG
1962 }
1963}
1964
1965/*******
1966 * some generic stuff for finishing off requests
1967 */
1968void Server::journal_and_reply(MDRequestRef& mdr, CInode *in, CDentry *dn, LogEvent *le, MDSLogContextBase *fin)
1969{
1970 dout(10) << "journal_and_reply tracei " << in << " tracedn " << dn << dendl;
11fdf7f2 1971 ceph_assert(!mdr->has_completed);
7c673cae
FG
1972
1973 // note trace items for eventual reply.
1974 mdr->tracei = in;
1975 if (in)
1976 mdr->pin(in);
1977
1978 mdr->tracedn = dn;
1979 if (dn)
1980 mdr->pin(dn);
1981
1982 early_reply(mdr, in, dn);
aee94f69 1983
7c673cae
FG
1984 mdr->committing = true;
1985 submit_mdlog_entry(le, fin, mdr, __func__);
aee94f69 1986
7c673cae
FG
1987 if (mdr->client_request && mdr->client_request->is_queued_for_replay()) {
1988 if (mds->queue_one_replay()) {
1989 dout(10) << " queued next replay op" << dendl;
1990 } else {
11fdf7f2 1991 dout(10) << " journaled last replay op" << dendl;
7c673cae 1992 }
aee94f69 1993 } else if (mdr->did_early_reply)
b32b8144 1994 mds->locker->drop_rdlocks_for_early_reply(mdr.get());
aee94f69 1995 else
7c673cae
FG
1996 mdlog->flush();
1997}
1998
1999void Server::submit_mdlog_entry(LogEvent *le, MDSLogContextBase *fin, MDRequestRef& mdr,
11fdf7f2 2000 std::string_view event)
7c673cae
FG
2001{
2002 if (mdr) {
2003 string event_str("submit entry: ");
2004 event_str += event;
11fdf7f2 2005 mdr->mark_event(event_str);
7c673cae
FG
2006 }
2007 mdlog->submit_entry(le, fin);
2008}
2009
2010/*
2011 * send response built from mdr contents and error code; clean up mdr
2012 */
2013void Server::respond_to_request(MDRequestRef& mdr, int r)
2014{
2015 if (mdr->client_request) {
f91f0fd5
TL
2016 if (mdr->is_batch_head()) {
2017 dout(20) << __func__ << " batch head " << *mdr << dendl;
2018 mdr->release_batch_op()->respond(r);
9f95a23c
TL
2019 } else {
2020 reply_client_request(mdr, make_message<MClientReply>(*mdr->client_request, r));
2021 }
7c673cae
FG
2022 } else if (mdr->internal_op > -1) {
2023 dout(10) << "respond_to_request on internal request " << mdr << dendl;
2024 if (!mdr->internal_op_finish)
11fdf7f2 2025 ceph_abort_msg("trying to respond to internal op without finisher");
7c673cae
FG
2026 mdr->internal_op_finish->complete(r);
2027 mdcache->request_finish(mdr);
2028 }
2029}
2030
91327a77 2031// statistics mds req op number and latency
9f95a23c 2032void Server::perf_gather_op_latency(const cref_t<MClientRequest> &req, utime_t lat)
91327a77
AA
2033{
2034 int code = l_mdss_first;
2035 switch(req->get_op()) {
2036 case CEPH_MDS_OP_LOOKUPHASH:
2037 code = l_mdss_req_lookuphash_latency;
2038 break;
2039 case CEPH_MDS_OP_LOOKUPINO:
2040 code = l_mdss_req_lookupino_latency;
2041 break;
2042 case CEPH_MDS_OP_LOOKUPPARENT:
2043 code = l_mdss_req_lookupparent_latency;
2044 break;
2045 case CEPH_MDS_OP_LOOKUPNAME:
2046 code = l_mdss_req_lookupname_latency;
2047 break;
2048 case CEPH_MDS_OP_LOOKUP:
2049 code = l_mdss_req_lookup_latency;
2050 break;
2051 case CEPH_MDS_OP_LOOKUPSNAP:
2052 code = l_mdss_req_lookupsnap_latency;
2053 break;
2054 case CEPH_MDS_OP_GETATTR:
2055 code = l_mdss_req_getattr_latency;
2056 break;
2057 case CEPH_MDS_OP_SETATTR:
2058 code = l_mdss_req_setattr_latency;
2059 break;
2060 case CEPH_MDS_OP_SETLAYOUT:
2061 code = l_mdss_req_setlayout_latency;
2062 break;
2063 case CEPH_MDS_OP_SETDIRLAYOUT:
2064 code = l_mdss_req_setdirlayout_latency;
2065 break;
1d09f67e
TL
2066 case CEPH_MDS_OP_GETVXATTR:
2067 code = l_mdss_req_getvxattr_latency;
2068 break;
91327a77
AA
2069 case CEPH_MDS_OP_SETXATTR:
2070 code = l_mdss_req_setxattr_latency;
2071 break;
2072 case CEPH_MDS_OP_RMXATTR:
2073 code = l_mdss_req_rmxattr_latency;
2074 break;
2075 case CEPH_MDS_OP_READDIR:
2076 code = l_mdss_req_readdir_latency;
2077 break;
2078 case CEPH_MDS_OP_SETFILELOCK:
2079 code = l_mdss_req_setfilelock_latency;
2080 break;
2081 case CEPH_MDS_OP_GETFILELOCK:
2082 code = l_mdss_req_getfilelock_latency;
2083 break;
2084 case CEPH_MDS_OP_CREATE:
2085 code = l_mdss_req_create_latency;
2086 break;
2087 case CEPH_MDS_OP_OPEN:
2088 code = l_mdss_req_open_latency;
2089 break;
2090 case CEPH_MDS_OP_MKNOD:
2091 code = l_mdss_req_mknod_latency;
2092 break;
2093 case CEPH_MDS_OP_LINK:
2094 code = l_mdss_req_link_latency;
2095 break;
2096 case CEPH_MDS_OP_UNLINK:
2097 code = l_mdss_req_unlink_latency;
2098 break;
2099 case CEPH_MDS_OP_RMDIR:
2100 code = l_mdss_req_rmdir_latency;
2101 break;
2102 case CEPH_MDS_OP_RENAME:
2103 code = l_mdss_req_rename_latency;
2104 break;
2105 case CEPH_MDS_OP_MKDIR:
2106 code = l_mdss_req_mkdir_latency;
2107 break;
2108 case CEPH_MDS_OP_SYMLINK:
2109 code = l_mdss_req_symlink_latency;
2110 break;
2111 case CEPH_MDS_OP_LSSNAP:
2112 code = l_mdss_req_lssnap_latency;
2113 break;
2114 case CEPH_MDS_OP_MKSNAP:
2115 code = l_mdss_req_mksnap_latency;
2116 break;
2117 case CEPH_MDS_OP_RMSNAP:
2118 code = l_mdss_req_rmsnap_latency;
2119 break;
2120 case CEPH_MDS_OP_RENAMESNAP:
2121 code = l_mdss_req_renamesnap_latency;
2122 break;
aee94f69
TL
2123 case CEPH_MDS_OP_READDIR_SNAPDIFF:
2124 code = l_mdss_req_snapdiff_latency;
2125 break;
20effc67
TL
2126 default:
2127 dout(1) << ": unknown client op" << dendl;
2128 return;
91327a77
AA
2129 }
2130 logger->tinc(code, lat);
2131}
2132
7c673cae
FG
2133void Server::early_reply(MDRequestRef& mdr, CInode *tracei, CDentry *tracedn)
2134{
11fdf7f2 2135 if (!g_conf()->mds_early_reply)
7c673cae
FG
2136 return;
2137
b32b8144
FG
2138 if (mdr->no_early_reply) {
2139 dout(10) << "early_reply - flag no_early_reply is set, not allowed." << dendl;
2140 return;
2141 }
2142
f67539c2
TL
2143 if (mdr->has_more() && mdr->more()->has_journaled_peers) {
2144 dout(10) << "early_reply - there are journaled peers, not allowed." << dendl;
7c673cae
FG
2145 return;
2146 }
2147
2148 if (mdr->alloc_ino) {
2149 dout(10) << "early_reply - allocated ino, not allowed" << dendl;
2150 return;
2151 }
2152
9f95a23c 2153 const cref_t<MClientRequest> &req = mdr->client_request;
7c673cae
FG
2154 entity_inst_t client_inst = req->get_source_inst();
2155 if (client_inst.name.is_mds())
2156 return;
2157
2158 if (req->is_replay()) {
2159 dout(10) << " no early reply on replay op" << dendl;
2160 return;
2161 }
2162
2163
9f95a23c 2164 auto reply = make_message<MClientReply>(*req, 0);
7c673cae
FG
2165 reply->set_unsafe();
2166
2167 // mark xlocks "done", indicating that we are exposing uncommitted changes.
2168 //
2169 //_rename_finish() does not send dentry link/unlink message to replicas.
2170 // so do not set xlocks on dentries "done", the xlocks prevent dentries
2171 // that have projected linkages from getting new replica.
2172 mds->locker->set_xlocks_done(mdr.get(), req->get_op() == CEPH_MDS_OP_RENAME);
2173
2174 dout(10) << "early_reply " << reply->get_result()
2175 << " (" << cpp_strerror(reply->get_result())
2176 << ") " << *req << dendl;
2177
2178 if (tracei || tracedn) {
2179 if (tracei)
2180 mdr->cap_releases.erase(tracei->vino());
2181 if (tracedn)
2182 mdr->cap_releases.erase(tracedn->get_dir()->get_inode()->vino());
2183
9f95a23c 2184 set_trace_dist(reply, tracei, tracedn, mdr);
7c673cae
FG
2185 }
2186
2187 reply->set_extra_bl(mdr->reply_extra_bl);
11fdf7f2 2188 mds->send_message_client(reply, mdr->session);
7c673cae
FG
2189
2190 mdr->did_early_reply = true;
2191
2192 mds->logger->inc(l_mds_reply);
2193 utime_t lat = ceph_clock_now() - req->get_recv_stamp();
2194 mds->logger->tinc(l_mds_reply_latency, lat);
33c7a0ef
TL
2195 if (lat >= g_conf()->mds_op_complaint_time) {
2196 mds->logger->inc(l_mds_slow_reply);
2197 }
91327a77
AA
2198 if (client_inst.name.is_client()) {
2199 mds->sessionmap.hit_session(mdr->session);
2200 }
2201 perf_gather_op_latency(req, lat);
7c673cae
FG
2202 dout(20) << "lat " << lat << dendl;
2203
2204 mdr->mark_event("early_replied");
2205}
2206
2207/*
2208 * send given reply
2209 * include a trace to tracei
2210 * Clean up mdr
2211 */
9f95a23c 2212void Server::reply_client_request(MDRequestRef& mdr, const ref_t<MClientReply> &reply)
7c673cae 2213{
11fdf7f2 2214 ceph_assert(mdr.get());
9f95a23c 2215 const cref_t<MClientRequest> &req = mdr->client_request;
7c673cae
FG
2216
2217 dout(7) << "reply_client_request " << reply->get_result()
2218 << " (" << cpp_strerror(reply->get_result())
2219 << ") " << *req << dendl;
2220
2221 mdr->mark_event("replying");
2222
2223 Session *session = mdr->session;
2224
2225 // note successful request in session map?
2226 //
2227 // setfilelock requests are special, they only modify states in MDS memory.
2228 // The states get lost when MDS fails. If Client re-send a completed
2229 // setfilelock request, it means that client did not receive corresponding
2230 // setfilelock reply. So MDS should re-execute the setfilelock request.
2231 if (req->may_write() && req->get_op() != CEPH_MDS_OP_SETFILELOCK &&
2232 reply->get_result() == 0 && session) {
2233 inodeno_t created = mdr->alloc_ino ? mdr->alloc_ino : mdr->used_prealloc_ino;
2234 session->add_completed_request(mdr->reqid.tid, created);
2235 if (mdr->ls) {
2236 mdr->ls->touched_sessions.insert(session->info.inst.name);
2237 }
2238 }
2239
2240 // give any preallocated inos to the session
2241 apply_allocated_inos(mdr, session);
2242
2243 // get tracei/tracedn from mdr?
7c673cae
FG
2244 CInode *tracei = mdr->tracei;
2245 CDentry *tracedn = mdr->tracedn;
2246
2247 bool is_replay = mdr->client_request->is_replay();
2248 bool did_early_reply = mdr->did_early_reply;
2249 entity_inst_t client_inst = req->get_source_inst();
7c673cae
FG
2250
2251 if (!did_early_reply && !is_replay) {
2252
2253 mds->logger->inc(l_mds_reply);
2254 utime_t lat = ceph_clock_now() - mdr->client_request->get_recv_stamp();
2255 mds->logger->tinc(l_mds_reply_latency, lat);
33c7a0ef
TL
2256 if (lat >= g_conf()->mds_op_complaint_time) {
2257 mds->logger->inc(l_mds_slow_reply);
2258 }
81eedcae 2259 if (session && client_inst.name.is_client()) {
91327a77
AA
2260 mds->sessionmap.hit_session(session);
2261 }
2262 perf_gather_op_latency(req, lat);
7c673cae
FG
2263 dout(20) << "lat " << lat << dendl;
2264
2265 if (tracei)
2266 mdr->cap_releases.erase(tracei->vino());
2267 if (tracedn)
2268 mdr->cap_releases.erase(tracedn->get_dir()->get_inode()->vino());
2269 }
2270
2271 // drop non-rdlocks before replying, so that we can issue leases
2272 mdcache->request_drop_non_rdlocks(mdr);
2273
2274 // reply at all?
81eedcae 2275 if (session && !client_inst.name.is_mds()) {
7c673cae
FG
2276 // send reply.
2277 if (!did_early_reply && // don't issue leases if we sent an earlier reply already
2278 (tracei || tracedn)) {
2279 if (is_replay) {
2280 if (tracei)
2281 mdcache->try_reconnect_cap(tracei, session);
2282 } else {
2283 // include metadata in reply
9f95a23c 2284 set_trace_dist(reply, tracei, tracedn, mdr);
7c673cae
FG
2285 }
2286 }
2287
2288 // We can set the extra bl unconditionally: if it's already been sent in the
2289 // early_reply, set_extra_bl will have claimed it and reply_extra_bl is empty
2290 reply->set_extra_bl(mdr->reply_extra_bl);
2291
2292 reply->set_mdsmap_epoch(mds->mdsmap->get_epoch());
11fdf7f2 2293 mds->send_message_client(reply, session);
7c673cae
FG
2294 }
2295
2296 if (req->is_queued_for_replay() &&
2297 (mdr->has_completed || reply->get_result() < 0)) {
2298 if (reply->get_result() < 0) {
2299 int r = reply->get_result();
2300 derr << "reply_client_request: failed to replay " << *req
2301 << " error " << r << " (" << cpp_strerror(r) << ")" << dendl;
2302 mds->clog->warn() << "failed to replay " << req->get_reqid() << " error " << r;
2303 }
2304 mds->queue_one_replay();
2305 }
2306
2307 // clean up request
2308 mdcache->request_finish(mdr);
2309
2310 // take a closer look at tracei, if it happens to be a remote link
2311 if (tracei &&
2312 tracedn &&
2313 tracedn->get_projected_linkage()->is_remote()) {
2314 mdcache->eval_remote(tracedn);
2315 }
2316}
2317
7c673cae
FG
2318/*
2319 * pass inode OR dentry (not both, or we may get confused)
2320 *
2321 * trace is in reverse order (i.e. root inode comes last)
2322 */
9f95a23c 2323void Server::set_trace_dist(const ref_t<MClientReply> &reply,
7c673cae 2324 CInode *in, CDentry *dn,
7c673cae
FG
2325 MDRequestRef& mdr)
2326{
2327 // skip doing this for debugging purposes?
11fdf7f2 2328 if (g_conf()->mds_inject_traceless_reply_probability &&
7c673cae 2329 mdr->ls && !mdr->o_trunc &&
11fdf7f2 2330 (rand() % 10000 < g_conf()->mds_inject_traceless_reply_probability * 10000.0)) {
7c673cae
FG
2331 dout(5) << "deliberately skipping trace for " << *reply << dendl;
2332 return;
2333 }
2334
2335 // inode, dentry, dir, ..., inode
2336 bufferlist bl;
2337 mds_rank_t whoami = mds->get_nodeid();
9f95a23c
TL
2338 Session *session = mdr->session;
2339 snapid_t snapid = mdr->snapid;
7c673cae
FG
2340 utime_t now = ceph_clock_now();
2341
2342 dout(20) << "set_trace_dist snapid " << snapid << dendl;
2343
7c673cae
FG
2344 // realm
2345 if (snapid == CEPH_NOSNAP) {
2346 SnapRealm *realm;
2347 if (in)
2348 realm = in->find_snaprealm();
2349 else
2350 realm = dn->get_dir()->get_inode()->find_snaprealm();
1e59de90 2351 reply->snapbl = get_snap_trace(session, realm);
7c673cae
FG
2352 dout(10) << "set_trace_dist snaprealm " << *realm << " len=" << reply->snapbl.length() << dendl;
2353 }
2354
2355 // dir + dentry?
2356 if (dn) {
2357 reply->head.is_dentry = 1;
2358 CDir *dir = dn->get_dir();
2359 CInode *diri = dir->get_inode();
2360
2361 diri->encode_inodestat(bl, session, NULL, snapid);
2362 dout(20) << "set_trace_dist added diri " << *diri << dendl;
2363
2364#ifdef MDS_VERIFY_FRAGSTAT
2365 if (dir->is_complete())
2366 dir->verify_fragstat();
2367#endif
11fdf7f2
TL
2368 DirStat ds;
2369 ds.frag = dir->get_frag();
2370 ds.auth = dir->get_dir_auth().first;
f91f0fd5 2371 if (dir->is_auth() && !forward_all_requests_to_auth)
11fdf7f2
TL
2372 dir->get_dist_spec(ds.dist, whoami);
2373
2374 dir->encode_dirstat(bl, session->info, ds);
7c673cae
FG
2375 dout(20) << "set_trace_dist added dir " << *dir << dendl;
2376
11fdf7f2 2377 encode(dn->get_name(), bl);
2a845540 2378 mds->locker->issue_client_lease(dn, in, mdr, now, bl);
7c673cae
FG
2379 } else
2380 reply->head.is_dentry = 0;
2381
2382 // inode
2383 if (in) {
2384 in->encode_inodestat(bl, session, NULL, snapid, 0, mdr->getattr_caps);
aee94f69
TL
2385 dout(20) << "set_trace_dist added snap " << snapid << " in " << *in
2386 << dendl;
7c673cae
FG
2387 reply->head.is_target = 1;
2388 } else
2389 reply->head.is_target = 0;
2390
2391 reply->set_trace(bl);
2392}
2393
9f95a23c 2394void Server::handle_client_request(const cref_t<MClientRequest> &req)
7c673cae
FG
2395{
2396 dout(4) << "handle_client_request " << *req << dendl;
2397
2398 if (mds->logger)
2399 mds->logger->inc(l_mds_request);
2400 if (logger)
2401 logger->inc(l_mdss_handle_client_request);
2402
2403 if (!mdcache->is_open()) {
2404 dout(5) << "waiting for root" << dendl;
2405 mdcache->wait_for_open(new C_MDS_RetryMessage(mds, req));
2406 return;
2407 }
2408
92f5a8d4 2409 bool sessionclosed_isok = replay_unsafe_with_closed_session;
7c673cae
FG
2410 // active session?
2411 Session *session = 0;
39ae355f 2412 if (req->is_a_client()) {
94b18763 2413 session = mds->get_session(req);
7c673cae
FG
2414 if (!session) {
2415 dout(5) << "no session for " << req->get_source() << ", dropping" << dendl;
92f5a8d4 2416 } else if ((session->is_closed() && (!mds->is_clientreplay() || !sessionclosed_isok)) ||
7c673cae
FG
2417 session->is_closing() ||
2418 session->is_killing()) {
2419 dout(5) << "session closed|closing|killing, dropping" << dendl;
2420 session = NULL;
2421 }
2422 if (!session) {
2423 if (req->is_queued_for_replay())
2424 mds->queue_one_replay();
7c673cae
FG
2425 return;
2426 }
2427 }
2428
2429 // old mdsmap?
2430 if (req->get_mdsmap_epoch() < mds->mdsmap->get_epoch()) {
2431 // send it? hrm, this isn't ideal; they may get a lot of copies if
2432 // they have a high request rate.
2433 }
2434
2435 // completed request?
2436 bool has_completed = false;
2437 if (req->is_replay() || req->get_retry_attempt()) {
11fdf7f2 2438 ceph_assert(session);
7c673cae
FG
2439 inodeno_t created;
2440 if (session->have_completed_request(req->get_reqid().tid, &created)) {
2441 has_completed = true;
92f5a8d4
TL
2442 if (!session->is_open())
2443 return;
7c673cae
FG
2444 // Don't send traceless reply if the completed request has created
2445 // new inode. Treat the request as lookup request instead.
2446 if (req->is_replay() ||
2447 ((created == inodeno_t() || !mds->is_clientreplay()) &&
2448 req->get_op() != CEPH_MDS_OP_OPEN &&
2449 req->get_op() != CEPH_MDS_OP_CREATE)) {
2450 dout(5) << "already completed " << req->get_reqid() << dendl;
9f95a23c 2451 auto reply = make_message<MClientReply>(*req, 0);
7c673cae
FG
2452 if (created != inodeno_t()) {
2453 bufferlist extra;
11fdf7f2 2454 encode(created, extra);
7c673cae
FG
2455 reply->set_extra_bl(extra);
2456 }
11fdf7f2 2457 mds->send_message_client(reply, session);
7c673cae
FG
2458
2459 if (req->is_queued_for_replay())
2460 mds->queue_one_replay();
2461
7c673cae
FG
2462 return;
2463 }
2464 if (req->get_op() != CEPH_MDS_OP_OPEN &&
2465 req->get_op() != CEPH_MDS_OP_CREATE) {
2466 dout(10) << " completed request which created new inode " << created
2467 << ", convert it to lookup request" << dendl;
2468 req->head.op = req->get_dentry_wanted() ? CEPH_MDS_OP_LOOKUP : CEPH_MDS_OP_GETATTR;
2469 req->head.args.getattr.mask = CEPH_STAT_CAP_INODE_ALL;
2470 }
2471 }
2472 }
2473
2474 // trim completed_request list
2475 if (req->get_oldest_client_tid() > 0) {
2476 dout(15) << " oldest_client_tid=" << req->get_oldest_client_tid() << dendl;
11fdf7f2 2477 ceph_assert(session);
7c673cae
FG
2478 if (session->trim_completed_requests(req->get_oldest_client_tid())) {
2479 // Sessions 'completed_requests' was dirtied, mark it to be
2480 // potentially flushed at segment expiry.
2481 mdlog->get_current_segment()->touched_sessions.insert(session->info.inst.name);
2482
2483 if (session->get_num_trim_requests_warnings() > 0 &&
11fdf7f2 2484 session->get_num_completed_requests() * 2 < g_conf()->mds_max_completed_requests)
7c673cae
FG
2485 session->reset_num_trim_requests_warnings();
2486 } else {
2487 if (session->get_num_completed_requests() >=
11fdf7f2 2488 (g_conf()->mds_max_completed_requests << session->get_num_trim_requests_warnings())) {
7c673cae 2489 session->inc_num_trim_requests_warnings();
f67539c2
TL
2490 CachedStackStringStream css;
2491 *css << "client." << session->get_client() << " does not advance its oldest_client_tid ("
7c673cae
FG
2492 << req->get_oldest_client_tid() << "), "
2493 << session->get_num_completed_requests()
2494 << " completed requests recorded in session\n";
f67539c2
TL
2495 mds->clog->warn() << css->strv();
2496 dout(20) << __func__ << " " << css->strv() << dendl;
7c673cae
FG
2497 }
2498 }
2499 }
2500
2501 // register + dispatch
2502 MDRequestRef mdr = mdcache->request_start(req);
2503 if (!mdr.get())
2504 return;
2505
2506 if (session) {
2507 mdr->session = session;
2508 session->requests.push_back(&mdr->item_session_request);
2509 }
2510
2511 if (has_completed)
2512 mdr->has_completed = true;
2513
2514 // process embedded cap releases?
2515 // (only if NOT replay!)
39ae355f 2516 if (!req->releases.empty() && req->is_a_client() && !req->is_replay()) {
7c673cae 2517 client_t client = req->get_source().num();
11fdf7f2
TL
2518 for (const auto &r : req->releases) {
2519 mds->locker->process_request_cap_release(mdr, client, r.item, r.dname);
2520 }
7c673cae
FG
2521 req->releases.clear();
2522 }
2523
2524 dispatch_client_request(mdr);
2525 return;
2526}
2527
2528void Server::handle_osd_map()
2529{
2530 /* Note that we check the OSDMAP_FULL flag directly rather than
2531 * using osdmap_full_flag(), because we want to know "is the flag set"
2532 * rather than "does the flag apply to us?" */
2533 mds->objecter->with_osdmap([this](const OSDMap& o) {
b3b6e05e 2534 auto pi = o.get_pg_pool(mds->get_metadata_pool());
b32b8144 2535 is_full = pi && pi->has_flag(pg_pool_t::FLAG_FULL);
7c673cae
FG
2536 dout(7) << __func__ << ": full = " << is_full << " epoch = "
2537 << o.get_epoch() << dendl;
2538 });
2539}
2540
2541void Server::dispatch_client_request(MDRequestRef& mdr)
2542{
2543 // we shouldn't be waiting on anyone.
f67539c2 2544 ceph_assert(!mdr->has_more() || mdr->more()->waiting_on_peer.empty());
7c673cae
FG
2545
2546 if (mdr->killed) {
2547 dout(10) << "request " << *mdr << " was killed" << dendl;
9f95a23c
TL
2548 //if the mdr is a "batch_op" and it has followers, pick a follower as
2549 //the new "head of the batch ops" and go on processing the new one.
f91f0fd5
TL
2550 if (mdr->is_batch_head()) {
2551 int mask = mdr->client_request->head.args.getattr.mask;
2552 auto it = mdr->batch_op_map->find(mask);
2553 auto new_batch_head = it->second->find_new_head();
2554 if (!new_batch_head) {
2555 mdr->batch_op_map->erase(it);
9f95a23c
TL
2556 return;
2557 }
f91f0fd5 2558 mdr = std::move(new_batch_head);
9f95a23c
TL
2559 } else {
2560 return;
2561 }
94b18763
FG
2562 } else if (mdr->aborted) {
2563 mdr->aborted = false;
2564 mdcache->request_kill(mdr);
2565 return;
7c673cae
FG
2566 }
2567
9f95a23c 2568 const cref_t<MClientRequest> &req = mdr->client_request;
7c673cae
FG
2569
2570 if (logger) logger->inc(l_mdss_dispatch_client_request);
2571
2572 dout(7) << "dispatch_client_request " << *req << dendl;
2573
9f95a23c
TL
2574 if (req->may_write() && mdcache->is_readonly()) {
2575 dout(10) << " read-only FS" << dendl;
f67539c2 2576 respond_to_request(mdr, -CEPHFS_EROFS);
9f95a23c
TL
2577 return;
2578 }
f67539c2
TL
2579 if (mdr->has_more() && mdr->more()->peer_error) {
2580 dout(10) << " got error from peers" << dendl;
2581 respond_to_request(mdr, mdr->more()->peer_error);
9f95a23c 2582 return;
7c673cae
FG
2583 }
2584
2585 if (is_full) {
b3b6e05e
TL
2586 CInode *cur = try_get_auth_inode(mdr, req->get_filepath().get_ino());
2587 if (!cur) {
20effc67 2588 // the request is already responded to
b3b6e05e
TL
2589 return;
2590 }
7c673cae
FG
2591 if (req->get_op() == CEPH_MDS_OP_SETLAYOUT ||
2592 req->get_op() == CEPH_MDS_OP_SETDIRLAYOUT ||
2593 req->get_op() == CEPH_MDS_OP_SETLAYOUT ||
2594 req->get_op() == CEPH_MDS_OP_RMXATTR ||
2595 req->get_op() == CEPH_MDS_OP_SETXATTR ||
2596 req->get_op() == CEPH_MDS_OP_CREATE ||
2597 req->get_op() == CEPH_MDS_OP_SYMLINK ||
2598 req->get_op() == CEPH_MDS_OP_MKSNAP ||
2599 ((req->get_op() == CEPH_MDS_OP_LINK ||
2600 req->get_op() == CEPH_MDS_OP_RENAME) &&
f67539c2 2601 (!mdr->has_more() || mdr->more()->witnessed.empty())) // haven't started peer request
7c673cae
FG
2602 ) {
2603
b3b6e05e
TL
2604 if (check_access(mdr, cur, MAY_FULL)) {
2605 dout(20) << __func__ << ": full, has FULL caps, permitting op " << ceph_mds_op_name(req->get_op()) << dendl;
2606 } else {
2607 dout(20) << __func__ << ": full, responding CEPHFS_ENOSPC to op " << ceph_mds_op_name(req->get_op()) << dendl;
2608 respond_to_request(mdr, -CEPHFS_ENOSPC);
2609 return;
2610 }
7c673cae
FG
2611 } else {
2612 dout(20) << __func__ << ": full, permitting op " << ceph_mds_op_name(req->get_op()) << dendl;
2613 }
2614 }
2615
2616 switch (req->get_op()) {
2617 case CEPH_MDS_OP_LOOKUPHASH:
2618 case CEPH_MDS_OP_LOOKUPINO:
2619 handle_client_lookup_ino(mdr, false, false);
2620 break;
2621 case CEPH_MDS_OP_LOOKUPPARENT:
2622 handle_client_lookup_ino(mdr, true, false);
2623 break;
2624 case CEPH_MDS_OP_LOOKUPNAME:
2625 handle_client_lookup_ino(mdr, false, true);
2626 break;
2627
2628 // inodes ops.
2629 case CEPH_MDS_OP_LOOKUP:
2630 handle_client_getattr(mdr, true);
2631 break;
2632
2633 case CEPH_MDS_OP_LOOKUPSNAP:
2634 // lookupsnap does not reference a CDentry; treat it as a getattr
2635 case CEPH_MDS_OP_GETATTR:
2636 handle_client_getattr(mdr, false);
2637 break;
1d09f67e
TL
2638 case CEPH_MDS_OP_GETVXATTR:
2639 handle_client_getvxattr(mdr);
2640 break;
7c673cae
FG
2641
2642 case CEPH_MDS_OP_SETATTR:
2643 handle_client_setattr(mdr);
2644 break;
2645 case CEPH_MDS_OP_SETLAYOUT:
2646 handle_client_setlayout(mdr);
2647 break;
2648 case CEPH_MDS_OP_SETDIRLAYOUT:
2649 handle_client_setdirlayout(mdr);
2650 break;
2651 case CEPH_MDS_OP_SETXATTR:
2652 handle_client_setxattr(mdr);
2653 break;
2654 case CEPH_MDS_OP_RMXATTR:
2655 handle_client_removexattr(mdr);
2656 break;
2657
2658 case CEPH_MDS_OP_READDIR:
2659 handle_client_readdir(mdr);
2660 break;
2661
2662 case CEPH_MDS_OP_SETFILELOCK:
2663 handle_client_file_setlock(mdr);
2664 break;
2665
2666 case CEPH_MDS_OP_GETFILELOCK:
2667 handle_client_file_readlock(mdr);
2668 break;
2669
2670 // funky.
2671 case CEPH_MDS_OP_CREATE:
2672 if (mdr->has_completed)
2673 handle_client_open(mdr); // already created.. just open
2674 else
2675 handle_client_openc(mdr);
2676 break;
2677
2678 case CEPH_MDS_OP_OPEN:
2679 handle_client_open(mdr);
2680 break;
2681
2682 // namespace.
2683 // no prior locks.
2684 case CEPH_MDS_OP_MKNOD:
2685 handle_client_mknod(mdr);
2686 break;
2687 case CEPH_MDS_OP_LINK:
2688 handle_client_link(mdr);
2689 break;
2690 case CEPH_MDS_OP_UNLINK:
2691 case CEPH_MDS_OP_RMDIR:
2692 handle_client_unlink(mdr);
2693 break;
2694 case CEPH_MDS_OP_RENAME:
2695 handle_client_rename(mdr);
2696 break;
2697 case CEPH_MDS_OP_MKDIR:
2698 handle_client_mkdir(mdr);
2699 break;
2700 case CEPH_MDS_OP_SYMLINK:
2701 handle_client_symlink(mdr);
2702 break;
2703
2704
2705 // snaps
2706 case CEPH_MDS_OP_LSSNAP:
2707 handle_client_lssnap(mdr);
2708 break;
2709 case CEPH_MDS_OP_MKSNAP:
2710 handle_client_mksnap(mdr);
2711 break;
2712 case CEPH_MDS_OP_RMSNAP:
2713 handle_client_rmsnap(mdr);
2714 break;
2715 case CEPH_MDS_OP_RENAMESNAP:
2716 handle_client_renamesnap(mdr);
2717 break;
aee94f69
TL
2718 case CEPH_MDS_OP_READDIR_SNAPDIFF:
2719 handle_client_readdir_snapdiff(mdr);
2720 break;
7c673cae
FG
2721
2722 default:
2723 dout(1) << " unknown client op " << req->get_op() << dendl;
f67539c2 2724 respond_to_request(mdr, -CEPHFS_EOPNOTSUPP);
7c673cae
FG
2725 }
2726}
2727
2728
2729// ---------------------------------------
f67539c2 2730// PEER REQUESTS
7c673cae 2731
f67539c2 2732void Server::handle_peer_request(const cref_t<MMDSPeerRequest> &m)
7c673cae 2733{
f67539c2 2734 dout(4) << "handle_peer_request " << m->get_reqid() << " from " << m->get_source() << dendl;
7c673cae
FG
2735 mds_rank_t from = mds_rank_t(m->get_source().num());
2736
f67539c2 2737 if (logger) logger->inc(l_mdss_handle_peer_request);
7c673cae
FG
2738
2739 // reply?
2740 if (m->is_reply())
f67539c2 2741 return handle_peer_request_reply(m);
7c673cae
FG
2742
2743 // the purpose of rename notify is enforcing causal message ordering. making sure
2744 // bystanders have received all messages from rename srcdn's auth MDS.
f67539c2
TL
2745 if (m->get_op() == MMDSPeerRequest::OP_RENAMENOTIFY) {
2746 auto reply = make_message<MMDSPeerRequest>(m->get_reqid(), m->get_attempt(), MMDSPeerRequest::OP_RENAMENOTIFYACK);
7c673cae 2747 mds->send_message(reply, m->get_connection());
7c673cae
FG
2748 return;
2749 }
2750
2751 CDentry *straydn = NULL;
11fdf7f2 2752 if (m->straybl.length() > 0) {
33c7a0ef 2753 mdcache->decode_replica_stray(straydn, nullptr, m->straybl, from);
11fdf7f2
TL
2754 ceph_assert(straydn);
2755 m->straybl.clear();
7c673cae
FG
2756 }
2757
9f95a23c
TL
2758 if (!mds->is_clientreplay() && !mds->is_active() && !mds->is_stopping()) {
2759 dout(3) << "not clientreplay|active yet, waiting" << dendl;
2760 mds->wait_for_replay(new C_MDS_RetryMessage(mds, m));
2761 return;
2762 }
2763
f67539c2 2764 // am i a new peer?
7c673cae
FG
2765 MDRequestRef mdr;
2766 if (mdcache->have_request(m->get_reqid())) {
2767 // existing?
2768 mdr = mdcache->request_get(m->get_reqid());
2769
2770 // is my request newer?
2771 if (mdr->attempt > m->get_attempt()) {
2772 dout(10) << "local request " << *mdr << " attempt " << mdr->attempt << " > " << m->get_attempt()
2773 << ", dropping " << *m << dendl;
7c673cae
FG
2774 return;
2775 }
2776
7c673cae
FG
2777 if (mdr->attempt < m->get_attempt()) {
2778 // mine is old, close it out
2779 dout(10) << "local request " << *mdr << " attempt " << mdr->attempt << " < " << m->get_attempt()
2780 << ", closing out" << dendl;
2781 mdcache->request_finish(mdr);
2782 mdr.reset();
f67539c2
TL
2783 } else if (mdr->peer_to_mds != from) {
2784 dout(10) << "local request " << *mdr << " not peer to mds." << from << dendl;
7c673cae
FG
2785 return;
2786 }
2787
f67539c2
TL
2788 // may get these while mdr->peer_request is non-null
2789 if (m->get_op() == MMDSPeerRequest::OP_DROPLOCKS) {
9f95a23c
TL
2790 mds->locker->drop_locks(mdr.get());
2791 return;
2792 }
f67539c2 2793 if (m->get_op() == MMDSPeerRequest::OP_FINISH) {
9f95a23c
TL
2794 if (m->is_abort()) {
2795 mdr->aborted = true;
f67539c2 2796 if (mdr->peer_request) {
9f95a23c 2797 // only abort on-going xlock, wrlock and auth pin
f67539c2 2798 ceph_assert(!mdr->peer_did_prepare());
9f95a23c
TL
2799 } else {
2800 mdcache->request_finish(mdr);
2801 }
7c673cae 2802 } else {
9f95a23c
TL
2803 if (m->inode_export.length() > 0)
2804 mdr->more()->inode_import = m->inode_export;
2805 // finish off request.
7c673cae
FG
2806 mdcache->request_finish(mdr);
2807 }
2808 return;
2809 }
2810 }
2811 if (!mdr.get()) {
2812 // new?
f67539c2
TL
2813 if (m->get_op() == MMDSPeerRequest::OP_FINISH) {
2814 dout(10) << "missing peer request for " << m->get_reqid()
7c673cae 2815 << " OP_FINISH, must have lost race with a forward" << dendl;
7c673cae
FG
2816 return;
2817 }
f67539c2 2818 mdr = mdcache->request_start_peer(m->get_reqid(), m->get_attempt(), m);
7c673cae
FG
2819 mdr->set_op_stamp(m->op_stamp);
2820 }
f67539c2 2821 ceph_assert(mdr->peer_request == 0); // only one at a time, please!
7c673cae
FG
2822
2823 if (straydn) {
2824 mdr->pin(straydn);
2825 mdr->straydn = straydn;
2826 }
2827
9f95a23c
TL
2828 if (mds->is_clientreplay() && !mds->mdsmap->is_clientreplay(from) &&
2829 mdr->locks.empty()) {
7c673cae
FG
2830 dout(3) << "not active yet, waiting" << dendl;
2831 mds->wait_for_active(new C_MDS_RetryMessage(mds, m));
2832 return;
2833 }
2834
f67539c2 2835 mdr->reset_peer_request(m);
7c673cae 2836
f67539c2 2837 dispatch_peer_request(mdr);
7c673cae
FG
2838}
2839
f67539c2 2840void Server::handle_peer_request_reply(const cref_t<MMDSPeerRequest> &m)
7c673cae
FG
2841{
2842 mds_rank_t from = mds_rank_t(m->get_source().num());
2843
2844 if (!mds->is_clientreplay() && !mds->is_active() && !mds->is_stopping()) {
2845 metareqid_t r = m->get_reqid();
f67539c2
TL
2846 if (!mdcache->have_uncommitted_leader(r, from)) {
2847 dout(10) << "handle_peer_request_reply ignoring peer reply from mds."
7c673cae 2848 << from << " reqid " << r << dendl;
7c673cae
FG
2849 return;
2850 }
2851 dout(3) << "not clientreplay|active yet, waiting" << dendl;
2852 mds->wait_for_replay(new C_MDS_RetryMessage(mds, m));
2853 return;
2854 }
2855
f67539c2 2856 if (m->get_op() == MMDSPeerRequest::OP_COMMITTED) {
7c673cae 2857 metareqid_t r = m->get_reqid();
f67539c2 2858 mdcache->committed_leader_peer(r, from);
7c673cae
FG
2859 return;
2860 }
2861
2862 MDRequestRef mdr = mdcache->request_get(m->get_reqid());
2863 if (m->get_attempt() != mdr->attempt) {
f67539c2 2864 dout(10) << "handle_peer_request_reply " << *mdr << " ignoring reply from other attempt "
7c673cae 2865 << m->get_attempt() << dendl;
7c673cae
FG
2866 return;
2867 }
2868
2869 switch (m->get_op()) {
f67539c2 2870 case MMDSPeerRequest::OP_XLOCKACK:
7c673cae 2871 {
f67539c2 2872 // identify lock, leader request
7c673cae
FG
2873 SimpleLock *lock = mds->locker->get_lock(m->get_lock_type(),
2874 m->get_object_info());
f67539c2 2875 mdr->more()->peers.insert(from);
11fdf7f2 2876 lock->decode_locked_state(m->get_lock_data());
7c673cae 2877 dout(10) << "got remote xlock on " << *lock << " on " << *lock->get_parent() << dendl;
9f95a23c 2878 mdr->emplace_lock(lock, MutationImpl::LockOp::XLOCK);
7c673cae
FG
2879 mdr->finish_locking(lock);
2880 lock->get_xlock(mdr, mdr->get_client());
2881
f67539c2
TL
2882 ceph_assert(mdr->more()->waiting_on_peer.count(from));
2883 mdr->more()->waiting_on_peer.erase(from);
2884 ceph_assert(mdr->more()->waiting_on_peer.empty());
7c673cae
FG
2885 mdcache->dispatch_request(mdr);
2886 }
2887 break;
2888
f67539c2 2889 case MMDSPeerRequest::OP_WRLOCKACK:
7c673cae 2890 {
f67539c2 2891 // identify lock, leader request
7c673cae
FG
2892 SimpleLock *lock = mds->locker->get_lock(m->get_lock_type(),
2893 m->get_object_info());
f67539c2 2894 mdr->more()->peers.insert(from);
7c673cae 2895 dout(10) << "got remote wrlock on " << *lock << " on " << *lock->get_parent() << dendl;
9f95a23c 2896 auto it = mdr->emplace_lock(lock, MutationImpl::LockOp::REMOTE_WRLOCK, from);
11fdf7f2
TL
2897 ceph_assert(it->is_remote_wrlock());
2898 ceph_assert(it->wrlock_target == from);
2899
7c673cae
FG
2900 mdr->finish_locking(lock);
2901
f67539c2
TL
2902 ceph_assert(mdr->more()->waiting_on_peer.count(from));
2903 mdr->more()->waiting_on_peer.erase(from);
2904 ceph_assert(mdr->more()->waiting_on_peer.empty());
7c673cae
FG
2905 mdcache->dispatch_request(mdr);
2906 }
2907 break;
2908
f67539c2
TL
2909 case MMDSPeerRequest::OP_AUTHPINACK:
2910 handle_peer_auth_pin_ack(mdr, m);
7c673cae
FG
2911 break;
2912
f67539c2
TL
2913 case MMDSPeerRequest::OP_LINKPREPACK:
2914 handle_peer_link_prep_ack(mdr, m);
7c673cae
FG
2915 break;
2916
f67539c2
TL
2917 case MMDSPeerRequest::OP_RMDIRPREPACK:
2918 handle_peer_rmdir_prep_ack(mdr, m);
7c673cae
FG
2919 break;
2920
f67539c2
TL
2921 case MMDSPeerRequest::OP_RENAMEPREPACK:
2922 handle_peer_rename_prep_ack(mdr, m);
7c673cae
FG
2923 break;
2924
f67539c2
TL
2925 case MMDSPeerRequest::OP_RENAMENOTIFYACK:
2926 handle_peer_rename_notify_ack(mdr, m);
7c673cae
FG
2927 break;
2928
2929 default:
39ae355f 2930 ceph_abort_msg("unknown op " + to_string(m->get_op()) + " requested");
7c673cae 2931 }
7c673cae
FG
2932}
2933
f67539c2 2934void Server::dispatch_peer_request(MDRequestRef& mdr)
7c673cae 2935{
f67539c2 2936 dout(7) << "dispatch_peer_request " << *mdr << " " << *mdr->peer_request << dendl;
7c673cae
FG
2937
2938 if (mdr->aborted) {
2939 dout(7) << " abort flag set, finishing" << dendl;
2940 mdcache->request_finish(mdr);
2941 return;
2942 }
2943
f67539c2 2944 if (logger) logger->inc(l_mdss_dispatch_peer_request);
7c673cae 2945
f67539c2 2946 int op = mdr->peer_request->get_op();
7c673cae 2947 switch (op) {
f67539c2
TL
2948 case MMDSPeerRequest::OP_XLOCK:
2949 case MMDSPeerRequest::OP_WRLOCK:
7c673cae
FG
2950 {
2951 // identify object
f67539c2
TL
2952 SimpleLock *lock = mds->locker->get_lock(mdr->peer_request->get_lock_type(),
2953 mdr->peer_request->get_object_info());
7c673cae
FG
2954
2955 if (!lock) {
2956 dout(10) << "don't have object, dropping" << dendl;
39ae355f 2957 ceph_abort_msg("don't have object"); // can this happen, if we auth pinned properly.
7c673cae 2958 }
f67539c2 2959 if (op == MMDSPeerRequest::OP_XLOCK && !lock->get_parent()->is_auth()) {
7c673cae
FG
2960 dout(10) << "not auth for remote xlock attempt, dropping on "
2961 << *lock << " on " << *lock->get_parent() << dendl;
2962 } else {
2963 // use acquire_locks so that we get auth_pinning.
11fdf7f2
TL
2964 MutationImpl::LockOpVec lov;
2965 for (const auto& p : mdr->locks) {
2966 if (p.is_xlock())
2967 lov.add_xlock(p.lock);
2968 else if (p.is_wrlock())
2969 lov.add_wrlock(p.lock);
2970 }
7c673cae
FG
2971
2972 int replycode = 0;
2973 switch (op) {
f67539c2 2974 case MMDSPeerRequest::OP_XLOCK:
11fdf7f2 2975 lov.add_xlock(lock);
f67539c2 2976 replycode = MMDSPeerRequest::OP_XLOCKACK;
7c673cae 2977 break;
f67539c2 2978 case MMDSPeerRequest::OP_WRLOCK:
11fdf7f2 2979 lov.add_wrlock(lock);
f67539c2 2980 replycode = MMDSPeerRequest::OP_WRLOCKACK;
7c673cae
FG
2981 break;
2982 }
2983
11fdf7f2 2984 if (!mds->locker->acquire_locks(mdr, lov))
7c673cae
FG
2985 return;
2986
2987 // ack
f67539c2 2988 auto r = make_message<MMDSPeerRequest>(mdr->reqid, mdr->attempt, replycode);
7c673cae
FG
2989 r->set_lock_type(lock->get_type());
2990 lock->get_parent()->set_object_info(r->get_object_info());
f67539c2 2991 if (replycode == MMDSPeerRequest::OP_XLOCKACK)
11fdf7f2 2992 lock->encode_locked_state(r->get_lock_data());
f67539c2 2993 mds->send_message(r, mdr->peer_request->get_connection());
7c673cae
FG
2994 }
2995
2996 // done.
f67539c2 2997 mdr->reset_peer_request();
7c673cae
FG
2998 }
2999 break;
3000
f67539c2
TL
3001 case MMDSPeerRequest::OP_UNXLOCK:
3002 case MMDSPeerRequest::OP_UNWRLOCK:
7c673cae 3003 {
f67539c2
TL
3004 SimpleLock *lock = mds->locker->get_lock(mdr->peer_request->get_lock_type(),
3005 mdr->peer_request->get_object_info());
11fdf7f2
TL
3006 ceph_assert(lock);
3007 auto it = mdr->locks.find(lock);
3008 ceph_assert(it != mdr->locks.end());
7c673cae
FG
3009 bool need_issue = false;
3010 switch (op) {
f67539c2 3011 case MMDSPeerRequest::OP_UNXLOCK:
11fdf7f2 3012 mds->locker->xlock_finish(it, mdr.get(), &need_issue);
7c673cae 3013 break;
f67539c2 3014 case MMDSPeerRequest::OP_UNWRLOCK:
11fdf7f2 3015 mds->locker->wrlock_finish(it, mdr.get(), &need_issue);
7c673cae
FG
3016 break;
3017 }
3018 if (need_issue)
3019 mds->locker->issue_caps(static_cast<CInode*>(lock->get_parent()));
3020
3021 // done. no ack necessary.
f67539c2 3022 mdr->reset_peer_request();
7c673cae
FG
3023 }
3024 break;
3025
f67539c2
TL
3026 case MMDSPeerRequest::OP_AUTHPIN:
3027 handle_peer_auth_pin(mdr);
7c673cae
FG
3028 break;
3029
f67539c2
TL
3030 case MMDSPeerRequest::OP_LINKPREP:
3031 case MMDSPeerRequest::OP_UNLINKPREP:
3032 handle_peer_link_prep(mdr);
7c673cae
FG
3033 break;
3034
f67539c2
TL
3035 case MMDSPeerRequest::OP_RMDIRPREP:
3036 handle_peer_rmdir_prep(mdr);
7c673cae
FG
3037 break;
3038
f67539c2
TL
3039 case MMDSPeerRequest::OP_RENAMEPREP:
3040 handle_peer_rename_prep(mdr);
7c673cae
FG
3041 break;
3042
7c673cae 3043 default:
39ae355f 3044 ceph_abort_msg("unknown op "+ to_string(op)+ " received");
7c673cae
FG
3045 }
3046}
3047
f67539c2 3048void Server::handle_peer_auth_pin(MDRequestRef& mdr)
7c673cae 3049{
f67539c2 3050 dout(10) << "handle_peer_auth_pin " << *mdr << dendl;
7c673cae
FG
3051
3052 // build list of objects
3053 list<MDSCacheObject*> objects;
3054 CInode *auth_pin_freeze = NULL;
f67539c2 3055 bool nonblocking = mdr->peer_request->is_nonblocking();
7c673cae 3056 bool fail = false, wouldblock = false, readonly = false;
f67539c2 3057 ref_t<MMDSPeerRequest> reply;
7c673cae
FG
3058
3059 if (mdcache->is_readonly()) {
3060 dout(10) << " read-only FS" << dendl;
3061 readonly = true;
3062 fail = true;
3063 }
3064
3065 if (!fail) {
f67539c2 3066 for (const auto &oi : mdr->peer_request->get_authpins()) {
11fdf7f2 3067 MDSCacheObject *object = mdcache->get_object(oi);
7c673cae 3068 if (!object) {
11fdf7f2 3069 dout(10) << " don't have " << oi << dendl;
7c673cae
FG
3070 fail = true;
3071 break;
3072 }
3073
3074 objects.push_back(object);
f67539c2 3075 if (oi == mdr->peer_request->get_authpin_freeze())
7c673cae
FG
3076 auth_pin_freeze = static_cast<CInode*>(object);
3077 }
3078 }
3079
3080 // can we auth pin them?
3081 if (!fail) {
9f95a23c
TL
3082 for (const auto& obj : objects) {
3083 if (!obj->is_auth()) {
3084 dout(10) << " not auth for " << *obj << dendl;
7c673cae
FG
3085 fail = true;
3086 break;
3087 }
9f95a23c 3088 if (mdr->is_auth_pinned(obj))
7c673cae 3089 continue;
9f95a23c
TL
3090 if (!mdr->can_auth_pin(obj)) {
3091 if (nonblocking) {
3092 dout(10) << " can't auth_pin (freezing?) " << *obj << " nonblocking" << dendl;
7c673cae
FG
3093 fail = true;
3094 wouldblock = true;
3095 break;
3096 }
3097 // wait
9f95a23c
TL
3098 dout(10) << " waiting for authpinnable on " << *obj << dendl;
3099 obj->add_waiter(CDir::WAIT_UNFREEZE, new C_MDS_RetryRequest(mdcache, mdr));
7c673cae
FG
3100 mdr->drop_local_auth_pins();
3101
9f95a23c
TL
3102 mds->locker->notify_freeze_waiter(obj);
3103 goto blocked;
7c673cae
FG
3104 }
3105 }
3106 }
3107
9f95a23c 3108 if (!fail) {
7c673cae
FG
3109 /* freeze authpin wrong inode */
3110 if (mdr->has_more() && mdr->more()->is_freeze_authpin &&
3111 mdr->more()->rename_inode != auth_pin_freeze)
3112 mdr->unfreeze_auth_pin(true);
3113
f67539c2 3114 /* handle_peer_rename_prep() call freeze_inode() to wait for all other operations
7c673cae
FG
3115 * on the source inode to complete. This happens after all locks for the rename
3116 * operation are acquired. But to acquire locks, we need auth pin locks' parent
3117 * objects first. So there is an ABBA deadlock if someone auth pins the source inode
f67539c2 3118 * after locks are acquired and before Server::handle_peer_rename_prep() is called.
7c673cae
FG
3119 * The solution is freeze the inode and prevent other MDRequests from getting new
3120 * auth pins.
3121 */
3122 if (auth_pin_freeze) {
3123 dout(10) << " freezing auth pin on " << *auth_pin_freeze << dendl;
3124 if (!mdr->freeze_auth_pin(auth_pin_freeze)) {
3125 auth_pin_freeze->add_waiter(CInode::WAIT_FROZEN, new C_MDS_RetryRequest(mdcache, mdr));
3126 mds->mdlog->flush();
9f95a23c 3127 goto blocked;
7c673cae
FG
3128 }
3129 }
7c673cae
FG
3130 }
3131
f67539c2 3132 reply = make_message<MMDSPeerRequest>(mdr->reqid, mdr->attempt, MMDSPeerRequest::OP_AUTHPINACK);
7c673cae 3133
9f95a23c
TL
3134 if (fail) {
3135 mdr->drop_local_auth_pins(); // just in case
3136 if (readonly)
3137 reply->mark_error_rofs();
3138 if (wouldblock)
3139 reply->mark_error_wouldblock();
3140 } else {
3141 // auth pin!
3142 for (const auto& obj : objects) {
3143 dout(10) << "auth_pinning " << *obj << dendl;
3144 mdr->auth_pin(obj);
3145 }
3146 // return list of my auth_pins (if any)
3147 for (const auto &p : mdr->object_states) {
3148 if (!p.second.auth_pinned)
3149 continue;
3150 MDSCacheObjectInfo info;
3151 p.first->set_object_info(info);
3152 reply->get_authpins().push_back(info);
3153 if (p.first == (MDSCacheObject*)auth_pin_freeze)
3154 auth_pin_freeze->set_object_info(reply->get_authpin_freeze());
3155 }
3156 }
7c673cae 3157
f67539c2 3158 mds->send_message_mds(reply, mdr->peer_to_mds);
7c673cae
FG
3159
3160 // clean up this request
f67539c2 3161 mdr->reset_peer_request();
7c673cae 3162 return;
9f95a23c
TL
3163
3164blocked:
f67539c2
TL
3165 if (mdr->peer_request->should_notify_blocking()) {
3166 reply = make_message<MMDSPeerRequest>(mdr->reqid, mdr->attempt, MMDSPeerRequest::OP_AUTHPINACK);
9f95a23c 3167 reply->mark_req_blocked();
f67539c2
TL
3168 mds->send_message_mds(reply, mdr->peer_to_mds);
3169 mdr->peer_request->clear_notify_blocking();
9f95a23c
TL
3170 }
3171 return;
7c673cae
FG
3172}
3173
f67539c2 3174void Server::handle_peer_auth_pin_ack(MDRequestRef& mdr, const cref_t<MMDSPeerRequest> &ack)
7c673cae 3175{
f67539c2 3176 dout(10) << "handle_peer_auth_pin_ack on " << *mdr << " " << *ack << dendl;
7c673cae
FG
3177 mds_rank_t from = mds_rank_t(ack->get_source().num());
3178
9f95a23c
TL
3179 if (ack->is_req_blocked()) {
3180 mdr->disable_lock_cache();
f67539c2 3181 // peer auth pin is blocked, drop locks to avoid deadlock
9f95a23c
TL
3182 mds->locker->drop_locks(mdr.get(), nullptr);
3183 return;
3184 }
3185
7c673cae
FG
3186 // added auth pins?
3187 set<MDSCacheObject*> pinned;
11fdf7f2
TL
3188 for (const auto &oi : ack->get_authpins()) {
3189 MDSCacheObject *object = mdcache->get_object(oi);
3190 ceph_assert(object); // we pinned it
7c673cae 3191 dout(10) << " remote has pinned " << *object << dendl;
9f95a23c 3192 mdr->set_remote_auth_pinned(object, from);
11fdf7f2 3193 if (oi == ack->get_authpin_freeze())
7c673cae
FG
3194 mdr->set_remote_frozen_auth_pin(static_cast<CInode *>(object));
3195 pinned.insert(object);
3196 }
3197
3198 // removed frozen auth pin ?
3199 if (mdr->more()->is_remote_frozen_authpin &&
3200 ack->get_authpin_freeze() == MDSCacheObjectInfo()) {
9f95a23c
TL
3201 auto stat_p = mdr->find_object_state(mdr->more()->rename_inode);
3202 ceph_assert(stat_p);
3203 if (stat_p->remote_auth_pinned == from) {
7c673cae
FG
3204 mdr->more()->is_remote_frozen_authpin = false;
3205 }
3206 }
3207
3208 // removed auth pins?
9f95a23c
TL
3209 for (auto& p : mdr->object_states) {
3210 if (p.second.remote_auth_pinned == MDS_RANK_NONE)
3211 continue;
3212 MDSCacheObject* object = p.first;
3213 if (p.second.remote_auth_pinned == from && pinned.count(object) == 0) {
7c673cae 3214 dout(10) << " remote has unpinned " << *object << dendl;
9f95a23c 3215 mdr->_clear_remote_auth_pinned(p.second);
7c673cae
FG
3216 }
3217 }
3218
f67539c2
TL
3219 // note peer
3220 mdr->more()->peers.insert(from);
9f95a23c
TL
3221
3222 // clear from waiting list
f67539c2 3223 auto ret = mdr->more()->waiting_on_peer.erase(from);
9f95a23c
TL
3224 ceph_assert(ret);
3225
7c673cae 3226 if (ack->is_error_rofs()) {
f67539c2 3227 mdr->more()->peer_error = -CEPHFS_EROFS;
7c673cae 3228 } else if (ack->is_error_wouldblock()) {
f67539c2 3229 mdr->more()->peer_error = -CEPHFS_EWOULDBLOCK;
7c673cae 3230 }
7c673cae
FG
3231
3232 // go again?
f67539c2 3233 if (mdr->more()->waiting_on_peer.empty())
7c673cae
FG
3234 mdcache->dispatch_request(mdr);
3235 else
f67539c2 3236 dout(10) << "still waiting on peers " << mdr->more()->waiting_on_peer << dendl;
7c673cae
FG
3237}
3238
3239
3240// ---------------------------------------
3241// HELPERS
3242
3243
3244/**
3245 * check whether we are permitted to complete a request
3246 *
3247 * Check whether we have permission to perform the operation specified
3248 * by mask on the given inode, based on the capability in the mdr's
3249 * session.
3250 */
3251bool Server::check_access(MDRequestRef& mdr, CInode *in, unsigned mask)
3252{
3253 if (mdr->session) {
3254 int r = mdr->session->check_access(
3255 in, mask,
3256 mdr->client_request->get_caller_uid(),
3257 mdr->client_request->get_caller_gid(),
3258 &mdr->client_request->get_caller_gid_list(),
3259 mdr->client_request->head.args.setattr.uid,
3260 mdr->client_request->head.args.setattr.gid);
3261 if (r < 0) {
3262 respond_to_request(mdr, r);
3263 return false;
3264 }
3265 }
3266 return true;
3267}
3268
3269/**
3270 * check whether fragment has reached maximum size
3271 *
3272 */
20effc67 3273bool Server::check_fragment_space(MDRequestRef &mdr, CDir *dir)
7c673cae 3274{
20effc67
TL
3275 const auto size = dir->get_frag_size();
3276 const auto max = bal_fragment_size_max;
3277 if (size >= max) {
3278 dout(10) << "fragment " << *dir << " size exceeds " << max << " (CEPHFS_ENOSPC)" << dendl;
f67539c2 3279 respond_to_request(mdr, -CEPHFS_ENOSPC);
7c673cae 3280 return false;
20effc67
TL
3281 } else {
3282 dout(20) << "fragment " << *dir << " size " << size << " < " << max << dendl;
7c673cae
FG
3283 }
3284
3285 return true;
3286}
3287
20effc67
TL
3288/**
3289 * check whether entries in a dir reached maximum size
3290 *
3291 */
3292bool Server::check_dir_max_entries(MDRequestRef &mdr, CDir *in)
3293{
3294 const uint64_t size = in->inode->get_projected_inode()->dirstat.nfiles +
3295 in->inode->get_projected_inode()->dirstat.nsubdirs;
3296 if (dir_max_entries && size >= dir_max_entries) {
3297 dout(10) << "entries per dir " << *in << " size exceeds " << dir_max_entries << " (ENOSPC)" << dendl;
3298 respond_to_request(mdr, -ENOSPC);
3299 return false;
3300 }
3301 return true;
3302}
3303
3304
7c673cae
FG
3305CDentry* Server::prepare_stray_dentry(MDRequestRef& mdr, CInode *in)
3306{
f67539c2
TL
3307 string straydname;
3308 in->name_stray_dentry(straydname);
3309
7c673cae
FG
3310 CDentry *straydn = mdr->straydn;
3311 if (straydn) {
9f95a23c
TL
3312 ceph_assert(straydn->get_name() == straydname);
3313 return straydn;
7c673cae 3314 }
7c673cae
FG
3315 CDir *straydir = mdcache->get_stray_dir(in);
3316
3317 if (!mdr->client_request->is_replay() &&
3318 !check_fragment_space(mdr, straydir))
f67539c2
TL
3319 return nullptr;
3320
3321 straydn = straydir->lookup(straydname);
3322 if (!straydn) {
3323 if (straydir->is_frozen_dir()) {
3324 dout(10) << __func__ << ": " << *straydir << " is frozen, waiting" << dendl;
3325 mds->locker->drop_locks(mdr.get());
3326 mdr->drop_local_auth_pins();
3327 straydir->add_waiter(CInode::WAIT_UNFREEZE, new C_MDS_RetryRequest(mdcache, mdr));
3328 return nullptr;
3329 }
3330 straydn = straydir->add_null_dentry(straydname);
3331 straydn->mark_new();
3332 } else {
3333 ceph_assert(straydn->get_projected_linkage()->is_null());
3334 }
7c673cae 3335
f67539c2 3336 straydn->state_set(CDentry::STATE_STRAY);
7c673cae
FG
3337 mdr->straydn = straydn;
3338 mdr->pin(straydn);
f67539c2 3339
7c673cae
FG
3340 return straydn;
3341}
3342
3343/** prepare_new_inode
3344 *
3345 * create a new inode. set c/m/atime. hit dir pop.
3346 */
3347CInode* Server::prepare_new_inode(MDRequestRef& mdr, CDir *dir, inodeno_t useino, unsigned mode,
f67539c2 3348 const file_layout_t *layout)
7c673cae
FG
3349{
3350 CInode *in = new CInode(mdcache);
f67539c2 3351 auto _inode = in->_get_inode();
7c673cae
FG
3352
3353 // Server::prepare_force_open_sessions() can re-open session in closing
3354 // state. In that corner case, session's prealloc_inos are being freed.
3355 // To simplify the code, we disallow using/refilling session's prealloc_ino
3356 // while session is opening.
92f5a8d4 3357 bool allow_prealloc_inos = mdr->session->is_open();
7c673cae 3358
05a536ef
TL
3359 inodeno_t _useino = useino;
3360
7c673cae 3361 // assign ino
05a536ef
TL
3362 do {
3363 if (allow_prealloc_inos && (mdr->used_prealloc_ino = _inode->ino = mdr->session->take_ino(_useino))) {
3364 if (mdcache->test_and_clear_taken_inos(_inode->ino)) {
3365 _inode->ino = 0;
3366 dout(10) << "prepare_new_inode used_prealloc " << mdr->used_prealloc_ino
3367 << " (" << mdr->session->info.prealloc_inos.size() << " left)"
3368 << " but has been taken, will try again!" << dendl;
3369 } else {
3370 mds->sessionmap.mark_projected(mdr->session);
3371 dout(10) << "prepare_new_inode used_prealloc " << mdr->used_prealloc_ino
3372 << " (" << mdr->session->info.prealloc_inos.size() << " left)"
3373 << dendl;
3374 }
3375 } else {
3376 mdr->alloc_ino =
3377 _inode->ino = mds->inotable->project_alloc_id(_useino);
3378 if (mdcache->test_and_clear_taken_inos(_inode->ino)) {
3379 mds->inotable->apply_alloc_id(_inode->ino);
3380 _inode->ino = 0;
3381 dout(10) << "prepare_new_inode alloc " << mdr->alloc_ino
3382 << " but has been taken, will try again!" << dendl;
3383 } else {
3384 dout(10) << "prepare_new_inode alloc " << mdr->alloc_ino << dendl;
3385 }
3386 }
3387 _useino = 0;
3388 } while (!_inode->ino);
7c673cae 3389
f67539c2
TL
3390 if (useino && useino != _inode->ino) {
3391 dout(0) << "WARNING: client specified " << useino << " and i allocated " << _inode->ino << dendl;
7c673cae
FG
3392 mds->clog->error() << mdr->client_request->get_source()
3393 << " specified ino " << useino
f67539c2 3394 << " but mds." << mds->get_nodeid() << " allocated " << _inode->ino;
7c673cae
FG
3395 //ceph_abort(); // just for now.
3396 }
05a536ef 3397
7c673cae 3398 if (allow_prealloc_inos &&
11fdf7f2
TL
3399 mdr->session->get_num_projected_prealloc_inos() < g_conf()->mds_client_prealloc_inos / 2) {
3400 int need = g_conf()->mds_client_prealloc_inos - mdr->session->get_num_projected_prealloc_inos();
7c673cae 3401 mds->inotable->project_alloc_ids(mdr->prealloc_inos, need);
11fdf7f2 3402 ceph_assert(mdr->prealloc_inos.size()); // or else fix projected increment semantics
7c673cae
FG
3403 mdr->session->pending_prealloc_inos.insert(mdr->prealloc_inos);
3404 mds->sessionmap.mark_projected(mdr->session);
3405 dout(10) << "prepare_new_inode prealloc " << mdr->prealloc_inos << dendl;
3406 }
3407
f67539c2
TL
3408 _inode->version = 1;
3409 _inode->xattr_version = 1;
3410 _inode->nlink = 1; // FIXME
7c673cae 3411
f67539c2 3412 _inode->mode = mode;
7c673cae 3413
92f5a8d4 3414 // FIPS zeroization audit 20191117: this memset is not security related.
f67539c2
TL
3415 memset(&_inode->dir_layout, 0, sizeof(_inode->dir_layout));
3416 if (_inode->is_dir()) {
3417 _inode->dir_layout.dl_dir_hash = g_conf()->mds_default_dir_hash;
7c673cae 3418 } else if (layout) {
f67539c2 3419 _inode->layout = *layout;
7c673cae 3420 } else {
f67539c2 3421 _inode->layout = mdcache->default_file_layout;
7c673cae
FG
3422 }
3423
f67539c2
TL
3424 _inode->truncate_size = -1ull; // not truncated, yet!
3425 _inode->truncate_seq = 1; /* starting with 1, 0 is kept for no-truncation logic */
7c673cae
FG
3426
3427 CInode *diri = dir->get_inode();
2a845540 3428 auto pip = diri->get_projected_inode();
7c673cae 3429
2a845540 3430 dout(10) << oct << " dir mode 0" << pip->mode << " new mode 0" << mode << dec << dendl;
7c673cae 3431
2a845540 3432 if (pip->mode & S_ISGID) {
7c673cae 3433 dout(10) << " dir is sticky" << dendl;
2a845540 3434 _inode->gid = pip->gid;
7c673cae 3435 if (S_ISDIR(mode)) {
2a845540 3436 dout(10) << " new dir also sticky" << dendl;
f67539c2 3437 _inode->mode |= S_ISGID;
7c673cae 3438 }
2a845540 3439 } else {
aee94f69
TL
3440 _inode->gid = mdr->client_request->get_owner_gid();
3441 ceph_assert(_inode->gid != (unsigned)-1);
2a845540 3442 }
7c673cae 3443
aee94f69
TL
3444 _inode->uid = mdr->client_request->get_owner_uid();
3445 ceph_assert(_inode->uid != (unsigned)-1);
7c673cae 3446
f67539c2 3447 _inode->btime = _inode->ctime = _inode->mtime = _inode->atime =
7c673cae
FG
3448 mdr->get_op_stamp();
3449
f67539c2 3450 _inode->change_attr = 0;
7c673cae 3451
9f95a23c 3452 const cref_t<MClientRequest> &req = mdr->client_request;
1e59de90
TL
3453
3454 dout(10) << "copying fscrypt_auth len " << req->fscrypt_auth.size() << dendl;
3455 _inode->fscrypt_auth = req->fscrypt_auth;
3456 _inode->fscrypt_file = req->fscrypt_file;
3457
7c673cae 3458 if (req->get_data().length()) {
11fdf7f2 3459 auto p = req->get_data().cbegin();
7c673cae
FG
3460
3461 // xattrs on new inode?
f67539c2
TL
3462 auto _xattrs = CInode::allocate_xattr_map();
3463 decode_noshare(*_xattrs, p);
3464 dout(10) << "prepare_new_inode setting xattrs " << *_xattrs << dendl;
3465 in->reset_xattrs(std::move(_xattrs));
7c673cae
FG
3466 }
3467
3468 if (!mds->mdsmap->get_inline_data_enabled() ||
11fdf7f2 3469 !mdr->session->get_connection()->has_feature(CEPH_FEATURE_MDS_INLINE_DATA))
f67539c2 3470 _inode->inline_data.version = CEPH_INLINE_NONE;
7c673cae
FG
3471
3472 mdcache->add_inode(in); // add
3473 dout(10) << "prepare_new_inode " << *in << dendl;
3474 return in;
3475}
3476
3477void Server::journal_allocated_inos(MDRequestRef& mdr, EMetaBlob *blob)
3478{
3479 dout(20) << "journal_allocated_inos sessionmapv " << mds->sessionmap.get_projected()
3480 << " inotablev " << mds->inotable->get_projected_version()
3481 << dendl;
3482 blob->set_ino_alloc(mdr->alloc_ino,
3483 mdr->used_prealloc_ino,
3484 mdr->prealloc_inos,
3485 mdr->client_request->get_source(),
3486 mds->sessionmap.get_projected(),
3487 mds->inotable->get_projected_version());
3488}
3489
3490void Server::apply_allocated_inos(MDRequestRef& mdr, Session *session)
3491{
3492 dout(10) << "apply_allocated_inos " << mdr->alloc_ino
3493 << " / " << mdr->prealloc_inos
3494 << " / " << mdr->used_prealloc_ino << dendl;
3495
3496 if (mdr->alloc_ino) {
3497 mds->inotable->apply_alloc_id(mdr->alloc_ino);
3498 }
3499 if (mdr->prealloc_inos.size()) {
11fdf7f2 3500 ceph_assert(session);
7c673cae 3501 session->pending_prealloc_inos.subtract(mdr->prealloc_inos);
f67539c2 3502 session->free_prealloc_inos.insert(mdr->prealloc_inos);
7c673cae 3503 session->info.prealloc_inos.insert(mdr->prealloc_inos);
81eedcae 3504 mds->sessionmap.mark_dirty(session, !mdr->used_prealloc_ino);
7c673cae
FG
3505 mds->inotable->apply_alloc_ids(mdr->prealloc_inos);
3506 }
3507 if (mdr->used_prealloc_ino) {
11fdf7f2 3508 ceph_assert(session);
f67539c2 3509 session->info.prealloc_inos.erase(mdr->used_prealloc_ino);
7c673cae
FG
3510 mds->sessionmap.mark_dirty(session);
3511 }
3512}
3513
2a845540
TL
3514struct C_MDS_TryOpenInode : public ServerContext {
3515 MDRequestRef mdr;
3516 inodeno_t ino;
3517 C_MDS_TryOpenInode(Server *s, MDRequestRef& r, inodeno_t i) :
3518 ServerContext(s), mdr(r), ino(i) {}
3519 void finish(int r) override {
3520 server->_try_open_ino(mdr, r, ino);
3521 }
3522};
3523
3524void Server::_try_open_ino(MDRequestRef& mdr, int r, inodeno_t ino)
3525{
3526 dout(10) << "_try_open_ino " << mdr.get() << " ino " << ino << " r=" << r << dendl;
3527
3528 // `r` is a rank if >=0, else an error code
3529 if (r >= 0) {
3530 mds_rank_t dest_rank(r);
3531 if (dest_rank == mds->get_nodeid())
3532 dispatch_client_request(mdr);
3533 else
3534 mdcache->request_forward(mdr, dest_rank);
3535 return;
3536 }
3537
3538 // give up
3539 if (r == -CEPHFS_ENOENT || r == -CEPHFS_ENODATA)
3540 r = -CEPHFS_ESTALE;
3541 respond_to_request(mdr, r);
3542}
3543
7c673cae
FG
3544class C_MDS_TryFindInode : public ServerContext {
3545 MDRequestRef mdr;
2a845540
TL
3546 MDCache *mdcache;
3547 inodeno_t ino;
7c673cae 3548public:
2a845540
TL
3549 C_MDS_TryFindInode(Server *s, MDRequestRef& r, MDCache *m, inodeno_t i) :
3550 ServerContext(s), mdr(r), mdcache(m), ino(i) {}
7c673cae 3551 void finish(int r) override {
2a845540
TL
3552 if (r == -CEPHFS_ESTALE) { // :( find_ino_peers failed
3553 /*
3554 * There has one case that when the MDS crashes and the
3555 * openfiletable journal couldn't be flushed and then
3556 * the replacing MDS is possibly won't load some already
3557 * opened CInodes into the MDCache. And if the clients
3558 * will retry some requests after reconnected, the MDS
3559 * will return -ESTALE after failing to find the ino in
3560 * all active peers.
3561 *
3562 * As a workaround users can run `ls -R ${mountpoint}`
3563 * to list all the sub-files or sub-direcotries from the
3564 * mountpoint.
3565 *
3566 * We need try to open the ino and try it again.
3567 */
3568 CInode *in = mdcache->get_inode(ino);
3569 if (in && in->state_test(CInode::STATE_PURGING))
3570 server->respond_to_request(mdr, r);
3571 else
3572 mdcache->open_ino(ino, (int64_t)-1, new C_MDS_TryOpenInode(server, mdr, ino));
3573 } else {
7c673cae 3574 server->dispatch_client_request(mdr);
2a845540 3575 }
7c673cae
FG
3576 }
3577};
3578
7c673cae
FG
3579/* If this returns null, the request has been handled
3580 * as appropriate: forwarded on, or the client's been replied to */
9f95a23c 3581CInode* Server::rdlock_path_pin_ref(MDRequestRef& mdr,
7c673cae 3582 bool want_auth,
9f95a23c 3583 bool no_want_auth)
7c673cae 3584{
9f95a23c 3585 const filepath& refpath = mdr->get_filepath();
7c673cae
FG
3586 dout(10) << "rdlock_path_pin_ref " << *mdr << " " << refpath << dendl;
3587
9f95a23c
TL
3588 if (mdr->locking_state & MutationImpl::PATH_LOCKED)
3589 return mdr->in[0];
7c673cae
FG
3590
3591 // traverse
f67539c2 3592 CF_MDS_RetryRequestFactory cf(mdcache, mdr, true);
9f95a23c
TL
3593 int flags = 0;
3594 if (refpath.is_last_snap()) {
3595 if (!no_want_auth)
3596 want_auth = true;
3597 } else {
f91f0fd5
TL
3598 if (!no_want_auth && forward_all_requests_to_auth)
3599 want_auth = true;
9f95a23c
TL
3600 flags |= MDS_TRAVERSE_RDLOCK_PATH | MDS_TRAVERSE_RDLOCK_SNAP;
3601 }
3602 if (want_auth)
3603 flags |= MDS_TRAVERSE_WANT_AUTH;
3604 int r = mdcache->path_traverse(mdr, cf, refpath, flags, &mdr->dn[0], &mdr->in[0]);
7c673cae 3605 if (r > 0)
9f95a23c 3606 return nullptr; // delayed
7c673cae 3607 if (r < 0) { // error
f67539c2 3608 if (r == -CEPHFS_ENOENT && !mdr->dn[0].empty()) {
9f95a23c
TL
3609 if (mdr->client_request &&
3610 mdr->client_request->get_dentry_wanted())
3611 mdr->tracedn = mdr->dn[0].back();
7c673cae 3612 respond_to_request(mdr, r);
f67539c2
TL
3613 } else if (r == -CEPHFS_ESTALE) {
3614 dout(10) << "FAIL on CEPHFS_ESTALE but attempting recovery" << dendl;
2a845540
TL
3615 inodeno_t ino = refpath.get_ino();
3616 mdcache->find_ino_peers(ino, new C_MDS_TryFindInode(this, mdr, mdcache, ino));
7c673cae
FG
3617 } else {
3618 dout(10) << "FAIL on error " << r << dendl;
3619 respond_to_request(mdr, r);
3620 }
9f95a23c 3621 return nullptr;
7c673cae 3622 }
9f95a23c 3623 CInode *ref = mdr->in[0];
7c673cae
FG
3624 dout(10) << "ref is " << *ref << dendl;
3625
7c673cae 3626 if (want_auth) {
7c673cae
FG
3627 // auth_pin?
3628 // do NOT proceed if freezing, as cap release may defer in that case, and
3629 // we could deadlock when we try to lock @ref.
3630 // if we're already auth_pinned, continue; the release has already been processed.
3631 if (ref->is_frozen() || ref->is_frozen_auth_pin() ||
3632 (ref->is_freezing() && !mdr->is_auth_pinned(ref))) {
3633 dout(7) << "waiting for !frozen/authpinnable on " << *ref << dendl;
9f95a23c
TL
3634 ref->add_waiter(CInode::WAIT_UNFREEZE, cf.build());
3635 if (mdr->is_any_remote_auth_pin())
224ce89b 3636 mds->locker->notify_freeze_waiter(ref);
7c673cae
FG
3637 return 0;
3638 }
7c673cae
FG
3639 mdr->auth_pin(ref);
3640 }
3641
7c673cae
FG
3642 // set and pin ref
3643 mdr->pin(ref);
3644 return ref;
3645}
3646
3647
3648/** rdlock_path_xlock_dentry
3649 * traverse path to the directory that could/would contain dentry.
1e59de90
TL
3650 * make sure i am auth for that dentry (or target inode if it exists and authexist),
3651 * forward as necessary. create null dentry in place (or use existing if okexist).
7c673cae 3652 * get rdlocks on traversed dentries, xlock on new dentry.
1e59de90
TL
3653 *
3654 * set authexist true if caller requires the target inode to be auth when it exists.
3655 * the tail dentry is not always auth any more if authexist because it is impossible
3656 * to ensure tail dentry and target inode are both auth in one mds. the tail dentry
3657 * will not be xlocked too if authexist and the target inode exists.
7c673cae 3658 */
9f95a23c 3659CDentry* Server::rdlock_path_xlock_dentry(MDRequestRef& mdr,
1e59de90
TL
3660 bool create, bool okexist, bool authexist,
3661 bool want_layout)
7c673cae 3662{
9f95a23c 3663 const filepath& refpath = mdr->get_filepath();
7c673cae
FG
3664 dout(10) << "rdlock_path_xlock_dentry " << *mdr << " " << refpath << dendl;
3665
9f95a23c
TL
3666 if (mdr->locking_state & MutationImpl::PATH_LOCKED)
3667 return mdr->dn[0].back();
3668
3669 // figure parent dir vs dname
3670 if (refpath.depth() == 0) {
3671 dout(7) << "invalid path (zero length)" << dendl;
f67539c2 3672 respond_to_request(mdr, -CEPHFS_EINVAL);
9f95a23c
TL
3673 return nullptr;
3674 }
3675
3676 if (refpath.is_last_snap()) {
f67539c2 3677 respond_to_request(mdr, -CEPHFS_EROFS);
9f95a23c
TL
3678 return nullptr;
3679 }
7c673cae 3680
9f95a23c
TL
3681 if (refpath.is_last_dot_or_dotdot()) {
3682 dout(7) << "invalid path (last dot or dot_dot)" << dendl;
3683 if (create)
f67539c2 3684 respond_to_request(mdr, -CEPHFS_EEXIST);
9f95a23c 3685 else
f67539c2 3686 respond_to_request(mdr, -CEPHFS_ENOTEMPTY);
9f95a23c
TL
3687 return nullptr;
3688 }
7c673cae 3689
9f95a23c 3690 // traverse to parent dir
f67539c2 3691 CF_MDS_RetryRequestFactory cf(mdcache, mdr, true);
9f95a23c
TL
3692 int flags = MDS_TRAVERSE_RDLOCK_SNAP | MDS_TRAVERSE_RDLOCK_PATH |
3693 MDS_TRAVERSE_WANT_DENTRY | MDS_TRAVERSE_XLOCK_DENTRY |
3694 MDS_TRAVERSE_WANT_AUTH;
3695 if (refpath.depth() == 1 && !mdr->lock_cache_disabled)
3696 flags |= MDS_TRAVERSE_CHECK_LOCKCACHE;
3697 if (create)
3698 flags |= MDS_TRAVERSE_RDLOCK_AUTHLOCK;
1e59de90
TL
3699 if (authexist)
3700 flags |= MDS_TRAVERSE_WANT_INODE;
9f95a23c
TL
3701 if (want_layout)
3702 flags |= MDS_TRAVERSE_WANT_DIRLAYOUT;
3703 int r = mdcache->path_traverse(mdr, cf, refpath, flags, &mdr->dn[0]);
3704 if (r > 0)
3705 return nullptr; // delayed
3706 if (r < 0) {
f67539c2
TL
3707 if (r == -CEPHFS_ESTALE) {
3708 dout(10) << "FAIL on CEPHFS_ESTALE but attempting recovery" << dendl;
2a845540
TL
3709 inodeno_t ino = refpath.get_ino();
3710 mdcache->find_ino_peers(ino, new C_MDS_TryFindInode(this, mdr, mdcache, ino));
9f95a23c
TL
3711 return nullptr;
3712 }
3713 respond_to_request(mdr, r);
3714 return nullptr;
3715 }
7c673cae 3716
9f95a23c
TL
3717 CDentry *dn = mdr->dn[0].back();
3718 CDir *dir = dn->get_dir();
7c673cae 3719 CInode *diri = dir->get_inode();
9f95a23c 3720
7c673cae 3721 if (!mdr->reqid.name.is_mds()) {
05a536ef
TL
3722 if (diri->is_system() && !diri->is_root() &&
3723 (!diri->is_lost_and_found() ||
3724 mdr->client_request->get_op() != CEPH_MDS_OP_UNLINK)) {
f67539c2 3725 respond_to_request(mdr, -CEPHFS_EROFS);
9f95a23c 3726 return nullptr;
7c673cae
FG
3727 }
3728 }
9f95a23c 3729
7c673cae 3730 if (!diri->is_base() && diri->get_projected_parent_dir()->inode->is_stray()) {
f67539c2 3731 respond_to_request(mdr, -CEPHFS_ENOENT);
9f95a23c 3732 return nullptr;
7c673cae
FG
3733 }
3734
9f95a23c
TL
3735 CDentry::linkage_t *dnl = dn->get_projected_linkage();
3736 if (dnl->is_null()) {
3737 if (!create && okexist) {
f67539c2 3738 respond_to_request(mdr, -CEPHFS_ENOENT);
9f95a23c 3739 return nullptr;
7c673cae
FG
3740 }
3741
9f95a23c
TL
3742 snapid_t next_snap = mdcache->get_global_snaprealm()->get_newest_seq() + 1;
3743 dn->first = std::max(dn->first, next_snap);
7c673cae 3744 } else {
9f95a23c 3745 if (!okexist) {
f67539c2 3746 respond_to_request(mdr, -CEPHFS_EEXIST);
9f95a23c
TL
3747 return nullptr;
3748 }
3749 mdr->in[0] = dnl->get_inode();
7c673cae
FG
3750 }
3751
7c673cae
FG
3752 return dn;
3753}
3754
9f95a23c
TL
3755/** rdlock_two_paths_xlock_destdn
3756 * traverse two paths and lock the two paths in proper order.
3757 * The order of taking locks is:
3758 * 1. Lock directory inodes or dentries according to which trees they
3759 * are under. Lock objects under fs root before objects under mdsdir.
3760 * 2. Lock directory inodes or dentries according to their depth, in
3761 * ascending order.
3762 * 3. Lock directory inodes or dentries according to inode numbers or
3763 * dentries' parent inode numbers, in ascending order.
3764 * 4. Lock dentries in the same directory in order of their keys.
3765 * 5. Lock non-directory inodes according to inode numbers, in ascending
3766 * order.
3767 */
3768std::pair<CDentry*, CDentry*>
3769Server::rdlock_two_paths_xlock_destdn(MDRequestRef& mdr, bool xlock_srcdn)
3770{
7c673cae 3771
9f95a23c
TL
3772 const filepath& refpath = mdr->get_filepath();
3773 const filepath& refpath2 = mdr->get_filepath2();
7c673cae 3774
9f95a23c 3775 dout(10) << "rdlock_two_paths_xlock_destdn " << *mdr << " " << refpath << " " << refpath2 << dendl;
7c673cae 3776
9f95a23c
TL
3777 if (mdr->locking_state & MutationImpl::PATH_LOCKED)
3778 return std::make_pair(mdr->dn[0].back(), mdr->dn[1].back());
7c673cae 3779
9f95a23c 3780 if (refpath.depth() != 1 || refpath2.depth() != 1) {
f67539c2 3781 respond_to_request(mdr, -CEPHFS_EINVAL);
9f95a23c
TL
3782 return std::pair<CDentry*, CDentry*>(nullptr, nullptr);
3783 }
3784
3785 if (refpath.is_last_snap() || refpath2.is_last_snap()) {
f67539c2 3786 respond_to_request(mdr, -CEPHFS_EROFS);
9f95a23c
TL
3787 return std::make_pair(nullptr, nullptr);
3788 }
3789
3790 // traverse to parent dir
f67539c2 3791 CF_MDS_RetryRequestFactory cf(mdcache, mdr, true);
9f95a23c
TL
3792 int flags = MDS_TRAVERSE_RDLOCK_SNAP | MDS_TRAVERSE_WANT_DENTRY | MDS_TRAVERSE_WANT_AUTH;
3793 int r = mdcache->path_traverse(mdr, cf, refpath, flags, &mdr->dn[0]);
3794 if (r != 0) {
f67539c2
TL
3795 if (r == -CEPHFS_ESTALE) {
3796 dout(10) << "CEPHFS_ESTALE on path, attempting recovery" << dendl;
2a845540
TL
3797 inodeno_t ino = refpath.get_ino();
3798 mdcache->find_ino_peers(ino, new C_MDS_TryFindInode(this, mdr, mdcache, ino));
9f95a23c
TL
3799 } else if (r < 0) {
3800 respond_to_request(mdr, r);
3801 }
3802 return std::make_pair(nullptr, nullptr);
3803 }
3804
3805 flags = MDS_TRAVERSE_RDLOCK_SNAP2 | MDS_TRAVERSE_WANT_DENTRY | MDS_TRAVERSE_DISCOVER;
3806 r = mdcache->path_traverse(mdr, cf, refpath2, flags, &mdr->dn[1]);
3807 if (r != 0) {
f67539c2
TL
3808 if (r == -CEPHFS_ESTALE) {
3809 dout(10) << "CEPHFS_ESTALE on path2, attempting recovery" << dendl;
2a845540
TL
3810 inodeno_t ino = refpath2.get_ino();
3811 mdcache->find_ino_peers(ino, new C_MDS_TryFindInode(this, mdr, mdcache, ino));
9f95a23c
TL
3812 } else if (r < 0) {
3813 respond_to_request(mdr, r);
3814 }
3815 return std::make_pair(nullptr, nullptr);
3816 }
3817
3818 CDentry *srcdn = mdr->dn[1].back();
3819 CDir *srcdir = srcdn->get_dir();
3820 CDentry *destdn = mdr->dn[0].back();
3821 CDir *destdir = destdn->get_dir();
3822
3823 if (!mdr->reqid.name.is_mds()) {
3824 if ((srcdir->get_inode()->is_system() && !srcdir->get_inode()->is_root()) ||
3825 (destdir->get_inode()->is_system() && !destdir->get_inode()->is_root())) {
f67539c2 3826 respond_to_request(mdr, -CEPHFS_EROFS);
9f95a23c
TL
3827 return std::make_pair(nullptr, nullptr);
3828 }
3829 }
3830
3831 if (!destdir->get_inode()->is_base() &&
3832 destdir->get_inode()->get_projected_parent_dir()->inode->is_stray()) {
f67539c2 3833 respond_to_request(mdr, -CEPHFS_ENOENT);
9f95a23c
TL
3834 return std::make_pair(nullptr, nullptr);
3835 }
3836
3837 MutationImpl::LockOpVec lov;
3838 if (srcdir->get_inode() == destdir->get_inode()) {
3839 lov.add_wrlock(&destdir->inode->filelock);
3840 lov.add_wrlock(&destdir->inode->nestlock);
3841 if (xlock_srcdn && srcdir != destdir) {
3842 mds_rank_t srcdir_auth = srcdir->authority().first;
3843 if (srcdir_auth != mds->get_nodeid()) {
3844 lov.add_remote_wrlock(&srcdir->inode->filelock, srcdir_auth);
3845 lov.add_remote_wrlock(&srcdir->inode->nestlock, srcdir_auth);
3846 }
3847 }
3848
3849 if (srcdn->get_name() > destdn->get_name())
3850 lov.add_xlock(&destdn->lock);
3851
3852 if (xlock_srcdn)
3853 lov.add_xlock(&srcdn->lock);
3854 else
3855 lov.add_rdlock(&srcdn->lock);
3856
3857 if (srcdn->get_name() < destdn->get_name())
3858 lov.add_xlock(&destdn->lock);
3859 } else {
3860 int cmp = mdr->compare_paths();
3861 bool lock_destdir_first =
3862 (cmp < 0 || (cmp == 0 && destdir->ino() < srcdir->ino()));
3863
3864 if (lock_destdir_first) {
3865 lov.add_wrlock(&destdir->inode->filelock);
3866 lov.add_wrlock(&destdir->inode->nestlock);
3867 lov.add_xlock(&destdn->lock);
3868 }
3869
3870 if (xlock_srcdn) {
3871 mds_rank_t srcdir_auth = srcdir->authority().first;
3872 if (srcdir_auth == mds->get_nodeid()) {
3873 lov.add_wrlock(&srcdir->inode->filelock);
3874 lov.add_wrlock(&srcdir->inode->nestlock);
3875 } else {
3876 lov.add_remote_wrlock(&srcdir->inode->filelock, srcdir_auth);
3877 lov.add_remote_wrlock(&srcdir->inode->nestlock, srcdir_auth);
3878 }
3879 lov.add_xlock(&srcdn->lock);
3880 } else {
3881 lov.add_rdlock(&srcdn->lock);
3882 }
3883
3884 if (!lock_destdir_first) {
3885 lov.add_wrlock(&destdir->inode->filelock);
3886 lov.add_wrlock(&destdir->inode->nestlock);
3887 lov.add_xlock(&destdn->lock);
3888 }
3889 }
3890
3891 CInode *auth_pin_freeze = nullptr;
3892 // XXX any better way to do this?
3893 if (xlock_srcdn && !srcdn->is_auth()) {
3894 CDentry::linkage_t *srcdnl = srcdn->get_projected_linkage();
3895 auth_pin_freeze = srcdnl->is_primary() ? srcdnl->get_inode() : nullptr;
3896 }
3897 if (!mds->locker->acquire_locks(mdr, lov, auth_pin_freeze))
3898 return std::make_pair(nullptr, nullptr);
3899
3900 if (srcdn->get_projected_linkage()->is_null()) {
f67539c2 3901 respond_to_request(mdr, -CEPHFS_ENOENT);
9f95a23c
TL
3902 return std::make_pair(nullptr, nullptr);
3903 }
3904
3905 if (destdn->get_projected_linkage()->is_null()) {
3906 snapid_t next_snap = mdcache->get_global_snaprealm()->get_newest_seq() + 1;
3907 destdn->first = std::max(destdn->first, next_snap);
3908 }
3909
3910 mdr->locking_state |= MutationImpl::PATH_LOCKED;
3911
3912 return std::make_pair(destdn, srcdn);
3913}
3914
3915/**
3916 * try_open_auth_dirfrag -- open dirfrag, or forward to dirfrag auth
3917 *
3918 * @param diri base inode
3919 * @param fg the exact frag we want
7c673cae
FG
3920 * @param mdr request
3921 * @returns the pointer, or NULL if it had to be delayed (but mdr is taken care of)
3922 */
3923CDir* Server::try_open_auth_dirfrag(CInode *diri, frag_t fg, MDRequestRef& mdr)
3924{
3925 CDir *dir = diri->get_dirfrag(fg);
3926
9f95a23c
TL
3927 if (dir) {
3928 // am i auth for the dirfrag?
3929 if (!dir->is_auth()) {
3930 mds_rank_t auth = dir->authority().first;
3931 dout(7) << "try_open_auth_dirfrag: not auth for " << *dir
3932 << ", fw to mds." << auth << dendl;
3933 mdcache->request_forward(mdr, auth);
3934 return nullptr;
3935 }
3936 } else {
3937 // not open and inode not mine?
3938 if (!diri->is_auth()) {
3939 mds_rank_t inauth = diri->authority().first;
3940 dout(7) << "try_open_auth_dirfrag: not open, not inode auth, fw to mds." << inauth << dendl;
3941 mdcache->request_forward(mdr, inauth);
3942 return nullptr;
3943 }
7c673cae 3944
9f95a23c
TL
3945 // not open and inode frozen?
3946 if (diri->is_frozen()) {
3947 dout(10) << "try_open_auth_dirfrag: dir inode is frozen, waiting " << *diri << dendl;
3948 ceph_assert(diri->get_parent_dir());
3949 diri->add_waiter(CInode::WAIT_UNFREEZE, new C_MDS_RetryRequest(mdcache, mdr));
3950 return nullptr;
3951 }
7c673cae 3952
9f95a23c 3953 // invent?
7c673cae 3954 dir = diri->get_or_open_dirfrag(mdcache, fg);
7c673cae
FG
3955 }
3956
3957 return dir;
3958}
3959
3960
3961// ===============================================================================
3962// STAT
3963
3964void Server::handle_client_getattr(MDRequestRef& mdr, bool is_lookup)
3965{
9f95a23c 3966 const cref_t<MClientRequest> &req = mdr->client_request;
7c673cae
FG
3967
3968 if (req->get_filepath().depth() == 0 && is_lookup) {
3969 // refpath can't be empty for lookup but it can for
3970 // getattr (we do getattr with empty refpath for mount of '/')
f67539c2 3971 respond_to_request(mdr, -CEPHFS_EINVAL);
7c673cae
FG
3972 return;
3973 }
3974
28e407b8
AA
3975 bool want_auth = false;
3976 int mask = req->head.args.getattr.mask;
3977 if (mask & CEPH_STAT_RSTAT)
3978 want_auth = true; // set want_auth for CEPH_STAT_RSTAT mask
3979
f91f0fd5 3980 if (!mdr->is_batch_head() && mdr->can_batch()) {
f67539c2 3981 CF_MDS_RetryRequestFactory cf(mdcache, mdr, false);
f91f0fd5
TL
3982 int r = mdcache->path_traverse(mdr, cf, mdr->get_filepath(),
3983 (want_auth ? MDS_TRAVERSE_WANT_AUTH : 0),
3984 &mdr->dn[0], &mdr->in[0]);
3985 if (r > 0)
3986 return; // delayed
9f95a23c 3987
f91f0fd5
TL
3988 if (r < 0) {
3989 // fall-thru. let rdlock_path_pin_ref() check again.
3990 } else if (is_lookup) {
3991 CDentry* dn = mdr->dn[0].back();
3992 mdr->pin(dn);
3993 auto em = dn->batch_ops.emplace(std::piecewise_construct, std::forward_as_tuple(mask), std::forward_as_tuple());
9f95a23c 3994 if (em.second) {
f91f0fd5 3995 em.first->second = std::make_unique<Batch_Getattr_Lookup>(this, mdr);
9f95a23c 3996 } else {
f91f0fd5 3997 dout(20) << __func__ << ": LOOKUP op, wait for previous same getattr ops to respond. " << *mdr << dendl;
9f95a23c 3998 em.first->second->add_request(mdr);
aee94f69 3999 mdr->mark_event("joining batch lookup");
9f95a23c
TL
4000 return;
4001 }
4002 } else {
f91f0fd5
TL
4003 CInode *in = mdr->in[0];
4004 mdr->pin(in);
4005 auto em = in->batch_ops.emplace(std::piecewise_construct, std::forward_as_tuple(mask), std::forward_as_tuple());
9f95a23c 4006 if (em.second) {
f91f0fd5 4007 em.first->second = std::make_unique<Batch_Getattr_Lookup>(this, mdr);
9f95a23c 4008 } else {
f91f0fd5 4009 dout(20) << __func__ << ": GETATTR op, wait for previous same getattr ops to respond. " << *mdr << dendl;
9f95a23c 4010 em.first->second->add_request(mdr);
aee94f69 4011 mdr->mark_event("joining batch getattr");
9f95a23c
TL
4012 return;
4013 }
4014 }
9f95a23c 4015 }
7c673cae 4016
f91f0fd5
TL
4017 CInode *ref = rdlock_path_pin_ref(mdr, want_auth, false);
4018 if (!ref)
4019 return;
4020
7c673cae
FG
4021 /*
4022 * if client currently holds the EXCL cap on a field, do not rdlock
4023 * it; client's stat() will result in valid info if _either_ EXCL
4024 * cap is held or MDS rdlocks and reads the value here.
4025 *
4026 * handling this case here is easier than weakening rdlock
4027 * semantics... that would cause problems elsewhere.
4028 */
4029 client_t client = mdr->get_client();
4030 int issued = 0;
4031 Capability *cap = ref->get_client_cap(client);
4032 if (cap && (mdr->snapid == CEPH_NOSNAP ||
4033 mdr->snapid <= cap->client_follows))
4034 issued = cap->issued();
4035
9f95a23c
TL
4036 // FIXME
4037 MutationImpl::LockOpVec lov;
94b18763 4038 if ((mask & CEPH_CAP_LINK_SHARED) && !(issued & CEPH_CAP_LINK_EXCL))
11fdf7f2 4039 lov.add_rdlock(&ref->linklock);
94b18763 4040 if ((mask & CEPH_CAP_AUTH_SHARED) && !(issued & CEPH_CAP_AUTH_EXCL))
11fdf7f2 4041 lov.add_rdlock(&ref->authlock);
94b18763 4042 if ((mask & CEPH_CAP_XATTR_SHARED) && !(issued & CEPH_CAP_XATTR_EXCL))
11fdf7f2 4043 lov.add_rdlock(&ref->xattrlock);
94b18763
FG
4044 if ((mask & CEPH_CAP_FILE_SHARED) && !(issued & CEPH_CAP_FILE_EXCL)) {
4045 // Don't wait on unstable filelock if client is allowed to read file size.
4046 // This can reduce the response time of getattr in the case that multiple
4047 // clients do stat(2) and there are writers.
4048 // The downside of this optimization is that mds may not issue Fs caps along
4049 // with getattr reply. Client may need to send more getattr requests.
11fdf7f2
TL
4050 if (mdr->is_rdlocked(&ref->filelock)) {
4051 lov.add_rdlock(&ref->filelock);
94b18763
FG
4052 } else if (ref->filelock.is_stable() ||
4053 ref->filelock.get_num_wrlocks() > 0 ||
4054 !ref->filelock.can_read(mdr->get_client())) {
aee94f69
TL
4055 /* Since we're taking advantage of an optimization here:
4056 *
4057 * We cannot suddenly, due to a changing condition, add this filelock as
4058 * it can cause lock-order deadlocks. In this case, that condition is the
4059 * lock state changes between request retries. If that happens, we need
4060 * to check if we've acquired the other locks in this vector. If we have,
4061 * then we need to drop those locks and retry.
4062 */
4063 if (mdr->is_rdlocked(&ref->linklock) ||
4064 mdr->is_rdlocked(&ref->authlock) ||
4065 mdr->is_rdlocked(&ref->xattrlock)) {
4066 /* start over */
4067 dout(20) << " dropping locks and restarting request because filelock state change" << dendl;
4068 mds->locker->drop_locks(mdr.get());
4069 mdr->drop_local_auth_pins();
4070 mds->queue_waiter(new C_MDS_RetryRequest(mdcache, mdr));
4071 return;
4072 }
11fdf7f2 4073 lov.add_rdlock(&ref->filelock);
9f95a23c 4074 mdr->locking_state &= ~MutationImpl::ALL_LOCKED;
94b18763
FG
4075 }
4076 }
7c673cae 4077
11fdf7f2 4078 if (!mds->locker->acquire_locks(mdr, lov))
7c673cae
FG
4079 return;
4080
4081 if (!check_access(mdr, ref, MAY_READ))
4082 return;
4083
28e407b8
AA
4084 utime_t now = ceph_clock_now();
4085 mdr->set_mds_stamp(now);
4086
7c673cae
FG
4087 // note which caps are requested, so we return at least a snapshot
4088 // value for them. (currently this matters for xattrs and inline data)
4089 mdr->getattr_caps = mask;
4090
1e59de90 4091 mds->balancer->hit_inode(ref, META_POP_IRD);
7c673cae
FG
4092
4093 // reply
4094 dout(10) << "reply to stat on " << *req << dendl;
4095 mdr->tracei = ref;
4096 if (is_lookup)
4097 mdr->tracedn = mdr->dn[0].back();
4098 respond_to_request(mdr, 0);
4099}
4100
4101struct C_MDS_LookupIno2 : public ServerContext {
4102 MDRequestRef mdr;
4103 C_MDS_LookupIno2(Server *s, MDRequestRef& r) : ServerContext(s), mdr(r) {}
4104 void finish(int r) override {
4105 server->_lookup_ino_2(mdr, r);
4106 }
4107};
4108
7c673cae
FG
4109/*
4110 * filepath: ino
4111 */
4112void Server::handle_client_lookup_ino(MDRequestRef& mdr,
4113 bool want_parent, bool want_dentry)
4114{
9f95a23c 4115 const cref_t<MClientRequest> &req = mdr->client_request;
11fdf7f2
TL
4116
4117 if ((uint64_t)req->head.args.lookupino.snapid > 0)
4118 return _lookup_snap_ino(mdr);
7c673cae
FG
4119
4120 inodeno_t ino = req->get_filepath().get_ino();
b3b6e05e
TL
4121 auto _ino = ino.val;
4122
4123 /* It's been observed [1] that a client may lookup a private ~mdsdir inode.
4124 * I do not have an explanation for how that happened organically but this
4125 * check will ensure that the client can no longer do that.
4126 *
4127 * [1] https://tracker.ceph.com/issues/49922
4128 */
4129 if (MDS_IS_PRIVATE_INO(_ino)) {
4130 respond_to_request(mdr, -CEPHFS_ESTALE);
4131 return;
4132 }
4133
7c673cae
FG
4134 CInode *in = mdcache->get_inode(ino);
4135 if (in && in->state_test(CInode::STATE_PURGING)) {
f67539c2 4136 respond_to_request(mdr, -CEPHFS_ESTALE);
7c673cae
FG
4137 return;
4138 }
4139 if (!in) {
4140 mdcache->open_ino(ino, (int64_t)-1, new C_MDS_LookupIno2(this, mdr), false);
4141 return;
4142 }
4143
7c673cae
FG
4144 // check for nothing (not read or write); this still applies the
4145 // path check.
4146 if (!check_access(mdr, in, 0))
4147 return;
4148
4149 CDentry *dn = in->get_projected_parent_dn();
4150 CInode *diri = dn ? dn->get_dir()->inode : NULL;
4151
11fdf7f2 4152 MutationImpl::LockOpVec lov;
7c673cae
FG
4153 if (dn && (want_parent || want_dentry)) {
4154 mdr->pin(dn);
11fdf7f2 4155 lov.add_rdlock(&dn->lock);
7c673cae
FG
4156 }
4157
11fdf7f2 4158 unsigned mask = req->head.args.lookupino.mask;
7c673cae
FG
4159 if (mask) {
4160 Capability *cap = in->get_client_cap(mdr->get_client());
4161 int issued = 0;
4162 if (cap && (mdr->snapid == CEPH_NOSNAP || mdr->snapid <= cap->client_follows))
4163 issued = cap->issued();
9f95a23c 4164 // FIXME
7c673cae
FG
4165 // permission bits, ACL/security xattrs
4166 if ((mask & CEPH_CAP_AUTH_SHARED) && (issued & CEPH_CAP_AUTH_EXCL) == 0)
11fdf7f2 4167 lov.add_rdlock(&in->authlock);
7c673cae 4168 if ((mask & CEPH_CAP_XATTR_SHARED) && (issued & CEPH_CAP_XATTR_EXCL) == 0)
11fdf7f2 4169 lov.add_rdlock(&in->xattrlock);
7c673cae
FG
4170
4171 mdr->getattr_caps = mask;
4172 }
4173
11fdf7f2
TL
4174 if (!lov.empty()) {
4175 if (!mds->locker->acquire_locks(mdr, lov))
7c673cae
FG
4176 return;
4177
d2e6a577
FG
4178 if (diri != NULL) {
4179 // need read access to directory inode
4180 if (!check_access(mdr, diri, MAY_READ))
4181 return;
4182 }
7c673cae
FG
4183 }
4184
4185 if (want_parent) {
4186 if (in->is_base()) {
f67539c2 4187 respond_to_request(mdr, -CEPHFS_EINVAL);
7c673cae
FG
4188 return;
4189 }
4190 if (!diri || diri->is_stray()) {
f67539c2 4191 respond_to_request(mdr, -CEPHFS_ESTALE);
7c673cae
FG
4192 return;
4193 }
4194 dout(10) << "reply to lookup_parent " << *in << dendl;
4195 mdr->tracei = diri;
4196 respond_to_request(mdr, 0);
4197 } else {
4198 if (want_dentry) {
4199 inodeno_t dirino = req->get_filepath2().get_ino();
4200 if (!diri || (dirino != inodeno_t() && diri->ino() != dirino)) {
f67539c2 4201 respond_to_request(mdr, -CEPHFS_ENOENT);
7c673cae
FG
4202 return;
4203 }
4204 dout(10) << "reply to lookup_name " << *in << dendl;
4205 } else
4206 dout(10) << "reply to lookup_ino " << *in << dendl;
4207
4208 mdr->tracei = in;
4209 if (want_dentry)
4210 mdr->tracedn = dn;
4211 respond_to_request(mdr, 0);
4212 }
4213}
4214
11fdf7f2
TL
4215void Server::_lookup_snap_ino(MDRequestRef& mdr)
4216{
9f95a23c 4217 const cref_t<MClientRequest> &req = mdr->client_request;
11fdf7f2
TL
4218
4219 vinodeno_t vino;
4220 vino.ino = req->get_filepath().get_ino();
4221 vino.snapid = (__u64)req->head.args.lookupino.snapid;
4222 inodeno_t parent_ino = (__u64)req->head.args.lookupino.parent;
4223 __u32 hash = req->head.args.lookupino.hash;
4224
4225 dout(7) << "lookup_snap_ino " << vino << " parent " << parent_ino << " hash " << hash << dendl;
4226
4227 CInode *in = mdcache->lookup_snap_inode(vino);
4228 if (!in) {
4229 in = mdcache->get_inode(vino.ino);
4230 if (in) {
4231 if (in->state_test(CInode::STATE_PURGING) ||
4232 !in->has_snap_data(vino.snapid)) {
4233 if (in->is_dir() || !parent_ino) {
f67539c2 4234 respond_to_request(mdr, -CEPHFS_ESTALE);
11fdf7f2
TL
4235 return;
4236 }
4237 in = NULL;
4238 }
4239 }
4240 }
4241
4242 if (in) {
4243 dout(10) << "reply to lookup_snap_ino " << *in << dendl;
4244 mdr->snapid = vino.snapid;
4245 mdr->tracei = in;
4246 respond_to_request(mdr, 0);
4247 return;
4248 }
4249
4250 CInode *diri = NULL;
4251 if (parent_ino) {
4252 diri = mdcache->get_inode(parent_ino);
4253 if (!diri) {
b3b6e05e 4254 mdcache->open_ino(parent_ino, mds->get_metadata_pool(), new C_MDS_LookupIno2(this, mdr));
11fdf7f2
TL
4255 return;
4256 }
4257
4258 if (!diri->is_dir()) {
f67539c2 4259 respond_to_request(mdr, -CEPHFS_EINVAL);
11fdf7f2
TL
4260 return;
4261 }
4262
4263 MutationImpl::LockOpVec lov;
4264 lov.add_rdlock(&diri->dirfragtreelock);
4265 if (!mds->locker->acquire_locks(mdr, lov))
4266 return;
4267
4268 frag_t frag = diri->dirfragtree[hash];
4269 CDir *dir = try_open_auth_dirfrag(diri, frag, mdr);
4270 if (!dir)
4271 return;
4272
4273 if (!dir->is_complete()) {
4274 if (dir->is_frozen()) {
4275 mds->locker->drop_locks(mdr.get());
4276 mdr->drop_local_auth_pins();
4277 dir->add_waiter(CDir::WAIT_UNFREEZE, new C_MDS_RetryRequest(mdcache, mdr));
4278 return;
4279 }
4280 dir->fetch(new C_MDS_RetryRequest(mdcache, mdr), true);
4281 return;
4282 }
4283
f67539c2 4284 respond_to_request(mdr, -CEPHFS_ESTALE);
11fdf7f2 4285 } else {
b3b6e05e 4286 mdcache->open_ino(vino.ino, mds->get_metadata_pool(), new C_MDS_LookupIno2(this, mdr), false);
11fdf7f2
TL
4287 }
4288}
4289
7c673cae
FG
4290void Server::_lookup_ino_2(MDRequestRef& mdr, int r)
4291{
4292 inodeno_t ino = mdr->client_request->get_filepath().get_ino();
4293 dout(10) << "_lookup_ino_2 " << mdr.get() << " ino " << ino << " r=" << r << dendl;
4294
4295 // `r` is a rank if >=0, else an error code
4296 if (r >= 0) {
4297 mds_rank_t dest_rank(r);
4298 if (dest_rank == mds->get_nodeid())
4299 dispatch_client_request(mdr);
4300 else
4301 mdcache->request_forward(mdr, dest_rank);
4302 return;
4303 }
4304
4305 // give up
f67539c2
TL
4306 if (r == -CEPHFS_ENOENT || r == -CEPHFS_ENODATA)
4307 r = -CEPHFS_ESTALE;
7c673cae
FG
4308 respond_to_request(mdr, r);
4309}
4310
4311
4312/* This function takes responsibility for the passed mdr*/
4313void Server::handle_client_open(MDRequestRef& mdr)
4314{
9f95a23c 4315 const cref_t<MClientRequest> &req = mdr->client_request;
7c673cae
FG
4316 dout(7) << "open on " << req->get_filepath() << dendl;
4317
4318 int flags = req->head.args.open.flags;
4319 int cmode = ceph_flags_to_mode(flags);
4320 if (cmode < 0) {
f67539c2 4321 respond_to_request(mdr, -CEPHFS_EINVAL);
7c673cae
FG
4322 return;
4323 }
4324
181888fb
FG
4325 bool need_auth = !file_mode_is_readonly(cmode) ||
4326 (flags & (CEPH_O_TRUNC | CEPH_O_DIRECTORY));
7c673cae
FG
4327
4328 if ((cmode & CEPH_FILE_MODE_WR) && mdcache->is_readonly()) {
4329 dout(7) << "read-only FS" << dendl;
f67539c2 4330 respond_to_request(mdr, -CEPHFS_EROFS);
7c673cae
FG
4331 return;
4332 }
4333
9f95a23c 4334 CInode *cur = rdlock_path_pin_ref(mdr, need_auth);
7c673cae
FG
4335 if (!cur)
4336 return;
4337
4338 if (cur->is_frozen() || cur->state_test(CInode::STATE_EXPORTINGCAPS)) {
11fdf7f2 4339 ceph_assert(!need_auth);
9f95a23c
TL
4340 mdr->locking_state &= ~(MutationImpl::PATH_LOCKED | MutationImpl::ALL_LOCKED);
4341 CInode *cur = rdlock_path_pin_ref(mdr, true);
7c673cae
FG
4342 if (!cur)
4343 return;
4344 }
4345
f67539c2 4346 if (!cur->is_file()) {
7c673cae
FG
4347 // can only open non-regular inode with mode FILE_MODE_PIN, at least for now.
4348 cmode = CEPH_FILE_MODE_PIN;
4349 // the inode is symlink and client wants to follow it, ignore the O_TRUNC flag.
f67539c2 4350 if (cur->is_symlink() && !(flags & CEPH_O_NOFOLLOW))
7c673cae
FG
4351 flags &= ~CEPH_O_TRUNC;
4352 }
4353
4354 dout(10) << "open flags = " << flags
4355 << ", filemode = " << cmode
4356 << ", need_auth = " << need_auth
4357 << dendl;
4358
4359 // regular file?
4360 /*if (!cur->inode.is_file() && !cur->inode.is_dir()) {
4361 dout(7) << "not a file or dir " << *cur << dendl;
f67539c2 4362 respond_to_request(mdr, -CEPHFS_ENXIO); // FIXME what error do we want?
7c673cae
FG
4363 return;
4364 }*/
f67539c2 4365 if ((flags & CEPH_O_DIRECTORY) && !cur->is_dir() && !cur->is_symlink()) {
7c673cae 4366 dout(7) << "specified O_DIRECTORY on non-directory " << *cur << dendl;
f67539c2 4367 respond_to_request(mdr, -CEPHFS_EINVAL);
7c673cae
FG
4368 return;
4369 }
4370
f67539c2 4371 if ((flags & CEPH_O_TRUNC) && !cur->is_file()) {
7c673cae 4372 dout(7) << "specified O_TRUNC on !(file|symlink) " << *cur << dendl;
f67539c2
TL
4373 // we should return -CEPHFS_EISDIR for directory, return -CEPHFS_EINVAL for other non-regular
4374 respond_to_request(mdr, cur->is_dir() ? -CEPHFS_EISDIR : -CEPHFS_EINVAL);
7c673cae
FG
4375 return;
4376 }
4377
f67539c2 4378 if (cur->get_inode()->inline_data.version != CEPH_INLINE_NONE &&
11fdf7f2 4379 !mdr->session->get_connection()->has_feature(CEPH_FEATURE_MDS_INLINE_DATA)) {
7c673cae 4380 dout(7) << "old client cannot open inline data file " << *cur << dendl;
f67539c2 4381 respond_to_request(mdr, -CEPHFS_EPERM);
7c673cae
FG
4382 return;
4383 }
4384
4385 // snapped data is read only
4386 if (mdr->snapid != CEPH_NOSNAP &&
4387 ((cmode & CEPH_FILE_MODE_WR) || req->may_write())) {
4388 dout(7) << "snap " << mdr->snapid << " is read-only " << *cur << dendl;
f67539c2 4389 respond_to_request(mdr, -CEPHFS_EROFS);
7c673cae
FG
4390 return;
4391 }
4392
9f95a23c 4393 MutationImpl::LockOpVec lov;
aee94f69 4394 lov.add_rdlock(&cur->snaplock);
9f95a23c 4395
7c673cae
FG
4396 unsigned mask = req->head.args.open.mask;
4397 if (mask) {
4398 Capability *cap = cur->get_client_cap(mdr->get_client());
4399 int issued = 0;
4400 if (cap && (mdr->snapid == CEPH_NOSNAP || mdr->snapid <= cap->client_follows))
4401 issued = cap->issued();
4402 // permission bits, ACL/security xattrs
4403 if ((mask & CEPH_CAP_AUTH_SHARED) && (issued & CEPH_CAP_AUTH_EXCL) == 0)
11fdf7f2 4404 lov.add_rdlock(&cur->authlock);
7c673cae 4405 if ((mask & CEPH_CAP_XATTR_SHARED) && (issued & CEPH_CAP_XATTR_EXCL) == 0)
11fdf7f2 4406 lov.add_rdlock(&cur->xattrlock);
7c673cae
FG
4407
4408 mdr->getattr_caps = mask;
4409 }
4410
4411 // O_TRUNC
4412 if ((flags & CEPH_O_TRUNC) && !mdr->has_completed) {
11fdf7f2 4413 ceph_assert(cur->is_auth());
7c673cae 4414
11fdf7f2
TL
4415 lov.add_xlock(&cur->filelock);
4416 if (!mds->locker->acquire_locks(mdr, lov))
7c673cae
FG
4417 return;
4418
4419 if (!check_access(mdr, cur, MAY_WRITE))
4420 return;
4421
4422 // wait for pending truncate?
f67539c2 4423 const auto& pi = cur->get_projected_inode();
7c673cae
FG
4424 if (pi->is_truncating()) {
4425 dout(10) << " waiting for pending truncate from " << pi->truncate_from
4426 << " to " << pi->truncate_size << " to complete on " << *cur << dendl;
4427 mds->locker->drop_locks(mdr.get());
4428 mdr->drop_local_auth_pins();
4429 cur->add_waiter(CInode::WAIT_TRUNC, new C_MDS_RetryRequest(mdcache, mdr));
4430 return;
4431 }
4432
4433 do_open_truncate(mdr, cmode);
4434 return;
4435 }
4436
4437 // sync filelock if snapped.
4438 // this makes us wait for writers to flushsnaps, ensuring we get accurate metadata,
4439 // and that data itself is flushed so that we can read the snapped data off disk.
4440 if (mdr->snapid != CEPH_NOSNAP && !cur->is_dir()) {
11fdf7f2 4441 lov.add_rdlock(&cur->filelock);
7c673cae
FG
4442 }
4443
11fdf7f2 4444 if (!mds->locker->acquire_locks(mdr, lov))
7c673cae
FG
4445 return;
4446
4447 mask = MAY_READ;
4448 if (cmode & CEPH_FILE_MODE_WR)
4449 mask |= MAY_WRITE;
4450 if (!check_access(mdr, cur, mask))
4451 return;
4452
28e407b8
AA
4453 utime_t now = ceph_clock_now();
4454 mdr->set_mds_stamp(now);
4455
7c673cae
FG
4456 if (cur->is_file() || cur->is_dir()) {
4457 if (mdr->snapid == CEPH_NOSNAP) {
4458 // register new cap
9f95a23c 4459 Capability *cap = mds->locker->issue_new_caps(cur, cmode, mdr, nullptr);
7c673cae
FG
4460 if (cap)
4461 dout(12) << "open issued caps " << ccap_string(cap->pending())
4462 << " for " << req->get_source()
4463 << " on " << *cur << dendl;
4464 } else {
4465 int caps = ceph_caps_for_mode(cmode);
4466 dout(12) << "open issued IMMUTABLE SNAP caps " << ccap_string(caps)
4467 << " for " << req->get_source()
4468 << " snapid " << mdr->snapid
4469 << " on " << *cur << dendl;
4470 mdr->snap_caps = caps;
4471 }
4472 }
4473
4474 // increase max_size?
4475 if (cmode & CEPH_FILE_MODE_WR)
4476 mds->locker->check_inode_max_size(cur);
4477
4478 // make sure this inode gets into the journal
4479 if (cur->is_auth() && cur->last == CEPH_NOSNAP &&
11fdf7f2 4480 mdcache->open_file_table.should_log_open(cur)) {
7c673cae
FG
4481 EOpen *le = new EOpen(mds->mdlog);
4482 mdlog->start_entry(le);
4483 le->add_clean_inode(cur);
7c673cae
FG
4484 mdlog->submit_entry(le);
4485 }
4486
4487 // hit pop
4488 if (cmode & CEPH_FILE_MODE_WR)
11fdf7f2 4489 mds->balancer->hit_inode(cur, META_POP_IWR);
7c673cae 4490 else
1e59de90 4491 mds->balancer->hit_inode(cur, META_POP_IRD);
7c673cae
FG
4492
4493 CDentry *dn = 0;
4494 if (req->get_dentry_wanted()) {
11fdf7f2 4495 ceph_assert(mdr->dn[0].size());
7c673cae
FG
4496 dn = mdr->dn[0].back();
4497 }
4498
4499 mdr->tracei = cur;
4500 mdr->tracedn = dn;
4501 respond_to_request(mdr, 0);
4502}
4503
4504class C_MDS_openc_finish : public ServerLogContext {
4505 CDentry *dn;
4506 CInode *newi;
7c673cae 4507public:
11fdf7f2
TL
4508 C_MDS_openc_finish(Server *s, MDRequestRef& r, CDentry *d, CInode *ni) :
4509 ServerLogContext(s, r), dn(d), newi(ni) {}
7c673cae 4510 void finish(int r) override {
11fdf7f2 4511 ceph_assert(r == 0);
7c673cae 4512
05a536ef
TL
4513 // crash current MDS and the replacing MDS will test the journal
4514 ceph_assert(!g_conf()->mds_kill_skip_replaying_inotable);
4515
7c673cae
FG
4516 dn->pop_projected_linkage();
4517
4518 // dirty inode, dn, dir
f67539c2 4519 newi->mark_dirty(mdr->ls);
28e407b8 4520 newi->mark_dirty_parent(mdr->ls, true);
7c673cae
FG
4521
4522 mdr->apply();
4523
4524 get_mds()->locker->share_inode_max_size(newi);
4525
4526 MDRequestRef null_ref;
4527 get_mds()->mdcache->send_dentry_link(dn, null_ref);
4528
11fdf7f2 4529 get_mds()->balancer->hit_inode(newi, META_POP_IWR);
7c673cae
FG
4530
4531 server->respond_to_request(mdr, 0);
4532
11fdf7f2 4533 ceph_assert(g_conf()->mds_kill_openc_at != 1);
7c673cae
FG
4534 }
4535};
4536
4537/* This function takes responsibility for the passed mdr*/
4538void Server::handle_client_openc(MDRequestRef& mdr)
4539{
9f95a23c 4540 const cref_t<MClientRequest> &req = mdr->client_request;
7c673cae
FG
4541 client_t client = mdr->get_client();
4542
4543 dout(7) << "open w/ O_CREAT on " << req->get_filepath() << dendl;
4544
4545 int cmode = ceph_flags_to_mode(req->head.args.open.flags);
4546 if (cmode < 0) {
f67539c2 4547 respond_to_request(mdr, -CEPHFS_EINVAL);
7c673cae
FG
4548 return;
4549 }
4550
c07f9fc5 4551 bool excl = req->head.args.open.flags & CEPH_O_EXCL;
1e59de90 4552 CDentry *dn = rdlock_path_xlock_dentry(mdr, true, !excl, true, true);
9f95a23c
TL
4553 if (!dn)
4554 return;
c07f9fc5 4555
9f95a23c
TL
4556 CDentry::linkage_t *dnl = dn->get_projected_linkage();
4557 if (!excl && !dnl->is_null()) {
4558 // it existed.
1e59de90 4559 ceph_assert(mdr.get()->is_rdlocked(&dn->lock));
9f95a23c 4560
9f95a23c 4561 handle_client_open(mdr);
7c673cae
FG
4562 return;
4563 }
9f95a23c
TL
4564
4565 ceph_assert(dnl->is_null());
4566
f67539c2
TL
4567 if (req->get_alternate_name().size() > alternate_name_max) {
4568 dout(10) << " alternate_name longer than " << alternate_name_max << dendl;
4569 respond_to_request(mdr, -CEPHFS_ENAMETOOLONG);
4570 return;
4571 }
4572 dn->set_alternate_name(req->get_alternate_name());
4573
7c673cae
FG
4574 // set layout
4575 file_layout_t layout;
9f95a23c
TL
4576 if (mdr->dir_layout != file_layout_t())
4577 layout = mdr->dir_layout;
7c673cae
FG
4578 else
4579 layout = mdcache->default_file_layout;
4580
4581 // What kind of client caps are required to complete this operation
4582 uint64_t access = MAY_WRITE;
4583
4584 const auto default_layout = layout;
4585
4586 // fill in any special params from client
4587 if (req->head.args.open.stripe_unit)
4588 layout.stripe_unit = req->head.args.open.stripe_unit;
4589 if (req->head.args.open.stripe_count)
4590 layout.stripe_count = req->head.args.open.stripe_count;
4591 if (req->head.args.open.object_size)
4592 layout.object_size = req->head.args.open.object_size;
4593 if (req->get_connection()->has_feature(CEPH_FEATURE_CREATEPOOLID) &&
4594 (__s32)req->head.args.open.pool >= 0) {
4595 layout.pool_id = req->head.args.open.pool;
4596
4597 // make sure we have as new a map as the client
4598 if (req->get_mdsmap_epoch() > mds->mdsmap->get_epoch()) {
4599 mds->wait_for_mdsmap(req->get_mdsmap_epoch(), new C_MDS_RetryRequest(mdcache, mdr));
4600 return;
4601 }
4602 }
4603
4604 // If client doesn't have capability to modify layout pools, then
4605 // only permit this request if the requested pool matches what the
4606 // file would have inherited anyway from its parent.
4607 if (default_layout != layout) {
4608 access |= MAY_SET_VXATTR;
4609 }
4610
4611 if (!layout.is_valid()) {
4612 dout(10) << " invalid initial file layout" << dendl;
f67539c2 4613 respond_to_request(mdr, -CEPHFS_EINVAL);
7c673cae
FG
4614 return;
4615 }
4616 if (!mds->mdsmap->is_data_pool(layout.pool_id)) {
4617 dout(10) << " invalid data pool " << layout.pool_id << dendl;
f67539c2 4618 respond_to_request(mdr, -CEPHFS_EINVAL);
7c673cae
FG
4619 return;
4620 }
4621
c07f9fc5 4622 // created null dn.
7c673cae
FG
4623 CDir *dir = dn->get_dir();
4624 CInode *diri = dir->get_inode();
7c673cae
FG
4625 if (!check_access(mdr, diri, access))
4626 return;
7c673cae
FG
4627 if (!check_fragment_space(mdr, dir))
4628 return;
20effc67
TL
4629 if (!check_dir_max_entries(mdr, dir))
4630 return;
7c673cae 4631
9f95a23c
TL
4632 if (mdr->dn[0].size() == 1)
4633 mds->locker->create_lock_cache(mdr, diri, &mdr->dir_layout);
7c673cae 4634
7c673cae 4635 // create inode.
f67539c2
TL
4636 CInode *newi = prepare_new_inode(mdr, dn->get_dir(), inodeno_t(req->head.ino),
4637 req->head.args.open.mode | S_IFREG, &layout);
4638 ceph_assert(newi);
7c673cae
FG
4639
4640 // it's a file.
f67539c2 4641 dn->push_projected_linkage(newi);
7c673cae 4642
f67539c2
TL
4643 auto _inode = newi->_get_inode();
4644 _inode->version = dn->pre_dirty();
7c673cae 4645 if (layout.pool_id != mdcache->default_file_layout.pool_id)
f67539c2
TL
4646 _inode->add_old_pool(mdcache->default_file_layout.pool_id);
4647 _inode->update_backtrace();
4648 _inode->rstat.rfiles = 1;
4649 _inode->accounted_rstat = _inode->rstat;
a8e16298
TL
4650
4651 SnapRealm *realm = diri->find_snaprealm();
11fdf7f2
TL
4652 snapid_t follows = mdcache->get_global_snaprealm()->get_newest_seq();
4653 ceph_assert(follows >= realm->get_newest_seq());
a8e16298
TL
4654
4655 ceph_assert(dn->first == follows+1);
f67539c2 4656 newi->first = dn->first;
a8e16298
TL
4657
4658 // do the open
f67539c2
TL
4659 Capability *cap = mds->locker->issue_new_caps(newi, cmode, mdr, realm);
4660 newi->authlock.set_state(LOCK_EXCL);
4661 newi->xattrlock.set_state(LOCK_EXCL);
a8e16298
TL
4662
4663 if (cap && (cmode & CEPH_FILE_MODE_WR)) {
f67539c2
TL
4664 _inode->client_ranges[client].range.first = 0;
4665 _inode->client_ranges[client].range.last = _inode->layout.stripe_unit;
4666 _inode->client_ranges[client].follows = follows;
4667 newi->mark_clientwriteable();
a8e16298 4668 cap->mark_clientwriteable();
7c673cae 4669 }
7c673cae
FG
4670
4671 // prepare finisher
4672 mdr->ls = mdlog->get_current_segment();
4673 EUpdate *le = new EUpdate(mdlog, "openc");
4674 mdlog->start_entry(le);
4675 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
4676 journal_allocated_inos(mdr, &le->metablob);
f67539c2
TL
4677 mdcache->predirty_journal_parents(mdr, &le->metablob, newi, dn->get_dir(), PREDIRTY_PRIMARY|PREDIRTY_DIR, 1);
4678 le->metablob.add_primary_dentry(dn, newi, true, true, true);
7c673cae 4679
7c673cae 4680 // make sure this inode gets into the journal
f67539c2 4681 le->metablob.add_opened_ino(newi->ino());
7c673cae 4682
f67539c2 4683 C_MDS_openc_finish *fin = new C_MDS_openc_finish(this, mdr, dn, newi);
7c673cae 4684
9f95a23c
TL
4685 if (mdr->session->info.has_feature(CEPHFS_FEATURE_DELEG_INO)) {
4686 openc_response_t ocresp;
4687
4688 dout(10) << "adding created_ino and delegated_inos" << dendl;
f67539c2 4689 ocresp.created_ino = _inode->ino;
9f95a23c
TL
4690
4691 if (delegate_inos_pct && !req->is_queued_for_replay()) {
4692 // Try to delegate some prealloc_inos to the client, if it's down to half the max
4693 unsigned frac = 100 / delegate_inos_pct;
4694 if (mdr->session->delegated_inos.size() < (unsigned)g_conf()->mds_client_prealloc_inos / frac / 2)
4695 mdr->session->delegate_inos(g_conf()->mds_client_prealloc_inos / frac, ocresp.delegated_inos);
4696 }
4697
4698 encode(ocresp, mdr->reply_extra_bl);
4699 } else if (mdr->client_request->get_connection()->has_feature(CEPH_FEATURE_REPLY_CREATE_INODE)) {
7c673cae
FG
4700 dout(10) << "adding ino to reply to indicate inode was created" << dendl;
4701 // add the file created flag onto the reply if create_flags features is supported
f67539c2 4702 encode(newi->ino(), mdr->reply_extra_bl);
7c673cae
FG
4703 }
4704
f67539c2 4705 journal_and_reply(mdr, newi, dn, le, fin);
7c673cae
FG
4706
4707 // We hit_dir (via hit_inode) in our finish callback, but by then we might
4708 // have overshot the split size (multiple opencs in flight), so here is
4709 // an early chance to split the dir if this openc makes it oversized.
4710 mds->balancer->maybe_fragment(dir, false);
4711}
4712
4713
aee94f69
TL
4714void Server::_finalize_readdir(MDRequestRef& mdr,
4715 CInode *diri,
4716 CDir* dir,
4717 bool start,
4718 bool end,
4719 __u16 flags,
4720 __u32 numfiles,
4721 bufferlist& dirbl,
4722 bufferlist& dnbl)
4723{
4724 const cref_t<MClientRequest> &req = mdr->client_request;
4725 Session *session = mds->get_session(req);
4726
4727 session->touch_readdir_cap(numfiles);
4728
4729 if (end) {
4730 flags |= CEPH_READDIR_FRAG_END;
4731 if (start)
4732 flags |= CEPH_READDIR_FRAG_COMPLETE; // FIXME: what purpose does this serve
4733 }
4734
4735 // finish final blob
4736 encode(numfiles, dirbl);
4737 encode(flags, dirbl);
4738 dirbl.claim_append(dnbl);
4739
4740 // yay, reply
4741 dout(10) << "reply to " << *req << " readdir num=" << numfiles
4742 << " bytes=" << dirbl.length()
4743 << " start=" << (int)start
4744 << " end=" << (int)end
4745 << dendl;
4746 mdr->reply_extra_bl = dirbl;
4747
4748 // bump popularity. NOTE: this doesn't quite capture it.
4749 mds->balancer->hit_dir(dir, META_POP_READDIR, numfiles);
4750
4751 // reply
4752 mdr->tracei = diri;
4753 respond_to_request(mdr, 0);
4754}
7c673cae
FG
4755
4756void Server::handle_client_readdir(MDRequestRef& mdr)
4757{
9f95a23c 4758 const cref_t<MClientRequest> &req = mdr->client_request;
adb31ebb 4759 Session *session = mds->get_session(req);
7c673cae 4760 client_t client = req->get_source().num();
11fdf7f2 4761 MutationImpl::LockOpVec lov;
9f95a23c 4762 CInode *diri = rdlock_path_pin_ref(mdr, false, true);
7c673cae
FG
4763 if (!diri) return;
4764
4765 // it's a directory, right?
4766 if (!diri->is_dir()) {
4767 // not a dir
f67539c2
TL
4768 dout(10) << "reply to " << *req << " readdir -CEPHFS_ENOTDIR" << dendl;
4769 respond_to_request(mdr, -CEPHFS_ENOTDIR);
7c673cae
FG
4770 return;
4771 }
4772
adb31ebb
TL
4773 auto num_caps = session->get_num_caps();
4774 auto session_cap_acquisition = session->get_cap_acquisition();
4775
4776 if (num_caps > static_cast<uint64_t>(max_caps_per_client * max_caps_throttle_ratio) && session_cap_acquisition >= cap_acquisition_throttle) {
4777 dout(20) << "readdir throttled. max_caps_per_client: " << max_caps_per_client << " num_caps: " << num_caps
4778 << " session_cap_acquistion: " << session_cap_acquisition << " cap_acquisition_throttle: " << cap_acquisition_throttle << dendl;
4779 if (logger)
4780 logger->inc(l_mdss_cap_acquisition_throttle);
4781
aee94f69 4782 mdr->mark_event("cap_acquisition_throttle");
adb31ebb
TL
4783 mds->timer.add_event_after(caps_throttle_retry_request_timeout, new C_MDS_RetryRequest(mdcache, mdr));
4784 return;
4785 }
4786
11fdf7f2
TL
4787 lov.add_rdlock(&diri->filelock);
4788 lov.add_rdlock(&diri->dirfragtreelock);
7c673cae 4789
11fdf7f2 4790 if (!mds->locker->acquire_locks(mdr, lov))
7c673cae
FG
4791 return;
4792
4793 if (!check_access(mdr, diri, MAY_READ))
4794 return;
4795
4796 // which frag?
4797 frag_t fg = (__u32)req->head.args.readdir.frag;
4798 unsigned req_flags = (__u32)req->head.args.readdir.flags;
4799 string offset_str = req->get_path2();
4800
4801 __u32 offset_hash = 0;
4802 if (!offset_str.empty())
4803 offset_hash = ceph_frag_value(diri->hash_dentry_name(offset_str));
4804 else
4805 offset_hash = (__u32)req->head.args.readdir.offset_hash;
4806
4807 dout(10) << " frag " << fg << " offset '" << offset_str << "'"
4808 << " offset_hash " << offset_hash << " flags " << req_flags << dendl;
4809
4810 // does the frag exist?
4811 if (diri->dirfragtree[fg.value()] != fg) {
4812 frag_t newfg;
4813 if (req_flags & CEPH_READDIR_REPLY_BITFLAGS) {
4814 if (fg.contains((unsigned)offset_hash)) {
4815 newfg = diri->dirfragtree[offset_hash];
4816 } else {
4817 // client actually wants next frag
4818 newfg = diri->dirfragtree[fg.value()];
4819 }
4820 } else {
4821 offset_str.clear();
4822 newfg = diri->dirfragtree[fg.value()];
4823 }
4824 dout(10) << " adjust frag " << fg << " -> " << newfg << " " << diri->dirfragtree << dendl;
4825 fg = newfg;
4826 }
4827
4828 CDir *dir = try_open_auth_dirfrag(diri, fg, mdr);
4829 if (!dir) return;
4830
4831 // ok!
4832 dout(10) << "handle_client_readdir on " << *dir << dendl;
11fdf7f2 4833 ceph_assert(dir->is_auth());
7c673cae
FG
4834
4835 if (!dir->is_complete()) {
4836 if (dir->is_frozen()) {
4837 dout(7) << "dir is frozen " << *dir << dendl;
4838 mds->locker->drop_locks(mdr.get());
4839 mdr->drop_local_auth_pins();
4840 dir->add_waiter(CDir::WAIT_UNFREEZE, new C_MDS_RetryRequest(mdcache, mdr));
4841 return;
4842 }
4843 // fetch
4844 dout(10) << " incomplete dir contents for readdir on " << *dir << ", fetching" << dendl;
4845 dir->fetch(new C_MDS_RetryRequest(mdcache, mdr), true);
4846 return;
4847 }
4848
4849#ifdef MDS_VERIFY_FRAGSTAT
4850 dir->verify_fragstat();
4851#endif
4852
4853 utime_t now = ceph_clock_now();
4854 mdr->set_mds_stamp(now);
4855
4856 snapid_t snapid = mdr->snapid;
4857 dout(10) << "snapid " << snapid << dendl;
4858
4859 SnapRealm *realm = diri->find_snaprealm();
4860
4861 unsigned max = req->head.args.readdir.max_entries;
4862 if (!max)
4863 max = dir->get_num_any(); // whatever, something big.
4864 unsigned max_bytes = req->head.args.readdir.max_bytes;
4865 if (!max_bytes)
4866 // make sure at least one item can be encoded
11fdf7f2 4867 max_bytes = (512 << 10) + g_conf()->mds_max_xattr_pairs_size;
7c673cae
FG
4868
4869 // start final blob
4870 bufferlist dirbl;
11fdf7f2
TL
4871 DirStat ds;
4872 ds.frag = dir->get_frag();
4873 ds.auth = dir->get_dir_auth().first;
f91f0fd5 4874 if (dir->is_auth() && !forward_all_requests_to_auth)
11fdf7f2
TL
4875 dir->get_dist_spec(ds.dist, mds->get_nodeid());
4876
4877 dir->encode_dirstat(dirbl, mdr->session->info, ds);
7c673cae
FG
4878
4879 // count bytes available.
4880 // this isn't perfect, but we should capture the main variable/unbounded size items!
4881 int front_bytes = dirbl.length() + sizeof(__u32) + sizeof(__u8)*2;
4882 int bytes_left = max_bytes - front_bytes;
1e59de90 4883 bytes_left -= get_snap_trace(session, realm).length();
7c673cae
FG
4884
4885 // build dir contents
4886 bufferlist dnbl;
4887 __u32 numfiles = 0;
4888 bool start = !offset_hash && offset_str.empty();
7c673cae
FG
4889 // skip all dns < dentry_key_t(snapid, offset_str, offset_hash)
4890 dentry_key_t skip_key(snapid, offset_str.c_str(), offset_hash);
181888fb
FG
4891 auto it = start ? dir->begin() : dir->lower_bound(skip_key);
4892 bool end = (it == dir->end());
4893 for (; !end && numfiles < max; end = (it == dir->end())) {
7c673cae
FG
4894 CDentry *dn = it->second;
4895 ++it;
4896
4897 if (dn->state_test(CDentry::STATE_PURGING))
4898 continue;
4899
4900 bool dnp = dn->use_projected(client, mdr);
4901 CDentry::linkage_t *dnl = dnp ? dn->get_projected_linkage() : dn->get_linkage();
4902
1e59de90
TL
4903 if (dnl->is_null()) {
4904 if (dn->get_num_ref() == 0 && !dn->is_projected())
4905 dir->remove_dentry(dn);
7c673cae 4906 continue;
1e59de90 4907 }
7c673cae
FG
4908
4909 if (dn->last < snapid || dn->first > snapid) {
4910 dout(20) << "skipping non-overlapping snap " << *dn << dendl;
4911 continue;
4912 }
4913
4914 if (!start) {
4915 dentry_key_t offset_key(dn->last, offset_str.c_str(), offset_hash);
4916 if (!(offset_key < dn->key()))
4917 continue;
4918 }
4919
4920 CInode *in = dnl->get_inode();
4921
4922 if (in && in->ino() == CEPH_INO_CEPH)
4923 continue;
4924
4925 // remote link?
4926 // better for the MDS to do the work, if we think the client will stat any of these files.
4927 if (dnl->is_remote() && !in) {
4928 in = mdcache->get_inode(dnl->get_remote_ino());
4929 if (in) {
4930 dn->link_remote(dnl, in);
4931 } else if (dn->state_test(CDentry::STATE_BADREMOTEINO)) {
4932 dout(10) << "skipping bad remote ino on " << *dn << dendl;
4933 continue;
4934 } else {
4935 // touch everything i _do_ have
94b18763
FG
4936 for (auto &p : *dir) {
4937 if (!p.second->get_linkage()->is_null())
4938 mdcache->lru.lru_touch(p.second);
4939 }
7c673cae
FG
4940
4941 // already issued caps and leases, reply immediately.
4942 if (dnbl.length() > 0) {
4943 mdcache->open_remote_dentry(dn, dnp, new C_MDSInternalNoop);
4944 dout(10) << " open remote dentry after caps were issued, stopping at "
4945 << dnbl.length() << " < " << bytes_left << dendl;
4946 break;
4947 }
4948
4949 mds->locker->drop_locks(mdr.get());
4950 mdr->drop_local_auth_pins();
4951 mdcache->open_remote_dentry(dn, dnp, new C_MDS_RetryRequest(mdcache, mdr));
4952 return;
4953 }
4954 }
11fdf7f2 4955 ceph_assert(in);
7c673cae 4956
94b18763 4957 if ((int)(dnbl.length() + dn->get_name().length() + sizeof(__u32) + sizeof(LeaseStat)) > bytes_left) {
7c673cae
FG
4958 dout(10) << " ran out of room, stopping at " << dnbl.length() << " < " << bytes_left << dendl;
4959 break;
4960 }
aee94f69 4961
7c673cae
FG
4962 unsigned start_len = dnbl.length();
4963
4964 // dentry
4965 dout(12) << "including dn " << *dn << dendl;
11fdf7f2 4966 encode(dn->get_name(), dnbl);
2a845540 4967 mds->locker->issue_client_lease(dn, in, mdr, now, dnbl);
7c673cae
FG
4968
4969 // inode
aee94f69 4970 dout(12) << "including inode in " << *in << " snap " << snapid << dendl;
7c673cae
FG
4971 int r = in->encode_inodestat(dnbl, mdr->session, realm, snapid, bytes_left - (int)dnbl.length());
4972 if (r < 0) {
4973 // chop off dn->name, lease
4974 dout(10) << " ran out of room, stopping at " << start_len << " < " << bytes_left << dendl;
4975 bufferlist keep;
4976 keep.substr_of(dnbl, 0, start_len);
4977 dnbl.swap(keep);
4978 break;
4979 }
11fdf7f2 4980 ceph_assert(r >= 0);
7c673cae
FG
4981 numfiles++;
4982
4983 // touch dn
4984 mdcache->lru.lru_touch(dn);
4985 }
7c673cae 4986 __u16 flags = 0;
7c673cae
FG
4987 // client only understand END and COMPLETE flags ?
4988 if (req_flags & CEPH_READDIR_REPLY_BITFLAGS) {
4989 flags |= CEPH_READDIR_HASH_ORDER | CEPH_READDIR_OFFSET_HASH;
4990 }
aee94f69 4991 _finalize_readdir(mdr, diri, dir, start, end, flags, numfiles, dirbl, dnbl);
7c673cae
FG
4992}
4993
4994
4995
4996// ===============================================================================
4997// INODE UPDATES
4998
4999
5000/*
5001 * finisher for basic inode updates
5002 */
5003class C_MDS_inode_update_finish : public ServerLogContext {
5004 CInode *in;
adb31ebb 5005 bool truncating_smaller, changed_ranges, adjust_realm;
7c673cae
FG
5006public:
5007 C_MDS_inode_update_finish(Server *s, MDRequestRef& r, CInode *i,
adb31ebb 5008 bool sm=false, bool cr=false, bool ar=false) :
11fdf7f2 5009 ServerLogContext(s, r), in(i),
adb31ebb 5010 truncating_smaller(sm), changed_ranges(cr), adjust_realm(ar) { }
7c673cae 5011 void finish(int r) override {
11fdf7f2 5012 ceph_assert(r == 0);
7c673cae 5013
adb31ebb
TL
5014 int snap_op = (in->snaprealm ? CEPH_SNAP_OP_UPDATE : CEPH_SNAP_OP_SPLIT);
5015
7c673cae 5016 // apply
7c673cae
FG
5017 mdr->apply();
5018
11fdf7f2
TL
5019 MDSRank *mds = get_mds();
5020
7c673cae 5021 // notify any clients
f67539c2 5022 if (truncating_smaller && in->get_inode()->is_truncating()) {
11fdf7f2
TL
5023 mds->locker->issue_truncate(in);
5024 mds->mdcache->truncate_inode(in, mdr->ls);
5025 }
5026
adb31ebb
TL
5027 if (adjust_realm) {
5028 mds->mdcache->send_snap_update(in, 0, snap_op);
5029 mds->mdcache->do_realm_invalidate_and_update_notify(in, snap_op);
7c673cae
FG
5030 }
5031
11fdf7f2 5032 get_mds()->balancer->hit_inode(in, META_POP_IWR);
7c673cae
FG
5033
5034 server->respond_to_request(mdr, 0);
5035
5036 if (changed_ranges)
5037 get_mds()->locker->share_inode_max_size(in);
5038 }
5039};
5040
5041void Server::handle_client_file_setlock(MDRequestRef& mdr)
5042{
9f95a23c 5043 const cref_t<MClientRequest> &req = mdr->client_request;
11fdf7f2 5044 MutationImpl::LockOpVec lov;
7c673cae
FG
5045
5046 // get the inode to operate on, and set up any locks needed for that
9f95a23c 5047 CInode *cur = rdlock_path_pin_ref(mdr, true);
7c673cae
FG
5048 if (!cur)
5049 return;
5050
11fdf7f2 5051 lov.add_xlock(&cur->flocklock);
7c673cae
FG
5052 /* acquire_locks will return true if it gets the locks. If it fails,
5053 it will redeliver this request at a later date, so drop the request.
5054 */
11fdf7f2 5055 if (!mds->locker->acquire_locks(mdr, lov)) {
7c673cae
FG
5056 dout(10) << "handle_client_file_setlock could not get locks!" << dendl;
5057 return;
5058 }
5059
5060 // copy the lock change into a ceph_filelock so we can store/apply it
5061 ceph_filelock set_lock;
5062 set_lock.start = req->head.args.filelock_change.start;
5063 set_lock.length = req->head.args.filelock_change.length;
5064 set_lock.client = req->get_orig_source().num();
5065 set_lock.owner = req->head.args.filelock_change.owner;
5066 set_lock.pid = req->head.args.filelock_change.pid;
5067 set_lock.type = req->head.args.filelock_change.type;
5068 bool will_wait = req->head.args.filelock_change.wait;
5069
5070 dout(10) << "handle_client_file_setlock: " << set_lock << dendl;
5071
5072 ceph_lock_state_t *lock_state = NULL;
5073 bool interrupt = false;
5074
5075 // get the appropriate lock state
5076 switch (req->head.args.filelock_change.rule) {
5077 case CEPH_LOCK_FLOCK_INTR:
5078 interrupt = true;
5079 // fall-thru
5080 case CEPH_LOCK_FLOCK:
5081 lock_state = cur->get_flock_lock_state();
5082 break;
5083
5084 case CEPH_LOCK_FCNTL_INTR:
5085 interrupt = true;
5086 // fall-thru
5087 case CEPH_LOCK_FCNTL:
5088 lock_state = cur->get_fcntl_lock_state();
5089 break;
5090
5091 default:
5092 dout(10) << "got unknown lock type " << set_lock.type
5093 << ", dropping request!" << dendl;
f67539c2 5094 respond_to_request(mdr, -CEPHFS_EOPNOTSUPP);
7c673cae
FG
5095 return;
5096 }
5097
5098 dout(10) << " state prior to lock change: " << *lock_state << dendl;
5099 if (CEPH_LOCK_UNLOCK == set_lock.type) {
5100 list<ceph_filelock> activated_locks;
11fdf7f2 5101 MDSContext::vec waiters;
7c673cae
FG
5102 if (lock_state->is_waiting(set_lock)) {
5103 dout(10) << " unlock removing waiting lock " << set_lock << dendl;
5104 lock_state->remove_waiting(set_lock);
5105 cur->take_waiting(CInode::WAIT_FLOCK, waiters);
5106 } else if (!interrupt) {
5107 dout(10) << " unlock attempt on " << set_lock << dendl;
5108 lock_state->remove_lock(set_lock, activated_locks);
5109 cur->take_waiting(CInode::WAIT_FLOCK, waiters);
5110 }
5111 mds->queue_waiters(waiters);
5112
5113 respond_to_request(mdr, 0);
5114 } else {
5115 dout(10) << " lock attempt on " << set_lock << dendl;
5116 bool deadlock = false;
5117 if (mdr->more()->flock_was_waiting &&
5118 !lock_state->is_waiting(set_lock)) {
5119 dout(10) << " was waiting for lock but not anymore, must have been canceled " << set_lock << dendl;
f67539c2 5120 respond_to_request(mdr, -CEPHFS_EINTR);
7c673cae
FG
5121 } else if (!lock_state->add_lock(set_lock, will_wait, mdr->more()->flock_was_waiting, &deadlock)) {
5122 dout(10) << " it failed on this attempt" << dendl;
5123 // couldn't set lock right now
5124 if (deadlock) {
f67539c2 5125 respond_to_request(mdr, -CEPHFS_EDEADLK);
7c673cae 5126 } else if (!will_wait) {
f67539c2 5127 respond_to_request(mdr, -CEPHFS_EWOULDBLOCK);
7c673cae
FG
5128 } else {
5129 dout(10) << " added to waiting list" << dendl;
11fdf7f2 5130 ceph_assert(lock_state->is_waiting(set_lock));
7c673cae
FG
5131 mdr->more()->flock_was_waiting = true;
5132 mds->locker->drop_locks(mdr.get());
5133 mdr->drop_local_auth_pins();
1adf2230
AA
5134 mdr->mark_event("failed to add lock, waiting");
5135 mdr->mark_nowarn();
7c673cae
FG
5136 cur->add_waiter(CInode::WAIT_FLOCK, new C_MDS_RetryRequest(mdcache, mdr));
5137 }
5138 } else
5139 respond_to_request(mdr, 0);
5140 }
5141 dout(10) << " state after lock change: " << *lock_state << dendl;
5142}
5143
5144void Server::handle_client_file_readlock(MDRequestRef& mdr)
5145{
9f95a23c 5146 const cref_t<MClientRequest> &req = mdr->client_request;
11fdf7f2 5147 MutationImpl::LockOpVec lov;
7c673cae
FG
5148
5149 // get the inode to operate on, and set up any locks needed for that
9f95a23c 5150 CInode *cur = rdlock_path_pin_ref(mdr, true);
7c673cae
FG
5151 if (!cur)
5152 return;
5153
5154 /* acquire_locks will return true if it gets the locks. If it fails,
5155 it will redeliver this request at a later date, so drop the request.
5156 */
11fdf7f2
TL
5157 lov.add_rdlock(&cur->flocklock);
5158 if (!mds->locker->acquire_locks(mdr, lov)) {
7c673cae
FG
5159 dout(10) << "handle_client_file_readlock could not get locks!" << dendl;
5160 return;
5161 }
5162
5163 // copy the lock change into a ceph_filelock so we can store/apply it
5164 ceph_filelock checking_lock;
5165 checking_lock.start = req->head.args.filelock_change.start;
5166 checking_lock.length = req->head.args.filelock_change.length;
5167 checking_lock.client = req->get_orig_source().num();
5168 checking_lock.owner = req->head.args.filelock_change.owner;
5169 checking_lock.pid = req->head.args.filelock_change.pid;
5170 checking_lock.type = req->head.args.filelock_change.type;
5171
5172 // get the appropriate lock state
5173 ceph_lock_state_t *lock_state = NULL;
5174 switch (req->head.args.filelock_change.rule) {
5175 case CEPH_LOCK_FLOCK:
5176 lock_state = cur->get_flock_lock_state();
5177 break;
5178
5179 case CEPH_LOCK_FCNTL:
5180 lock_state = cur->get_fcntl_lock_state();
5181 break;
5182
5183 default:
5184 dout(10) << "got unknown lock type " << checking_lock.type << dendl;
f67539c2 5185 respond_to_request(mdr, -CEPHFS_EINVAL);
7c673cae
FG
5186 return;
5187 }
5188 lock_state->look_for_lock(checking_lock);
5189
5190 bufferlist lock_bl;
11fdf7f2 5191 encode(checking_lock, lock_bl);
7c673cae
FG
5192
5193 mdr->reply_extra_bl = lock_bl;
5194 respond_to_request(mdr, 0);
5195}
5196
5197void Server::handle_client_setattr(MDRequestRef& mdr)
5198{
9f95a23c 5199 const cref_t<MClientRequest> &req = mdr->client_request;
11fdf7f2 5200 MutationImpl::LockOpVec lov;
9f95a23c 5201 CInode *cur = rdlock_path_pin_ref(mdr, true);
7c673cae
FG
5202 if (!cur) return;
5203
5204 if (mdr->snapid != CEPH_NOSNAP) {
f67539c2 5205 respond_to_request(mdr, -CEPHFS_EROFS);
7c673cae
FG
5206 return;
5207 }
5208 if (cur->ino() < MDS_INO_SYSTEM_BASE && !cur->is_base()) {
f67539c2 5209 respond_to_request(mdr, -CEPHFS_EPERM);
7c673cae
FG
5210 return;
5211 }
5212
5213 __u32 mask = req->head.args.setattr.mask;
5214 __u32 access_mask = MAY_WRITE;
5215
1e59de90
TL
5216 if (req->get_header().version < 6) {
5217 // No changes to fscrypted inodes by downrevved clients
5218 if (!cur->get_inode()->fscrypt_auth.empty()) {
5219 respond_to_request(mdr, -CEPHFS_EPERM);
5220 return;
5221 }
5222
5223 // Only allow fscrypt field changes by capable clients
5224 if (mask & (CEPH_SETATTR_FSCRYPT_FILE|CEPH_SETATTR_FSCRYPT_AUTH)) {
5225 respond_to_request(mdr, -CEPHFS_EINVAL);
5226 return;
5227 }
5228 }
5229
7c673cae 5230 // xlock inode
1e59de90 5231 if (mask & (CEPH_SETATTR_MODE|CEPH_SETATTR_UID|CEPH_SETATTR_GID|CEPH_SETATTR_BTIME|CEPH_SETATTR_KILL_SGUID|CEPH_SETATTR_FSCRYPT_AUTH|CEPH_SETATTR_KILL_SUID|CEPH_SETATTR_KILL_SGID))
11fdf7f2 5232 lov.add_xlock(&cur->authlock);
1e59de90 5233 if (mask & (CEPH_SETATTR_MTIME|CEPH_SETATTR_ATIME|CEPH_SETATTR_SIZE|CEPH_SETATTR_FSCRYPT_FILE))
11fdf7f2 5234 lov.add_xlock(&cur->filelock);
7c673cae 5235 if (mask & CEPH_SETATTR_CTIME)
11fdf7f2 5236 lov.add_wrlock(&cur->versionlock);
7c673cae 5237
11fdf7f2 5238 if (!mds->locker->acquire_locks(mdr, lov))
7c673cae
FG
5239 return;
5240
f67539c2 5241 if ((mask & CEPH_SETATTR_UID) && (cur->get_inode()->uid != req->head.args.setattr.uid))
7c673cae
FG
5242 access_mask |= MAY_CHOWN;
5243
f67539c2 5244 if ((mask & CEPH_SETATTR_GID) && (cur->get_inode()->gid != req->head.args.setattr.gid))
7c673cae
FG
5245 access_mask |= MAY_CHGRP;
5246
5247 if (!check_access(mdr, cur, access_mask))
5248 return;
5249
5250 // trunc from bigger -> smaller?
f67539c2 5251 const auto& pip = cur->get_projected_inode();
7c673cae 5252
94b18763 5253 uint64_t old_size = std::max<uint64_t>(pip->size, req->head.args.setattr.old_size);
7c673cae 5254
f67539c2 5255 // CEPHFS_ENOSPC on growing file while full, but allow shrinks
7c673cae 5256 if (is_full && req->head.args.setattr.size > old_size) {
f67539c2
TL
5257 dout(20) << __func__ << ": full, responding CEPHFS_ENOSPC to setattr with larger size" << dendl;
5258 respond_to_request(mdr, -CEPHFS_ENOSPC);
7c673cae
FG
5259 return;
5260 }
5261
5262 bool truncating_smaller = false;
5263 if (mask & CEPH_SETATTR_SIZE) {
1e59de90
TL
5264 if (req->get_data().length() >
5265 sizeof(struct ceph_fscrypt_last_block_header) + fscrypt_last_block_max_size) {
5266 dout(10) << __func__ << ": the last block size is too large" << dendl;
5267 respond_to_request(mdr, -CEPHFS_EINVAL);
5268 return;
5269 }
5270
5271 truncating_smaller = req->head.args.setattr.size < old_size ||
5272 (req->head.args.setattr.size == old_size && req->get_data().length());
94b18763
FG
5273 if (truncating_smaller && pip->is_truncating()) {
5274 dout(10) << " waiting for pending truncate from " << pip->truncate_from
5275 << " to " << pip->truncate_size << " to complete on " << *cur << dendl;
7c673cae
FG
5276 mds->locker->drop_locks(mdr.get());
5277 mdr->drop_local_auth_pins();
5278 cur->add_waiter(CInode::WAIT_TRUNC, new C_MDS_RetryRequest(mdcache, mdr));
5279 return;
5280 }
1e59de90
TL
5281
5282 if (truncating_smaller && req->get_data().length()) {
5283 struct ceph_fscrypt_last_block_header header;
5284 memset(&header, 0, sizeof(header));
5285 auto bl = req->get_data().cbegin();
5286 DECODE_START(1, bl);
5287 decode(header.change_attr, bl);
5288 DECODE_FINISH(bl);
5289
5290 dout(20) << __func__ << " mdr->retry:" << mdr->retry
5291 << " header.change_attr: " << header.change_attr
5292 << " header.file_offset: " << header.file_offset
5293 << " header.block_size: " << header.block_size
5294 << dendl;
5295
5296 if (header.change_attr != pip->change_attr) {
5297 dout(5) << __func__ << ": header.change_attr:" << header.change_attr
5298 << " != current change_attr:" << pip->change_attr
5299 << ", let client retry it!" << dendl;
5300 // flush the journal to make sure the clients will get the lasted
5301 // change_attr as possible for the next retry
5302 mds->mdlog->flush();
5303 respond_to_request(mdr, -CEPHFS_EAGAIN);
5304 return;
5305 }
5306 }
7c673cae
FG
5307 }
5308
5309 bool changed_ranges = false;
5310
5311 // project update
5312 mdr->ls = mdlog->get_current_segment();
5313 EUpdate *le = new EUpdate(mdlog, "setattr");
5314 mdlog->start_entry(le);
5315
f67539c2 5316 auto pi = cur->project_inode(mdr);
7c673cae
FG
5317
5318 if (mask & CEPH_SETATTR_UID)
f67539c2 5319 pi.inode->uid = req->head.args.setattr.uid;
7c673cae 5320 if (mask & CEPH_SETATTR_GID)
f67539c2 5321 pi.inode->gid = req->head.args.setattr.gid;
7c673cae
FG
5322
5323 if (mask & CEPH_SETATTR_MODE)
f67539c2 5324 pi.inode->mode = (pi.inode->mode & ~07777) | (req->head.args.setattr.mode & 07777);
1e59de90
TL
5325 else if ((mask & (CEPH_SETATTR_UID|CEPH_SETATTR_GID|CEPH_SETATTR_KILL_SGUID|
5326 CEPH_SETATTR_KILL_SUID|CEPH_SETATTR_KILL_SGID)) &&
5327 S_ISREG(pi.inode->mode)) {
5328 if (mask & (CEPH_SETATTR_UID|CEPH_SETATTR_GID|CEPH_SETATTR_KILL_SGUID) &&
5329 (pi.inode->mode & (S_IXUSR|S_IXGRP|S_IXOTH))) {
5330 pi.inode->mode &= ~(S_ISUID|S_ISGID);
5331 } else {
5332 if (mask & CEPH_SETATTR_KILL_SUID) {
5333 pi.inode->mode &= ~S_ISUID;
5334 }
5335 if (mask & CEPH_SETATTR_KILL_SGID) {
5336 pi.inode->mode &= ~S_ISGID;
5337 }
5338 }
7c673cae
FG
5339 }
5340
5341 if (mask & CEPH_SETATTR_MTIME)
f67539c2 5342 pi.inode->mtime = req->head.args.setattr.mtime;
7c673cae 5343 if (mask & CEPH_SETATTR_ATIME)
f67539c2 5344 pi.inode->atime = req->head.args.setattr.atime;
7c673cae 5345 if (mask & CEPH_SETATTR_BTIME)
f67539c2 5346 pi.inode->btime = req->head.args.setattr.btime;
7c673cae 5347 if (mask & (CEPH_SETATTR_ATIME | CEPH_SETATTR_MTIME | CEPH_SETATTR_BTIME))
f67539c2 5348 pi.inode->time_warp_seq++; // maybe not a timewarp, but still a serialization point.
7c673cae
FG
5349 if (mask & CEPH_SETATTR_SIZE) {
5350 if (truncating_smaller) {
1e59de90 5351 pi.inode->truncate(old_size, req->head.args.setattr.size, req->get_data());
7c673cae
FG
5352 le->metablob.add_truncate_start(cur->ino());
5353 } else {
f67539c2
TL
5354 pi.inode->size = req->head.args.setattr.size;
5355 pi.inode->rstat.rbytes = pi.inode->size;
7c673cae 5356 }
f67539c2 5357 pi.inode->mtime = mdr->get_op_stamp();
7c673cae
FG
5358
5359 // adjust client's max_size?
f67539c2 5360 if (mds->locker->calc_new_client_ranges(cur, pi.inode->size)) {
f91f0fd5 5361 dout(10) << " client_ranges " << cur->get_previous_projected_inode()->client_ranges
f67539c2 5362 << " -> " << pi.inode->client_ranges << dendl;
7c673cae
FG
5363 changed_ranges = true;
5364 }
5365 }
5366
1e59de90
TL
5367 if (mask & CEPH_SETATTR_FSCRYPT_AUTH)
5368 pi.inode->fscrypt_auth = req->fscrypt_auth;
5369 if (mask & CEPH_SETATTR_FSCRYPT_FILE)
5370 pi.inode->fscrypt_file = req->fscrypt_file;
5371
f67539c2
TL
5372 pi.inode->version = cur->pre_dirty();
5373 pi.inode->ctime = mdr->get_op_stamp();
5374 if (mdr->get_op_stamp() > pi.inode->rstat.rctime)
5375 pi.inode->rstat.rctime = mdr->get_op_stamp();
5376 pi.inode->change_attr++;
7c673cae
FG
5377
5378 // log + wait
5379 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
5380 mdcache->predirty_journal_parents(mdr, &le->metablob, cur, 0, PREDIRTY_PRIMARY);
5381 mdcache->journal_dirty_inode(mdr.get(), &le->metablob, cur);
5382
5383 journal_and_reply(mdr, cur, 0, le, new C_MDS_inode_update_finish(this, mdr, cur,
5384 truncating_smaller, changed_ranges));
5385
5386 // flush immediately if there are readers/writers waiting
11fdf7f2 5387 if (mdr->is_xlocked(&cur->filelock) &&
7c673cae
FG
5388 (cur->get_caps_wanted() & (CEPH_CAP_FILE_RD|CEPH_CAP_FILE_WR)))
5389 mds->mdlog->flush();
5390}
5391
5392/* Takes responsibility for mdr */
5393void Server::do_open_truncate(MDRequestRef& mdr, int cmode)
5394{
5395 CInode *in = mdr->in[0];
5396 client_t client = mdr->get_client();
11fdf7f2 5397 ceph_assert(in);
7c673cae
FG
5398
5399 dout(10) << "do_open_truncate " << *in << dendl;
5400
5401 SnapRealm *realm = in->find_snaprealm();
9f95a23c 5402 Capability *cap = mds->locker->issue_new_caps(in, cmode, mdr, realm);
7c673cae
FG
5403
5404 mdr->ls = mdlog->get_current_segment();
5405 EUpdate *le = new EUpdate(mdlog, "open_truncate");
5406 mdlog->start_entry(le);
5407
5408 // prepare
f67539c2
TL
5409 auto pi = in->project_inode(mdr);
5410 pi.inode->version = in->pre_dirty();
5411 pi.inode->mtime = pi.inode->ctime = mdr->get_op_stamp();
5412 if (mdr->get_op_stamp() > pi.inode->rstat.rctime)
5413 pi.inode->rstat.rctime = mdr->get_op_stamp();
5414 pi.inode->change_attr++;
5415
5416 uint64_t old_size = std::max<uint64_t>(pi.inode->size, mdr->client_request->head.args.open.old_size);
7c673cae 5417 if (old_size > 0) {
f67539c2 5418 pi.inode->truncate(old_size, 0);
7c673cae
FG
5419 le->metablob.add_truncate_start(in->ino());
5420 }
5421
5422 bool changed_ranges = false;
a8e16298 5423 if (cap && (cmode & CEPH_FILE_MODE_WR)) {
f67539c2
TL
5424 pi.inode->client_ranges[client].range.first = 0;
5425 pi.inode->client_ranges[client].range.last = pi.inode->get_layout_size_increment();
5426 pi.inode->client_ranges[client].follows = realm->get_newest_seq();
7c673cae 5427 changed_ranges = true;
f91f0fd5 5428 in->mark_clientwriteable();
a8e16298 5429 cap->mark_clientwriteable();
7c673cae
FG
5430 }
5431
5432 le->metablob.add_client_req(mdr->reqid, mdr->client_request->get_oldest_client_tid());
5433
5434 mdcache->predirty_journal_parents(mdr, &le->metablob, in, 0, PREDIRTY_PRIMARY);
5435 mdcache->journal_dirty_inode(mdr.get(), &le->metablob, in);
5436
5437 // make sure ino gets into the journal
5438 le->metablob.add_opened_ino(in->ino());
7c673cae
FG
5439
5440 mdr->o_trunc = true;
5441
5442 CDentry *dn = 0;
5443 if (mdr->client_request->get_dentry_wanted()) {
11fdf7f2 5444 ceph_assert(mdr->dn[0].size());
7c673cae
FG
5445 dn = mdr->dn[0].back();
5446 }
5447
5448 journal_and_reply(mdr, in, dn, le, new C_MDS_inode_update_finish(this, mdr, in, old_size > 0,
5449 changed_ranges));
5450 // Although the `open` part can give an early reply, the truncation won't
5451 // happen until our EUpdate is persistent, to give the client a prompt
5452 // response we must also flush that event.
5453 mdlog->flush();
5454}
5455
5456
5457/* This function cleans up the passed mdr */
5458void Server::handle_client_setlayout(MDRequestRef& mdr)
5459{
9f95a23c
TL
5460 const cref_t<MClientRequest> &req = mdr->client_request;
5461 CInode *cur = rdlock_path_pin_ref(mdr, true);
7c673cae
FG
5462 if (!cur) return;
5463
5464 if (mdr->snapid != CEPH_NOSNAP) {
f67539c2 5465 respond_to_request(mdr, -CEPHFS_EROFS);
7c673cae
FG
5466 return;
5467 }
5468 if (!cur->is_file()) {
f67539c2 5469 respond_to_request(mdr, -CEPHFS_EINVAL);
7c673cae
FG
5470 return;
5471 }
5472 if (cur->get_projected_inode()->size ||
5473 cur->get_projected_inode()->truncate_seq > 1) {
f67539c2 5474 respond_to_request(mdr, -CEPHFS_ENOTEMPTY);
7c673cae
FG
5475 return;
5476 }
5477
5478 // validate layout
5479 file_layout_t layout = cur->get_projected_inode()->layout;
5480 // save existing layout for later
5481 const auto old_layout = layout;
5482
5483 int access = MAY_WRITE;
5484
5485 if (req->head.args.setlayout.layout.fl_object_size > 0)
5486 layout.object_size = req->head.args.setlayout.layout.fl_object_size;
5487 if (req->head.args.setlayout.layout.fl_stripe_unit > 0)
5488 layout.stripe_unit = req->head.args.setlayout.layout.fl_stripe_unit;
5489 if (req->head.args.setlayout.layout.fl_stripe_count > 0)
5490 layout.stripe_count=req->head.args.setlayout.layout.fl_stripe_count;
5491 if (req->head.args.setlayout.layout.fl_pg_pool > 0) {
5492 layout.pool_id = req->head.args.setlayout.layout.fl_pg_pool;
5493
5494 // make sure we have as new a map as the client
5495 if (req->get_mdsmap_epoch() > mds->mdsmap->get_epoch()) {
5496 mds->wait_for_mdsmap(req->get_mdsmap_epoch(), new C_MDS_RetryRequest(mdcache, mdr));
5497 return;
5498 }
5499 }
5500
5501 // Don't permit layout modifications without 'p' caps
5502 if (layout != old_layout) {
5503 access |= MAY_SET_VXATTR;
5504 }
5505
5506 if (!layout.is_valid()) {
5507 dout(10) << "bad layout" << dendl;
f67539c2 5508 respond_to_request(mdr, -CEPHFS_EINVAL);
7c673cae
FG
5509 return;
5510 }
5511 if (!mds->mdsmap->is_data_pool(layout.pool_id)) {
5512 dout(10) << " invalid data pool " << layout.pool_id << dendl;
f67539c2 5513 respond_to_request(mdr, -CEPHFS_EINVAL);
7c673cae
FG
5514 return;
5515 }
5516
9f95a23c 5517 MutationImpl::LockOpVec lov;
11fdf7f2
TL
5518 lov.add_xlock(&cur->filelock);
5519 if (!mds->locker->acquire_locks(mdr, lov))
7c673cae
FG
5520 return;
5521
5522 if (!check_access(mdr, cur, access))
5523 return;
5524
5525 // project update
f67539c2
TL
5526 auto pi = cur->project_inode(mdr);
5527 pi.inode->layout = layout;
7c673cae 5528 // add the old pool to the inode
f67539c2
TL
5529 pi.inode->add_old_pool(old_layout.pool_id);
5530 pi.inode->version = cur->pre_dirty();
5531 pi.inode->ctime = mdr->get_op_stamp();
5532 if (mdr->get_op_stamp() > pi.inode->rstat.rctime)
5533 pi.inode->rstat.rctime = mdr->get_op_stamp();
5534 pi.inode->change_attr++;
7c673cae
FG
5535
5536 // log + wait
5537 mdr->ls = mdlog->get_current_segment();
5538 EUpdate *le = new EUpdate(mdlog, "setlayout");
5539 mdlog->start_entry(le);
5540 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
5541 mdcache->predirty_journal_parents(mdr, &le->metablob, cur, 0, PREDIRTY_PRIMARY);
5542 mdcache->journal_dirty_inode(mdr.get(), &le->metablob, cur);
5543
5544 journal_and_reply(mdr, cur, 0, le, new C_MDS_inode_update_finish(this, mdr, cur));
5545}
5546
9f95a23c 5547bool Server::xlock_policylock(MDRequestRef& mdr, CInode *in, bool want_layout, bool xlock_snaplock)
7c673cae 5548{
9f95a23c
TL
5549 if (mdr->locking_state & MutationImpl::ALL_LOCKED)
5550 return true;
5551
11fdf7f2 5552 MutationImpl::LockOpVec lov;
9f95a23c
TL
5553 lov.add_xlock(&in->policylock);
5554 if (xlock_snaplock)
5555 lov.add_xlock(&in->snaplock);
5556 else
5557 lov.add_rdlock(&in->snaplock);
5558 if (!mds->locker->acquire_locks(mdr, lov))
5559 return false;
7c673cae 5560
9f95a23c
TL
5561 if (want_layout && in->get_projected_inode()->has_layout()) {
5562 mdr->dir_layout = in->get_projected_inode()->layout;
5563 want_layout = false;
5564 }
5565 if (CDentry *pdn = in->get_projected_parent_dn(); pdn) {
5566 if (!mds->locker->try_rdlock_snap_layout(pdn->get_dir()->get_inode(), mdr, 0, want_layout))
5567 return false;
7c673cae
FG
5568 }
5569
9f95a23c
TL
5570 mdr->locking_state |= MutationImpl::ALL_LOCKED;
5571 return true;
5572}
5573
5574CInode* Server::try_get_auth_inode(MDRequestRef& mdr, inodeno_t ino)
5575{
5576 CInode *in = mdcache->get_inode(ino);
5577 if (!in || in->state_test(CInode::STATE_PURGING)) {
f67539c2 5578 respond_to_request(mdr, -CEPHFS_ESTALE);
9f95a23c
TL
5579 return nullptr;
5580 }
5581 if (!in->is_auth()) {
5582 mdcache->request_forward(mdr, in->authority().first);
5583 return nullptr;
5584 }
5585
5586 return in;
5587}
5588
5589void Server::handle_client_setdirlayout(MDRequestRef& mdr)
5590{
5591 const cref_t<MClientRequest> &req = mdr->client_request;
5592
5593 // can't use rdlock_path_pin_ref because we need to xlock snaplock/policylock
5594 CInode *cur = try_get_auth_inode(mdr, req->get_filepath().get_ino());
5595 if (!cur)
5596 return;
5597
7c673cae 5598 if (!cur->is_dir()) {
f67539c2 5599 respond_to_request(mdr, -CEPHFS_ENOTDIR);
7c673cae
FG
5600 return;
5601 }
5602
9f95a23c 5603 if (!xlock_policylock(mdr, cur, true))
7c673cae
FG
5604 return;
5605
5606 // validate layout
f67539c2 5607 const auto& old_pi = cur->get_projected_inode();
7c673cae
FG
5608 file_layout_t layout;
5609 if (old_pi->has_layout())
5610 layout = old_pi->layout;
9f95a23c
TL
5611 else if (mdr->dir_layout != file_layout_t())
5612 layout = mdr->dir_layout;
7c673cae
FG
5613 else
5614 layout = mdcache->default_file_layout;
5615
5616 // Level of access required to complete
5617 int access = MAY_WRITE;
5618
5619 const auto old_layout = layout;
5620
5621 if (req->head.args.setlayout.layout.fl_object_size > 0)
5622 layout.object_size = req->head.args.setlayout.layout.fl_object_size;
5623 if (req->head.args.setlayout.layout.fl_stripe_unit > 0)
5624 layout.stripe_unit = req->head.args.setlayout.layout.fl_stripe_unit;
5625 if (req->head.args.setlayout.layout.fl_stripe_count > 0)
5626 layout.stripe_count=req->head.args.setlayout.layout.fl_stripe_count;
5627 if (req->head.args.setlayout.layout.fl_pg_pool > 0) {
5628 layout.pool_id = req->head.args.setlayout.layout.fl_pg_pool;
5629 // make sure we have as new a map as the client
5630 if (req->get_mdsmap_epoch() > mds->mdsmap->get_epoch()) {
5631 mds->wait_for_mdsmap(req->get_mdsmap_epoch(), new C_MDS_RetryRequest(mdcache, mdr));
5632 return;
5633 }
5634 }
5635
5636 if (layout != old_layout) {
5637 access |= MAY_SET_VXATTR;
5638 }
5639
5640 if (!layout.is_valid()) {
5641 dout(10) << "bad layout" << dendl;
f67539c2 5642 respond_to_request(mdr, -CEPHFS_EINVAL);
7c673cae
FG
5643 return;
5644 }
5645 if (!mds->mdsmap->is_data_pool(layout.pool_id)) {
5646 dout(10) << " invalid data pool " << layout.pool_id << dendl;
f67539c2 5647 respond_to_request(mdr, -CEPHFS_EINVAL);
7c673cae
FG
5648 return;
5649 }
5650
5651 if (!check_access(mdr, cur, access))
5652 return;
5653
f67539c2
TL
5654 auto pi = cur->project_inode(mdr);
5655 pi.inode->layout = layout;
5656 pi.inode->version = cur->pre_dirty();
7c673cae
FG
5657
5658 // log + wait
5659 mdr->ls = mdlog->get_current_segment();
5660 EUpdate *le = new EUpdate(mdlog, "setlayout");
5661 mdlog->start_entry(le);
5662 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
5663 mdcache->predirty_journal_parents(mdr, &le->metablob, cur, 0, PREDIRTY_PRIMARY);
5664 mdcache->journal_dirty_inode(mdr.get(), &le->metablob, cur);
5665
b32b8144 5666 mdr->no_early_reply = true;
7c673cae
FG
5667 journal_and_reply(mdr, cur, 0, le, new C_MDS_inode_update_finish(this, mdr, cur));
5668}
5669
5670// XATTRS
1d09f67e
TL
5671int Server::parse_layout_vxattr_json(
5672 string name, string value, const OSDMap& osdmap, file_layout_t *layout)
5673{
5674 auto parse_pool = [&](std::string pool_name, int64_t pool_id) -> int64_t {
5675 if (pool_name != "") {
5676 int64_t _pool_id = osdmap.lookup_pg_pool_name(pool_name);
5677 if (_pool_id < 0) {
5678 dout(10) << __func__ << ": unknown pool name:" << pool_name << dendl;
5679 return -CEPHFS_EINVAL;
5680 }
5681 return _pool_id;
5682 } else if (pool_id >= 0) {
5683 const auto pools = osdmap.get_pools();
5684 if (pools.find(pool_id) == pools.end()) {
5685 dout(10) << __func__ << ": unknown pool id:" << pool_id << dendl;
5686 return -CEPHFS_EINVAL;
5687 }
5688 return pool_id;
5689 } else {
5690 return -CEPHFS_EINVAL;
5691 }
5692 };
7c673cae 5693
1d09f67e
TL
5694 try {
5695 if (name == "layout.json") {
5696 JSONParser json_parser;
5697 if (json_parser.parse(value.c_str(), value.length()) and json_parser.is_object()) {
5698 std::string field;
5699 try {
5700 field = "object_size";
5701 JSONDecoder::decode_json("object_size", layout->object_size, &json_parser, true);
5702
5703 field = "stripe_unit";
5704 JSONDecoder::decode_json("stripe_unit", layout->stripe_unit, &json_parser, true);
5705
5706 field = "stripe_count";
5707 JSONDecoder::decode_json("stripe_count", layout->stripe_count, &json_parser, true);
5708
5709 field = "pool_namespace";
5710 JSONDecoder::decode_json("pool_namespace", layout->pool_ns, &json_parser, false);
5711
5712 field = "pool_id";
5713 int64_t pool_id = 0;
5714 JSONDecoder::decode_json("pool_id", pool_id, &json_parser, false);
5715
5716 field = "pool_name";
5717 std::string pool_name;
5718 JSONDecoder::decode_json("pool_name", pool_name, &json_parser, false);
5719
5720 pool_id = parse_pool(pool_name, pool_id);
5721 if (pool_id < 0) {
5722 return (int)pool_id;
5723 }
5724 layout->pool_id = pool_id;
5725 } catch (JSONDecoder::err&) {
5726 dout(10) << __func__ << ": json is missing a mandatory field named "
5727 << field << dendl;
5728 return -CEPHFS_EINVAL;
5729 }
5730 } else {
5731 dout(10) << __func__ << ": bad json" << dendl;
5732 return -CEPHFS_EINVAL;
5733 }
5734 } else {
5735 dout(10) << __func__ << ": unknown layout vxattr " << name << dendl;
5736 return -CEPHFS_ENODATA; // no such attribute
5737 }
5738 } catch (boost::bad_lexical_cast const&) {
5739 dout(10) << __func__ << ": bad vxattr value:" << value
5740 << ", unable to parse for xattr:" << name << dendl;
5741 return -CEPHFS_EINVAL;
5742 }
5743 return 0;
5744}
5745
5746// parse old style layout string
5747int Server::parse_layout_vxattr_string(
5748 string name, string value, const OSDMap& osdmap, file_layout_t *layout)
7c673cae 5749{
7c673cae
FG
5750 try {
5751 if (name == "layout") {
5752 string::iterator begin = value.begin();
5753 string::iterator end = value.end();
5754 keys_and_values<string::iterator> p; // create instance of parser
5755 std::map<string, string> m; // map to receive results
5756 if (!qi::parse(begin, end, p, m)) { // returns true if successful
f67539c2 5757 return -CEPHFS_EINVAL;
7c673cae
FG
5758 }
5759 string left(begin, end);
1d09f67e 5760 dout(10) << __func__ << ": parsed " << m << " left '" << left << "'" << dendl;
7c673cae 5761 if (begin != end)
f67539c2 5762 return -CEPHFS_EINVAL;
7c673cae
FG
5763 for (map<string,string>::iterator q = m.begin(); q != m.end(); ++q) {
5764 // Skip validation on each attr, we do it once at the end (avoid
5765 // rejecting intermediate states if the overall result is ok)
1d09f67e
TL
5766 int r = parse_layout_vxattr_string(string("layout.") + q->first, q->second,
5767 osdmap, layout);
7c673cae
FG
5768 if (r < 0)
5769 return r;
5770 }
5771 } else if (name == "layout.object_size") {
5772 layout->object_size = boost::lexical_cast<unsigned>(value);
5773 } else if (name == "layout.stripe_unit") {
5774 layout->stripe_unit = boost::lexical_cast<unsigned>(value);
5775 } else if (name == "layout.stripe_count") {
5776 layout->stripe_count = boost::lexical_cast<unsigned>(value);
5777 } else if (name == "layout.pool") {
5778 try {
5779 layout->pool_id = boost::lexical_cast<unsigned>(value);
5780 } catch (boost::bad_lexical_cast const&) {
5781 int64_t pool = osdmap.lookup_pg_pool_name(value);
5782 if (pool < 0) {
1d09f67e 5783 dout(10) << __func__ << ": unknown pool " << value << dendl;
f67539c2 5784 return -CEPHFS_ENOENT;
7c673cae
FG
5785 }
5786 layout->pool_id = pool;
5787 }
1d09f67e
TL
5788 } else if (name == "layout.pool_id") {
5789 layout->pool_id = boost::lexical_cast<int64_t>(value);
5790 } else if (name == "layout.pool_name") {
5791 layout->pool_id = osdmap.lookup_pg_pool_name(value);
5792 if (layout->pool_id < 0) {
5793 dout(10) << __func__ << ": unknown pool " << value << dendl;
5794 return -CEPHFS_EINVAL;
5795 }
7c673cae
FG
5796 } else if (name == "layout.pool_namespace") {
5797 layout->pool_ns = value;
5798 } else {
1d09f67e
TL
5799 dout(10) << __func__ << ": unknown layout vxattr " << name << dendl;
5800 return -CEPHFS_ENODATA; // no such attribute
7c673cae
FG
5801 }
5802 } catch (boost::bad_lexical_cast const&) {
1d09f67e
TL
5803 dout(10) << __func__ << ": bad vxattr value, unable to parse int for "
5804 << name << dendl;
f67539c2 5805 return -CEPHFS_EINVAL;
7c673cae 5806 }
1d09f67e
TL
5807 return 0;
5808}
5809
5810int Server::parse_layout_vxattr(string name, string value, const OSDMap& osdmap,
5811 file_layout_t *layout, bool validate)
5812{
5813 dout(20) << __func__ << ": name:" << name << " value:'" << value << "'" << dendl;
5814
5815 int r;
5816 if (name == "layout.json") {
5817 r = parse_layout_vxattr_json(name, value, osdmap, layout);
5818 } else {
5819 r = parse_layout_vxattr_string(name, value, osdmap, layout);
5820 }
5821 if (r < 0) {
5822 return r;
5823 }
7c673cae
FG
5824
5825 if (validate && !layout->is_valid()) {
1d09f67e 5826 dout(10) << __func__ << ": bad layout" << dendl;
f67539c2 5827 return -CEPHFS_EINVAL;
7c673cae
FG
5828 }
5829 if (!mds->mdsmap->is_data_pool(layout->pool_id)) {
1d09f67e 5830 dout(10) << __func__ << ": invalid data pool " << layout->pool_id << dendl;
f67539c2 5831 return -CEPHFS_EINVAL;
7c673cae
FG
5832 }
5833 return 0;
5834}
5835
5836int Server::parse_quota_vxattr(string name, string value, quota_info_t *quota)
5837{
5838 dout(20) << "parse_quota_vxattr name " << name << " value '" << value << "'" << dendl;
5839 try {
5840 if (name == "quota") {
5841 string::iterator begin = value.begin();
5842 string::iterator end = value.end();
11fdf7f2
TL
5843 if (begin == end) {
5844 // keep quota unchanged. (for create_quota_realm())
5845 return 0;
5846 }
7c673cae
FG
5847 keys_and_values<string::iterator> p; // create instance of parser
5848 std::map<string, string> m; // map to receive results
5849 if (!qi::parse(begin, end, p, m)) { // returns true if successful
f67539c2 5850 return -CEPHFS_EINVAL;
7c673cae
FG
5851 }
5852 string left(begin, end);
5853 dout(10) << " parsed " << m << " left '" << left << "'" << dendl;
5854 if (begin != end)
f67539c2 5855 return -CEPHFS_EINVAL;
7c673cae
FG
5856 for (map<string,string>::iterator q = m.begin(); q != m.end(); ++q) {
5857 int r = parse_quota_vxattr(string("quota.") + q->first, q->second, quota);
5858 if (r < 0)
5859 return r;
5860 }
5861 } else if (name == "quota.max_bytes") {
5862 int64_t q = boost::lexical_cast<int64_t>(value);
5863 if (q < 0)
f67539c2 5864 return -CEPHFS_EINVAL;
7c673cae
FG
5865 quota->max_bytes = q;
5866 } else if (name == "quota.max_files") {
5867 int64_t q = boost::lexical_cast<int64_t>(value);
5868 if (q < 0)
f67539c2 5869 return -CEPHFS_EINVAL;
7c673cae
FG
5870 quota->max_files = q;
5871 } else {
5872 dout(10) << " unknown quota vxattr " << name << dendl;
f67539c2 5873 return -CEPHFS_EINVAL;
7c673cae
FG
5874 }
5875 } catch (boost::bad_lexical_cast const&) {
5876 dout(10) << "bad vxattr value, unable to parse int for " << name << dendl;
f67539c2 5877 return -CEPHFS_EINVAL;
7c673cae
FG
5878 }
5879
5880 if (!quota->is_valid()) {
5881 dout(10) << "bad quota" << dendl;
f67539c2 5882 return -CEPHFS_EINVAL;
7c673cae
FG
5883 }
5884 return 0;
5885}
5886
11fdf7f2
TL
5887void Server::create_quota_realm(CInode *in)
5888{
5889 dout(10) << __func__ << " " << *in << dendl;
5890
9f95a23c 5891 auto req = make_message<MClientRequest>(CEPH_MDS_OP_SETXATTR);
11fdf7f2
TL
5892 req->set_filepath(filepath(in->ino()));
5893 req->set_string2("ceph.quota");
5894 // empty vxattr value
5895 req->set_tid(mds->issue_tid());
5896
5897 mds->send_message_mds(req, in->authority().first);
5898}
5899
7c673cae
FG
5900/*
5901 * Verify that the file layout attribute carried by client
5902 * is well-formatted.
5903 * Return 0 on success, otherwise this function takes
5904 * responsibility for the passed mdr.
5905 */
5906int Server::check_layout_vxattr(MDRequestRef& mdr,
5907 string name,
5908 string value,
5909 file_layout_t *layout)
5910{
9f95a23c 5911 const cref_t<MClientRequest> &req = mdr->client_request;
7c673cae
FG
5912 epoch_t epoch;
5913 int r;
5914
5915 mds->objecter->with_osdmap([&](const OSDMap& osdmap) {
5916 r = parse_layout_vxattr(name, value, osdmap, layout);
5917 epoch = osdmap.get_epoch();
5918 });
5919
f67539c2 5920 if (r == -CEPHFS_ENOENT) {
7c673cae
FG
5921
5922 // we don't have the specified pool, make sure our map
5923 // is newer than or as new as the client.
5924 epoch_t req_epoch = req->get_osdmap_epoch();
5925
5926 if (req_epoch > epoch) {
5927
5928 // well, our map is older. consult mds.
f67539c2 5929 auto fin = new C_IO_Wrapper(mds, new C_MDS_RetryRequest(mdcache, mdr));
7c673cae 5930
f67539c2
TL
5931 mds->objecter->wait_for_map(req_epoch, lambdafy(fin));
5932 return r;
7c673cae
FG
5933 } else if (req_epoch == 0 && !mdr->waited_for_osdmap) {
5934
5935 // For compatibility with client w/ old code, we still need get the
5936 // latest map. One day if COMPACT_VERSION of MClientRequest >=3,
5937 // we can remove those code.
5938 mdr->waited_for_osdmap = true;
f67539c2
TL
5939 mds->objecter->wait_for_latest_osdmap(std::ref(*new C_IO_Wrapper(
5940 mds, new C_MDS_RetryRequest(mdcache, mdr))));
7c673cae
FG
5941 return r;
5942 }
5943 }
5944
5945 if (r < 0) {
5946
f67539c2
TL
5947 if (r == -CEPHFS_ENOENT)
5948 r = -CEPHFS_EINVAL;
7c673cae
FG
5949
5950 respond_to_request(mdr, r);
5951 return r;
5952 }
5953
5954 // all is well
5955 return 0;
5956}
5957
9f95a23c 5958void Server::handle_set_vxattr(MDRequestRef& mdr, CInode *cur)
7c673cae 5959{
9f95a23c 5960 const cref_t<MClientRequest> &req = mdr->client_request;
1e59de90 5961 MutationImpl::LockOpVec lov;
7c673cae
FG
5962 string name(req->get_path2());
5963 bufferlist bl = req->get_data();
5964 string value (bl.c_str(), bl.length());
5965 dout(10) << "handle_set_vxattr " << name
5966 << " val " << value.length()
5967 << " bytes on " << *cur
5968 << dendl;
5969
94b18763 5970 CInode::mempool_inode *pip = nullptr;
7c673cae
FG
5971 string rest;
5972
5973 if (!check_access(mdr, cur, MAY_SET_VXATTR)) {
5974 return;
5975 }
5976
adb31ebb 5977 bool adjust_realm = false;
7c673cae
FG
5978 if (name.compare(0, 15, "ceph.dir.layout") == 0) {
5979 if (!cur->is_dir()) {
f67539c2 5980 respond_to_request(mdr, -CEPHFS_EINVAL);
7c673cae
FG
5981 return;
5982 }
5983
9f95a23c
TL
5984 if (!xlock_policylock(mdr, cur, true))
5985 return;
5986
1e59de90
TL
5987 /* We need 'As' caps for the fscrypt context */
5988 lov.add_xlock(&cur->authlock);
5989 if (!mds->locker->acquire_locks(mdr, lov)) {
5990 return;
5991 }
5992
5993 /* encrypted directories can't have their layout changed */
5994 if (!cur->get_inode()->fscrypt_auth.empty()) {
5995 respond_to_request(mdr, -CEPHFS_EINVAL);
5996 return;
5997 }
5998
7c673cae
FG
5999 file_layout_t layout;
6000 if (cur->get_projected_inode()->has_layout())
6001 layout = cur->get_projected_inode()->layout;
9f95a23c
TL
6002 else if (mdr->dir_layout != file_layout_t())
6003 layout = mdr->dir_layout;
7c673cae
FG
6004 else
6005 layout = mdcache->default_file_layout;
6006
6007 rest = name.substr(name.find("layout"));
6008 if (check_layout_vxattr(mdr, rest, value, &layout) < 0)
6009 return;
6010
f67539c2
TL
6011 auto pi = cur->project_inode(mdr);
6012 pi.inode->layout = layout;
b32b8144 6013 mdr->no_early_reply = true;
f67539c2 6014 pip = pi.inode.get();
7c673cae
FG
6015 } else if (name.compare(0, 16, "ceph.file.layout") == 0) {
6016 if (!cur->is_file()) {
f67539c2 6017 respond_to_request(mdr, -CEPHFS_EINVAL);
7c673cae
FG
6018 return;
6019 }
6020 if (cur->get_projected_inode()->size ||
6021 cur->get_projected_inode()->truncate_seq > 1) {
f67539c2 6022 respond_to_request(mdr, -CEPHFS_ENOTEMPTY);
7c673cae
FG
6023 return;
6024 }
6025 file_layout_t layout = cur->get_projected_inode()->layout;
6026 rest = name.substr(name.find("layout"));
6027 if (check_layout_vxattr(mdr, rest, value, &layout) < 0)
6028 return;
6029
11fdf7f2
TL
6030 lov.add_xlock(&cur->filelock);
6031 if (!mds->locker->acquire_locks(mdr, lov))
7c673cae
FG
6032 return;
6033
1e59de90
TL
6034 /* encrypted files can't have their layout changed */
6035 if (!cur->get_inode()->fscrypt_auth.empty()) {
6036 respond_to_request(mdr, -CEPHFS_EINVAL);
6037 return;
6038 }
6039
f67539c2
TL
6040 auto pi = cur->project_inode(mdr);
6041 int64_t old_pool = pi.inode->layout.pool_id;
6042 pi.inode->add_old_pool(old_pool);
6043 pi.inode->layout = layout;
6044 pip = pi.inode.get();
7c673cae 6045 } else if (name.compare(0, 10, "ceph.quota") == 0) {
f67539c2
TL
6046 if (!cur->is_dir()) {
6047 respond_to_request(mdr, -CEPHFS_EINVAL);
7c673cae
FG
6048 return;
6049 }
6050
6051 quota_info_t quota = cur->get_projected_inode()->quota;
6052
6053 rest = name.substr(name.find("quota"));
6054 int r = parse_quota_vxattr(rest, value, &quota);
6055 if (r < 0) {
6056 respond_to_request(mdr, r);
6057 return;
6058 }
6059
1e59de90 6060 if (quota.is_enabled() && !cur->get_projected_srnode())
adb31ebb
TL
6061 adjust_realm = true;
6062
6063 if (!xlock_policylock(mdr, cur, false, adjust_realm))
6064 return;
11fdf7f2 6065
adb31ebb
TL
6066 if (cur->get_projected_inode()->quota == quota) {
6067 respond_to_request(mdr, 0);
7c673cae 6068 return;
adb31ebb 6069 }
7c673cae 6070
f67539c2
TL
6071 auto pi = cur->project_inode(mdr, false, adjust_realm);
6072 pi.inode->quota = quota;
94b18763 6073
adb31ebb
TL
6074 if (adjust_realm)
6075 pi.snapnode->created = pi.snapnode->seq = cur->find_snaprealm()->get_newest_seq();
6076
b32b8144 6077 mdr->no_early_reply = true;
f67539c2 6078 pip = pi.inode.get();
28e407b8
AA
6079
6080 client_t exclude_ct = mdr->get_client();
a8e16298 6081 mdcache->broadcast_quota_to_client(cur, exclude_ct, true);
adb31ebb
TL
6082 } else if (name == "ceph.dir.subvolume"sv) {
6083 if (!cur->is_dir()) {
f67539c2 6084 respond_to_request(mdr, -CEPHFS_EINVAL);
adb31ebb
TL
6085 return;
6086 }
6087
6088 bool val;
6089 try {
6090 val = boost::lexical_cast<bool>(value);
6091 } catch (boost::bad_lexical_cast const&) {
6092 dout(10) << "bad vxattr value, unable to parse bool for " << name << dendl;
f67539c2 6093 respond_to_request(mdr, -CEPHFS_EINVAL);
adb31ebb
TL
6094 return;
6095 }
6096
b3b6e05e
TL
6097 /* Verify it's not already a subvolume with lighter weight
6098 * rdlock.
6099 */
6100 if (!mdr->more()->rdonly_checks) {
6101 if (!(mdr->locking_state & MutationImpl::ALL_LOCKED)) {
b3b6e05e
TL
6102 lov.add_rdlock(&cur->snaplock);
6103 if (!mds->locker->acquire_locks(mdr, lov))
6104 return;
6105 mdr->locking_state |= MutationImpl::ALL_LOCKED;
6106 }
b3b6e05e
TL
6107 const auto srnode = cur->get_projected_srnode();
6108 if (val == (srnode && srnode->is_subvolume())) {
6109 dout(20) << "already marked subvolume" << dendl;
6110 respond_to_request(mdr, 0);
6111 return;
6112 }
6113 mdr->more()->rdonly_checks = true;
6114 }
6115
6116 if ((mdr->locking_state & MutationImpl::ALL_LOCKED) && !mdr->is_xlocked(&cur->snaplock)) {
6117 /* drop the rdlock and acquire xlocks */
6118 dout(20) << "dropping rdlocks" << dendl;
6119 mds->locker->drop_locks(mdr.get());
6120 if (!xlock_policylock(mdr, cur, false, true))
6121 return;
6122 }
adb31ebb 6123
b3b6e05e 6124 /* repeat rdonly checks in case changed between rdlock -> xlock */
adb31ebb
TL
6125 SnapRealm *realm = cur->find_snaprealm();
6126 if (val) {
6127 inodeno_t subvol_ino = realm->get_subvolume_ino();
6128 // can't create subvolume inside another subvolume
6129 if (subvol_ino && subvol_ino != cur->ino()) {
f67539c2 6130 respond_to_request(mdr, -CEPHFS_EINVAL);
adb31ebb
TL
6131 return;
6132 }
6133 }
6134
6135 const auto srnode = cur->get_projected_srnode();
6136 if (val == (srnode && srnode->is_subvolume())) {
6137 respond_to_request(mdr, 0);
6138 return;
6139 }
6140
f67539c2 6141 auto pi = cur->project_inode(mdr, false, true);
adb31ebb
TL
6142 if (!srnode)
6143 pi.snapnode->created = pi.snapnode->seq = realm->get_newest_seq();
6144 if (val)
6145 pi.snapnode->mark_subvolume();
6146 else
6147 pi.snapnode->clear_subvolume();
6148
6149 mdr->no_early_reply = true;
f67539c2 6150 pip = pi.inode.get();
adb31ebb 6151 adjust_realm = true;
f6b5b4d7 6152 } else if (name == "ceph.dir.pin"sv) {
7c673cae 6153 if (!cur->is_dir() || cur->is_root()) {
f67539c2 6154 respond_to_request(mdr, -CEPHFS_EINVAL);
7c673cae
FG
6155 return;
6156 }
6157
6158 mds_rank_t rank;
6159 try {
6160 rank = boost::lexical_cast<mds_rank_t>(value);
6161 if (rank < 0) rank = MDS_RANK_NONE;
20effc67
TL
6162 else if (rank >= MAX_MDS) {
6163 respond_to_request(mdr, -CEPHFS_EDOM);
6164 return;
6165 }
7c673cae
FG
6166 } catch (boost::bad_lexical_cast const&) {
6167 dout(10) << "bad vxattr value, unable to parse int for " << name << dendl;
f67539c2 6168 respond_to_request(mdr, -CEPHFS_EINVAL);
7c673cae
FG
6169 return;
6170 }
6171
9f95a23c 6172 if (!xlock_policylock(mdr, cur))
7c673cae
FG
6173 return;
6174
f67539c2 6175 auto pi = cur->project_inode(mdr);
7c673cae 6176 cur->set_export_pin(rank);
f67539c2 6177 pip = pi.inode.get();
f6b5b4d7
TL
6178 } else if (name == "ceph.dir.pin.random"sv) {
6179 if (!cur->is_dir() || cur->is_root()) {
f67539c2 6180 respond_to_request(mdr, -CEPHFS_EINVAL);
f6b5b4d7
TL
6181 return;
6182 }
6183
6184 double val;
6185 try {
6186 val = boost::lexical_cast<double>(value);
6187 } catch (boost::bad_lexical_cast const&) {
6188 dout(10) << "bad vxattr value, unable to parse float for " << name << dendl;
f67539c2 6189 respond_to_request(mdr, -CEPHFS_EINVAL);
f6b5b4d7
TL
6190 return;
6191 }
6192
6193 if (val < 0.0 || 1.0 < val) {
f67539c2 6194 respond_to_request(mdr, -CEPHFS_EDOM);
f6b5b4d7
TL
6195 return;
6196 } else if (mdcache->export_ephemeral_random_max < val) {
f67539c2 6197 respond_to_request(mdr, -CEPHFS_EINVAL);
f6b5b4d7
TL
6198 return;
6199 }
6200
6201 if (!xlock_policylock(mdr, cur))
6202 return;
6203
f67539c2 6204 auto pi = cur->project_inode(mdr);
f6b5b4d7 6205 cur->setxattr_ephemeral_rand(val);
f67539c2 6206 pip = pi.inode.get();
f6b5b4d7
TL
6207 } else if (name == "ceph.dir.pin.distributed"sv) {
6208 if (!cur->is_dir() || cur->is_root()) {
f67539c2 6209 respond_to_request(mdr, -CEPHFS_EINVAL);
f6b5b4d7
TL
6210 return;
6211 }
6212
6213 bool val;
6214 try {
6215 val = boost::lexical_cast<bool>(value);
6216 } catch (boost::bad_lexical_cast const&) {
6217 dout(10) << "bad vxattr value, unable to parse bool for " << name << dendl;
f67539c2 6218 respond_to_request(mdr, -CEPHFS_EINVAL);
f6b5b4d7
TL
6219 return;
6220 }
6221
6222 if (!xlock_policylock(mdr, cur))
6223 return;
6224
f67539c2 6225 auto pi = cur->project_inode(mdr);
f6b5b4d7 6226 cur->setxattr_ephemeral_dist(val);
f67539c2 6227 pip = pi.inode.get();
7c673cae
FG
6228 } else {
6229 dout(10) << " unknown vxattr " << name << dendl;
f67539c2 6230 respond_to_request(mdr, -CEPHFS_EINVAL);
7c673cae
FG
6231 return;
6232 }
6233
94b18763 6234 pip->change_attr++;
91327a77
AA
6235 pip->ctime = mdr->get_op_stamp();
6236 if (mdr->get_op_stamp() > pip->rstat.rctime)
6237 pip->rstat.rctime = mdr->get_op_stamp();
94b18763 6238 pip->version = cur->pre_dirty();
7c673cae 6239 if (cur->is_file())
94b18763 6240 pip->update_backtrace();
7c673cae
FG
6241
6242 // log + wait
6243 mdr->ls = mdlog->get_current_segment();
6244 EUpdate *le = new EUpdate(mdlog, "set vxattr layout");
6245 mdlog->start_entry(le);
6246 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
6247 mdcache->predirty_journal_parents(mdr, &le->metablob, cur, 0, PREDIRTY_PRIMARY);
6248 mdcache->journal_dirty_inode(mdr.get(), &le->metablob, cur);
6249
11fdf7f2 6250 journal_and_reply(mdr, cur, 0, le, new C_MDS_inode_update_finish(this, mdr, cur,
adb31ebb 6251 false, false, adjust_realm));
7c673cae
FG
6252 return;
6253}
6254
9f95a23c 6255void Server::handle_remove_vxattr(MDRequestRef& mdr, CInode *cur)
7c673cae 6256{
9f95a23c 6257 const cref_t<MClientRequest> &req = mdr->client_request;
7c673cae
FG
6258 string name(req->get_path2());
6259
6260 dout(10) << __func__ << " " << name << " on " << *cur << dendl;
6261
6262 if (name == "ceph.dir.layout") {
6263 if (!cur->is_dir()) {
f67539c2 6264 respond_to_request(mdr, -CEPHFS_ENODATA);
7c673cae
FG
6265 return;
6266 }
6267 if (cur->is_root()) {
6268 dout(10) << "can't remove layout policy on the root directory" << dendl;
f67539c2 6269 respond_to_request(mdr, -CEPHFS_EINVAL);
7c673cae
FG
6270 return;
6271 }
6272
6273 if (!cur->get_projected_inode()->has_layout()) {
f67539c2 6274 respond_to_request(mdr, -CEPHFS_ENODATA);
7c673cae
FG
6275 return;
6276 }
6277
9f95a23c 6278 MutationImpl::LockOpVec lov;
11fdf7f2
TL
6279 lov.add_xlock(&cur->policylock);
6280 if (!mds->locker->acquire_locks(mdr, lov))
7c673cae
FG
6281 return;
6282
f67539c2
TL
6283 auto pi = cur->project_inode(mdr);
6284 pi.inode->clear_layout();
6285 pi.inode->version = cur->pre_dirty();
7c673cae
FG
6286
6287 // log + wait
6288 mdr->ls = mdlog->get_current_segment();
6289 EUpdate *le = new EUpdate(mdlog, "remove dir layout vxattr");
6290 mdlog->start_entry(le);
6291 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
6292 mdcache->predirty_journal_parents(mdr, &le->metablob, cur, 0, PREDIRTY_PRIMARY);
6293 mdcache->journal_dirty_inode(mdr.get(), &le->metablob, cur);
6294
b32b8144 6295 mdr->no_early_reply = true;
7c673cae
FG
6296 journal_and_reply(mdr, cur, 0, le, new C_MDS_inode_update_finish(this, mdr, cur));
6297 return;
6298 } else if (name == "ceph.dir.layout.pool_namespace"
6299 || name == "ceph.file.layout.pool_namespace") {
6300 // Namespace is the only layout field that has a meaningful
6301 // null/none value (empty string, means default layout). Is equivalent
6302 // to a setxattr with empty string: pass through the empty payload of
6303 // the rmxattr request to do this.
9f95a23c 6304 handle_set_vxattr(mdr, cur);
7c673cae
FG
6305 return;
6306 }
6307
f67539c2 6308 respond_to_request(mdr, -CEPHFS_ENODATA);
7c673cae
FG
6309}
6310
f67539c2
TL
6311const Server::XattrHandler Server::xattr_handlers[] = {
6312 {
6313 xattr_name: Server::DEFAULT_HANDLER,
6314 description: "default xattr handler",
6315 validate: &Server::default_xattr_validate,
6316 setxattr: &Server::default_setxattr_handler,
6317 removexattr: &Server::default_removexattr_handler,
6318 },
6319 {
6320 xattr_name: "ceph.mirror.info",
6321 description: "mirror info xattr handler",
6322 validate: &Server::mirror_info_xattr_validate,
6323 setxattr: &Server::mirror_info_setxattr_handler,
6324 removexattr: &Server::mirror_info_removexattr_handler
6325 },
6326};
7c673cae 6327
f67539c2
TL
6328const Server::XattrHandler* Server::get_xattr_or_default_handler(std::string_view xattr_name) {
6329 const XattrHandler *default_xattr_handler = nullptr;
7c673cae 6330
f67539c2
TL
6331 for (auto &handler : xattr_handlers) {
6332 if (handler.xattr_name == Server::DEFAULT_HANDLER) {
6333 ceph_assert(default_xattr_handler == nullptr);
6334 default_xattr_handler = &handler;
6335 }
6336 if (handler.xattr_name == xattr_name) {
6337 dout(20) << "handler=" << handler.description << dendl;
6338 return &handler;
6339 }
6340 }
7c673cae 6341
f67539c2
TL
6342 ceph_assert(default_xattr_handler != nullptr);
6343 dout(20) << "handler=" << default_xattr_handler->description << dendl;
6344 return default_xattr_handler;
6345}
7c673cae 6346
f67539c2
TL
6347int Server::xattr_validate(CInode *cur, const InodeStoreBase::xattr_map_const_ptr xattrs,
6348 const std::string &xattr_name, int op, int flags) {
6349 if (op == CEPH_MDS_OP_SETXATTR) {
6350 if (xattrs) {
6351 if ((flags & CEPH_XATTR_CREATE) && xattrs->count(mempool::mds_co::string(xattr_name))) {
6352 dout(10) << "setxattr '" << xattr_name << "' XATTR_CREATE and CEPHFS_EEXIST on " << *cur << dendl;
6353 return -CEPHFS_EEXIST;
6354 }
6355 }
6356 if ((flags & CEPH_XATTR_REPLACE) && !(xattrs && xattrs->count(mempool::mds_co::string(xattr_name)))) {
6357 dout(10) << "setxattr '" << xattr_name << "' XATTR_REPLACE and CEPHFS_ENODATA on " << *cur << dendl;
6358 return -CEPHFS_ENODATA;
6359 }
6360
6361 return 0;
7c673cae 6362 }
f67539c2
TL
6363
6364 if (op == CEPH_MDS_OP_RMXATTR) {
6365 if (!xattrs || xattrs->count(mempool::mds_co::string(xattr_name)) == 0) {
6366 dout(10) << "removexattr '" << xattr_name << "' and CEPHFS_ENODATA on " << *cur << dendl;
6367 return -CEPHFS_ENODATA;
6368 }
6369
6370 return 0;
6371 }
6372
6373 derr << ": unhandled validation for: " << xattr_name << dendl;
6374 return -CEPHFS_EINVAL;
6375}
6376
6377void Server::xattr_set(InodeStoreBase::xattr_map_ptr xattrs, const std::string &xattr_name,
6378 const bufferlist &xattr_value) {
6379 size_t len = xattr_value.length();
6380 bufferptr b = buffer::create(len);
6381 if (len) {
6382 xattr_value.begin().copy(len, b.c_str());
6383 }
6384 auto em = xattrs->emplace(std::piecewise_construct,
6385 std::forward_as_tuple(mempool::mds_co::string(xattr_name)),
6386 std::forward_as_tuple(b));
6387 if (!em.second) {
6388 em.first->second = b;
6389 }
6390}
6391
6392void Server::xattr_rm(InodeStoreBase::xattr_map_ptr xattrs, const std::string &xattr_name) {
6393 xattrs->erase(mempool::mds_co::string(xattr_name));
6394}
6395
6396int Server::default_xattr_validate(CInode *cur, const InodeStoreBase::xattr_map_const_ptr xattrs,
6397 XattrOp *xattr_op) {
6398 return xattr_validate(cur, xattrs, xattr_op->xattr_name, xattr_op->op, xattr_op->flags);
6399}
6400
6401void Server::default_setxattr_handler(CInode *cur, InodeStoreBase::xattr_map_ptr xattrs,
6402 const XattrOp &xattr_op) {
6403 xattr_set(xattrs, xattr_op.xattr_name, xattr_op.xattr_value);
6404}
6405
6406void Server::default_removexattr_handler(CInode *cur, InodeStoreBase::xattr_map_ptr xattrs,
6407 const XattrOp &xattr_op) {
6408 xattr_rm(xattrs, xattr_op.xattr_name);
6409}
6410
6411// mirror info xattr handlers
6412const std::string Server::MirrorXattrInfo::MIRROR_INFO_REGEX = "^cluster_id=([a-f0-9]{8}-" \
6413 "[a-f0-9]{4}-[a-f0-9]{4}-" \
6414 "[a-f0-9]{4}-[a-f0-9]{12})" \
6415 " fs_id=(\\d+)$";
6416const std::string Server::MirrorXattrInfo::CLUSTER_ID = "ceph.mirror.info.cluster_id";
6417const std::string Server::MirrorXattrInfo::FS_ID = "ceph.mirror.info.fs_id";
6418int Server::parse_mirror_info_xattr(const std::string &name, const std::string &value,
6419 std::string &cluster_id, std::string &fs_id) {
6420 dout(20) << "parsing name=" << name << ", value=" << value << dendl;
6421
6422 static const std::regex regex(Server::MirrorXattrInfo::MIRROR_INFO_REGEX);
6423 std::smatch match;
6424
6425 std::regex_search(value, match, regex);
6426 if (match.size() != 3) {
6427 derr << "mirror info parse error" << dendl;
6428 return -CEPHFS_EINVAL;
6429 }
6430
6431 cluster_id = match[1];
6432 fs_id = match[2];
6433 dout(20) << " parsed cluster_id=" << cluster_id << ", fs_id=" << fs_id << dendl;
6434 return 0;
6435}
6436
6437int Server::mirror_info_xattr_validate(CInode *cur, const InodeStoreBase::xattr_map_const_ptr xattrs,
6438 XattrOp *xattr_op) {
6439 if (!cur->is_root()) {
6440 return -CEPHFS_EINVAL;
6441 }
6442
6443 int v1 = xattr_validate(cur, xattrs, Server::MirrorXattrInfo::CLUSTER_ID, xattr_op->op, xattr_op->flags);
6444 int v2 = xattr_validate(cur, xattrs, Server::MirrorXattrInfo::FS_ID, xattr_op->op, xattr_op->flags);
6445 if (v1 != v2) {
6446 derr << "inconsistent mirror info state (" << v1 << "," << v2 << ")" << dendl;
6447 return -CEPHFS_EINVAL;
6448 }
6449
6450 if (v1 < 0) {
6451 return v1;
6452 }
6453
6454 if (xattr_op->op == CEPH_MDS_OP_RMXATTR) {
6455 return 0;
6456 }
6457
6458 std::string cluster_id;
6459 std::string fs_id;
6460 int r = parse_mirror_info_xattr(xattr_op->xattr_name, xattr_op->xattr_value.to_str(),
6461 cluster_id, fs_id);
6462 if (r < 0) {
6463 return r;
6464 }
6465
6466 xattr_op->xinfo = std::make_unique<MirrorXattrInfo>(cluster_id, fs_id);
6467 return 0;
6468}
6469
6470void Server::mirror_info_setxattr_handler(CInode *cur, InodeStoreBase::xattr_map_ptr xattrs,
6471 const XattrOp &xattr_op) {
6472 auto mirror_info = dynamic_cast<MirrorXattrInfo&>(*(xattr_op.xinfo));
6473
6474 bufferlist bl;
6475 bl.append(mirror_info.cluster_id.c_str(), mirror_info.cluster_id.length());
6476 xattr_set(xattrs, Server::MirrorXattrInfo::CLUSTER_ID, bl);
6477
6478 bl.clear();
6479 bl.append(mirror_info.fs_id.c_str(), mirror_info.fs_id.length());
6480 xattr_set(xattrs, Server::MirrorXattrInfo::FS_ID, bl);
6481}
6482
6483void Server::mirror_info_removexattr_handler(CInode *cur, InodeStoreBase::xattr_map_ptr xattrs,
6484 const XattrOp &xattr_op) {
6485 xattr_rm(xattrs, Server::MirrorXattrInfo::CLUSTER_ID);
6486 xattr_rm(xattrs, Server::MirrorXattrInfo::FS_ID);
6487}
7c673cae
FG
6488
6489void Server::handle_client_setxattr(MDRequestRef& mdr)
6490{
9f95a23c 6491 const cref_t<MClientRequest> &req = mdr->client_request;
7c673cae 6492 string name(req->get_path2());
7c673cae 6493
f67539c2
TL
6494 // is a ceph virtual xattr?
6495 if (is_ceph_vxattr(name)) {
9f95a23c
TL
6496 // can't use rdlock_path_pin_ref because we need to xlock snaplock/policylock
6497 CInode *cur = try_get_auth_inode(mdr, req->get_filepath().get_ino());
6498 if (!cur)
6499 return;
6500
6501 handle_set_vxattr(mdr, cur);
6502 return;
6503 }
6504
f67539c2
TL
6505 if (!is_allowed_ceph_xattr(name)) {
6506 respond_to_request(mdr, -CEPHFS_EINVAL);
6507 return;
6508 }
6509
9f95a23c 6510 CInode *cur = rdlock_path_pin_ref(mdr, true);
7c673cae
FG
6511 if (!cur)
6512 return;
6513
6514 if (mdr->snapid != CEPH_NOSNAP) {
f67539c2 6515 respond_to_request(mdr, -CEPHFS_EROFS);
7c673cae
FG
6516 return;
6517 }
6518
6519 int flags = req->head.args.setxattr.flags;
6520
9f95a23c 6521 MutationImpl::LockOpVec lov;
11fdf7f2
TL
6522 lov.add_xlock(&cur->xattrlock);
6523 if (!mds->locker->acquire_locks(mdr, lov))
7c673cae
FG
6524 return;
6525
6526 if (!check_access(mdr, cur, MAY_WRITE))
6527 return;
6528
7c673cae
FG
6529 size_t len = req->get_data().length();
6530 size_t inc = len + name.length();
6531
f67539c2
TL
6532 auto handler = Server::get_xattr_or_default_handler(name);
6533 const auto& pxattrs = cur->get_projected_xattrs();
6534 if (pxattrs) {
6535 // check xattrs kv pairs size
6536 size_t cur_xattrs_size = 0;
6537 for (const auto& p : *pxattrs) {
6538 if ((flags & CEPH_XATTR_REPLACE) && name.compare(p.first) == 0) {
6539 continue;
6540 }
6541 cur_xattrs_size += p.first.length() + p.second.length();
7c673cae 6542 }
7c673cae 6543
f67539c2
TL
6544 if (((cur_xattrs_size + inc) > g_conf()->mds_max_xattr_pairs_size)) {
6545 dout(10) << "xattr kv pairs size too big. cur_xattrs_size "
6546 << cur_xattrs_size << ", inc " << inc << dendl;
6547 respond_to_request(mdr, -CEPHFS_ENOSPC);
6548 return;
6549 }
7c673cae
FG
6550 }
6551
f67539c2
TL
6552 XattrOp xattr_op(CEPH_MDS_OP_SETXATTR, name, req->get_data(), flags);
6553 int r = std::invoke(handler->validate, this, cur, pxattrs, &xattr_op);
6554 if (r < 0) {
6555 respond_to_request(mdr, r);
7c673cae
FG
6556 return;
6557 }
6558
6559 dout(10) << "setxattr '" << name << "' len " << len << " on " << *cur << dendl;
6560
6561 // project update
f67539c2
TL
6562 auto pi = cur->project_inode(mdr, true);
6563 pi.inode->version = cur->pre_dirty();
6564 pi.inode->ctime = mdr->get_op_stamp();
6565 if (mdr->get_op_stamp() > pi.inode->rstat.rctime)
6566 pi.inode->rstat.rctime = mdr->get_op_stamp();
f67539c2
TL
6567 pi.inode->change_attr++;
6568 pi.inode->xattr_version++;
6569
94b18763 6570 if ((flags & CEPH_XATTR_REMOVE)) {
f67539c2 6571 std::invoke(handler->removexattr, this, cur, pi.xattrs, xattr_op);
94b18763 6572 } else {
f67539c2 6573 std::invoke(handler->setxattr, this, cur, pi.xattrs, xattr_op);
7c673cae
FG
6574 }
6575
6576 // log + wait
6577 mdr->ls = mdlog->get_current_segment();
6578 EUpdate *le = new EUpdate(mdlog, "setxattr");
6579 mdlog->start_entry(le);
6580 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
6581 mdcache->predirty_journal_parents(mdr, &le->metablob, cur, 0, PREDIRTY_PRIMARY);
6582 mdcache->journal_dirty_inode(mdr.get(), &le->metablob, cur);
6583
6584 journal_and_reply(mdr, cur, 0, le, new C_MDS_inode_update_finish(this, mdr, cur));
6585}
6586
6587void Server::handle_client_removexattr(MDRequestRef& mdr)
6588{
9f95a23c 6589 const cref_t<MClientRequest> &req = mdr->client_request;
94b18763 6590 std::string name(req->get_path2());
11fdf7f2 6591
f67539c2
TL
6592 // is a ceph virtual xattr?
6593 if (is_ceph_vxattr(name)) {
9f95a23c
TL
6594 // can't use rdlock_path_pin_ref because we need to xlock snaplock/policylock
6595 CInode *cur = try_get_auth_inode(mdr, req->get_filepath().get_ino());
6596 if (!cur)
6597 return;
6598
6599 handle_remove_vxattr(mdr, cur);
6600 return;
6601 }
6602
f67539c2
TL
6603 if (!is_allowed_ceph_xattr(name)) {
6604 respond_to_request(mdr, -CEPHFS_EINVAL);
6605 return;
6606 }
6607
9f95a23c 6608 CInode* cur = rdlock_path_pin_ref(mdr, true);
7c673cae
FG
6609 if (!cur)
6610 return;
6611
6612 if (mdr->snapid != CEPH_NOSNAP) {
f67539c2 6613 respond_to_request(mdr, -CEPHFS_EROFS);
7c673cae
FG
6614 return;
6615 }
6616
9f95a23c 6617 MutationImpl::LockOpVec lov;
11fdf7f2
TL
6618 lov.add_xlock(&cur->xattrlock);
6619 if (!mds->locker->acquire_locks(mdr, lov))
7c673cae
FG
6620 return;
6621
f67539c2
TL
6622
6623 auto handler = Server::get_xattr_or_default_handler(name);
6624 bufferlist bl;
6625 XattrOp xattr_op(CEPH_MDS_OP_RMXATTR, name, bl, 0);
6626
6627 const auto& pxattrs = cur->get_projected_xattrs();
6628 int r = std::invoke(handler->validate, this, cur, pxattrs, &xattr_op);
6629 if (r < 0) {
6630 respond_to_request(mdr, r);
7c673cae
FG
6631 return;
6632 }
6633
6634 dout(10) << "removexattr '" << name << "' on " << *cur << dendl;
6635
6636 // project update
f67539c2
TL
6637 auto pi = cur->project_inode(mdr, true);
6638 pi.inode->version = cur->pre_dirty();
6639 pi.inode->ctime = mdr->get_op_stamp();
6640 if (mdr->get_op_stamp() > pi.inode->rstat.rctime)
6641 pi.inode->rstat.rctime = mdr->get_op_stamp();
6642 pi.inode->change_attr++;
6643 pi.inode->xattr_version++;
6644 std::invoke(handler->removexattr, this, cur, pi.xattrs, xattr_op);
7c673cae
FG
6645
6646 // log + wait
6647 mdr->ls = mdlog->get_current_segment();
6648 EUpdate *le = new EUpdate(mdlog, "removexattr");
6649 mdlog->start_entry(le);
6650 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
6651 mdcache->predirty_journal_parents(mdr, &le->metablob, cur, 0, PREDIRTY_PRIMARY);
6652 mdcache->journal_dirty_inode(mdr.get(), &le->metablob, cur);
6653
6654 journal_and_reply(mdr, cur, 0, le, new C_MDS_inode_update_finish(this, mdr, cur));
6655}
6656
1d09f67e
TL
6657void Server::handle_client_getvxattr(MDRequestRef& mdr)
6658{
6659 const auto& req = mdr->client_request;
6660 string xattr_name{req->get_path2()};
6661
6662 // is a ceph virtual xattr?
6663 if (!is_ceph_vxattr(xattr_name)) {
6664 respond_to_request(mdr, -CEPHFS_ENODATA);
6665 return;
6666 }
6667
6668 CInode *cur = rdlock_path_pin_ref(mdr, true, false);
6669 if (!cur) {
6670 return;
6671 }
6672
6673 if (is_ceph_dir_vxattr(xattr_name)) {
6674 if (!cur->is_dir()) {
6675 respond_to_request(mdr, -CEPHFS_ENODATA);
6676 return;
6677 }
6678 } else if (is_ceph_file_vxattr(xattr_name)) {
6679 if (cur->is_dir()) {
6680 respond_to_request(mdr, -CEPHFS_ENODATA);
6681 return;
6682 }
6683 }
6684
6685 CachedStackStringStream css;
6686 int r = 0;
6687 ceph::bufferlist bl;
6688 // handle these vxattrs
6689 if ((xattr_name.substr(0, 15) == "ceph.dir.layout"sv) ||
6690 (xattr_name.substr(0, 16) == "ceph.file.layout"sv)) {
6691 std::string layout_field;
6692
6693 struct layout_xattr_info_t {
6694 enum class InheritanceStatus : uint32_t {
6695 DEFAULT = 0,
6696 SET = 1,
6697 INHERITED = 2
6698 };
6699
6700 const file_layout_t layout;
6701 const InheritanceStatus status;
6702
6703 layout_xattr_info_t(const file_layout_t& l, InheritanceStatus inh)
6704 : layout(l), status(inh) { }
6705
6706 static std::string status_to_string(InheritanceStatus status) {
6707 switch (status) {
6708 case InheritanceStatus::DEFAULT: return "default"s;
6709 case InheritanceStatus::SET: return "set"s;
6710 case InheritanceStatus::INHERITED: return "inherited"s;
6711 default: return "unknown"s;
6712 }
6713 }
6714 };
6715
6716 auto is_default_layout = [&](const file_layout_t& layout) -> bool {
6717 return (layout == mdcache->default_file_layout);
6718 };
6719 auto get_inherited_layout = [&](CInode *cur) -> layout_xattr_info_t {
6720 auto orig_in = cur;
6721
6722 while (cur) {
6723 if (cur->get_projected_inode()->has_layout()) {
6724 auto& curr_layout = cur->get_projected_inode()->layout;
6725 if (is_default_layout(curr_layout)) {
6726 return {curr_layout, layout_xattr_info_t::InheritanceStatus::DEFAULT};
6727 }
6728 if (cur == orig_in) {
6729 // we've found a new layout at this inode
6730 return {curr_layout, layout_xattr_info_t::InheritanceStatus::SET};
6731 } else {
6732 return {curr_layout, layout_xattr_info_t::InheritanceStatus::INHERITED};
6733 }
6734 }
6735
6736 if (cur->is_root()) {
6737 break;
6738 }
6739
6740 cur = cur->get_projected_parent_dir()->get_inode();
6741 }
6742 mds->clog->error() << "no layout found at root dir!";
6743 ceph_abort("no layout found at root dir! something is really messed up with layouts!");
6744 };
6745
6746 if (xattr_name == "ceph.dir.layout.json"sv ||
6747 xattr_name == "ceph.file.layout.json"sv) {
6748 // fetch layout only for valid xattr_name
6749 const auto lxi = get_inherited_layout(cur);
6750
6751 *css << "{\"stripe_unit\": " << lxi.layout.stripe_unit
6752 << ", \"stripe_count\": " << lxi.layout.stripe_count
6753 << ", \"object_size\": " << lxi.layout.object_size
6754 << ", \"pool_name\": ";
6755 mds->objecter->with_osdmap([lxi, &css](const OSDMap& o) {
6756 *css << "\"";
6757 if (o.have_pg_pool(lxi.layout.pool_id)) {
6758 *css << o.get_pool_name(lxi.layout.pool_id);
6759 }
6760 *css << "\"";
6761 });
6762 *css << ", \"pool_id\": " << (uint64_t)lxi.layout.pool_id;
6763 *css << ", \"pool_namespace\": \"" << lxi.layout.pool_ns << "\"";
6764 *css << ", \"inheritance\": \"@"
6765 << layout_xattr_info_t::status_to_string(lxi.status) << "\"}";
6766 } else if ((xattr_name == "ceph.dir.layout.pool_name"sv) ||
6767 (xattr_name == "ceph.file.layout.pool_name"sv)) {
6768 // fetch layout only for valid xattr_name
6769 const auto lxi = get_inherited_layout(cur);
6770 mds->objecter->with_osdmap([lxi, &css](const OSDMap& o) {
6771 if (o.have_pg_pool(lxi.layout.pool_id)) {
6772 *css << o.get_pool_name(lxi.layout.pool_id);
6773 }
6774 });
6775 } else if ((xattr_name == "ceph.dir.layout.pool_id"sv) ||
6776 (xattr_name == "ceph.file.layout.pool_id"sv)) {
6777 // fetch layout only for valid xattr_name
6778 const auto lxi = get_inherited_layout(cur);
6779 *css << (uint64_t)lxi.layout.pool_id;
6780 } else {
6781 r = -CEPHFS_ENODATA; // no such attribute
6782 }
6783 } else if (xattr_name.substr(0, 12) == "ceph.dir.pin"sv) {
6784 if (xattr_name == "ceph.dir.pin"sv) {
6785 *css << cur->get_projected_inode()->export_pin;
6786 } else if (xattr_name == "ceph.dir.pin.random"sv) {
6787 *css << cur->get_projected_inode()->export_ephemeral_random_pin;
6788 } else if (xattr_name == "ceph.dir.pin.distributed"sv) {
6789 *css << cur->get_projected_inode()->export_ephemeral_distributed_pin;
6790 } else {
6791 // otherwise respond as invalid request
6792 // since we only handle ceph vxattrs here
6793 r = -CEPHFS_ENODATA; // no such attribute
6794 }
6795 } else {
6796 // otherwise respond as invalid request
6797 // since we only handle ceph vxattrs here
6798 r = -CEPHFS_ENODATA; // no such attribute
6799 }
6800
6801 if (r == 0) {
6802 ENCODE_START(1, 1, bl);
6803 encode(css->strv(), bl);
6804 ENCODE_FINISH(bl);
6805 mdr->reply_extra_bl = bl;
6806 }
6807
6808 respond_to_request(mdr, r);
6809}
7c673cae
FG
6810
6811// =================================================================
6812// DIRECTORY and NAMESPACE OPS
6813
6814
6815// ------------------------------------------------
6816
6817// MKNOD
6818
6819class C_MDS_mknod_finish : public ServerLogContext {
6820 CDentry *dn;
6821 CInode *newi;
6822public:
6823 C_MDS_mknod_finish(Server *s, MDRequestRef& r, CDentry *d, CInode *ni) :
6824 ServerLogContext(s, r), dn(d), newi(ni) {}
6825 void finish(int r) override {
11fdf7f2 6826 ceph_assert(r == 0);
7c673cae 6827
05a536ef
TL
6828 // crash current MDS and the replacing MDS will test the journal
6829 ceph_assert(!g_conf()->mds_kill_skip_replaying_inotable);
6830
7c673cae
FG
6831 // link the inode
6832 dn->pop_projected_linkage();
6833
6834 // be a bit hacky with the inode version, here.. we decrement it
6835 // just to keep mark_dirty() happen. (we didn't bother projecting
6836 // a new version of hte inode since it's just been created)
f67539c2 6837 newi->mark_dirty(mdr->ls);
28e407b8 6838 newi->mark_dirty_parent(mdr->ls, true);
7c673cae
FG
6839
6840 // mkdir?
f67539c2 6841 if (newi->is_dir()) {
7c673cae 6842 CDir *dir = newi->get_dirfrag(frag_t());
11fdf7f2 6843 ceph_assert(dir);
f67539c2 6844 dir->mark_dirty(mdr->ls);
7c673cae
FG
6845 dir->mark_new(mdr->ls);
6846 }
6847
6848 mdr->apply();
6849
6850 MDRequestRef null_ref;
6851 get_mds()->mdcache->send_dentry_link(dn, null_ref);
6852
f67539c2 6853 if (newi->is_file()) {
7c673cae 6854 get_mds()->locker->share_inode_max_size(newi);
f67539c2 6855 } else if (newi->is_dir()) {
f6b5b4d7 6856 // We do this now so that the linkages on the new directory are stable.
f67539c2 6857 newi->maybe_ephemeral_rand();
f6b5b4d7 6858 }
7c673cae
FG
6859
6860 // hit pop
11fdf7f2 6861 get_mds()->balancer->hit_inode(newi, META_POP_IWR);
7c673cae
FG
6862
6863 // reply
6864 server->respond_to_request(mdr, 0);
6865 }
6866};
6867
6868
6869void Server::handle_client_mknod(MDRequestRef& mdr)
6870{
9f95a23c 6871 const cref_t<MClientRequest> &req = mdr->client_request;
7c673cae 6872 client_t client = mdr->get_client();
9f95a23c
TL
6873
6874 unsigned mode = req->head.args.mknod.mode;
6875 if ((mode & S_IFMT) == 0)
6876 mode |= S_IFREG;
6877
6878 mdr->disable_lock_cache();
1e59de90 6879 CDentry *dn = rdlock_path_xlock_dentry(mdr, true, false, false, S_ISREG(mode));
9f95a23c 6880 if (!dn)
7c673cae
FG
6881 return;
6882
9f95a23c
TL
6883 CDir *dir = dn->get_dir();
6884 CInode *diri = dir->get_inode();
7c673cae
FG
6885 if (!check_access(mdr, diri, MAY_WRITE))
6886 return;
20effc67
TL
6887 if (!check_fragment_space(mdr, dir))
6888 return;
6889 if (!check_dir_max_entries(mdr, dir))
7c673cae
FG
6890 return;
6891
f67539c2
TL
6892 ceph_assert(dn->get_projected_linkage()->is_null());
6893 if (req->get_alternate_name().size() > alternate_name_max) {
6894 dout(10) << " alternate_name longer than " << alternate_name_max << dendl;
6895 respond_to_request(mdr, -CEPHFS_ENAMETOOLONG);
6896 return;
6897 }
6898 dn->set_alternate_name(req->get_alternate_name());
6899
7c673cae
FG
6900 // set layout
6901 file_layout_t layout;
9f95a23c
TL
6902 if (mdr->dir_layout != file_layout_t())
6903 layout = mdr->dir_layout;
7c673cae
FG
6904 else
6905 layout = mdcache->default_file_layout;
6906
11fdf7f2
TL
6907 CInode *newi = prepare_new_inode(mdr, dn->get_dir(), inodeno_t(req->head.ino), mode, &layout);
6908 ceph_assert(newi);
7c673cae
FG
6909
6910 dn->push_projected_linkage(newi);
6911
f67539c2
TL
6912 auto _inode = newi->_get_inode();
6913 _inode->version = dn->pre_dirty();
6914 _inode->rdev = req->head.args.mknod.rdev;
6915 _inode->rstat.rfiles = 1;
6916 _inode->accounted_rstat = _inode->rstat;
7c673cae 6917 if (layout.pool_id != mdcache->default_file_layout.pool_id)
f67539c2
TL
6918 _inode->add_old_pool(mdcache->default_file_layout.pool_id);
6919 _inode->update_backtrace();
7c673cae 6920
11fdf7f2
TL
6921 snapid_t follows = mdcache->get_global_snaprealm()->get_newest_seq();
6922 SnapRealm *realm = dn->get_dir()->inode->find_snaprealm();
6923 ceph_assert(follows >= realm->get_newest_seq());
6924
7c673cae
FG
6925 // if the client created a _regular_ file via MKNOD, it's highly likely they'll
6926 // want to write to it (e.g., if they are reexporting NFS)
f67539c2 6927 if (S_ISREG(_inode->mode)) {
7c673cae
FG
6928 // issue a cap on the file
6929 int cmode = CEPH_FILE_MODE_RDWR;
9f95a23c 6930 Capability *cap = mds->locker->issue_new_caps(newi, cmode, mdr, realm);
7c673cae
FG
6931 if (cap) {
6932 cap->set_wanted(0);
6933
6934 // put locks in excl mode
6935 newi->filelock.set_state(LOCK_EXCL);
6936 newi->authlock.set_state(LOCK_EXCL);
6937 newi->xattrlock.set_state(LOCK_EXCL);
a8e16298
TL
6938
6939 dout(15) << " setting a client_range too, since this is a regular file" << dendl;
f67539c2
TL
6940 _inode->client_ranges[client].range.first = 0;
6941 _inode->client_ranges[client].range.last = _inode->layout.stripe_unit;
6942 _inode->client_ranges[client].follows = follows;
f91f0fd5 6943 newi->mark_clientwriteable();
a8e16298 6944 cap->mark_clientwriteable();
7c673cae
FG
6945 }
6946 }
6947
11fdf7f2 6948 ceph_assert(dn->first == follows + 1);
7c673cae
FG
6949 newi->first = dn->first;
6950
f67539c2 6951 dout(10) << "mknod mode " << _inode->mode << " rdev " << _inode->rdev << dendl;
7c673cae
FG
6952
6953 // prepare finisher
6954 mdr->ls = mdlog->get_current_segment();
6955 EUpdate *le = new EUpdate(mdlog, "mknod");
6956 mdlog->start_entry(le);
6957 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
6958 journal_allocated_inos(mdr, &le->metablob);
6959
6960 mdcache->predirty_journal_parents(mdr, &le->metablob, newi, dn->get_dir(),
6961 PREDIRTY_PRIMARY|PREDIRTY_DIR, 1);
6962 le->metablob.add_primary_dentry(dn, newi, true, true, true);
6963
6964 journal_and_reply(mdr, newi, dn, le, new C_MDS_mknod_finish(this, mdr, dn, newi));
92f5a8d4 6965 mds->balancer->maybe_fragment(dn->get_dir(), false);
7c673cae
FG
6966}
6967
6968
6969
6970// MKDIR
6971/* This function takes responsibility for the passed mdr*/
6972void Server::handle_client_mkdir(MDRequestRef& mdr)
6973{
9f95a23c 6974 const cref_t<MClientRequest> &req = mdr->client_request;
91327a77 6975
9f95a23c
TL
6976 mdr->disable_lock_cache();
6977 CDentry *dn = rdlock_path_xlock_dentry(mdr, true);
6978 if (!dn)
7c673cae 6979 return;
9f95a23c 6980
7c673cae
FG
6981 CDir *dir = dn->get_dir();
6982 CInode *diri = dir->get_inode();
7c673cae
FG
6983
6984 // mkdir check access
6985 if (!check_access(mdr, diri, MAY_WRITE))
6986 return;
6987
6988 if (!check_fragment_space(mdr, dir))
6989 return;
20effc67
TL
6990 if (!check_dir_max_entries(mdr, dir))
6991 return;
7c673cae 6992
f67539c2
TL
6993 ceph_assert(dn->get_projected_linkage()->is_null());
6994 if (req->get_alternate_name().size() > alternate_name_max) {
6995 dout(10) << " alternate_name longer than " << alternate_name_max << dendl;
6996 respond_to_request(mdr, -CEPHFS_ENAMETOOLONG);
6997 return;
6998 }
6999 dn->set_alternate_name(req->get_alternate_name());
7000
7c673cae 7001 // new inode
7c673cae
FG
7002 unsigned mode = req->head.args.mkdir.mode;
7003 mode &= ~S_IFMT;
7004 mode |= S_IFDIR;
9f95a23c 7005 CInode *newi = prepare_new_inode(mdr, dir, inodeno_t(req->head.ino), mode);
11fdf7f2 7006 ceph_assert(newi);
7c673cae
FG
7007
7008 // it's a directory.
7009 dn->push_projected_linkage(newi);
7010
f67539c2
TL
7011 auto _inode = newi->_get_inode();
7012 _inode->version = dn->pre_dirty();
7013 _inode->rstat.rsubdirs = 1;
7014 _inode->accounted_rstat = _inode->rstat;
7015 _inode->update_backtrace();
7c673cae 7016
11fdf7f2
TL
7017 snapid_t follows = mdcache->get_global_snaprealm()->get_newest_seq();
7018 SnapRealm *realm = dn->get_dir()->inode->find_snaprealm();
7019 ceph_assert(follows >= realm->get_newest_seq());
7020
7c673cae 7021 dout(12) << " follows " << follows << dendl;
11fdf7f2 7022 ceph_assert(dn->first == follows + 1);
7c673cae
FG
7023 newi->first = dn->first;
7024
7025 // ...and that new dir is empty.
7026 CDir *newdir = newi->get_or_open_dirfrag(mdcache, frag_t());
7027 newdir->state_set(CDir::STATE_CREATING);
7028 newdir->mark_complete();
f67539c2 7029 newdir->_get_fnode()->version = newdir->pre_dirty();
7c673cae
FG
7030
7031 // prepare finisher
7032 mdr->ls = mdlog->get_current_segment();
7033 EUpdate *le = new EUpdate(mdlog, "mkdir");
7034 mdlog->start_entry(le);
7035 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
7036 journal_allocated_inos(mdr, &le->metablob);
7037 mdcache->predirty_journal_parents(mdr, &le->metablob, newi, dn->get_dir(), PREDIRTY_PRIMARY|PREDIRTY_DIR, 1);
7038 le->metablob.add_primary_dentry(dn, newi, true, true);
7039 le->metablob.add_new_dir(newdir); // dirty AND complete AND new
7040
7041 // issue a cap on the directory
7042 int cmode = CEPH_FILE_MODE_RDWR;
9f95a23c 7043 Capability *cap = mds->locker->issue_new_caps(newi, cmode, mdr, realm);
7c673cae
FG
7044 if (cap) {
7045 cap->set_wanted(0);
7046
7047 // put locks in excl mode
7048 newi->filelock.set_state(LOCK_EXCL);
7049 newi->authlock.set_state(LOCK_EXCL);
7050 newi->xattrlock.set_state(LOCK_EXCL);
7051 }
7052
7053 // make sure this inode gets into the journal
7054 le->metablob.add_opened_ino(newi->ino());
7c673cae
FG
7055
7056 journal_and_reply(mdr, newi, dn, le, new C_MDS_mknod_finish(this, mdr, dn, newi));
81eedcae
TL
7057
7058 // We hit_dir (via hit_inode) in our finish callback, but by then we might
7059 // have overshot the split size (multiple mkdir in flight), so here is
7060 // an early chance to split the dir if this mkdir makes it oversized.
7061 mds->balancer->maybe_fragment(dir, false);
7c673cae
FG
7062}
7063
7064
7065// SYMLINK
7066
7067void Server::handle_client_symlink(MDRequestRef& mdr)
7068{
f67539c2
TL
7069 const auto& req = mdr->client_request;
7070
9f95a23c
TL
7071 mdr->disable_lock_cache();
7072 CDentry *dn = rdlock_path_xlock_dentry(mdr, true);
7073 if (!dn)
7c673cae 7074 return;
9f95a23c 7075
7c673cae
FG
7076 CDir *dir = dn->get_dir();
7077 CInode *diri = dir->get_inode();
7c673cae
FG
7078
7079 if (!check_access(mdr, diri, MAY_WRITE))
9f95a23c 7080 return;
7c673cae
FG
7081 if (!check_fragment_space(mdr, dir))
7082 return;
20effc67
TL
7083 if (!check_dir_max_entries(mdr, dir))
7084 return;
7c673cae 7085
f67539c2
TL
7086 ceph_assert(dn->get_projected_linkage()->is_null());
7087 if (req->get_alternate_name().size() > alternate_name_max) {
7088 dout(10) << " alternate_name longer than " << alternate_name_max << dendl;
7089 respond_to_request(mdr, -CEPHFS_ENAMETOOLONG);
7090 }
7091 dn->set_alternate_name(req->get_alternate_name());
9f95a23c 7092
7c673cae 7093 unsigned mode = S_IFLNK | 0777;
9f95a23c 7094 CInode *newi = prepare_new_inode(mdr, dir, inodeno_t(req->head.ino), mode);
11fdf7f2 7095 ceph_assert(newi);
7c673cae
FG
7096
7097 // it's a symlink
7098 dn->push_projected_linkage(newi);
7099
11fdf7f2 7100 newi->symlink = req->get_path2();
f67539c2
TL
7101 auto _inode = newi->_get_inode();
7102 _inode->version = dn->pre_dirty();
7103 _inode->size = newi->symlink.length();
7104 _inode->rstat.rbytes = _inode->size;
7105 _inode->rstat.rfiles = 1;
7106 _inode->accounted_rstat = _inode->rstat;
7107 _inode->update_backtrace();
7c673cae
FG
7108
7109 newi->first = dn->first;
7110
7111 // prepare finisher
7112 mdr->ls = mdlog->get_current_segment();
7113 EUpdate *le = new EUpdate(mdlog, "symlink");
7114 mdlog->start_entry(le);
7115 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
7116 journal_allocated_inos(mdr, &le->metablob);
7117 mdcache->predirty_journal_parents(mdr, &le->metablob, newi, dn->get_dir(), PREDIRTY_PRIMARY|PREDIRTY_DIR, 1);
7118 le->metablob.add_primary_dentry(dn, newi, true, true);
7119
7120 journal_and_reply(mdr, newi, dn, le, new C_MDS_mknod_finish(this, mdr, dn, newi));
92f5a8d4 7121 mds->balancer->maybe_fragment(dir, false);
05a536ef
TL
7122
7123 // flush the journal as soon as possible
7124 if (g_conf()->mds_kill_skip_replaying_inotable) {
7125 mdlog->flush();
7126 }
7c673cae
FG
7127}
7128
7129
7130
7131
7132
7133// LINK
7134
7135void Server::handle_client_link(MDRequestRef& mdr)
7136{
9f95a23c 7137 const cref_t<MClientRequest> &req = mdr->client_request;
7c673cae
FG
7138
7139 dout(7) << "handle_client_link " << req->get_filepath()
7140 << " to " << req->get_filepath2()
7141 << dendl;
7142
9f95a23c 7143 mdr->disable_lock_cache();
7c673cae 7144
9f95a23c
TL
7145 CDentry *destdn;
7146 CInode *targeti;
7147
7148 if (req->get_filepath2().depth() == 0) {
7149 targeti = mdcache->get_inode(req->get_filepath2().get_ino());
7150 if (!targeti) {
f67539c2 7151 dout(10) << "CEPHFS_ESTALE on path2, attempting recovery" << dendl;
2a845540
TL
7152 inodeno_t ino = req->get_filepath2().get_ino();
7153 mdcache->find_ino_peers(ino, new C_MDS_TryFindInode(this, mdr, mdcache, ino));
9f95a23c
TL
7154 return;
7155 }
7156 mdr->pin(targeti);
7157
7158 if (!(mdr->locking_state & MutationImpl::SNAP2_LOCKED)) {
7159 CDentry *pdn = targeti->get_projected_parent_dn();
7160 if (!pdn) {
7161 dout(7) << "target has no parent dn, failing..." << dendl;
f67539c2 7162 respond_to_request(mdr, -CEPHFS_EINVAL);
9f95a23c
TL
7163 return;
7164 }
7165 if (!mds->locker->try_rdlock_snap_layout(pdn->get_dir()->get_inode(), mdr, 1))
7166 return;
7167 mdr->locking_state |= MutationImpl::SNAP2_LOCKED;
7168 }
7169
7170 destdn = rdlock_path_xlock_dentry(mdr, false);
7171 if (!destdn)
7172 return;
9f95a23c
TL
7173 } else {
7174 auto ret = rdlock_two_paths_xlock_destdn(mdr, false);
7175 destdn = ret.first;
7176 if (!destdn)
7177 return;
7178
7179 if (!destdn->get_projected_linkage()->is_null()) {
f67539c2 7180 respond_to_request(mdr, -CEPHFS_EEXIST);
9f95a23c
TL
7181 return;
7182 }
7183
7184 targeti = ret.second->get_projected_linkage()->get_inode();
7185 }
7186
f67539c2
TL
7187 ceph_assert(destdn->get_projected_linkage()->is_null());
7188 if (req->get_alternate_name().size() > alternate_name_max) {
7189 dout(10) << " alternate_name longer than " << alternate_name_max << dendl;
7190 respond_to_request(mdr, -CEPHFS_ENAMETOOLONG);
7191 return;
7192 }
7193 destdn->set_alternate_name(req->get_alternate_name());
7194
9f95a23c
TL
7195 if (targeti->is_dir()) {
7196 dout(7) << "target is a dir, failing..." << dendl;
f67539c2 7197 respond_to_request(mdr, -CEPHFS_EINVAL);
7c673cae
FG
7198 return;
7199 }
7200
9f95a23c
TL
7201 CDir *dir = destdn->get_dir();
7202 dout(7) << "handle_client_link link " << destdn->get_name() << " in " << *dir << dendl;
7c673cae 7203 dout(7) << "target is " << *targeti << dendl;
9f95a23c
TL
7204
7205 if (!(mdr->locking_state & MutationImpl::ALL_LOCKED)) {
7206 MutationImpl::LockOpVec lov;
7207 lov.add_xlock(&targeti->snaplock);
7208 lov.add_xlock(&targeti->linklock);
7209
7210 if (!mds->locker->acquire_locks(mdr, lov))
181888fb 7211 return;
7c673cae 7212
9f95a23c
TL
7213 mdr->locking_state |= MutationImpl::ALL_LOCKED;
7214 }
7c673cae 7215
9f95a23c
TL
7216 if (targeti->get_projected_inode()->nlink == 0) {
7217 dout(7) << "target has no link, failing..." << dendl;
f67539c2 7218 respond_to_request(mdr, -CEPHFS_ENOENT);
20effc67 7219 return;
9f95a23c 7220 }
7c673cae
FG
7221
7222 if ((!mdr->has_more() || mdr->more()->witnessed.empty())) {
7223 if (!check_access(mdr, targeti, MAY_WRITE))
7224 return;
7225
7226 if (!check_access(mdr, dir->get_inode(), MAY_WRITE))
7227 return;
7228
7229 if (!check_fragment_space(mdr, dir))
7230 return;
20effc67
TL
7231
7232 if (!check_dir_max_entries(mdr, dir))
7233 return;
7c673cae
FG
7234 }
7235
adb31ebb
TL
7236 CInode* target_pin = targeti->get_projected_parent_dir()->inode;
7237 SnapRealm *target_realm = target_pin->find_snaprealm();
7238 if (target_pin != dir->inode &&
7239 target_realm->get_subvolume_ino() !=
aee94f69
TL
7240 dir->inode->find_snaprealm()->get_subvolume_ino() &&
7241 /* The inode is temporarily located in the stray dir pending reintegration */
7242 !target_pin->is_stray()) {
adb31ebb 7243 dout(7) << "target is in different subvolume, failing..." << dendl;
f67539c2 7244 respond_to_request(mdr, -CEPHFS_EXDEV);
adb31ebb
TL
7245 return;
7246 }
7247
7c673cae 7248 // go!
11fdf7f2 7249 ceph_assert(g_conf()->mds_kill_link_at != 1);
7c673cae
FG
7250
7251 // local or remote?
7252 if (targeti->is_auth())
adb31ebb 7253 _link_local(mdr, destdn, targeti, target_realm);
7c673cae 7254 else
9f95a23c 7255 _link_remote(mdr, true, destdn, targeti);
92f5a8d4 7256 mds->balancer->maybe_fragment(dir, false);
7c673cae
FG
7257}
7258
7259
7260class C_MDS_link_local_finish : public ServerLogContext {
7261 CDentry *dn;
7262 CInode *targeti;
7263 version_t dnpv;
7264 version_t tipv;
11fdf7f2 7265 bool adjust_realm;
7c673cae
FG
7266public:
7267 C_MDS_link_local_finish(Server *s, MDRequestRef& r, CDentry *d, CInode *ti,
11fdf7f2 7268 version_t dnpv_, version_t tipv_, bool ar) :
7c673cae 7269 ServerLogContext(s, r), dn(d), targeti(ti),
11fdf7f2 7270 dnpv(dnpv_), tipv(tipv_), adjust_realm(ar) { }
7c673cae 7271 void finish(int r) override {
11fdf7f2
TL
7272 ceph_assert(r == 0);
7273 server->_link_local_finish(mdr, dn, targeti, dnpv, tipv, adjust_realm);
7c673cae
FG
7274 }
7275};
7276
7277
adb31ebb 7278void Server::_link_local(MDRequestRef& mdr, CDentry *dn, CInode *targeti, SnapRealm *target_realm)
7c673cae
FG
7279{
7280 dout(10) << "_link_local " << *dn << " to " << *targeti << dendl;
7281
7282 mdr->ls = mdlog->get_current_segment();
7283
7284 // predirty NEW dentry
7285 version_t dnpv = dn->pre_dirty();
7286 version_t tipv = targeti->pre_dirty();
7287
7288 // project inode update
f67539c2
TL
7289 auto pi = targeti->project_inode(mdr);
7290 pi.inode->nlink++;
7291 pi.inode->ctime = mdr->get_op_stamp();
7292 if (mdr->get_op_stamp() > pi.inode->rstat.rctime)
7293 pi.inode->rstat.rctime = mdr->get_op_stamp();
7294 pi.inode->change_attr++;
7295 pi.inode->version = tipv;
7c673cae 7296
11fdf7f2 7297 bool adjust_realm = false;
adb31ebb 7298 if (!target_realm->get_subvolume_ino() && !targeti->is_projected_snaprealm_global()) {
11fdf7f2
TL
7299 sr_t *newsnap = targeti->project_snaprealm();
7300 targeti->mark_snaprealm_global(newsnap);
adb31ebb 7301 targeti->record_snaprealm_parent_dentry(newsnap, target_realm, targeti->get_projected_parent_dn(), true);
11fdf7f2
TL
7302 adjust_realm = true;
7303 }
7304
7c673cae
FG
7305 // log + wait
7306 EUpdate *le = new EUpdate(mdlog, "link_local");
7307 mdlog->start_entry(le);
7308 le->metablob.add_client_req(mdr->reqid, mdr->client_request->get_oldest_client_tid());
7309 mdcache->predirty_journal_parents(mdr, &le->metablob, targeti, dn->get_dir(), PREDIRTY_DIR, 1); // new dn
7310 mdcache->predirty_journal_parents(mdr, &le->metablob, targeti, 0, PREDIRTY_PRIMARY); // targeti
7311 le->metablob.add_remote_dentry(dn, true, targeti->ino(), targeti->d_type()); // new remote
7312 mdcache->journal_dirty_inode(mdr.get(), &le->metablob, targeti);
7313
7314 // do this after predirty_*, to avoid funky extra dnl arg
7315 dn->push_projected_linkage(targeti->ino(), targeti->d_type());
7316
11fdf7f2
TL
7317 journal_and_reply(mdr, targeti, dn, le,
7318 new C_MDS_link_local_finish(this, mdr, dn, targeti, dnpv, tipv, adjust_realm));
7c673cae
FG
7319}
7320
7321void Server::_link_local_finish(MDRequestRef& mdr, CDentry *dn, CInode *targeti,
11fdf7f2 7322 version_t dnpv, version_t tipv, bool adjust_realm)
7c673cae
FG
7323{
7324 dout(10) << "_link_local_finish " << *dn << " to " << *targeti << dendl;
7325
7326 // link and unlock the NEW dentry
31f18b77
FG
7327 CDentry::linkage_t *dnl = dn->pop_projected_linkage();
7328 if (!dnl->get_inode())
7329 dn->link_remote(dnl, targeti);
7c673cae
FG
7330 dn->mark_dirty(dnpv, mdr->ls);
7331
7332 // target inode
7c673cae
FG
7333 mdr->apply();
7334
7335 MDRequestRef null_ref;
7336 mdcache->send_dentry_link(dn, null_ref);
7337
11fdf7f2
TL
7338 if (adjust_realm) {
7339 int op = CEPH_SNAP_OP_SPLIT;
7340 mds->mdcache->send_snap_update(targeti, 0, op);
7341 mds->mdcache->do_realm_invalidate_and_update_notify(targeti, op);
7342 }
7343
7c673cae 7344 // bump target popularity
11fdf7f2
TL
7345 mds->balancer->hit_inode(targeti, META_POP_IWR);
7346 mds->balancer->hit_dir(dn->get_dir(), META_POP_IWR);
7c673cae
FG
7347
7348 // reply
7349 respond_to_request(mdr, 0);
7350}
7351
7352
7353// link / unlink remote
7354
7355class C_MDS_link_remote_finish : public ServerLogContext {
7356 bool inc;
7357 CDentry *dn;
7358 CInode *targeti;
7359 version_t dpv;
7360public:
7361 C_MDS_link_remote_finish(Server *s, MDRequestRef& r, bool i, CDentry *d, CInode *ti) :
7362 ServerLogContext(s, r), inc(i), dn(d), targeti(ti),
7363 dpv(d->get_projected_version()) {}
7364 void finish(int r) override {
11fdf7f2 7365 ceph_assert(r == 0);
7c673cae
FG
7366 server->_link_remote_finish(mdr, inc, dn, targeti, dpv);
7367 }
7368};
7369
7370void Server::_link_remote(MDRequestRef& mdr, bool inc, CDentry *dn, CInode *targeti)
7371{
7372 dout(10) << "_link_remote "
7373 << (inc ? "link ":"unlink ")
7374 << *dn << " to " << *targeti << dendl;
7375
7376 // 1. send LinkPrepare to dest (journal nlink++ prepare)
7377 mds_rank_t linkauth = targeti->authority().first;
7378 if (mdr->more()->witnessed.count(linkauth) == 0) {
7379 if (mds->is_cluster_degraded() &&
7380 !mds->mdsmap->is_clientreplay_or_active_or_stopping(linkauth)) {
7381 dout(10) << " targeti auth mds." << linkauth << " is not active" << dendl;
f67539c2 7382 if (mdr->more()->waiting_on_peer.empty())
7c673cae
FG
7383 mds->wait_for_active_peer(linkauth, new C_MDS_RetryRequest(mdcache, mdr));
7384 return;
7385 }
7386
7387 dout(10) << " targeti auth must prepare nlink++/--" << dendl;
7388 int op;
7389 if (inc)
f67539c2 7390 op = MMDSPeerRequest::OP_LINKPREP;
7c673cae 7391 else
f67539c2
TL
7392 op = MMDSPeerRequest::OP_UNLINKPREP;
7393 auto req = make_message<MMDSPeerRequest>(mdr->reqid, mdr->attempt, op);
7c673cae
FG
7394 targeti->set_object_info(req->get_object_info());
7395 req->op_stamp = mdr->get_op_stamp();
11fdf7f2
TL
7396 if (auto& desti_srnode = mdr->more()->desti_srnode)
7397 encode(*desti_srnode, req->desti_snapbl);
7c673cae
FG
7398 mds->send_message_mds(req, linkauth);
7399
f67539c2
TL
7400 ceph_assert(mdr->more()->waiting_on_peer.count(linkauth) == 0);
7401 mdr->more()->waiting_on_peer.insert(linkauth);
7c673cae
FG
7402 return;
7403 }
7404 dout(10) << " targeti auth has prepared nlink++/--" << dendl;
7405
11fdf7f2
TL
7406 ceph_assert(g_conf()->mds_kill_link_at != 2);
7407
7408 if (auto& desti_srnode = mdr->more()->desti_srnode) {
7409 delete desti_srnode;
7410 desti_srnode = NULL;
7411 }
7c673cae
FG
7412
7413 mdr->set_mds_stamp(ceph_clock_now());
7414
7415 // add to event
7416 mdr->ls = mdlog->get_current_segment();
7417 EUpdate *le = new EUpdate(mdlog, inc ? "link_remote":"unlink_remote");
7418 mdlog->start_entry(le);
7419 le->metablob.add_client_req(mdr->reqid, mdr->client_request->get_oldest_client_tid());
7420 if (!mdr->more()->witnessed.empty()) {
f67539c2 7421 dout(20) << " noting uncommitted_peers " << mdr->more()->witnessed << dendl;
7c673cae 7422 le->reqid = mdr->reqid;
f67539c2
TL
7423 le->had_peers = true;
7424 mdcache->add_uncommitted_leader(mdr->reqid, mdr->ls, mdr->more()->witnessed);
7c673cae
FG
7425 }
7426
7427 if (inc) {
7428 dn->pre_dirty();
7429 mdcache->predirty_journal_parents(mdr, &le->metablob, targeti, dn->get_dir(), PREDIRTY_DIR, 1);
7430 le->metablob.add_remote_dentry(dn, true, targeti->ino(), targeti->d_type()); // new remote
7431 dn->push_projected_linkage(targeti->ino(), targeti->d_type());
7432 } else {
7433 dn->pre_dirty();
7434 mdcache->predirty_journal_parents(mdr, &le->metablob, targeti, dn->get_dir(), PREDIRTY_DIR, -1);
7435 mdcache->journal_cow_dentry(mdr.get(), &le->metablob, dn);
7436 le->metablob.add_null_dentry(dn, true);
31f18b77 7437 dn->push_projected_linkage();
7c673cae
FG
7438 }
7439
9f95a23c
TL
7440 journal_and_reply(mdr, (inc ? targeti : nullptr), dn, le,
7441 new C_MDS_link_remote_finish(this, mdr, inc, dn, targeti));
7c673cae
FG
7442}
7443
7444void Server::_link_remote_finish(MDRequestRef& mdr, bool inc,
7445 CDentry *dn, CInode *targeti,
7446 version_t dpv)
7447{
7448 dout(10) << "_link_remote_finish "
7449 << (inc ? "link ":"unlink ")
7450 << *dn << " to " << *targeti << dendl;
7451
11fdf7f2 7452 ceph_assert(g_conf()->mds_kill_link_at != 3);
7c673cae
FG
7453
7454 if (!mdr->more()->witnessed.empty())
f67539c2 7455 mdcache->logged_leader_update(mdr->reqid);
7c673cae
FG
7456
7457 if (inc) {
7458 // link the new dentry
31f18b77
FG
7459 CDentry::linkage_t *dnl = dn->pop_projected_linkage();
7460 if (!dnl->get_inode())
7461 dn->link_remote(dnl, targeti);
7c673cae
FG
7462 dn->mark_dirty(dpv, mdr->ls);
7463 } else {
7464 // unlink main dentry
7465 dn->get_dir()->unlink_inode(dn);
31f18b77 7466 dn->pop_projected_linkage();
7c673cae
FG
7467 dn->mark_dirty(dn->get_projected_version(), mdr->ls); // dirty old dentry
7468 }
7469
7470 mdr->apply();
7471
7472 MDRequestRef null_ref;
aee94f69 7473 if (inc)
7c673cae 7474 mdcache->send_dentry_link(dn, null_ref);
aee94f69 7475 else
7c673cae 7476 mdcache->send_dentry_unlink(dn, NULL, null_ref);
aee94f69 7477
7c673cae 7478 // bump target popularity
11fdf7f2
TL
7479 mds->balancer->hit_inode(targeti, META_POP_IWR);
7480 mds->balancer->hit_dir(dn->get_dir(), META_POP_IWR);
7c673cae
FG
7481
7482 // reply
7483 respond_to_request(mdr, 0);
7484
7485 if (!inc)
7486 // removing a new dn?
7487 dn->get_dir()->try_remove_unlinked_dn(dn);
7488}
7489
7490
7491// remote linking/unlinking
7492
f67539c2 7493class C_MDS_PeerLinkPrep : public ServerLogContext {
7c673cae 7494 CInode *targeti;
11fdf7f2 7495 bool adjust_realm;
7c673cae 7496public:
f67539c2 7497 C_MDS_PeerLinkPrep(Server *s, MDRequestRef& r, CInode *t, bool ar) :
11fdf7f2 7498 ServerLogContext(s, r), targeti(t), adjust_realm(ar) { }
7c673cae 7499 void finish(int r) override {
11fdf7f2 7500 ceph_assert(r == 0);
f67539c2 7501 server->_logged_peer_link(mdr, targeti, adjust_realm);
7c673cae
FG
7502 }
7503};
7504
f67539c2 7505class C_MDS_PeerLinkCommit : public ServerContext {
7c673cae
FG
7506 MDRequestRef mdr;
7507 CInode *targeti;
7508public:
f67539c2 7509 C_MDS_PeerLinkCommit(Server *s, MDRequestRef& r, CInode *t) :
7c673cae
FG
7510 ServerContext(s), mdr(r), targeti(t) { }
7511 void finish(int r) override {
f67539c2 7512 server->_commit_peer_link(mdr, r, targeti);
7c673cae
FG
7513 }
7514};
7515
f67539c2 7516void Server::handle_peer_link_prep(MDRequestRef& mdr)
7c673cae 7517{
f67539c2
TL
7518 dout(10) << "handle_peer_link_prep " << *mdr
7519 << " on " << mdr->peer_request->get_object_info()
7c673cae
FG
7520 << dendl;
7521
11fdf7f2 7522 ceph_assert(g_conf()->mds_kill_link_at != 4);
7c673cae 7523
f67539c2 7524 CInode *targeti = mdcache->get_inode(mdr->peer_request->get_object_info().ino);
11fdf7f2 7525 ceph_assert(targeti);
7c673cae
FG
7526 dout(10) << "targeti " << *targeti << dendl;
7527 CDentry *dn = targeti->get_parent_dn();
7528 CDentry::linkage_t *dnl = dn->get_linkage();
11fdf7f2 7529 ceph_assert(dnl->is_primary());
7c673cae 7530
f67539c2 7531 mdr->set_op_stamp(mdr->peer_request->op_stamp);
7c673cae
FG
7532
7533 mdr->auth_pin(targeti);
7534
f67539c2 7535 //ceph_abort(); // test hack: make sure leader can handle a peer that fails to prepare...
11fdf7f2 7536 ceph_assert(g_conf()->mds_kill_link_at != 5);
7c673cae
FG
7537
7538 // journal it
7539 mdr->ls = mdlog->get_current_segment();
f67539c2
TL
7540 EPeerUpdate *le = new EPeerUpdate(mdlog, "peer_link_prep", mdr->reqid, mdr->peer_to_mds,
7541 EPeerUpdate::OP_PREPARE, EPeerUpdate::LINK);
7c673cae
FG
7542 mdlog->start_entry(le);
7543
f67539c2 7544 auto pi = dnl->get_inode()->project_inode(mdr);
7c673cae
FG
7545
7546 // update journaled target inode
7547 bool inc;
11fdf7f2
TL
7548 bool adjust_realm = false;
7549 bool realm_projected = false;
f67539c2 7550 if (mdr->peer_request->get_op() == MMDSPeerRequest::OP_LINKPREP) {
7c673cae 7551 inc = true;
f67539c2 7552 pi.inode->nlink++;
adb31ebb
TL
7553
7554 CDentry *target_pdn = targeti->get_projected_parent_dn();
7555 SnapRealm *target_realm = target_pdn->get_dir()->inode->find_snaprealm();
7556 if (!target_realm->get_subvolume_ino() && !targeti->is_projected_snaprealm_global()) {
11fdf7f2
TL
7557 sr_t *newsnap = targeti->project_snaprealm();
7558 targeti->mark_snaprealm_global(newsnap);
adb31ebb 7559 targeti->record_snaprealm_parent_dentry(newsnap, target_realm, target_pdn, true);
11fdf7f2
TL
7560 adjust_realm = true;
7561 realm_projected = true;
7562 }
7c673cae
FG
7563 } else {
7564 inc = false;
f67539c2 7565 pi.inode->nlink--;
11fdf7f2 7566 if (targeti->is_projected_snaprealm_global()) {
f67539c2
TL
7567 ceph_assert(mdr->peer_request->desti_snapbl.length());
7568 auto p = mdr->peer_request->desti_snapbl.cbegin();
11fdf7f2
TL
7569
7570 sr_t *newsnap = targeti->project_snaprealm();
7571 decode(*newsnap, p);
7572
f67539c2 7573 if (pi.inode->nlink == 0)
11fdf7f2
TL
7574 ceph_assert(!newsnap->is_parent_global());
7575
7576 realm_projected = true;
7577 } else {
f67539c2 7578 ceph_assert(mdr->peer_request->desti_snapbl.length() == 0);
11fdf7f2 7579 }
7c673cae
FG
7580 }
7581
7582 link_rollback rollback;
7583 rollback.reqid = mdr->reqid;
7584 rollback.ino = targeti->ino();
f67539c2
TL
7585 rollback.old_ctime = targeti->get_inode()->ctime; // we hold versionlock xlock; no concorrent projections
7586 const auto& pf = targeti->get_parent_dn()->get_dir()->get_projected_fnode();
7c673cae
FG
7587 rollback.old_dir_mtime = pf->fragstat.mtime;
7588 rollback.old_dir_rctime = pf->rstat.rctime;
7589 rollback.was_inc = inc;
11fdf7f2
TL
7590 if (realm_projected) {
7591 if (targeti->snaprealm) {
7592 encode(true, rollback.snapbl);
7593 targeti->encode_snap_blob(rollback.snapbl);
7594 } else {
7595 encode(false, rollback.snapbl);
7596 }
7597 }
7598 encode(rollback, le->rollback);
7c673cae
FG
7599 mdr->more()->rollback_bl = le->rollback;
7600
f67539c2
TL
7601 pi.inode->ctime = mdr->get_op_stamp();
7602 pi.inode->version = targeti->pre_dirty();
7c673cae 7603
f67539c2 7604 dout(10) << " projected inode " << pi.inode->ino << " v " << pi.inode->version << dendl;
7c673cae
FG
7605
7606 // commit case
7607 mdcache->predirty_journal_parents(mdr, &le->commit, dnl->get_inode(), 0, PREDIRTY_SHALLOW|PREDIRTY_PRIMARY);
7608 mdcache->journal_dirty_inode(mdr.get(), &le->commit, targeti);
f67539c2 7609 mdcache->add_uncommitted_peer(mdr->reqid, mdr->ls, mdr->peer_to_mds);
7c673cae
FG
7610
7611 // set up commit waiter
f67539c2 7612 mdr->more()->peer_commit = new C_MDS_PeerLinkCommit(this, mdr, targeti);
7c673cae 7613
f67539c2
TL
7614 mdr->more()->peer_update_journaled = true;
7615 submit_mdlog_entry(le, new C_MDS_PeerLinkPrep(this, mdr, targeti, adjust_realm),
7c673cae
FG
7616 mdr, __func__);
7617 mdlog->flush();
7618}
7619
f67539c2 7620void Server::_logged_peer_link(MDRequestRef& mdr, CInode *targeti, bool adjust_realm)
7c673cae 7621{
f67539c2 7622 dout(10) << "_logged_peer_link " << *mdr
7c673cae
FG
7623 << " " << *targeti << dendl;
7624
11fdf7f2 7625 ceph_assert(g_conf()->mds_kill_link_at != 6);
7c673cae
FG
7626
7627 // update the target
7c673cae
FG
7628 mdr->apply();
7629
7630 // hit pop
11fdf7f2 7631 mds->balancer->hit_inode(targeti, META_POP_IWR);
7c673cae
FG
7632
7633 // done.
f67539c2 7634 mdr->reset_peer_request();
7c673cae 7635
11fdf7f2
TL
7636 if (adjust_realm) {
7637 int op = CEPH_SNAP_OP_SPLIT;
7638 mds->mdcache->send_snap_update(targeti, 0, op);
7639 mds->mdcache->do_realm_invalidate_and_update_notify(targeti, op);
7640 }
7641
7c673cae
FG
7642 // ack
7643 if (!mdr->aborted) {
f67539c2
TL
7644 auto reply = make_message<MMDSPeerRequest>(mdr->reqid, mdr->attempt, MMDSPeerRequest::OP_LINKPREPACK);
7645 mds->send_message_mds(reply, mdr->peer_to_mds);
7c673cae
FG
7646 } else {
7647 dout(10) << " abort flag set, finishing" << dendl;
7648 mdcache->request_finish(mdr);
7649 }
7650}
7651
7652
f67539c2
TL
7653struct C_MDS_CommittedPeer : public ServerLogContext {
7654 C_MDS_CommittedPeer(Server *s, MDRequestRef& m) : ServerLogContext(s, m) {}
7c673cae 7655 void finish(int r) override {
f67539c2 7656 server->_committed_peer(mdr);
7c673cae
FG
7657 }
7658};
7659
f67539c2 7660void Server::_commit_peer_link(MDRequestRef& mdr, int r, CInode *targeti)
7c673cae 7661{
f67539c2 7662 dout(10) << "_commit_peer_link " << *mdr
7c673cae
FG
7663 << " r=" << r
7664 << " " << *targeti << dendl;
7665
11fdf7f2 7666 ceph_assert(g_conf()->mds_kill_link_at != 7);
7c673cae
FG
7667
7668 if (r == 0) {
7669 // drop our pins, etc.
7670 mdr->cleanup();
7671
7672 // write a commit to the journal
f67539c2
TL
7673 EPeerUpdate *le = new EPeerUpdate(mdlog, "peer_link_commit", mdr->reqid, mdr->peer_to_mds,
7674 EPeerUpdate::OP_COMMIT, EPeerUpdate::LINK);
7c673cae 7675 mdlog->start_entry(le);
f67539c2 7676 submit_mdlog_entry(le, new C_MDS_CommittedPeer(this, mdr), mdr, __func__);
7c673cae
FG
7677 mdlog->flush();
7678 } else {
f67539c2 7679 do_link_rollback(mdr->more()->rollback_bl, mdr->peer_to_mds, mdr);
7c673cae
FG
7680 }
7681}
7682
f67539c2 7683void Server::_committed_peer(MDRequestRef& mdr)
7c673cae 7684{
f67539c2 7685 dout(10) << "_committed_peer " << *mdr << dendl;
7c673cae 7686
11fdf7f2 7687 ceph_assert(g_conf()->mds_kill_link_at != 8);
7c673cae 7688
f67539c2
TL
7689 bool assert_exist = mdr->more()->peer_update_journaled;
7690 mdcache->finish_uncommitted_peer(mdr->reqid, assert_exist);
7691 auto req = make_message<MMDSPeerRequest>(mdr->reqid, mdr->attempt, MMDSPeerRequest::OP_COMMITTED);
7692 mds->send_message_mds(req, mdr->peer_to_mds);
7c673cae
FG
7693 mdcache->request_finish(mdr);
7694}
7695
7696struct C_MDS_LoggedLinkRollback : public ServerLogContext {
7697 MutationRef mut;
9f95a23c 7698 map<client_t,ref_t<MClientSnap>> splits;
11fdf7f2 7699 C_MDS_LoggedLinkRollback(Server *s, MutationRef& m, MDRequestRef& r,
9f95a23c 7700 map<client_t,ref_t<MClientSnap>>&& _splits) :
11fdf7f2
TL
7701 ServerLogContext(s, r), mut(m), splits(std::move(_splits)) {
7702 }
7c673cae 7703 void finish(int r) override {
11fdf7f2 7704 server->_link_rollback_finish(mut, mdr, splits);
7c673cae
FG
7705 }
7706};
7707
f67539c2 7708void Server::do_link_rollback(bufferlist &rbl, mds_rank_t leader, MDRequestRef& mdr)
7c673cae
FG
7709{
7710 link_rollback rollback;
11fdf7f2
TL
7711 auto p = rbl.cbegin();
7712 decode(rollback, p);
7c673cae
FG
7713
7714 dout(10) << "do_link_rollback on " << rollback.reqid
7715 << (rollback.was_inc ? " inc":" dec")
7716 << " ino " << rollback.ino
7717 << dendl;
7718
11fdf7f2 7719 ceph_assert(g_conf()->mds_kill_link_at != 9);
7c673cae 7720
f67539c2 7721 mdcache->add_rollback(rollback.reqid, leader); // need to finish this update before resolve finishes
11fdf7f2 7722 ceph_assert(mdr || mds->is_resolve());
7c673cae
FG
7723
7724 MutationRef mut(new MutationImpl(nullptr, utime_t(), rollback.reqid));
7725 mut->ls = mds->mdlog->get_current_segment();
7726
7727 CInode *in = mdcache->get_inode(rollback.ino);
11fdf7f2 7728 ceph_assert(in);
7c673cae 7729 dout(10) << " target is " << *in << dendl;
f67539c2 7730 ceph_assert(!in->is_projected()); // live peer request hold versionlock xlock.
7c673cae 7731
f67539c2
TL
7732 auto pi = in->project_inode(mut);
7733 pi.inode->version = in->pre_dirty();
7c673cae
FG
7734
7735 // parent dir rctime
7736 CDir *parent = in->get_projected_parent_dn()->get_dir();
f67539c2 7737 auto pf = parent->project_fnode(mut);
7c673cae 7738 pf->version = parent->pre_dirty();
f67539c2 7739 if (pf->fragstat.mtime == pi.inode->ctime) {
7c673cae 7740 pf->fragstat.mtime = rollback.old_dir_mtime;
f67539c2 7741 if (pf->rstat.rctime == pi.inode->ctime)
7c673cae
FG
7742 pf->rstat.rctime = rollback.old_dir_rctime;
7743 mut->add_updated_lock(&parent->get_inode()->filelock);
7744 mut->add_updated_lock(&parent->get_inode()->nestlock);
7745 }
7746
7747 // inode
f67539c2 7748 pi.inode->ctime = rollback.old_ctime;
7c673cae 7749 if (rollback.was_inc)
f67539c2 7750 pi.inode->nlink--;
7c673cae 7751 else
f67539c2 7752 pi.inode->nlink++;
7c673cae 7753
9f95a23c 7754 map<client_t,ref_t<MClientSnap>> splits;
11fdf7f2
TL
7755 if (rollback.snapbl.length() && in->snaprealm) {
7756 bool hadrealm;
7757 auto p = rollback.snapbl.cbegin();
7758 decode(hadrealm, p);
7759 if (hadrealm) {
7760 if (!mds->is_resolve()) {
7761 sr_t *new_srnode = new sr_t();
7762 decode(*new_srnode, p);
7763 in->project_snaprealm(new_srnode);
7764 } else {
7765 decode(in->snaprealm->srnode, p);
7766 }
7767 } else {
7768 SnapRealm *realm = parent->get_inode()->find_snaprealm();
7769 if (!mds->is_resolve())
7770 mdcache->prepare_realm_merge(in->snaprealm, realm, splits);
7771 in->project_snaprealm(NULL);
7772 }
7773 }
7774
7c673cae 7775 // journal it
f67539c2
TL
7776 EPeerUpdate *le = new EPeerUpdate(mdlog, "peer_link_rollback", rollback.reqid, leader,
7777 EPeerUpdate::OP_ROLLBACK, EPeerUpdate::LINK);
7c673cae
FG
7778 mdlog->start_entry(le);
7779 le->commit.add_dir_context(parent);
7780 le->commit.add_dir(parent, true);
7781 le->commit.add_primary_dentry(in->get_projected_parent_dn(), 0, true);
7782
11fdf7f2 7783 submit_mdlog_entry(le, new C_MDS_LoggedLinkRollback(this, mut, mdr, std::move(splits)),
7c673cae
FG
7784 mdr, __func__);
7785 mdlog->flush();
7786}
7787
11fdf7f2 7788void Server::_link_rollback_finish(MutationRef& mut, MDRequestRef& mdr,
9f95a23c 7789 map<client_t,ref_t<MClientSnap>>& splits)
7c673cae
FG
7790{
7791 dout(10) << "_link_rollback_finish" << dendl;
7792
11fdf7f2 7793 ceph_assert(g_conf()->mds_kill_link_at != 10);
7c673cae
FG
7794
7795 mut->apply();
11fdf7f2
TL
7796
7797 if (!mds->is_resolve())
7798 mdcache->send_snaps(splits);
7799
7c673cae
FG
7800 if (mdr)
7801 mdcache->request_finish(mdr);
7802
e306af50 7803 mdcache->finish_rollback(mut->reqid, mdr);
7c673cae
FG
7804
7805 mut->cleanup();
7806}
7807
7808
f67539c2 7809void Server::handle_peer_link_prep_ack(MDRequestRef& mdr, const cref_t<MMDSPeerRequest> &m)
7c673cae 7810{
f67539c2 7811 dout(10) << "handle_peer_link_prep_ack " << *mdr
7c673cae
FG
7812 << " " << *m << dendl;
7813 mds_rank_t from = mds_rank_t(m->get_source().num());
7814
11fdf7f2 7815 ceph_assert(g_conf()->mds_kill_link_at != 11);
7c673cae 7816
f67539c2
TL
7817 // note peer
7818 mdr->more()->peers.insert(from);
7c673cae
FG
7819
7820 // witnessed!
11fdf7f2 7821 ceph_assert(mdr->more()->witnessed.count(from) == 0);
7c673cae 7822 mdr->more()->witnessed.insert(from);
11fdf7f2 7823 ceph_assert(!m->is_not_journaled());
f67539c2 7824 mdr->more()->has_journaled_peers = true;
7c673cae
FG
7825
7826 // remove from waiting list
f67539c2
TL
7827 ceph_assert(mdr->more()->waiting_on_peer.count(from));
7828 mdr->more()->waiting_on_peer.erase(from);
7c673cae 7829
f67539c2 7830 ceph_assert(mdr->more()->waiting_on_peer.empty());
7c673cae 7831
9f95a23c
TL
7832 dispatch_client_request(mdr); // go again!
7833}
7c673cae 7834
9f95a23c
TL
7835
7836
7837
7838
7839// UNLINK
7840
7841void Server::handle_client_unlink(MDRequestRef& mdr)
7842{
7843 const cref_t<MClientRequest> &req = mdr->client_request;
7844 client_t client = mdr->get_client();
7845
7846 // rmdir or unlink?
7847 bool rmdir = (req->get_op() == CEPH_MDS_OP_RMDIR);
7848
7849 if (rmdir)
7850 mdr->disable_lock_cache();
7851 CDentry *dn = rdlock_path_xlock_dentry(mdr, false, true);
7852 if (!dn)
7853 return;
7c673cae
FG
7854
7855 CDentry::linkage_t *dnl = dn->get_linkage(client, mdr);
11fdf7f2 7856 ceph_assert(!dnl->is_null());
9f95a23c 7857 CInode *in = dnl->get_inode();
7c673cae
FG
7858
7859 if (rmdir) {
7860 dout(7) << "handle_client_rmdir on " << *dn << dendl;
7861 } else {
7862 dout(7) << "handle_client_unlink on " << *dn << dendl;
7863 }
7864 dout(7) << "dn links to " << *in << dendl;
7865
7866 // rmdir vs is_dir
7867 if (in->is_dir()) {
7868 if (rmdir) {
7869 // do empty directory checks
7870 if (_dir_is_nonempty_unlocked(mdr, in)) {
aee94f69 7871 respond_to_request(mdr, -CEPHFS_ENOTEMPTY);
7c673cae
FG
7872 return;
7873 }
7874 } else {
7875 dout(7) << "handle_client_unlink on dir " << *in << ", returning error" << dendl;
f67539c2 7876 respond_to_request(mdr, -CEPHFS_EISDIR);
7c673cae
FG
7877 return;
7878 }
7879 } else {
7880 if (rmdir) {
7881 // unlink
7882 dout(7) << "handle_client_rmdir on non-dir " << *in << ", returning error" << dendl;
f67539c2 7883 respond_to_request(mdr, -CEPHFS_ENOTDIR);
7c673cae
FG
7884 return;
7885 }
7886 }
7887
9f95a23c
TL
7888 CInode *diri = dn->get_dir()->get_inode();
7889 if ((!mdr->has_more() || mdr->more()->witnessed.empty())) {
aee94f69 7890 if (!check_access(mdr, diri, MAY_WRITE))
9f95a23c
TL
7891 return;
7892 }
7893
7c673cae
FG
7894 // -- create stray dentry? --
7895 CDentry *straydn = NULL;
7896 if (dnl->is_primary()) {
7897 straydn = prepare_stray_dentry(mdr, dnl->get_inode());
7898 if (!straydn)
7899 return;
7900 dout(10) << " straydn is " << *straydn << dendl;
7901 } else if (mdr->straydn) {
7902 mdr->unpin(mdr->straydn);
7903 mdr->straydn = NULL;
7904 }
7905
7906 // lock
9f95a23c
TL
7907 if (!(mdr->locking_state & MutationImpl::ALL_LOCKED)) {
7908 MutationImpl::LockOpVec lov;
11fdf7f2 7909
9f95a23c
TL
7910 lov.add_xlock(&in->linklock);
7911 lov.add_xlock(&in->snaplock);
7912 if (in->is_dir())
7913 lov.add_rdlock(&in->filelock); // to verify it's empty
7914
7915 if (straydn) {
7916 lov.add_wrlock(&straydn->get_dir()->inode->filelock);
7917 lov.add_wrlock(&straydn->get_dir()->inode->nestlock);
7918 lov.add_xlock(&straydn->lock);
7919 }
11fdf7f2 7920
9f95a23c
TL
7921 if (!mds->locker->acquire_locks(mdr, lov))
7922 return;
7c673cae 7923
9f95a23c
TL
7924 mdr->locking_state |= MutationImpl::ALL_LOCKED;
7925 }
7c673cae
FG
7926
7927 if (in->is_dir() &&
7928 _dir_is_nonempty(mdr, in)) {
f67539c2 7929 respond_to_request(mdr, -CEPHFS_ENOTEMPTY);
7c673cae
FG
7930 return;
7931 }
7932
11fdf7f2
TL
7933 if (straydn)
7934 straydn->first = mdcache->get_global_snaprealm()->get_newest_seq() + 1;
7935
7936 if (!mdr->more()->desti_srnode) {
7937 if (in->is_projected_snaprealm_global()) {
7938 sr_t *new_srnode = in->prepare_new_srnode(0);
adb31ebb 7939 in->record_snaprealm_parent_dentry(new_srnode, nullptr, dn, dnl->is_primary());
11fdf7f2
TL
7940 // dropping the last linkage or dropping the last remote linkage,
7941 // detch the inode from global snaprealm
7942 auto nlink = in->get_projected_inode()->nlink;
7943 if (nlink == 1 ||
7944 (nlink == 2 && !dnl->is_primary() &&
7945 !in->get_projected_parent_dir()->inode->is_stray()))
7946 in->clear_snaprealm_global(new_srnode);
7947 mdr->more()->desti_srnode = new_srnode;
7948 } else if (dnl->is_primary()) {
f67539c2 7949 // prepare snaprealm blob for peer request
11fdf7f2
TL
7950 SnapRealm *realm = in->find_snaprealm();
7951 snapid_t follows = realm->get_newest_seq();
7952 if (in->snaprealm || follows + 1 > in->get_oldest_snap()) {
7953 sr_t *new_srnode = in->prepare_new_srnode(follows);
7954 in->record_snaprealm_past_parent(new_srnode, straydn->get_dir()->inode->find_snaprealm());
7955 mdr->more()->desti_srnode = new_srnode;
7956 }
7957 }
7958 }
7959
7c673cae
FG
7960 // yay!
7961 if (in->is_dir() && in->has_subtree_root_dirfrag()) {
7962 // subtree root auths need to be witnesses
7963 set<mds_rank_t> witnesses;
7964 in->list_replicas(witnesses);
7965 dout(10) << " witnesses " << witnesses << ", have " << mdr->more()->witnessed << dendl;
7966
7967 for (set<mds_rank_t>::iterator p = witnesses.begin();
7968 p != witnesses.end();
7969 ++p) {
7970 if (mdr->more()->witnessed.count(*p)) {
7971 dout(10) << " already witnessed by mds." << *p << dendl;
f67539c2 7972 } else if (mdr->more()->waiting_on_peer.count(*p)) {
7c673cae
FG
7973 dout(10) << " already waiting on witness mds." << *p << dendl;
7974 } else {
9f95a23c 7975 if (!_rmdir_prepare_witness(mdr, *p, mdr->dn[0], straydn))
7c673cae
FG
7976 return;
7977 }
7978 }
f67539c2 7979 if (!mdr->more()->waiting_on_peer.empty())
7c673cae
FG
7980 return; // we're waiting for a witness.
7981 }
7982
9f95a23c
TL
7983 if (!rmdir && dnl->is_primary() && mdr->dn[0].size() == 1)
7984 mds->locker->create_lock_cache(mdr, diri);
7985
7c673cae
FG
7986 // ok!
7987 if (dnl->is_remote() && !dnl->get_inode()->is_auth())
7988 _link_remote(mdr, false, dn, dnl->get_inode());
7989 else
7990 _unlink_local(mdr, dn, straydn);
7991}
7992
7993class C_MDS_unlink_local_finish : public ServerLogContext {
7994 CDentry *dn;
7995 CDentry *straydn;
7996 version_t dnpv; // deleted dentry
7997public:
7998 C_MDS_unlink_local_finish(Server *s, MDRequestRef& r, CDentry *d, CDentry *sd) :
7999 ServerLogContext(s, r), dn(d), straydn(sd),
8000 dnpv(d->get_projected_version()) {}
8001 void finish(int r) override {
11fdf7f2 8002 ceph_assert(r == 0);
7c673cae
FG
8003 server->_unlink_local_finish(mdr, dn, straydn, dnpv);
8004 }
8005};
8006
8007void Server::_unlink_local(MDRequestRef& mdr, CDentry *dn, CDentry *straydn)
8008{
8009 dout(10) << "_unlink_local " << *dn << dendl;
8010
8011 CDentry::linkage_t *dnl = dn->get_projected_linkage();
8012 CInode *in = dnl->get_inode();
8013
7c673cae
FG
8014
8015 // ok, let's do it.
8016 mdr->ls = mdlog->get_current_segment();
8017
8018 // prepare log entry
8019 EUpdate *le = new EUpdate(mdlog, "unlink_local");
8020 mdlog->start_entry(le);
8021 le->metablob.add_client_req(mdr->reqid, mdr->client_request->get_oldest_client_tid());
8022 if (!mdr->more()->witnessed.empty()) {
f67539c2 8023 dout(20) << " noting uncommitted_peers " << mdr->more()->witnessed << dendl;
7c673cae 8024 le->reqid = mdr->reqid;
f67539c2
TL
8025 le->had_peers = true;
8026 mdcache->add_uncommitted_leader(mdr->reqid, mdr->ls, mdr->more()->witnessed);
7c673cae
FG
8027 }
8028
8029 if (straydn) {
11fdf7f2 8030 ceph_assert(dnl->is_primary());
7c673cae 8031 straydn->push_projected_linkage(in);
7c673cae
FG
8032 }
8033
8034 // the unlinked dentry
8035 dn->pre_dirty();
8036
f67539c2 8037 auto pi = in->project_inode(mdr);
94b18763
FG
8038 {
8039 std::string t;
8040 dn->make_path_string(t, true);
f67539c2
TL
8041 pi.inode->stray_prior_path = std::move(t);
8042 }
8043 pi.inode->version = in->pre_dirty();
8044 pi.inode->ctime = mdr->get_op_stamp();
8045 if (mdr->get_op_stamp() > pi.inode->rstat.rctime)
8046 pi.inode->rstat.rctime = mdr->get_op_stamp();
8047 pi.inode->change_attr++;
8048 pi.inode->nlink--;
8049 if (pi.inode->nlink == 0)
7c673cae
FG
8050 in->state_set(CInode::STATE_ORPHAN);
8051
11fdf7f2
TL
8052 if (mdr->more()->desti_srnode) {
8053 auto& desti_srnode = mdr->more()->desti_srnode;
8054 in->project_snaprealm(desti_srnode);
8055 desti_srnode = NULL;
8056 }
8057
8058 if (straydn) {
8059 // will manually pop projected inode
8060
7c673cae 8061 // primary link. add stray dentry.
7c673cae
FG
8062 mdcache->predirty_journal_parents(mdr, &le->metablob, in, dn->get_dir(), PREDIRTY_PRIMARY|PREDIRTY_DIR, -1);
8063 mdcache->predirty_journal_parents(mdr, &le->metablob, in, straydn->get_dir(), PREDIRTY_PRIMARY|PREDIRTY_DIR, 1);
8064
f67539c2 8065 pi.inode->update_backtrace();
7c673cae
FG
8066 le->metablob.add_primary_dentry(straydn, in, true, true);
8067 } else {
8068 // remote link. update remote inode.
8069 mdcache->predirty_journal_parents(mdr, &le->metablob, in, dn->get_dir(), PREDIRTY_DIR, -1);
8070 mdcache->predirty_journal_parents(mdr, &le->metablob, in, 0, PREDIRTY_PRIMARY);
8071 mdcache->journal_dirty_inode(mdr.get(), &le->metablob, in);
8072 }
8073
8074 mdcache->journal_cow_dentry(mdr.get(), &le->metablob, dn);
8075 le->metablob.add_null_dentry(dn, true);
8076
8077 if (in->is_dir()) {
8078 dout(10) << " noting renamed (unlinked) dir ino " << in->ino() << " in metablob" << dendl;
8079 le->metablob.renamed_dirino = in->ino();
8080 }
8081
8082 dn->push_projected_linkage();
8083
11fdf7f2
TL
8084 if (straydn) {
8085 ceph_assert(in->first <= straydn->first);
8086 in->first = straydn->first;
8087 }
8088
7c673cae 8089 if (in->is_dir()) {
11fdf7f2 8090 ceph_assert(straydn);
7c673cae
FG
8091 mdcache->project_subtree_rename(in, dn->get_dir(), straydn->get_dir());
8092 }
8093
8094 journal_and_reply(mdr, 0, dn, le, new C_MDS_unlink_local_finish(this, mdr, dn, straydn));
8095}
8096
8097void Server::_unlink_local_finish(MDRequestRef& mdr,
8098 CDentry *dn, CDentry *straydn,
8099 version_t dnpv)
8100{
8101 dout(10) << "_unlink_local_finish " << *dn << dendl;
8102
8103 if (!mdr->more()->witnessed.empty())
f67539c2 8104 mdcache->logged_leader_update(mdr->reqid);
7c673cae 8105
11fdf7f2
TL
8106 CInode *strayin = NULL;
8107 bool hadrealm = false;
8108 if (straydn) {
8109 // if there is newly created snaprealm, need to split old snaprealm's
8110 // inodes_with_caps. So pop snaprealm before linkage changes.
8111 strayin = dn->get_linkage()->get_inode();
8112 hadrealm = strayin->snaprealm ? true : false;
8113 strayin->early_pop_projected_snaprealm();
8114 }
8115
7c673cae
FG
8116 // unlink main dentry
8117 dn->get_dir()->unlink_inode(dn);
8118 dn->pop_projected_linkage();
f67539c2 8119 dn->mark_dirty(dnpv, mdr->ls);
7c673cae
FG
8120
8121 // relink as stray? (i.e. was primary link?)
7c673cae
FG
8122 if (straydn) {
8123 dout(20) << " straydn is " << *straydn << dendl;
11fdf7f2 8124 straydn->pop_projected_linkage();
7c673cae
FG
8125 mdcache->touch_dentry_bottom(straydn);
8126 }
8127
7c673cae 8128 mdr->apply();
aee94f69 8129
7c673cae 8130 mdcache->send_dentry_unlink(dn, straydn, mdr);
aee94f69 8131
11fdf7f2
TL
8132 if (straydn) {
8133 // update subtree map?
8134 if (strayin->is_dir())
8135 mdcache->adjust_subtree_after_rename(strayin, dn->get_dir(), true);
8136
8137 if (strayin->snaprealm && !hadrealm)
8138 mdcache->do_realm_invalidate_and_update_notify(strayin, CEPH_SNAP_OP_SPLIT, false);
8139 }
7c673cae
FG
8140
8141 // bump pop
11fdf7f2 8142 mds->balancer->hit_dir(dn->get_dir(), META_POP_IWR);
7c673cae
FG
8143
8144 // reply
8145 respond_to_request(mdr, 0);
aee94f69 8146
7c673cae
FG
8147 // removing a new dn?
8148 dn->get_dir()->try_remove_unlinked_dn(dn);
8149
8150 // clean up ?
8151 // respond_to_request() drops locks. So stray reintegration can race with us.
8152 if (straydn && !straydn->get_projected_linkage()->is_null()) {
8153 // Tip off the MDCache that this dentry is a stray that
8154 // might be elegible for purge.
8155 mdcache->notify_stray(straydn);
8156 }
8157}
8158
8159bool Server::_rmdir_prepare_witness(MDRequestRef& mdr, mds_rank_t who, vector<CDentry*>& trace, CDentry *straydn)
8160{
8161 if (mds->is_cluster_degraded() &&
8162 !mds->mdsmap->is_clientreplay_or_active_or_stopping(who)) {
8163 dout(10) << "_rmdir_prepare_witness mds." << who << " is not active" << dendl;
f67539c2 8164 if (mdr->more()->waiting_on_peer.empty())
7c673cae
FG
8165 mds->wait_for_active_peer(who, new C_MDS_RetryRequest(mdcache, mdr));
8166 return false;
8167 }
8168
8169 dout(10) << "_rmdir_prepare_witness mds." << who << dendl;
f67539c2 8170 auto req = make_message<MMDSPeerRequest>(mdr->reqid, mdr->attempt, MMDSPeerRequest::OP_RMDIRPREP);
7c673cae
FG
8171 req->srcdnpath = filepath(trace.front()->get_dir()->ino());
8172 for (auto dn : trace)
94b18763 8173 req->srcdnpath.push_dentry(dn->get_name());
9f95a23c 8174 mdcache->encode_replica_stray(straydn, who, req->straybl);
11fdf7f2
TL
8175 if (mdr->more()->desti_srnode)
8176 encode(*mdr->more()->desti_srnode, req->desti_snapbl);
7c673cae
FG
8177
8178 req->op_stamp = mdr->get_op_stamp();
8179 mds->send_message_mds(req, who);
8180
f67539c2
TL
8181 ceph_assert(mdr->more()->waiting_on_peer.count(who) == 0);
8182 mdr->more()->waiting_on_peer.insert(who);
7c673cae
FG
8183 return true;
8184}
8185
f67539c2 8186struct C_MDS_PeerRmdirPrep : public ServerLogContext {
7c673cae 8187 CDentry *dn, *straydn;
f67539c2 8188 C_MDS_PeerRmdirPrep(Server *s, MDRequestRef& r, CDentry *d, CDentry *st)
7c673cae
FG
8189 : ServerLogContext(s, r), dn(d), straydn(st) {}
8190 void finish(int r) override {
f67539c2 8191 server->_logged_peer_rmdir(mdr, dn, straydn);
7c673cae
FG
8192 }
8193};
8194
f67539c2 8195struct C_MDS_PeerRmdirCommit : public ServerContext {
7c673cae 8196 MDRequestRef mdr;
31f18b77 8197 CDentry *straydn;
f67539c2 8198 C_MDS_PeerRmdirCommit(Server *s, MDRequestRef& r, CDentry *sd)
31f18b77 8199 : ServerContext(s), mdr(r), straydn(sd) { }
7c673cae 8200 void finish(int r) override {
f67539c2 8201 server->_commit_peer_rmdir(mdr, r, straydn);
7c673cae
FG
8202 }
8203};
8204
f67539c2 8205void Server::handle_peer_rmdir_prep(MDRequestRef& mdr)
7c673cae 8206{
f67539c2
TL
8207 dout(10) << "handle_peer_rmdir_prep " << *mdr
8208 << " " << mdr->peer_request->srcdnpath
8209 << " to " << mdr->peer_request->destdnpath
7c673cae
FG
8210 << dendl;
8211
8212 vector<CDentry*> trace;
f67539c2 8213 filepath srcpath(mdr->peer_request->srcdnpath);
7c673cae
FG
8214 dout(10) << " src " << srcpath << dendl;
8215 CInode *in;
f67539c2 8216 CF_MDS_RetryRequestFactory cf(mdcache, mdr, false);
9f95a23c
TL
8217 int r = mdcache->path_traverse(mdr, cf, srcpath,
8218 MDS_TRAVERSE_DISCOVER | MDS_TRAVERSE_PATH_LOCKED,
8219 &trace, &in);
7c673cae 8220 if (r > 0) return;
f67539c2 8221 if (r == -CEPHFS_ESTALE) {
7c673cae 8222 mdcache->find_ino_peers(srcpath.get_ino(), new C_MDS_RetryRequest(mdcache, mdr),
f67539c2 8223 mdr->peer_to_mds, true);
7c673cae
FG
8224 return;
8225 }
11fdf7f2 8226 ceph_assert(r == 0);
91327a77 8227 CDentry *dn = trace.back();
7c673cae
FG
8228 dout(10) << " dn " << *dn << dendl;
8229 mdr->pin(dn);
8230
11fdf7f2 8231 ceph_assert(mdr->straydn);
7c673cae
FG
8232 CDentry *straydn = mdr->straydn;
8233 dout(10) << " straydn " << *straydn << dendl;
8234
f67539c2 8235 mdr->set_op_stamp(mdr->peer_request->op_stamp);
7c673cae
FG
8236
8237 rmdir_rollback rollback;
8238 rollback.reqid = mdr->reqid;
8239 rollback.src_dir = dn->get_dir()->dirfrag();
11fdf7f2 8240 rollback.src_dname = dn->get_name();
7c673cae 8241 rollback.dest_dir = straydn->get_dir()->dirfrag();
11fdf7f2 8242 rollback.dest_dname = straydn->get_name();
f67539c2 8243 if (mdr->peer_request->desti_snapbl.length()) {
11fdf7f2
TL
8244 if (in->snaprealm) {
8245 encode(true, rollback.snapbl);
8246 in->encode_snap_blob(rollback.snapbl);
8247 } else {
8248 encode(false, rollback.snapbl);
8249 }
8250 }
8251 encode(rollback, mdr->more()->rollback_bl);
8252 // FIXME: rollback snaprealm
7c673cae
FG
8253 dout(20) << " rollback is " << mdr->more()->rollback_bl.length() << " bytes" << dendl;
8254
8255 // set up commit waiter
f67539c2 8256 mdr->more()->peer_commit = new C_MDS_PeerRmdirCommit(this, mdr, straydn);
7c673cae 8257
11fdf7f2
TL
8258 straydn->push_projected_linkage(in);
8259 dn->push_projected_linkage();
7c673cae 8260
11fdf7f2
TL
8261 ceph_assert(straydn->first >= in->first);
8262 in->first = straydn->first;
7c673cae 8263
11fdf7f2
TL
8264 if (!in->has_subtree_root_dirfrag(mds->get_nodeid())) {
8265 dout(10) << " no auth subtree in " << *in << ", skipping journal" << dendl;
f67539c2 8266 _logged_peer_rmdir(mdr, dn, straydn);
7c673cae
FG
8267 return;
8268 }
8269
e306af50 8270 mdr->ls = mdlog->get_current_segment();
f67539c2
TL
8271 EPeerUpdate *le = new EPeerUpdate(mdlog, "peer_rmdir", mdr->reqid, mdr->peer_to_mds,
8272 EPeerUpdate::OP_PREPARE, EPeerUpdate::RMDIR);
7c673cae
FG
8273 mdlog->start_entry(le);
8274 le->rollback = mdr->more()->rollback_bl;
8275
8276 le->commit.add_dir_context(straydn->get_dir());
8277 le->commit.add_primary_dentry(straydn, in, true);
f67539c2 8278 // peer: no need to journal original dentry
7c673cae
FG
8279
8280 dout(10) << " noting renamed (unlinked) dir ino " << in->ino() << " in metablob" << dendl;
8281 le->commit.renamed_dirino = in->ino();
8282
8283 mdcache->project_subtree_rename(in, dn->get_dir(), straydn->get_dir());
f67539c2 8284 mdcache->add_uncommitted_peer(mdr->reqid, mdr->ls, mdr->peer_to_mds);
7c673cae 8285
f67539c2
TL
8286 mdr->more()->peer_update_journaled = true;
8287 submit_mdlog_entry(le, new C_MDS_PeerRmdirPrep(this, mdr, dn, straydn),
7c673cae
FG
8288 mdr, __func__);
8289 mdlog->flush();
8290}
8291
f67539c2 8292void Server::_logged_peer_rmdir(MDRequestRef& mdr, CDentry *dn, CDentry *straydn)
7c673cae 8293{
f67539c2 8294 dout(10) << "_logged_peer_rmdir " << *mdr << " on " << *dn << dendl;
11fdf7f2
TL
8295 CInode *in = dn->get_linkage()->get_inode();
8296
8297 bool new_realm;
f67539c2 8298 if (mdr->peer_request->desti_snapbl.length()) {
11fdf7f2 8299 new_realm = !in->snaprealm;
f67539c2 8300 in->decode_snap_blob(mdr->peer_request->desti_snapbl);
11fdf7f2 8301 ceph_assert(in->snaprealm);
11fdf7f2
TL
8302 } else {
8303 new_realm = false;
8304 }
7c673cae
FG
8305
8306 // update our cache now, so we are consistent with what is in the journal
8307 // when we journal a subtree map
7c673cae
FG
8308 dn->get_dir()->unlink_inode(dn);
8309 straydn->pop_projected_linkage();
8310 dn->pop_projected_linkage();
11fdf7f2 8311
f67539c2 8312 mdcache->adjust_subtree_after_rename(in, dn->get_dir(), mdr->more()->peer_update_journaled);
11fdf7f2
TL
8313
8314 if (new_realm)
8315 mdcache->do_realm_invalidate_and_update_notify(in, CEPH_SNAP_OP_SPLIT, false);
7c673cae
FG
8316
8317 // done.
f67539c2 8318 mdr->reset_peer_request();
7c673cae
FG
8319 mdr->straydn = 0;
8320
8321 if (!mdr->aborted) {
f67539c2
TL
8322 auto reply = make_message<MMDSPeerRequest>(mdr->reqid, mdr->attempt, MMDSPeerRequest::OP_RMDIRPREPACK);
8323 if (!mdr->more()->peer_update_journaled)
11fdf7f2 8324 reply->mark_not_journaled();
f67539c2 8325 mds->send_message_mds(reply, mdr->peer_to_mds);
7c673cae
FG
8326 } else {
8327 dout(10) << " abort flag set, finishing" << dendl;
8328 mdcache->request_finish(mdr);
8329 }
8330}
8331
f67539c2 8332void Server::handle_peer_rmdir_prep_ack(MDRequestRef& mdr, const cref_t<MMDSPeerRequest> &ack)
7c673cae 8333{
f67539c2 8334 dout(10) << "handle_peer_rmdir_prep_ack " << *mdr
7c673cae
FG
8335 << " " << *ack << dendl;
8336
8337 mds_rank_t from = mds_rank_t(ack->get_source().num());
8338
f67539c2 8339 mdr->more()->peers.insert(from);
7c673cae
FG
8340 mdr->more()->witnessed.insert(from);
8341 if (!ack->is_not_journaled())
f67539c2 8342 mdr->more()->has_journaled_peers = true;
7c673cae
FG
8343
8344 // remove from waiting list
f67539c2
TL
8345 ceph_assert(mdr->more()->waiting_on_peer.count(from));
8346 mdr->more()->waiting_on_peer.erase(from);
7c673cae 8347
f67539c2 8348 if (mdr->more()->waiting_on_peer.empty())
7c673cae
FG
8349 dispatch_client_request(mdr); // go again!
8350 else
f67539c2 8351 dout(10) << "still waiting on peers " << mdr->more()->waiting_on_peer << dendl;
7c673cae
FG
8352}
8353
f67539c2 8354void Server::_commit_peer_rmdir(MDRequestRef& mdr, int r, CDentry *straydn)
7c673cae 8355{
f67539c2 8356 dout(10) << "_commit_peer_rmdir " << *mdr << " r=" << r << dendl;
e306af50 8357
7c673cae 8358 if (r == 0) {
f67539c2 8359 if (mdr->more()->peer_update_journaled) {
31f18b77
FG
8360 CInode *strayin = straydn->get_projected_linkage()->get_inode();
8361 if (strayin && !strayin->snaprealm)
8362 mdcache->clear_dirty_bits_for_stray(strayin);
8363 }
8364
7c673cae
FG
8365 mdr->cleanup();
8366
f67539c2 8367 if (mdr->more()->peer_update_journaled) {
7c673cae 8368 // write a commit to the journal
f67539c2
TL
8369 EPeerUpdate *le = new EPeerUpdate(mdlog, "peer_rmdir_commit", mdr->reqid,
8370 mdr->peer_to_mds, EPeerUpdate::OP_COMMIT,
8371 EPeerUpdate::RMDIR);
7c673cae 8372 mdlog->start_entry(le);
f67539c2 8373 submit_mdlog_entry(le, new C_MDS_CommittedPeer(this, mdr), mdr, __func__);
7c673cae
FG
8374 mdlog->flush();
8375 } else {
f67539c2 8376 _committed_peer(mdr);
7c673cae
FG
8377 }
8378 } else {
8379 // abort
f67539c2 8380 do_rmdir_rollback(mdr->more()->rollback_bl, mdr->peer_to_mds, mdr);
7c673cae
FG
8381 }
8382}
8383
8384struct C_MDS_LoggedRmdirRollback : public ServerLogContext {
8385 metareqid_t reqid;
8386 CDentry *dn;
8387 CDentry *straydn;
8388 C_MDS_LoggedRmdirRollback(Server *s, MDRequestRef& m, metareqid_t mr, CDentry *d, CDentry *st)
8389 : ServerLogContext(s, m), reqid(mr), dn(d), straydn(st) {}
8390 void finish(int r) override {
8391 server->_rmdir_rollback_finish(mdr, reqid, dn, straydn);
8392 }
8393};
8394
f67539c2 8395void Server::do_rmdir_rollback(bufferlist &rbl, mds_rank_t leader, MDRequestRef& mdr)
7c673cae
FG
8396{
8397 // unlink the other rollback methods, the rmdir rollback is only
8398 // needed to record the subtree changes in the journal for inode
8399 // replicas who are auth for empty dirfrags. no actual changes to
8400 // the file system are taking place here, so there is no Mutation.
8401
8402 rmdir_rollback rollback;
11fdf7f2
TL
8403 auto p = rbl.cbegin();
8404 decode(rollback, p);
7c673cae
FG
8405
8406 dout(10) << "do_rmdir_rollback on " << rollback.reqid << dendl;
f67539c2 8407 mdcache->add_rollback(rollback.reqid, leader); // need to finish this update before resolve finishes
11fdf7f2 8408 ceph_assert(mdr || mds->is_resolve());
7c673cae
FG
8409
8410 CDir *dir = mdcache->get_dirfrag(rollback.src_dir);
8411 if (!dir)
8412 dir = mdcache->get_dirfrag(rollback.src_dir.ino, rollback.src_dname);
11fdf7f2 8413 ceph_assert(dir);
7c673cae 8414 CDentry *dn = dir->lookup(rollback.src_dname);
11fdf7f2 8415 ceph_assert(dn);
7c673cae 8416 dout(10) << " dn " << *dn << dendl;
11fdf7f2
TL
8417 CDir *straydir = mdcache->get_dirfrag(rollback.dest_dir);
8418 ceph_assert(straydir);
8419 CDentry *straydn = straydir->lookup(rollback.dest_dname);
8420 ceph_assert(straydn);
8421 dout(10) << " straydn " << *straydn << dendl;
7c673cae
FG
8422 CInode *in = straydn->get_linkage()->get_inode();
8423
11fdf7f2
TL
8424 dn->push_projected_linkage(in);
8425 straydn->push_projected_linkage();
7c673cae 8426
11fdf7f2
TL
8427 if (rollback.snapbl.length() && in->snaprealm) {
8428 bool hadrealm;
8429 auto p = rollback.snapbl.cbegin();
8430 decode(hadrealm, p);
8431 if (hadrealm) {
8432 decode(in->snaprealm->srnode, p);
8433 } else {
8434 in->snaprealm->merge_to(dir->get_inode()->find_snaprealm());
8435 }
8436 }
7c673cae 8437
f67539c2 8438 if (mdr && !mdr->more()->peer_update_journaled) {
11fdf7f2 8439 ceph_assert(!in->has_subtree_root_dirfrag(mds->get_nodeid()));
7c673cae 8440
11fdf7f2 8441 _rmdir_rollback_finish(mdr, rollback.reqid, dn, straydn);
7c673cae
FG
8442 return;
8443 }
8444
7c673cae 8445
f67539c2
TL
8446 EPeerUpdate *le = new EPeerUpdate(mdlog, "peer_rmdir_rollback", rollback.reqid, leader,
8447 EPeerUpdate::OP_ROLLBACK, EPeerUpdate::RMDIR);
7c673cae
FG
8448 mdlog->start_entry(le);
8449
8450 le->commit.add_dir_context(dn->get_dir());
8451 le->commit.add_primary_dentry(dn, in, true);
f67539c2 8452 // peer: no need to journal straydn
7c673cae
FG
8453
8454 dout(10) << " noting renamed (unlinked) dir ino " << in->ino() << " in metablob" << dendl;
8455 le->commit.renamed_dirino = in->ino();
8456
8457 mdcache->project_subtree_rename(in, straydn->get_dir(), dn->get_dir());
8458
8459 submit_mdlog_entry(le,
8460 new C_MDS_LoggedRmdirRollback(this, mdr,rollback.reqid,
8461 dn, straydn),
8462 mdr, __func__);
8463 mdlog->flush();
8464}
8465
8466void Server::_rmdir_rollback_finish(MDRequestRef& mdr, metareqid_t reqid, CDentry *dn, CDentry *straydn)
8467{
8468 dout(10) << "_rmdir_rollback_finish " << reqid << dendl;
8469
8470 straydn->get_dir()->unlink_inode(straydn);
8471 dn->pop_projected_linkage();
8472 straydn->pop_projected_linkage();
8473
8474 CInode *in = dn->get_linkage()->get_inode();
11fdf7f2 8475 mdcache->adjust_subtree_after_rename(in, straydn->get_dir(),
f67539c2 8476 !mdr || mdr->more()->peer_update_journaled);
11fdf7f2 8477
7c673cae
FG
8478 if (mds->is_resolve()) {
8479 CDir *root = mdcache->get_subtree_root(straydn->get_dir());
8480 mdcache->try_trim_non_auth_subtree(root);
8481 }
8482
8483 if (mdr)
8484 mdcache->request_finish(mdr);
8485
e306af50 8486 mdcache->finish_rollback(reqid, mdr);
7c673cae
FG
8487}
8488
8489
8490/** _dir_is_nonempty[_unlocked]
8491 *
8492 * check if a directory is non-empty (i.e. we can rmdir it).
8493 *
8494 * the unlocked varient this is a fastpath check. we can't really be
8495 * sure until we rdlock the filelock.
8496 */
8497bool Server::_dir_is_nonempty_unlocked(MDRequestRef& mdr, CInode *in)
8498{
8499 dout(10) << "dir_is_nonempty_unlocked " << *in << dendl;
11fdf7f2 8500 ceph_assert(in->is_auth());
7c673cae 8501
9f95a23c
TL
8502 if (in->filelock.is_cached())
8503 return false; // there can be pending async create/unlink. don't know.
7c673cae
FG
8504 if (in->snaprealm && in->snaprealm->srnode.snaps.size())
8505 return true; // in a snapshot!
8506
9f95a23c
TL
8507 auto&& ls = in->get_dirfrags();
8508 for (const auto& dir : ls) {
7c673cae
FG
8509 // is the frag obviously non-empty?
8510 if (dir->is_auth()) {
8511 if (dir->get_projected_fnode()->fragstat.size()) {
8512 dout(10) << "dir_is_nonempty_unlocked dirstat has "
8513 << dir->get_projected_fnode()->fragstat.size() << " items " << *dir << dendl;
8514 return true;
8515 }
8516 }
8517 }
8518
8519 return false;
8520}
8521
8522bool Server::_dir_is_nonempty(MDRequestRef& mdr, CInode *in)
8523{
8524 dout(10) << "dir_is_nonempty " << *in << dendl;
11fdf7f2
TL
8525 ceph_assert(in->is_auth());
8526 ceph_assert(in->filelock.can_read(mdr->get_client()));
7c673cae
FG
8527
8528 frag_info_t dirstat;
8529 version_t dirstat_version = in->get_projected_inode()->dirstat.version;
8530
9f95a23c
TL
8531 auto&& ls = in->get_dirfrags();
8532 for (const auto& dir : ls) {
f67539c2 8533 const auto& pf = dir->get_projected_fnode();
7c673cae
FG
8534 if (pf->fragstat.size()) {
8535 dout(10) << "dir_is_nonempty dirstat has "
8536 << pf->fragstat.size() << " items " << *dir << dendl;
8537 return true;
8538 }
8539
8540 if (pf->accounted_fragstat.version == dirstat_version)
8541 dirstat.add(pf->accounted_fragstat);
8542 else
8543 dirstat.add(pf->fragstat);
8544 }
8545
8546 return dirstat.size() != in->get_projected_inode()->dirstat.size();
8547}
8548
8549
8550// ======================================================
8551
8552
8553class C_MDS_rename_finish : public ServerLogContext {
8554 CDentry *srcdn;
8555 CDentry *destdn;
8556 CDentry *straydn;
8557public:
8558 C_MDS_rename_finish(Server *s, MDRequestRef& r,
8559 CDentry *sdn, CDentry *ddn, CDentry *stdn) :
8560 ServerLogContext(s, r),
8561 srcdn(sdn), destdn(ddn), straydn(stdn) { }
8562 void finish(int r) override {
11fdf7f2 8563 ceph_assert(r == 0);
7c673cae
FG
8564 server->_rename_finish(mdr, srcdn, destdn, straydn);
8565 }
8566};
8567
8568
8569/** handle_client_rename
8570 *
f67539c2 8571 * rename leader is the destdn auth. this is because cached inodes
7c673cae
FG
8572 * must remain connected. thus, any replica of srci, must also
8573 * replicate destdn, and possibly straydn, so that srci (and
8574 * destdn->inode) remain connected during the rename.
8575 *
f67539c2 8576 * to do this, we freeze srci, then leader (destdn auth) verifies that
7c673cae
FG
8577 * all other nodes have also replciated destdn and straydn. note that
8578 * destdn replicas need not also replicate srci. this only works when
f67539c2 8579 * destdn is leader.
7c673cae
FG
8580 *
8581 * This function takes responsibility for the passed mdr.
8582 */
8583void Server::handle_client_rename(MDRequestRef& mdr)
8584{
f67539c2 8585 const auto& req = mdr->client_request;
7c673cae
FG
8586 dout(7) << "handle_client_rename " << *req << dendl;
8587
8588 filepath destpath = req->get_filepath();
8589 filepath srcpath = req->get_filepath2();
91327a77 8590 if (srcpath.is_last_dot_or_dotdot() || destpath.is_last_dot_or_dotdot()) {
f67539c2
TL
8591 respond_to_request(mdr, -CEPHFS_EBUSY);
8592 return;
8593 }
8594
8595 if (req->get_alternate_name().size() > alternate_name_max) {
8596 dout(10) << " alternate_name longer than " << alternate_name_max << dendl;
8597 respond_to_request(mdr, -CEPHFS_ENAMETOOLONG);
91327a77
AA
8598 return;
8599 }
8600
9f95a23c
TL
8601 auto [destdn, srcdn] = rdlock_two_paths_xlock_destdn(mdr, true);
8602 if (!destdn)
8603 return;
7c673cae 8604
7c673cae 8605 dout(10) << " destdn " << *destdn << dendl;
7c673cae 8606 CDir *destdir = destdn->get_dir();
11fdf7f2 8607 ceph_assert(destdir->is_auth());
9f95a23c 8608 CDentry::linkage_t *destdnl = destdn->get_projected_linkage();
7c673cae 8609
7c673cae 8610 dout(10) << " srcdn " << *srcdn << dendl;
11fdf7f2 8611 CDir *srcdir = srcdn->get_dir();
7c673cae
FG
8612 CDentry::linkage_t *srcdnl = srcdn->get_projected_linkage();
8613 CInode *srci = srcdnl->get_inode();
8614 dout(10) << " srci " << *srci << dendl;
8615
9f95a23c
TL
8616 // -- some sanity checks --
8617 if (destdn == srcdn) {
8618 dout(7) << "rename src=dest, noop" << dendl;
8619 respond_to_request(mdr, 0);
8620 return;
8621 }
8622
8623 // dest a child of src?
8624 // e.g. mv /usr /usr/foo
8625 if (srci->is_dir() && srci->is_projected_ancestor_of(destdir->get_inode())) {
8626 dout(7) << "cannot rename item to be a child of itself" << dendl;
f67539c2 8627 respond_to_request(mdr, -CEPHFS_EINVAL);
9f95a23c
TL
8628 return;
8629 }
8630
8631 // is this a stray migration, reintegration or merge? (sanity checks!)
8632 if (mdr->reqid.name.is_mds() &&
8633 !(MDS_INO_IS_STRAY(srcpath.get_ino()) &&
8634 MDS_INO_IS_STRAY(destpath.get_ino())) &&
8635 !(destdnl->is_remote() &&
8636 destdnl->get_remote_ino() == srci->ino())) {
f67539c2 8637 respond_to_request(mdr, -CEPHFS_EINVAL); // actually, this won't reply, but whatev.
9f95a23c
TL
8638 return;
8639 }
8640
7c673cae
FG
8641 CInode *oldin = 0;
8642 if (!destdnl->is_null()) {
8643 //dout(10) << "dest dn exists " << *destdn << dendl;
8644 oldin = mdcache->get_dentry_inode(destdn, mdr, true);
8645 if (!oldin) return;
8646 dout(10) << " oldin " << *oldin << dendl;
7c673cae
FG
8647
8648 // non-empty dir? do trivial fast unlocked check, do another check later with read locks
8649 if (oldin->is_dir() && _dir_is_nonempty_unlocked(mdr, oldin)) {
f67539c2 8650 respond_to_request(mdr, -CEPHFS_ENOTEMPTY);
7c673cae
FG
8651 return;
8652 }
181888fb 8653
9f95a23c
TL
8654 // mv /some/thing /to/some/existing_other_thing
8655 if (oldin->is_dir() && !srci->is_dir()) {
f67539c2 8656 respond_to_request(mdr, -CEPHFS_EISDIR);
9f95a23c
TL
8657 return;
8658 }
8659 if (!oldin->is_dir() && srci->is_dir()) {
f67539c2 8660 respond_to_request(mdr, -CEPHFS_ENOTDIR);
9f95a23c
TL
8661 return;
8662 }
8663 if (srci == oldin && !srcdir->inode->is_stray()) {
8664 respond_to_request(mdr, 0); // no-op. POSIX makes no sense.
8665 return;
7c673cae 8666 }
f67539c2
TL
8667 if (destdn->get_alternate_name() != req->get_alternate_name()) {
8668 /* the dentry exists but the alternate_names do not match, fail... */
8669 respond_to_request(mdr, -CEPHFS_EINVAL);
8670 return;
8671 }
7c673cae
FG
8672 }
8673
9f95a23c
TL
8674 vector<CDentry*>& srctrace = mdr->dn[1];
8675 vector<CDentry*>& desttrace = mdr->dn[0];
7c673cae
FG
8676
8677 // src+dest traces _must_ share a common ancestor for locking to prevent orphans
8678 if (destpath.get_ino() != srcpath.get_ino() &&
8679 !(req->get_source().is_mds() &&
9f95a23c 8680 MDS_INO_IS_STRAY(srcpath.get_ino()))) { // <-- mds 'rename' out of stray dir is ok!
7c673cae
FG
8681 CInode *srcbase = srctrace[0]->get_dir()->get_inode();
8682 CInode *destbase = desttrace[0]->get_dir()->get_inode();
8683 // ok, extend srctrace toward root until it is an ancestor of desttrace.
8684 while (srcbase != destbase &&
8685 !srcbase->is_projected_ancestor_of(destbase)) {
8686 CDentry *pdn = srcbase->get_projected_parent_dn();
8687 srctrace.insert(srctrace.begin(), pdn);
8688 dout(10) << "rename prepending srctrace with " << *pdn << dendl;
8689 srcbase = pdn->get_dir()->get_inode();
8690 }
8691
8692 // then, extend destpath until it shares the same parent inode as srcpath.
8693 while (destbase != srcbase) {
8694 CDentry *pdn = destbase->get_projected_parent_dn();
8695 desttrace.insert(desttrace.begin(), pdn);
7c673cae
FG
8696 dout(10) << "rename prepending desttrace with " << *pdn << dendl;
8697 destbase = pdn->get_dir()->get_inode();
8698 }
8699 dout(10) << "rename src and dest traces now share common ancestor " << *destbase << dendl;
8700 }
8701
7c673cae 8702
11fdf7f2 8703 bool linkmerge = srcdnl->get_inode() == destdnl->get_inode();
7c673cae
FG
8704 if (linkmerge)
8705 dout(10) << " this is a link merge" << dendl;
8706
8707 // -- create stray dentry? --
8708 CDentry *straydn = NULL;
8709 if (destdnl->is_primary() && !linkmerge) {
8710 straydn = prepare_stray_dentry(mdr, destdnl->get_inode());
8711 if (!straydn)
8712 return;
8713 dout(10) << " straydn is " << *straydn << dendl;
8714 } else if (mdr->straydn) {
8715 mdr->unpin(mdr->straydn);
8716 mdr->straydn = NULL;
8717 }
8718
7c673cae
FG
8719
8720 // -- locks --
9f95a23c
TL
8721 if (!(mdr->locking_state & MutationImpl::ALL_LOCKED)) {
8722 MutationImpl::LockOpVec lov;
7c673cae 8723
9f95a23c
TL
8724 // we need to update srci's ctime. xlock its least contended lock to do that...
8725 lov.add_xlock(&srci->linklock);
8726 lov.add_xlock(&srci->snaplock);
7c673cae 8727
9f95a23c
TL
8728 if (oldin) {
8729 // xlock oldin (for nlink--)
8730 lov.add_xlock(&oldin->linklock);
8731 lov.add_xlock(&oldin->snaplock);
8732 if (oldin->is_dir()) {
8733 ceph_assert(srci->is_dir());
11fdf7f2 8734 lov.add_rdlock(&oldin->filelock); // to verify it's empty
7c673cae 8735
9f95a23c
TL
8736 // adjust locking order?
8737 int cmp = mdr->compare_paths();
8738 if (cmp < 0 || (cmp == 0 && oldin->ino() < srci->ino()))
8739 std::reverse(lov.begin(), lov.end());
8740 } else {
8741 ceph_assert(!srci->is_dir());
8742 // adjust locking order;
8743 if (srci->ino() > oldin->ino())
8744 std::reverse(lov.begin(), lov.end());
8745 }
8746 }
8747
8748 // straydn?
8749 if (straydn) {
8750 lov.add_wrlock(&straydn->get_dir()->inode->filelock);
8751 lov.add_wrlock(&straydn->get_dir()->inode->nestlock);
8752 lov.add_xlock(&straydn->lock);
8753 }
8754
8755 CInode *auth_pin_freeze = !srcdn->is_auth() && srcdnl->is_primary() ? srci : nullptr;
8756 if (!mds->locker->acquire_locks(mdr, lov, auth_pin_freeze))
8757 return;
8758
8759 mdr->locking_state |= MutationImpl::ALL_LOCKED;
8760 }
7c673cae 8761
11fdf7f2
TL
8762 if (linkmerge)
8763 ceph_assert(srcdir->inode->is_stray() && srcdnl->is_primary() && destdnl->is_remote());
8764
7c673cae 8765 if ((!mdr->has_more() || mdr->more()->witnessed.empty())) {
11fdf7f2 8766 if (!check_access(mdr, srcdir->get_inode(), MAY_WRITE))
7c673cae
FG
8767 return;
8768
8769 if (!check_access(mdr, destdn->get_dir()->get_inode(), MAY_WRITE))
8770 return;
8771
20effc67
TL
8772 if (!linkmerge && !check_fragment_space(mdr, destdn->get_dir()))
8773 return;
8774
8775 if (!linkmerge && !check_dir_max_entries(mdr, destdn->get_dir()))
7c673cae
FG
8776 return;
8777
8778 if (!check_access(mdr, srci, MAY_WRITE))
8779 return;
8780 }
8781
8782 // with read lock, really verify oldin is empty
8783 if (oldin &&
8784 oldin->is_dir() &&
8785 _dir_is_nonempty(mdr, oldin)) {
f67539c2 8786 respond_to_request(mdr, -CEPHFS_ENOTEMPTY);
7c673cae
FG
8787 return;
8788 }
8789
11fdf7f2 8790 /* project_snaprealm_past_parent() will do this job
7c673cae
FG
8791 *
8792 // moving between snaprealms?
8793 if (srcdnl->is_primary() && srci->is_multiversion() && !srci->snaprealm) {
8794 SnapRealm *srcrealm = srci->find_snaprealm();
8795 SnapRealm *destrealm = destdn->get_dir()->inode->find_snaprealm();
8796 if (srcrealm != destrealm &&
8797 (srcrealm->get_newest_seq() + 1 > srcdn->first ||
8798 destrealm->get_newest_seq() + 1 > srcdn->first)) {
8799 dout(10) << " renaming between snaprealms, creating snaprealm for " << *srci << dendl;
8800 mdcache->snaprealm_create(mdr, srci);
8801 return;
8802 }
8803 }
8804 */
8805
adb31ebb
TL
8806 SnapRealm *dest_realm = nullptr;
8807 SnapRealm *src_realm = nullptr;
8808 if (!linkmerge) {
8809 dest_realm = destdir->inode->find_snaprealm();
8810 if (srcdir->inode == destdir->inode)
8811 src_realm = dest_realm;
8812 else
8813 src_realm = srcdir->inode->find_snaprealm();
8814 if (src_realm != dest_realm &&
8815 src_realm->get_subvolume_ino() != dest_realm->get_subvolume_ino()) {
f67539c2 8816 respond_to_request(mdr, -CEPHFS_EXDEV);
adb31ebb
TL
8817 return;
8818 }
8819 }
8820
11fdf7f2 8821 ceph_assert(g_conf()->mds_kill_rename_at != 1);
7c673cae
FG
8822
8823 // -- open all srcdn inode frags, if any --
8824 // we need these open so that auth can properly delegate from inode to dirfrags
8825 // after the inode is _ours_.
8826 if (srcdnl->is_primary() &&
8827 !srcdn->is_auth() &&
8828 srci->is_dir()) {
8829 dout(10) << "srci is remote dir, setting stickydirs and opening all frags" << dendl;
8830 mdr->set_stickydirs(srci);
8831
11fdf7f2
TL
8832 frag_vec_t leaves;
8833 srci->dirfragtree.get_leaves(leaves);
8834 for (const auto& leaf : leaves) {
8835 CDir *dir = srci->get_dirfrag(leaf);
7c673cae 8836 if (!dir) {
11fdf7f2
TL
8837 dout(10) << " opening " << leaf << " under " << *srci << dendl;
8838 mdcache->open_remote_dirfrag(srci, leaf, new C_MDS_RetryRequest(mdcache, mdr));
7c673cae
FG
8839 return;
8840 }
8841 }
8842 }
8843
11fdf7f2
TL
8844 // -- prepare snaprealm ---
8845
8846 if (linkmerge) {
8847 if (!mdr->more()->srci_srnode &&
8848 srci->get_projected_inode()->nlink == 1 &&
8849 srci->is_projected_snaprealm_global()) {
8850 sr_t *new_srnode = srci->prepare_new_srnode(0);
adb31ebb 8851 srci->record_snaprealm_parent_dentry(new_srnode, nullptr, destdn, false);
11fdf7f2
TL
8852
8853 srci->clear_snaprealm_global(new_srnode);
8854 mdr->more()->srci_srnode = new_srnode;
8855 }
8856 } else {
8857 if (oldin && !mdr->more()->desti_srnode) {
8858 if (oldin->is_projected_snaprealm_global()) {
8859 sr_t *new_srnode = oldin->prepare_new_srnode(0);
adb31ebb 8860 oldin->record_snaprealm_parent_dentry(new_srnode, dest_realm, destdn, destdnl->is_primary());
11fdf7f2
TL
8861 // dropping the last linkage or dropping the last remote linkage,
8862 // detch the inode from global snaprealm
8863 auto nlink = oldin->get_projected_inode()->nlink;
8864 if (nlink == 1 ||
8865 (nlink == 2 && !destdnl->is_primary() &&
8866 !oldin->get_projected_parent_dir()->inode->is_stray()))
8867 oldin->clear_snaprealm_global(new_srnode);
8868 mdr->more()->desti_srnode = new_srnode;
8869 } else if (destdnl->is_primary()) {
11fdf7f2
TL
8870 snapid_t follows = dest_realm->get_newest_seq();
8871 if (oldin->snaprealm || follows + 1 > oldin->get_oldest_snap()) {
8872 sr_t *new_srnode = oldin->prepare_new_srnode(follows);
8873 oldin->record_snaprealm_past_parent(new_srnode, straydn->get_dir()->inode->find_snaprealm());
8874 mdr->more()->desti_srnode = new_srnode;
8875 }
8876 }
8877 }
8878 if (!mdr->more()->srci_srnode) {
11fdf7f2
TL
8879 if (srci->is_projected_snaprealm_global()) {
8880 sr_t *new_srnode = srci->prepare_new_srnode(0);
adb31ebb 8881 srci->record_snaprealm_parent_dentry(new_srnode, src_realm, srcdn, srcdnl->is_primary());
11fdf7f2
TL
8882 mdr->more()->srci_srnode = new_srnode;
8883 } else if (srcdnl->is_primary()) {
11fdf7f2
TL
8884 snapid_t follows = src_realm->get_newest_seq();
8885 if (src_realm != dest_realm &&
8886 (srci->snaprealm || follows + 1 > srci->get_oldest_snap())) {
8887 sr_t *new_srnode = srci->prepare_new_srnode(follows);
8888 srci->record_snaprealm_past_parent(new_srnode, dest_realm);
8889 mdr->more()->srci_srnode = new_srnode;
8890 }
8891 }
8892 }
8893 }
8894
7c673cae
FG
8895 // -- prepare witnesses --
8896
9f95a23c
TL
8897 /*
8898 * NOTE: we use _all_ replicas as witnesses.
8899 * this probably isn't totally necessary (esp for file renames),
8900 * but if/when we change that, we have to make sure rejoin is
8901 * sufficiently robust to handle strong rejoins from survivors
8902 * with totally wrong dentry->inode linkage.
8903 * (currently, it can ignore rename effects, because the resolve
8904 * stage will sort them out.)
8905 */
8906 set<mds_rank_t> witnesses = mdr->more()->extra_witnesses;
8907 if (srcdn->is_auth())
8908 srcdn->list_replicas(witnesses);
8909 else
8910 witnesses.insert(srcdn->authority().first);
8911 if (srcdnl->is_remote() && !srci->is_auth())
8912 witnesses.insert(srci->authority().first);
8913 destdn->list_replicas(witnesses);
8914 if (destdnl->is_remote() && !oldin->is_auth())
8915 witnesses.insert(oldin->authority().first);
8916 dout(10) << " witnesses " << witnesses << ", have " << mdr->more()->witnessed << dendl;
8917
8918 if (!witnesses.empty()) {
8919 // Replicas can't see projected dentry linkages and will get confused.
8920 // We have taken snaplocks on ancestor inodes. Later rename/rmdir requests
8921 // can't project these inodes' linkages.
8922 bool need_flush = false;
8923 for (auto& dn : srctrace) {
8924 if (dn->is_projected()) {
8925 need_flush = true;
8926 break;
8927 }
8928 }
8929 if (!need_flush) {
8930 CDentry *dn = destdn;
8931 do {
8932 if (dn->is_projected()) {
8933 need_flush = true;
8934 break;
8935 }
8936 CInode *diri = dn->get_dir()->get_inode();
8937 dn = diri->get_projected_parent_dn();
8938 } while (dn);
8939 }
8940 if (need_flush) {
8941 mdlog->wait_for_safe(
8942 new MDSInternalContextWrapper(mds,
8943 new C_MDS_RetryRequest(mdcache, mdr)));
8944 mdlog->flush();
8945 return;
8946 }
8947 }
8948
7c673cae
FG
8949 // do srcdn auth last
8950 mds_rank_t last = MDS_RANK_NONE;
8951 if (!srcdn->is_auth()) {
8952 last = srcdn->authority().first;
8953 mdr->more()->srcdn_auth_mds = last;
8954 // ask auth of srci to mark srci as ambiguous auth if more than two MDS
8955 // are involved in the rename operation.
8956 if (srcdnl->is_primary() && !mdr->more()->is_ambiguous_auth) {
8957 dout(10) << " preparing ambiguous auth for srci" << dendl;
11fdf7f2
TL
8958 ceph_assert(mdr->more()->is_remote_frozen_authpin);
8959 ceph_assert(mdr->more()->rename_inode == srci);
7c673cae
FG
8960 _rename_prepare_witness(mdr, last, witnesses, srctrace, desttrace, straydn);
8961 return;
8962 }
8963 }
8964
8965 for (set<mds_rank_t>::iterator p = witnesses.begin();
8966 p != witnesses.end();
8967 ++p) {
8968 if (*p == last) continue; // do it last!
8969 if (mdr->more()->witnessed.count(*p)) {
8970 dout(10) << " already witnessed by mds." << *p << dendl;
f67539c2 8971 } else if (mdr->more()->waiting_on_peer.count(*p)) {
7c673cae
FG
8972 dout(10) << " already waiting on witness mds." << *p << dendl;
8973 } else {
8974 if (!_rename_prepare_witness(mdr, *p, witnesses, srctrace, desttrace, straydn))
8975 return;
8976 }
8977 }
f67539c2 8978 if (!mdr->more()->waiting_on_peer.empty())
7c673cae
FG
8979 return; // we're waiting for a witness.
8980
8981 if (last != MDS_RANK_NONE && mdr->more()->witnessed.count(last) == 0) {
8982 dout(10) << " preparing last witness (srcdn auth)" << dendl;
f67539c2 8983 ceph_assert(mdr->more()->waiting_on_peer.count(last) == 0);
7c673cae
FG
8984 _rename_prepare_witness(mdr, last, witnesses, srctrace, desttrace, straydn);
8985 return;
8986 }
8987
f67539c2
TL
8988 // test hack: bail after peer does prepare, so we can verify it's _live_ rollback.
8989 if (!mdr->more()->peers.empty() && !srci->is_dir())
11fdf7f2 8990 ceph_assert(g_conf()->mds_kill_rename_at != 3);
f67539c2 8991 if (!mdr->more()->peers.empty() && srci->is_dir())
11fdf7f2 8992 ceph_assert(g_conf()->mds_kill_rename_at != 4);
7c673cae
FG
8993
8994 // -- declare now --
8995 mdr->set_mds_stamp(ceph_clock_now());
8996
8997 // -- prepare journal entry --
8998 mdr->ls = mdlog->get_current_segment();
8999 EUpdate *le = new EUpdate(mdlog, "rename");
9000 mdlog->start_entry(le);
f67539c2 9001 le->metablob.add_client_req(mdr->reqid, req->get_oldest_client_tid());
7c673cae 9002 if (!mdr->more()->witnessed.empty()) {
f67539c2 9003 dout(20) << " noting uncommitted_peers " << mdr->more()->witnessed << dendl;
7c673cae
FG
9004
9005 le->reqid = mdr->reqid;
f67539c2 9006 le->had_peers = true;
7c673cae 9007
f67539c2 9008 mdcache->add_uncommitted_leader(mdr->reqid, mdr->ls, mdr->more()->witnessed);
7c673cae
FG
9009 // no need to send frozen auth pin to recovring auth MDS of srci
9010 mdr->more()->is_remote_frozen_authpin = false;
9011 }
9012
f67539c2 9013 _rename_prepare(mdr, &le->metablob, &le->client_map, srcdn, destdn, req->get_alternate_name(), straydn);
7c673cae
FG
9014 if (le->client_map.length())
9015 le->cmapv = mds->sessionmap.get_projected();
9016
9017 // -- commit locally --
9018 C_MDS_rename_finish *fin = new C_MDS_rename_finish(this, mdr, srcdn, destdn, straydn);
9019
9020 journal_and_reply(mdr, srci, destdn, le, fin);
81eedcae 9021 mds->balancer->maybe_fragment(destdn->get_dir(), false);
7c673cae
FG
9022}
9023
9024
9025void Server::_rename_finish(MDRequestRef& mdr, CDentry *srcdn, CDentry *destdn, CDentry *straydn)
9026{
9027 dout(10) << "_rename_finish " << *mdr << dendl;
9028
9029 if (!mdr->more()->witnessed.empty())
f67539c2 9030 mdcache->logged_leader_update(mdr->reqid);
7c673cae
FG
9031
9032 // apply
9033 _rename_apply(mdr, srcdn, destdn, straydn);
9034
9035 mdcache->send_dentry_link(destdn, mdr);
9036
9037 CDentry::linkage_t *destdnl = destdn->get_linkage();
9038 CInode *in = destdnl->get_inode();
9039 bool need_eval = mdr->more()->cap_imports.count(in);
9040
f67539c2
TL
9041 // test hack: test peer commit
9042 if (!mdr->more()->peers.empty() && !in->is_dir())
11fdf7f2 9043 ceph_assert(g_conf()->mds_kill_rename_at != 5);
f67539c2 9044 if (!mdr->more()->peers.empty() && in->is_dir())
11fdf7f2 9045 ceph_assert(g_conf()->mds_kill_rename_at != 6);
7c673cae
FG
9046
9047 // bump popularity
11fdf7f2 9048 mds->balancer->hit_dir(srcdn->get_dir(), META_POP_IWR);
7c673cae 9049 if (destdnl->is_remote() && in->is_auth())
11fdf7f2 9050 mds->balancer->hit_inode(in, META_POP_IWR);
7c673cae
FG
9051
9052 // did we import srci? if so, explicitly ack that import that, before we unlock and reply.
9053
11fdf7f2 9054 ceph_assert(g_conf()->mds_kill_rename_at != 7);
7c673cae
FG
9055
9056 // reply
9057 respond_to_request(mdr, 0);
9058
9059 if (need_eval)
9060 mds->locker->eval(in, CEPH_CAP_LOCKS, true);
9061
9062 // clean up?
9063 // respond_to_request() drops locks. So stray reintegration can race with us.
9064 if (straydn && !straydn->get_projected_linkage()->is_null()) {
9065 mdcache->notify_stray(straydn);
9066 }
9067}
9068
9069
9070
9071// helpers
9072
9073bool Server::_rename_prepare_witness(MDRequestRef& mdr, mds_rank_t who, set<mds_rank_t> &witnesse,
9074 vector<CDentry*>& srctrace, vector<CDentry*>& dsttrace, CDentry *straydn)
9075{
f67539c2
TL
9076 const auto& client_req = mdr->client_request;
9077 ceph_assert(client_req);
9078
7c673cae
FG
9079 if (mds->is_cluster_degraded() &&
9080 !mds->mdsmap->is_clientreplay_or_active_or_stopping(who)) {
9081 dout(10) << "_rename_prepare_witness mds." << who << " is not active" << dendl;
f67539c2 9082 if (mdr->more()->waiting_on_peer.empty())
7c673cae
FG
9083 mds->wait_for_active_peer(who, new C_MDS_RetryRequest(mdcache, mdr));
9084 return false;
9085 }
9086
9087 dout(10) << "_rename_prepare_witness mds." << who << dendl;
f67539c2 9088 auto req = make_message<MMDSPeerRequest>(mdr->reqid, mdr->attempt, MMDSPeerRequest::OP_RENAMEPREP);
7c673cae
FG
9089
9090 req->srcdnpath = filepath(srctrace.front()->get_dir()->ino());
9091 for (auto dn : srctrace)
94b18763 9092 req->srcdnpath.push_dentry(dn->get_name());
7c673cae
FG
9093 req->destdnpath = filepath(dsttrace.front()->get_dir()->ino());
9094 for (auto dn : dsttrace)
94b18763 9095 req->destdnpath.push_dentry(dn->get_name());
f67539c2 9096 req->alternate_name = client_req->alternate_name;
7c673cae 9097 if (straydn)
9f95a23c 9098 mdcache->encode_replica_stray(straydn, who, req->straybl);
11fdf7f2
TL
9099
9100 if (mdr->more()->srci_srnode)
9101 encode(*mdr->more()->srci_srnode, req->srci_snapbl);
9102 if (mdr->more()->desti_srnode)
9103 encode(*mdr->more()->desti_srnode, req->desti_snapbl);
31f18b77
FG
9104
9105 req->srcdn_auth = mdr->more()->srcdn_auth_mds;
7c673cae
FG
9106
9107 // srcdn auth will verify our current witness list is sufficient
9108 req->witnesses = witnesse;
9109
9110 req->op_stamp = mdr->get_op_stamp();
9111 mds->send_message_mds(req, who);
9112
f67539c2
TL
9113 ceph_assert(mdr->more()->waiting_on_peer.count(who) == 0);
9114 mdr->more()->waiting_on_peer.insert(who);
7c673cae
FG
9115 return true;
9116}
9117
9118version_t Server::_rename_prepare_import(MDRequestRef& mdr, CDentry *srcdn, bufferlist *client_map_bl)
9119{
9120 version_t oldpv = mdr->more()->inode_import_v;
9121
9122 CDentry::linkage_t *srcdnl = srcdn->get_linkage();
9123
9124 /* import node */
11fdf7f2 9125 auto blp = mdr->more()->inode_import.cbegin();
7c673cae
FG
9126
9127 // imported caps
28e407b8 9128 map<client_t,entity_inst_t> client_map;
11fdf7f2 9129 map<client_t, client_metadata_t> client_metadata_map;
28e407b8 9130 decode(client_map, blp);
11fdf7f2
TL
9131 decode(client_metadata_map, blp);
9132 prepare_force_open_sessions(client_map, client_metadata_map,
9133 mdr->more()->imported_session_map);
28e407b8 9134 encode(client_map, *client_map_bl, mds->mdsmap->get_up_features());
11fdf7f2 9135 encode(client_metadata_map, *client_map_bl);
7c673cae
FG
9136
9137 list<ScatterLock*> updated_scatterlocks;
9138 mdcache->migrator->decode_import_inode(srcdn, blp, srcdn->authority().first, mdr->ls,
9139 mdr->more()->cap_imports, updated_scatterlocks);
9140
9141 // hack: force back to !auth and clean, temporarily
9142 srcdnl->get_inode()->state_clear(CInode::STATE_AUTH);
9143 srcdnl->get_inode()->mark_clean();
9144
9145 return oldpv;
9146}
9147
9148bool Server::_need_force_journal(CInode *diri, bool empty)
9149{
9f95a23c 9150 auto&& dirs = diri->get_dirfrags();
7c673cae
FG
9151
9152 bool force_journal = false;
9153 if (empty) {
11fdf7f2
TL
9154 for (const auto& dir : dirs) {
9155 if (dir->is_subtree_root() && dir->get_dir_auth().first == mds->get_nodeid()) {
9156 dout(10) << " frag " << dir->get_frag() << " is auth subtree dirfrag, will force journal" << dendl;
7c673cae
FG
9157 force_journal = true;
9158 break;
9159 } else
11fdf7f2 9160 dout(20) << " frag " << dir->get_frag() << " is not auth subtree dirfrag" << dendl;
7c673cae
FG
9161 }
9162 } else {
9163 // see if any children of our frags are auth subtrees.
11fdf7f2
TL
9164 std::vector<CDir*> subtrees;
9165 mdcache->get_subtrees(subtrees);
9166 dout(10) << " subtrees " << subtrees << " frags " << dirs << dendl;
9167 for (const auto& dir : dirs) {
9168 for (const auto& subtree : subtrees) {
9169 if (dir->contains(subtree)) {
9170 if (subtree->get_dir_auth().first == mds->get_nodeid()) {
9171 dout(10) << " frag " << dir->get_frag() << " contains (maybe) auth subtree, will force journal "
9172 << *subtree << dendl;
7c673cae
FG
9173 force_journal = true;
9174 break;
9175 } else
11fdf7f2 9176 dout(20) << " frag " << dir->get_frag() << " contains but isn't auth for " << *subtree << dendl;
7c673cae 9177 } else
11fdf7f2 9178 dout(20) << " frag " << dir->get_frag() << " does not contain " << *subtree << dendl;
7c673cae
FG
9179 }
9180 if (force_journal)
9181 break;
9182 }
9183 }
9184 return force_journal;
9185}
9186
9187void Server::_rename_prepare(MDRequestRef& mdr,
9188 EMetaBlob *metablob, bufferlist *client_map_bl,
f67539c2
TL
9189 CDentry *srcdn, CDentry *destdn, std::string_view alternate_name,
9190 CDentry *straydn)
7c673cae
FG
9191{
9192 dout(10) << "_rename_prepare " << *mdr << " " << *srcdn << " " << *destdn << dendl;
9193 if (straydn)
9194 dout(10) << " straydn " << *straydn << dendl;
9195
9196 CDentry::linkage_t *srcdnl = srcdn->get_projected_linkage();
9197 CDentry::linkage_t *destdnl = destdn->get_projected_linkage();
9198 CInode *srci = srcdnl->get_inode();
9199 CInode *oldin = destdnl->get_inode();
9200
9201 // primary+remote link merge?
11fdf7f2
TL
9202 bool linkmerge = (srci == oldin);
9203 if (linkmerge)
9204 ceph_assert(srcdnl->is_primary() && destdnl->is_remote());
7c673cae
FG
9205 bool silent = srcdn->get_dir()->inode->is_stray();
9206
9207 bool force_journal_dest = false;
9208 if (srci->is_dir() && !destdn->is_auth()) {
9209 if (srci->is_auth()) {
9210 // if we are auth for srci and exporting it, force journal because journal replay needs
9211 // the source inode to create auth subtrees.
9212 dout(10) << " we are exporting srci, will force journal destdn" << dendl;
9213 force_journal_dest = true;
9214 } else
9215 force_journal_dest = _need_force_journal(srci, false);
9216 }
9217
9218 bool force_journal_stray = false;
9219 if (oldin && oldin->is_dir() && straydn && !straydn->is_auth())
9220 force_journal_stray = _need_force_journal(oldin, true);
9221
9222 if (linkmerge)
9223 dout(10) << " merging remote and primary links to the same inode" << dendl;
9224 if (silent)
9225 dout(10) << " reintegrating stray; will avoid changing nlink or dir mtime" << dendl;
9226 if (force_journal_dest)
9227 dout(10) << " forcing journal destdn because we (will) have auth subtrees nested beneath it" << dendl;
9228 if (force_journal_stray)
9229 dout(10) << " forcing journal straydn because we (will) have auth subtrees nested beneath it" << dendl;
9230
9231 if (srci->is_dir() && (destdn->is_auth() || force_journal_dest)) {
9232 dout(10) << " noting renamed dir ino " << srci->ino() << " in metablob" << dendl;
9233 metablob->renamed_dirino = srci->ino();
9234 } else if (oldin && oldin->is_dir() && force_journal_stray) {
9235 dout(10) << " noting rename target dir " << oldin->ino() << " in metablob" << dendl;
9236 metablob->renamed_dirino = oldin->ino();
9237 }
9238
9239 // prepare
94b18763
FG
9240 CInode::mempool_inode *spi = 0; // renamed inode
9241 CInode::mempool_inode *tpi = 0; // target/overwritten inode
7c673cae
FG
9242
9243 // target inode
9244 if (!linkmerge) {
9245 if (destdnl->is_primary()) {
11fdf7f2 9246 ceph_assert(straydn); // moving to straydn.
7c673cae
FG
9247 // link--, and move.
9248 if (destdn->is_auth()) {
f67539c2
TL
9249 auto pi= oldin->project_inode(mdr); //project_snaprealm
9250 pi.inode->version = straydn->pre_dirty(pi.inode->version);
9251 pi.inode->update_backtrace();
9252 tpi = pi.inode.get();
7c673cae
FG
9253 }
9254 straydn->push_projected_linkage(oldin);
9255 } else if (destdnl->is_remote()) {
9256 // nlink-- targeti
9257 if (oldin->is_auth()) {
f67539c2
TL
9258 auto pi = oldin->project_inode(mdr);
9259 pi.inode->version = oldin->pre_dirty();
9260 tpi = pi.inode.get();
7c673cae
FG
9261 }
9262 }
9263 }
9264
9265 // dest
f67539c2
TL
9266 if (destdnl->is_null()) {
9267 /* handle_client_rename checks that alternate_name matches for existing destdn */
9268 destdn->set_alternate_name(alternate_name);
9269 }
7c673cae
FG
9270 if (srcdnl->is_remote()) {
9271 if (!linkmerge) {
9272 // destdn
9273 if (destdn->is_auth())
9274 mdr->more()->pvmap[destdn] = destdn->pre_dirty();
9275 destdn->push_projected_linkage(srcdnl->get_remote_ino(), srcdnl->get_remote_d_type());
9276 // srci
9277 if (srci->is_auth()) {
f67539c2
TL
9278 auto pi = srci->project_inode(mdr);
9279 pi.inode->version = srci->pre_dirty();
9280 spi = pi.inode.get();
7c673cae
FG
9281 }
9282 } else {
9283 dout(10) << " will merge remote onto primary link" << dendl;
9284 if (destdn->is_auth()) {
f67539c2
TL
9285 auto pi = oldin->project_inode(mdr);
9286 pi.inode->version = mdr->more()->pvmap[destdn] = destdn->pre_dirty(oldin->get_version());
9287 spi = pi.inode.get();
7c673cae
FG
9288 }
9289 }
9290 } else { // primary
9291 if (destdn->is_auth()) {
9292 version_t oldpv;
9293 if (srcdn->is_auth())
9294 oldpv = srci->get_projected_version();
9295 else {
9296 oldpv = _rename_prepare_import(mdr, srcdn, client_map_bl);
9297
9298 // note which dirfrags have child subtrees in the journal
9299 // event, so that we can open those (as bounds) during replay.
9300 if (srci->is_dir()) {
9f95a23c
TL
9301 auto&& ls = srci->get_dirfrags();
9302 for (const auto& dir : ls) {
7c673cae
FG
9303 if (!dir->is_auth())
9304 metablob->renamed_dir_frags.push_back(dir->get_frag());
9305 }
9306 dout(10) << " noting renamed dir open frags " << metablob->renamed_dir_frags << dendl;
9307 }
9308 }
f67539c2 9309 auto pi = srci->project_inode(mdr); // project snaprealm if srcdnl->is_primary
7c673cae 9310 // & srcdnl->snaprealm
f67539c2
TL
9311 pi.inode->version = mdr->more()->pvmap[destdn] = destdn->pre_dirty(oldpv);
9312 pi.inode->update_backtrace();
9313 spi = pi.inode.get();
7c673cae
FG
9314 }
9315 destdn->push_projected_linkage(srci);
9316 }
9317
9318 // src
9319 if (srcdn->is_auth())
9320 mdr->more()->pvmap[srcdn] = srcdn->pre_dirty();
9321 srcdn->push_projected_linkage(); // push null linkage
9322
9323 if (!silent) {
94b18763 9324 if (spi) {
91327a77
AA
9325 spi->ctime = mdr->get_op_stamp();
9326 if (mdr->get_op_stamp() > spi->rstat.rctime)
9327 spi->rstat.rctime = mdr->get_op_stamp();
94b18763 9328 spi->change_attr++;
7c673cae 9329 if (linkmerge)
94b18763 9330 spi->nlink--;
7c673cae
FG
9331 }
9332 if (tpi) {
91327a77
AA
9333 tpi->ctime = mdr->get_op_stamp();
9334 if (mdr->get_op_stamp() > tpi->rstat.rctime)
9335 tpi->rstat.rctime = mdr->get_op_stamp();
7c673cae 9336 tpi->change_attr++;
94b18763
FG
9337 {
9338 std::string t;
9339 destdn->make_path_string(t, true);
11fdf7f2 9340 tpi->stray_prior_path = std::move(t);
94b18763 9341 }
7c673cae
FG
9342 tpi->nlink--;
9343 if (tpi->nlink == 0)
9344 oldin->state_set(CInode::STATE_ORPHAN);
9345 }
9346 }
9347
9348 // prepare nesting, mtime updates
9349 int predirty_dir = silent ? 0:PREDIRTY_DIR;
9350
9351 // guarantee stray dir is processed first during journal replay. unlink the old inode,
9352 // then link the source inode to destdn
9353 if (destdnl->is_primary()) {
11fdf7f2 9354 ceph_assert(straydn);
7c673cae
FG
9355 if (straydn->is_auth()) {
9356 metablob->add_dir_context(straydn->get_dir());
9357 metablob->add_dir(straydn->get_dir(), true);
9358 }
9359 }
9360
f67539c2
TL
9361 if (!linkmerge && destdnl->is_remote() && oldin->is_auth()) {
9362 CDir *oldin_dir = oldin->get_projected_parent_dir();
9363 if (oldin_dir != srcdn->get_dir() && oldin_dir != destdn->get_dir())
9364 mdcache->predirty_journal_parents(mdr, metablob, oldin, oldin_dir, PREDIRTY_PRIMARY);
9365 }
9366
7c673cae
FG
9367 // sub off target
9368 if (destdn->is_auth() && !destdnl->is_null()) {
9369 mdcache->predirty_journal_parents(mdr, metablob, oldin, destdn->get_dir(),
9370 (destdnl->is_primary() ? PREDIRTY_PRIMARY:0)|predirty_dir, -1);
224ce89b 9371 if (destdnl->is_primary()) {
11fdf7f2 9372 ceph_assert(straydn);
7c673cae
FG
9373 mdcache->predirty_journal_parents(mdr, metablob, oldin, straydn->get_dir(),
9374 PREDIRTY_PRIMARY|PREDIRTY_DIR, 1);
224ce89b 9375 }
7c673cae 9376 }
f67539c2
TL
9377
9378 if (srcdnl->is_remote() && srci->is_auth()) {
9379 CDir *srci_dir = srci->get_projected_parent_dir();
9380 if (srci_dir != srcdn->get_dir() && srci_dir != destdn->get_dir())
9381 mdcache->predirty_journal_parents(mdr, metablob, srci, srci_dir, PREDIRTY_PRIMARY);
9382 }
7c673cae
FG
9383
9384 // move srcdn
9385 int predirty_primary = (srcdnl->is_primary() && srcdn->get_dir() != destdn->get_dir()) ? PREDIRTY_PRIMARY:0;
9386 int flags = predirty_dir | predirty_primary;
9387 if (srcdn->is_auth())
9388 mdcache->predirty_journal_parents(mdr, metablob, srci, srcdn->get_dir(), PREDIRTY_SHALLOW|flags, -1);
9389 if (destdn->is_auth())
9390 mdcache->predirty_journal_parents(mdr, metablob, srci, destdn->get_dir(), flags, 1);
9391
7c673cae
FG
9392 // add it all to the metablob
9393 // target inode
9394 if (!linkmerge) {
9395 if (destdnl->is_primary()) {
11fdf7f2 9396 ceph_assert(straydn);
7c673cae
FG
9397 if (destdn->is_auth()) {
9398 // project snaprealm, too
11fdf7f2
TL
9399 if (auto& desti_srnode = mdr->more()->desti_srnode) {
9400 oldin->project_snaprealm(desti_srnode);
9401 if (tpi->nlink == 0)
9402 ceph_assert(!desti_srnode->is_parent_global());
9403 desti_srnode = NULL;
9404 }
9405 straydn->first = mdcache->get_global_snaprealm()->get_newest_seq() + 1;
7c673cae
FG
9406 metablob->add_primary_dentry(straydn, oldin, true, true);
9407 } else if (force_journal_stray) {
9408 dout(10) << " forced journaling straydn " << *straydn << dendl;
9409 metablob->add_dir_context(straydn->get_dir());
9410 metablob->add_primary_dentry(straydn, oldin, true);
9411 }
9412 } else if (destdnl->is_remote()) {
9413 if (oldin->is_auth()) {
11fdf7f2 9414 sr_t *new_srnode = NULL;
f67539c2
TL
9415 if (mdr->peer_request) {
9416 if (mdr->peer_request->desti_snapbl.length() > 0) {
11fdf7f2 9417 new_srnode = new sr_t();
f67539c2 9418 auto p = mdr->peer_request->desti_snapbl.cbegin();
11fdf7f2
TL
9419 decode(*new_srnode, p);
9420 }
9421 } else if (auto& desti_srnode = mdr->more()->desti_srnode) {
9422 new_srnode = desti_srnode;
9423 desti_srnode = NULL;
9424 }
9425 if (new_srnode) {
9426 oldin->project_snaprealm(new_srnode);
9427 if (tpi->nlink == 0)
9428 ceph_assert(!new_srnode->is_parent_global());
9429 }
7c673cae 9430 // auth for targeti
f67539c2
TL
9431 CDentry *oldin_pdn = oldin->get_projected_parent_dn();
9432 mdcache->journal_cow_dentry(mdr.get(), metablob, oldin_pdn);
9433 metablob->add_primary_dentry(oldin_pdn, oldin, true);
7c673cae
FG
9434 }
9435 }
9436 }
9437
9438 // dest
9439 if (srcdnl->is_remote()) {
11fdf7f2
TL
9440 ceph_assert(!linkmerge);
9441 if (destdn->is_auth() && !destdnl->is_null())
9442 mdcache->journal_cow_dentry(mdr.get(), metablob, destdn, CEPH_NOSNAP, 0, destdnl);
9443 else
9444 destdn->first = mdcache->get_global_snaprealm()->get_newest_seq() + 1;
7c673cae 9445
11fdf7f2
TL
9446 if (destdn->is_auth())
9447 metablob->add_remote_dentry(destdn, true, srcdnl->get_remote_ino(), srcdnl->get_remote_d_type());
9448
9449 if (srci->is_auth() ) { // it's remote
f67539c2
TL
9450 if (mdr->peer_request) {
9451 if (mdr->peer_request->srci_snapbl.length() > 0) {
11fdf7f2 9452 sr_t *new_srnode = new sr_t();
f67539c2 9453 auto p = mdr->peer_request->srci_snapbl.cbegin();
11fdf7f2
TL
9454 decode(*new_srnode, p);
9455 srci->project_snaprealm(new_srnode);
9456 }
9457 } else if (auto& srci_srnode = mdr->more()->srci_srnode) {
9458 srci->project_snaprealm(srci_srnode);
9459 srci_srnode = NULL;
7c673cae 9460 }
7c673cae 9461
11fdf7f2 9462 CDentry *srci_pdn = srci->get_projected_parent_dn();
f67539c2 9463 mdcache->journal_cow_dentry(mdr.get(), metablob, srci_pdn);
11fdf7f2 9464 metablob->add_primary_dentry(srci_pdn, srci, true);
7c673cae
FG
9465 }
9466 } else if (srcdnl->is_primary()) {
9467 // project snap parent update?
11fdf7f2
TL
9468 if (destdn->is_auth()) {
9469 if (auto& srci_srnode = mdr->more()->srci_srnode) {
9470 srci->project_snaprealm(srci_srnode);
9471 srci_srnode = NULL;
9472 }
9473 }
7c673cae
FG
9474
9475 if (destdn->is_auth() && !destdnl->is_null())
9476 mdcache->journal_cow_dentry(mdr.get(), metablob, destdn, CEPH_NOSNAP, 0, destdnl);
11fdf7f2
TL
9477
9478 destdn->first = mdcache->get_global_snaprealm()->get_newest_seq() + 1;
1e59de90
TL
9479 {
9480 auto do_corruption = inject_rename_corrupt_dentry_first;
9481 if (unlikely(do_corruption > 0.0)) {
9482 auto r = ceph::util::generate_random_number(0.0, 1.0);
9483 if (r < do_corruption) {
9484 dout(0) << "corrupting dn: " << *destdn << dendl;
9485 destdn->first = -10;
9486 }
9487 }
9488 }
7c673cae
FG
9489
9490 if (destdn->is_auth())
9491 metablob->add_primary_dentry(destdn, srci, true, true);
9492 else if (force_journal_dest) {
9493 dout(10) << " forced journaling destdn " << *destdn << dendl;
9494 metablob->add_dir_context(destdn->get_dir());
9495 metablob->add_primary_dentry(destdn, srci, true);
9496 if (srcdn->is_auth() && srci->is_dir()) {
9497 // journal new subtrees root dirfrags
9f95a23c
TL
9498 auto&& ls = srci->get_dirfrags();
9499 for (const auto& dir : ls) {
7c673cae
FG
9500 if (dir->is_auth())
9501 metablob->add_dir(dir, true);
9502 }
9503 }
9504 }
9505 }
9506
9507 // src
9508 if (srcdn->is_auth()) {
9509 dout(10) << " journaling srcdn " << *srcdn << dendl;
9510 mdcache->journal_cow_dentry(mdr.get(), metablob, srcdn, CEPH_NOSNAP, 0, srcdnl);
f67539c2 9511 // also journal the inode in case we need do peer rename rollback. It is Ok to add
7c673cae
FG
9512 // both primary and NULL dentries. Because during journal replay, null dentry is
9513 // processed after primary dentry.
9514 if (srcdnl->is_primary() && !srci->is_dir() && !destdn->is_auth())
9515 metablob->add_primary_dentry(srcdn, srci, true);
9516 metablob->add_null_dentry(srcdn, true);
9517 } else
9518 dout(10) << " NOT journaling srcdn " << *srcdn << dendl;
9519
9520 // make renamed inode first track the dn
11fdf7f2
TL
9521 if (srcdnl->is_primary() && destdn->is_auth()) {
9522 ceph_assert(srci->first <= destdn->first);
9523 srci->first = destdn->first;
9524 }
9525 // make stray inode first track the straydn
9526 if (straydn && straydn->is_auth()) {
9527 ceph_assert(oldin->first <= straydn->first);
9528 oldin->first = straydn->first;
9529 }
7c673cae 9530
224ce89b 9531 if (oldin && oldin->is_dir()) {
11fdf7f2 9532 ceph_assert(straydn);
7c673cae 9533 mdcache->project_subtree_rename(oldin, destdn->get_dir(), straydn->get_dir());
224ce89b 9534 }
7c673cae
FG
9535 if (srci->is_dir())
9536 mdcache->project_subtree_rename(srci, srcdn->get_dir(), destdn->get_dir());
9537
9538}
9539
9540
9541void Server::_rename_apply(MDRequestRef& mdr, CDentry *srcdn, CDentry *destdn, CDentry *straydn)
9542{
9543 dout(10) << "_rename_apply " << *mdr << " " << *srcdn << " " << *destdn << dendl;
9544 dout(10) << " pvs " << mdr->more()->pvmap << dendl;
9545
9546 CDentry::linkage_t *srcdnl = srcdn->get_linkage();
9547 CDentry::linkage_t *destdnl = destdn->get_linkage();
9548
9549 CInode *oldin = destdnl->get_inode();
7c673cae
FG
9550
9551 // primary+remote link merge?
11fdf7f2
TL
9552 bool linkmerge = (srcdnl->get_inode() == oldin);
9553 if (linkmerge)
aee94f69 9554 ceph_assert(srcdnl->is_primary() && destdnl->is_remote());
11fdf7f2
TL
9555
9556 bool new_in_snaprealm = false;
9557 bool new_oldin_snaprealm = false;
7c673cae
FG
9558
9559 // target inode
9560 if (!linkmerge) {
9561 if (destdnl->is_primary()) {
11fdf7f2 9562 ceph_assert(straydn);
7c673cae 9563 dout(10) << "straydn is " << *straydn << dendl;
11fdf7f2
TL
9564
9565 // if there is newly created snaprealm, need to split old snaprealm's
9566 // inodes_with_caps. So pop snaprealm before linkage changes.
9567 if (destdn->is_auth()) {
9568 bool hadrealm = (oldin->snaprealm ? true : false);
9569 oldin->early_pop_projected_snaprealm();
9570 new_oldin_snaprealm = (oldin->snaprealm && !hadrealm);
9571 } else {
f67539c2
TL
9572 ceph_assert(mdr->peer_request);
9573 if (mdr->peer_request->desti_snapbl.length()) {
11fdf7f2 9574 new_oldin_snaprealm = !oldin->snaprealm;
f67539c2 9575 oldin->decode_snap_blob(mdr->peer_request->desti_snapbl);
11fdf7f2 9576 ceph_assert(oldin->snaprealm);
11fdf7f2
TL
9577 }
9578 }
9579
31f18b77 9580 destdn->get_dir()->unlink_inode(destdn, false);
7c673cae
FG
9581
9582 straydn->pop_projected_linkage();
f67539c2 9583 if (mdr->is_peer() && !mdr->more()->peer_update_journaled)
11fdf7f2 9584 ceph_assert(!straydn->is_projected()); // no other projected
7c673cae
FG
9585
9586 // nlink-- targeti
11fdf7f2 9587 if (destdn->is_auth())
f67539c2 9588 oldin->pop_and_dirty_projected_inode(mdr->ls, mdr);
11fdf7f2
TL
9589
9590 mdcache->touch_dentry_bottom(straydn); // drop dn as quickly as possible.
7c673cae 9591 } else if (destdnl->is_remote()) {
31f18b77 9592 destdn->get_dir()->unlink_inode(destdn, false);
11fdf7f2 9593 if (oldin->is_auth()) {
f67539c2
TL
9594 oldin->pop_and_dirty_projected_inode(mdr->ls, mdr);
9595 } else if (mdr->peer_request) {
9596 if (mdr->peer_request->desti_snapbl.length() > 0) {
11fdf7f2 9597 ceph_assert(oldin->snaprealm);
f67539c2 9598 oldin->decode_snap_blob(mdr->peer_request->desti_snapbl);
11fdf7f2
TL
9599 }
9600 } else if (auto& desti_srnode = mdr->more()->desti_srnode) {
9601 delete desti_srnode;
9602 desti_srnode = NULL;
9603 }
7c673cae
FG
9604 }
9605 }
9606
9607 // unlink src before we relink it at dest
9608 CInode *in = srcdnl->get_inode();
11fdf7f2 9609 ceph_assert(in);
7c673cae
FG
9610
9611 bool srcdn_was_remote = srcdnl->is_remote();
11fdf7f2
TL
9612 if (!srcdn_was_remote) {
9613 // if there is newly created snaprealm, need to split old snaprealm's
9614 // inodes_with_caps. So pop snaprealm before linkage changes.
9615 if (destdn->is_auth()) {
9616 bool hadrealm = (in->snaprealm ? true : false);
9617 in->early_pop_projected_snaprealm();
9618 new_in_snaprealm = (in->snaprealm && !hadrealm);
9619 } else {
f67539c2
TL
9620 ceph_assert(mdr->peer_request);
9621 if (mdr->peer_request->srci_snapbl.length()) {
11fdf7f2 9622 new_in_snaprealm = !in->snaprealm;
f67539c2 9623 in->decode_snap_blob(mdr->peer_request->srci_snapbl);
11fdf7f2 9624 ceph_assert(in->snaprealm);
11fdf7f2
TL
9625 }
9626 }
9627 }
9628
7c673cae
FG
9629 srcdn->get_dir()->unlink_inode(srcdn);
9630
9631 // dest
9632 if (srcdn_was_remote) {
9633 if (!linkmerge) {
9634 // destdn
9635 destdnl = destdn->pop_projected_linkage();
f67539c2 9636 if (mdr->is_peer() && !mdr->more()->peer_update_journaled)
11fdf7f2 9637 ceph_assert(!destdn->is_projected()); // no other projected
7c673cae
FG
9638
9639 destdn->link_remote(destdnl, in);
9640 if (destdn->is_auth())
9641 destdn->mark_dirty(mdr->more()->pvmap[destdn], mdr->ls);
9642 // in
11fdf7f2 9643 if (in->is_auth()) {
f67539c2
TL
9644 in->pop_and_dirty_projected_inode(mdr->ls, mdr);
9645 } else if (mdr->peer_request) {
9646 if (mdr->peer_request->srci_snapbl.length() > 0) {
11fdf7f2 9647 ceph_assert(in->snaprealm);
f67539c2 9648 in->decode_snap_blob(mdr->peer_request->srci_snapbl);
11fdf7f2
TL
9649 }
9650 } else if (auto& srci_srnode = mdr->more()->srci_srnode) {
9651 delete srci_srnode;
9652 srci_srnode = NULL;
9653 }
7c673cae
FG
9654 } else {
9655 dout(10) << "merging remote onto primary link" << dendl;
f67539c2 9656 oldin->pop_and_dirty_projected_inode(mdr->ls, mdr);
7c673cae
FG
9657 }
9658 } else { // primary
9659 if (linkmerge) {
9660 dout(10) << "merging primary onto remote link" << dendl;
31f18b77 9661 destdn->get_dir()->unlink_inode(destdn, false);
7c673cae
FG
9662 }
9663 destdnl = destdn->pop_projected_linkage();
f67539c2 9664 if (mdr->is_peer() && !mdr->more()->peer_update_journaled)
11fdf7f2 9665 ceph_assert(!destdn->is_projected()); // no other projected
7c673cae
FG
9666
9667 // srcdn inode import?
9668 if (!srcdn->is_auth() && destdn->is_auth()) {
11fdf7f2 9669 ceph_assert(mdr->more()->inode_import.length() > 0);
7c673cae
FG
9670
9671 map<client_t,Capability::Import> imported_caps;
9672
9673 // finish cap imports
28e407b8 9674 finish_force_open_sessions(mdr->more()->imported_session_map);
7c673cae
FG
9675 if (mdr->more()->cap_imports.count(destdnl->get_inode())) {
9676 mdcache->migrator->finish_import_inode_caps(destdnl->get_inode(),
28e407b8
AA
9677 mdr->more()->srcdn_auth_mds, true,
9678 mdr->more()->imported_session_map,
9679 mdr->more()->cap_imports[destdnl->get_inode()],
9680 imported_caps);
7c673cae
FG
9681 }
9682
9683 mdr->more()->inode_import.clear();
11fdf7f2 9684 encode(imported_caps, mdr->more()->inode_import);
7c673cae
FG
9685
9686 /* hack: add an auth pin for each xlock we hold. These were
9687 * remote xlocks previously but now they're local and
9688 * we're going to try and unpin when we xlock_finish. */
11fdf7f2
TL
9689
9690 for (auto i = mdr->locks.lower_bound(&destdnl->get_inode()->versionlock);
9691 i != mdr->locks.end();
9692 ++i) {
9693 SimpleLock *lock = i->lock;
9694 if (lock->get_parent() != destdnl->get_inode())
9695 break;
9696 if (i->is_xlock() && !lock->is_locallock())
9697 mds->locker->xlock_import(lock);
9698 }
7c673cae
FG
9699
9700 // hack: fix auth bit
9701 in->state_set(CInode::STATE_AUTH);
7c673cae
FG
9702
9703 mdr->clear_ambiguous_auth();
9704 }
9705
11fdf7f2 9706 if (destdn->is_auth())
f67539c2 9707 in->pop_and_dirty_projected_inode(mdr->ls, mdr);
7c673cae
FG
9708 }
9709
9710 // src
9711 if (srcdn->is_auth())
9712 srcdn->mark_dirty(mdr->more()->pvmap[srcdn], mdr->ls);
9713 srcdn->pop_projected_linkage();
f67539c2 9714 if (mdr->is_peer() && !mdr->more()->peer_update_journaled)
11fdf7f2 9715 ceph_assert(!srcdn->is_projected()); // no other projected
7c673cae
FG
9716
9717 // apply remaining projected inodes (nested)
9718 mdr->apply();
9719
9720 // update subtree map?
11fdf7f2 9721 if (destdnl->is_primary() && in->is_dir())
224ce89b 9722 mdcache->adjust_subtree_after_rename(in, srcdn->get_dir(), true);
7c673cae
FG
9723
9724 if (straydn && oldin->is_dir())
9725 mdcache->adjust_subtree_after_rename(oldin, destdn->get_dir(), true);
9726
11fdf7f2
TL
9727 if (new_oldin_snaprealm)
9728 mdcache->do_realm_invalidate_and_update_notify(oldin, CEPH_SNAP_OP_SPLIT, false);
9729 if (new_in_snaprealm)
9730 mdcache->do_realm_invalidate_and_update_notify(in, CEPH_SNAP_OP_SPLIT, true);
9731
7c673cae
FG
9732 // removing a new dn?
9733 if (srcdn->is_auth())
9734 srcdn->get_dir()->try_remove_unlinked_dn(srcdn);
9735}
9736
9737
9738
9739// ------------
f67539c2 9740// PEER
7c673cae 9741
f67539c2 9742class C_MDS_PeerRenamePrep : public ServerLogContext {
7c673cae
FG
9743 CDentry *srcdn, *destdn, *straydn;
9744public:
f67539c2 9745 C_MDS_PeerRenamePrep(Server *s, MDRequestRef& m, CDentry *sr, CDentry *de, CDentry *st) :
7c673cae
FG
9746 ServerLogContext(s, m), srcdn(sr), destdn(de), straydn(st) {}
9747 void finish(int r) override {
f67539c2 9748 server->_logged_peer_rename(mdr, srcdn, destdn, straydn);
7c673cae
FG
9749 }
9750};
9751
f67539c2 9752class C_MDS_PeerRenameCommit : public ServerContext {
7c673cae
FG
9753 MDRequestRef mdr;
9754 CDentry *srcdn, *destdn, *straydn;
9755public:
f67539c2 9756 C_MDS_PeerRenameCommit(Server *s, MDRequestRef& m, CDentry *sr, CDentry *de, CDentry *st) :
7c673cae
FG
9757 ServerContext(s), mdr(m), srcdn(sr), destdn(de), straydn(st) {}
9758 void finish(int r) override {
f67539c2 9759 server->_commit_peer_rename(mdr, r, srcdn, destdn, straydn);
7c673cae
FG
9760 }
9761};
9762
f67539c2 9763class C_MDS_PeerRenameSessionsFlushed : public ServerContext {
7c673cae
FG
9764 MDRequestRef mdr;
9765public:
f67539c2 9766 C_MDS_PeerRenameSessionsFlushed(Server *s, MDRequestRef& r) :
7c673cae
FG
9767 ServerContext(s), mdr(r) {}
9768 void finish(int r) override {
f67539c2 9769 server->_peer_rename_sessions_flushed(mdr);
7c673cae
FG
9770 }
9771};
9772
f67539c2 9773void Server::handle_peer_rename_prep(MDRequestRef& mdr)
7c673cae 9774{
f67539c2
TL
9775 dout(10) << "handle_peer_rename_prep " << *mdr
9776 << " " << mdr->peer_request->srcdnpath
9777 << " to " << mdr->peer_request->destdnpath
7c673cae 9778 << dendl;
31f18b77 9779
f67539c2
TL
9780 if (mdr->peer_request->is_interrupted()) {
9781 dout(10) << " peer request interrupted, sending noop reply" << dendl;
9782 auto reply = make_message<MMDSPeerRequest>(mdr->reqid, mdr->attempt, MMDSPeerRequest::OP_RENAMEPREPACK);
31f18b77 9783 reply->mark_interrupted();
f67539c2
TL
9784 mds->send_message_mds(reply, mdr->peer_to_mds);
9785 mdr->reset_peer_request();
31f18b77
FG
9786 return;
9787 }
9788
7c673cae 9789 // discover destdn
f67539c2 9790 filepath destpath(mdr->peer_request->destdnpath);
7c673cae
FG
9791 dout(10) << " dest " << destpath << dendl;
9792 vector<CDentry*> trace;
f67539c2 9793 CF_MDS_RetryRequestFactory cf(mdcache, mdr, false);
9f95a23c
TL
9794 int r = mdcache->path_traverse(mdr, cf, destpath,
9795 MDS_TRAVERSE_DISCOVER | MDS_TRAVERSE_PATH_LOCKED | MDS_TRAVERSE_WANT_DENTRY,
9796 &trace);
7c673cae 9797 if (r > 0) return;
f67539c2 9798 if (r == -CEPHFS_ESTALE) {
7c673cae 9799 mdcache->find_ino_peers(destpath.get_ino(), new C_MDS_RetryRequest(mdcache, mdr),
f67539c2 9800 mdr->peer_to_mds, true);
7c673cae
FG
9801 return;
9802 }
11fdf7f2 9803 ceph_assert(r == 0); // we shouldn't get an error here!
7c673cae 9804
91327a77 9805 CDentry *destdn = trace.back();
7c673cae
FG
9806 CDentry::linkage_t *destdnl = destdn->get_projected_linkage();
9807 dout(10) << " destdn " << *destdn << dendl;
9808 mdr->pin(destdn);
9809
9810 // discover srcdn
f67539c2 9811 filepath srcpath(mdr->peer_request->srcdnpath);
7c673cae
FG
9812 dout(10) << " src " << srcpath << dendl;
9813 CInode *srci = nullptr;
9f95a23c
TL
9814 r = mdcache->path_traverse(mdr, cf, srcpath,
9815 MDS_TRAVERSE_DISCOVER | MDS_TRAVERSE_PATH_LOCKED,
9816 &trace, &srci);
7c673cae 9817 if (r > 0) return;
11fdf7f2 9818 ceph_assert(r == 0);
7c673cae 9819
91327a77 9820 CDentry *srcdn = trace.back();
7c673cae
FG
9821 CDentry::linkage_t *srcdnl = srcdn->get_projected_linkage();
9822 dout(10) << " srcdn " << *srcdn << dendl;
9823 mdr->pin(srcdn);
9824 mdr->pin(srci);
9825
9826 // stray?
11fdf7f2
TL
9827 bool linkmerge = srcdnl->get_inode() == destdnl->get_inode();
9828 if (linkmerge)
9829 ceph_assert(srcdnl->is_primary() && destdnl->is_remote());
7c673cae
FG
9830 CDentry *straydn = mdr->straydn;
9831 if (destdnl->is_primary() && !linkmerge)
11fdf7f2 9832 ceph_assert(straydn);
7c673cae 9833
f67539c2 9834 mdr->set_op_stamp(mdr->peer_request->op_stamp);
7c673cae
FG
9835 mdr->more()->srcdn_auth_mds = srcdn->authority().first;
9836
9837 // set up commit waiter (early, to clean up any freezing etc we do)
f67539c2
TL
9838 if (!mdr->more()->peer_commit)
9839 mdr->more()->peer_commit = new C_MDS_PeerRenameCommit(this, mdr, srcdn, destdn, straydn);
7c673cae
FG
9840
9841 // am i srcdn auth?
9842 if (srcdn->is_auth()) {
9843 set<mds_rank_t> srcdnrep;
9844 srcdn->list_replicas(srcdnrep);
9845
9846 bool reply_witness = false;
9847 if (srcdnl->is_primary() && !srcdnl->get_inode()->state_test(CInode::STATE_AMBIGUOUSAUTH)) {
9848 // freeze?
9849 // we need this to
9850 // - avoid conflicting lock state changes
9851 // - avoid concurrent updates to the inode
9852 // (this could also be accomplished with the versionlock)
11fdf7f2 9853 int allowance = 3; // 1 for the mdr auth_pin, 1 for the link lock, 1 for the snap lock
7c673cae
FG
9854 dout(10) << " freezing srci " << *srcdnl->get_inode() << " with allowance " << allowance << dendl;
9855 bool frozen_inode = srcdnl->get_inode()->freeze_inode(allowance);
9856
9857 // unfreeze auth pin after freezing the inode to avoid queueing waiters
9858 if (srcdnl->get_inode()->is_frozen_auth_pin())
9859 mdr->unfreeze_auth_pin();
9860
9861 if (!frozen_inode) {
9862 srcdnl->get_inode()->add_waiter(CInode::WAIT_FROZEN, new C_MDS_RetryRequest(mdcache, mdr));
9863 return;
9864 }
9865
9866 /*
9867 * set ambiguous auth for srci
9868 * NOTE: we don't worry about ambiguous cache expire as we do
f67539c2 9869 * with subtree migrations because all peers will pin
7c673cae
FG
9870 * srcdn->get_inode() for duration of this rename.
9871 */
9872 mdr->set_ambiguous_auth(srcdnl->get_inode());
9873
9874 // just mark the source inode as ambiguous auth if more than two MDS are involved.
f67539c2
TL
9875 // the leader will send another OP_RENAMEPREP peer request later.
9876 if (mdr->peer_request->witnesses.size() > 1) {
7c673cae
FG
9877 dout(10) << " set srci ambiguous auth; providing srcdn replica list" << dendl;
9878 reply_witness = true;
9879 }
9880
9881 // make sure bystanders have received all lock related messages
9882 for (set<mds_rank_t>::iterator p = srcdnrep.begin(); p != srcdnrep.end(); ++p) {
f67539c2 9883 if (*p == mdr->peer_to_mds ||
7c673cae
FG
9884 (mds->is_cluster_degraded() &&
9885 !mds->mdsmap->is_clientreplay_or_active_or_stopping(*p)))
9886 continue;
f67539c2 9887 auto notify = make_message<MMDSPeerRequest>(mdr->reqid, mdr->attempt, MMDSPeerRequest::OP_RENAMENOTIFY);
7c673cae 9888 mds->send_message_mds(notify, *p);
f67539c2 9889 mdr->more()->waiting_on_peer.insert(*p);
7c673cae
FG
9890 }
9891
9892 // make sure clients have received all cap related messages
9893 set<client_t> export_client_set;
9894 mdcache->migrator->get_export_client_set(srcdnl->get_inode(), export_client_set);
9895
9896 MDSGatherBuilder gather(g_ceph_context);
9897 flush_client_sessions(export_client_set, gather);
9898 if (gather.has_subs()) {
f67539c2
TL
9899 mdr->more()->waiting_on_peer.insert(MDS_RANK_NONE);
9900 gather.set_finisher(new C_MDS_PeerRenameSessionsFlushed(this, mdr));
7c673cae
FG
9901 gather.activate();
9902 }
9903 }
9904
9905 // is witness list sufficient?
9906 for (set<mds_rank_t>::iterator p = srcdnrep.begin(); p != srcdnrep.end(); ++p) {
f67539c2
TL
9907 if (*p == mdr->peer_to_mds ||
9908 mdr->peer_request->witnesses.count(*p)) continue;
7c673cae
FG
9909 dout(10) << " witness list insufficient; providing srcdn replica list" << dendl;
9910 reply_witness = true;
9911 break;
9912 }
9913
9914 if (reply_witness) {
11fdf7f2 9915 ceph_assert(!srcdnrep.empty());
f67539c2 9916 auto reply = make_message<MMDSPeerRequest>(mdr->reqid, mdr->attempt, MMDSPeerRequest::OP_RENAMEPREPACK);
7c673cae 9917 reply->witnesses.swap(srcdnrep);
f67539c2
TL
9918 mds->send_message_mds(reply, mdr->peer_to_mds);
9919 mdr->reset_peer_request();
7c673cae
FG
9920 return;
9921 }
9922 dout(10) << " witness list sufficient: includes all srcdn replicas" << dendl;
f67539c2 9923 if (!mdr->more()->waiting_on_peer.empty()) {
7c673cae 9924 dout(10) << " still waiting for rename notify acks from "
f67539c2 9925 << mdr->more()->waiting_on_peer << dendl;
7c673cae
FG
9926 return;
9927 }
9928 } else if (srcdnl->is_primary() && srcdn->authority() != destdn->authority()) {
9929 // set ambiguous auth for srci on witnesses
9930 mdr->set_ambiguous_auth(srcdnl->get_inode());
9931 }
9932
9933 // encode everything we'd need to roll this back... basically, just the original state.
9934 rename_rollback rollback;
9935
9936 rollback.reqid = mdr->reqid;
9937
9938 rollback.orig_src.dirfrag = srcdn->get_dir()->dirfrag();
9939 rollback.orig_src.dirfrag_old_mtime = srcdn->get_dir()->get_projected_fnode()->fragstat.mtime;
9940 rollback.orig_src.dirfrag_old_rctime = srcdn->get_dir()->get_projected_fnode()->rstat.rctime;
11fdf7f2 9941 rollback.orig_src.dname = srcdn->get_name();
7c673cae
FG
9942 if (srcdnl->is_primary())
9943 rollback.orig_src.ino = srcdnl->get_inode()->ino();
9944 else {
11fdf7f2 9945 ceph_assert(srcdnl->is_remote());
7c673cae
FG
9946 rollback.orig_src.remote_ino = srcdnl->get_remote_ino();
9947 rollback.orig_src.remote_d_type = srcdnl->get_remote_d_type();
9948 }
9949
9950 rollback.orig_dest.dirfrag = destdn->get_dir()->dirfrag();
9951 rollback.orig_dest.dirfrag_old_mtime = destdn->get_dir()->get_projected_fnode()->fragstat.mtime;
9952 rollback.orig_dest.dirfrag_old_rctime = destdn->get_dir()->get_projected_fnode()->rstat.rctime;
11fdf7f2 9953 rollback.orig_dest.dname = destdn->get_name();
7c673cae
FG
9954 if (destdnl->is_primary())
9955 rollback.orig_dest.ino = destdnl->get_inode()->ino();
9956 else if (destdnl->is_remote()) {
9957 rollback.orig_dest.remote_ino = destdnl->get_remote_ino();
9958 rollback.orig_dest.remote_d_type = destdnl->get_remote_d_type();
9959 }
9960
9961 if (straydn) {
9962 rollback.stray.dirfrag = straydn->get_dir()->dirfrag();
9963 rollback.stray.dirfrag_old_mtime = straydn->get_dir()->get_projected_fnode()->fragstat.mtime;
9964 rollback.stray.dirfrag_old_rctime = straydn->get_dir()->get_projected_fnode()->rstat.rctime;
11fdf7f2
TL
9965 rollback.stray.dname = straydn->get_name();
9966 }
f67539c2 9967 if (mdr->peer_request->desti_snapbl.length()) {
11fdf7f2
TL
9968 CInode *oldin = destdnl->get_inode();
9969 if (oldin->snaprealm) {
9970 encode(true, rollback.desti_snapbl);
9971 oldin->encode_snap_blob(rollback.desti_snapbl);
9972 } else {
9973 encode(false, rollback.desti_snapbl);
9974 }
9975 }
f67539c2 9976 if (mdr->peer_request->srci_snapbl.length()) {
11fdf7f2
TL
9977 if (srci->snaprealm) {
9978 encode(true, rollback.srci_snapbl);
9979 srci->encode_snap_blob(rollback.srci_snapbl);
9980 } else {
9981 encode(false, rollback.srci_snapbl);
9982 }
7c673cae 9983 }
11fdf7f2
TL
9984 encode(rollback, mdr->more()->rollback_bl);
9985 // FIXME: rollback snaprealm
7c673cae
FG
9986 dout(20) << " rollback is " << mdr->more()->rollback_bl.length() << " bytes" << dendl;
9987
9988 // journal.
9989 mdr->ls = mdlog->get_current_segment();
f67539c2
TL
9990 EPeerUpdate *le = new EPeerUpdate(mdlog, "peer_rename_prep", mdr->reqid, mdr->peer_to_mds,
9991 EPeerUpdate::OP_PREPARE, EPeerUpdate::RENAME);
7c673cae
FG
9992 mdlog->start_entry(le);
9993 le->rollback = mdr->more()->rollback_bl;
9994
f67539c2
TL
9995 bufferlist blah; // inode import data... obviously not used if we're the peer
9996 _rename_prepare(mdr, &le->commit, &blah, srcdn, destdn, mdr->peer_request->alternate_name, straydn);
7c673cae
FG
9997
9998 if (le->commit.empty()) {
9999 dout(10) << " empty metablob, skipping journal" << dendl;
10000 mdlog->cancel_entry(le);
10001 mdr->ls = NULL;
f67539c2 10002 _logged_peer_rename(mdr, srcdn, destdn, straydn);
7c673cae 10003 } else {
f67539c2
TL
10004 mdcache->add_uncommitted_peer(mdr->reqid, mdr->ls, mdr->peer_to_mds);
10005 mdr->more()->peer_update_journaled = true;
10006 submit_mdlog_entry(le, new C_MDS_PeerRenamePrep(this, mdr, srcdn, destdn, straydn),
7c673cae
FG
10007 mdr, __func__);
10008 mdlog->flush();
10009 }
10010}
10011
f67539c2 10012void Server::_logged_peer_rename(MDRequestRef& mdr,
7c673cae
FG
10013 CDentry *srcdn, CDentry *destdn, CDentry *straydn)
10014{
f67539c2 10015 dout(10) << "_logged_peer_rename " << *mdr << dendl;
7c673cae
FG
10016
10017 // prepare ack
f67539c2 10018 ref_t<MMDSPeerRequest> reply;
7c673cae 10019 if (!mdr->aborted) {
f67539c2
TL
10020 reply = make_message<MMDSPeerRequest>(mdr->reqid, mdr->attempt, MMDSPeerRequest::OP_RENAMEPREPACK);
10021 if (!mdr->more()->peer_update_journaled)
7c673cae
FG
10022 reply->mark_not_journaled();
10023 }
10024
10025 CDentry::linkage_t *srcdnl = srcdn->get_linkage();
7c673cae
FG
10026 //CDentry::linkage_t *straydnl = straydn ? straydn->get_linkage() : 0;
10027
10028 // export srci?
10029 if (srcdn->is_auth() && srcdnl->is_primary()) {
10030 // set export bounds for CInode::encode_export()
11fdf7f2 10031 if (reply) {
9f95a23c 10032 std::vector<CDir*> bounds;
11fdf7f2
TL
10033 if (srcdnl->get_inode()->is_dir()) {
10034 srcdnl->get_inode()->get_dirfrags(bounds);
9f95a23c
TL
10035 for (const auto& bound : bounds) {
10036 bound->state_set(CDir::STATE_EXPORTBOUND);
10037 }
11fdf7f2 10038 }
7c673cae 10039
11fdf7f2
TL
10040 map<client_t,entity_inst_t> exported_client_map;
10041 map<client_t, client_metadata_t> exported_client_metadata_map;
10042 bufferlist inodebl;
10043 mdcache->migrator->encode_export_inode(srcdnl->get_inode(), inodebl,
10044 exported_client_map,
10045 exported_client_metadata_map);
7c673cae 10046
9f95a23c
TL
10047 for (const auto& bound : bounds) {
10048 bound->state_clear(CDir::STATE_EXPORTBOUND);
10049 }
7c673cae 10050
11fdf7f2
TL
10051 encode(exported_client_map, reply->inode_export, mds->mdsmap->get_up_features());
10052 encode(exported_client_metadata_map, reply->inode_export);
7c673cae 10053 reply->inode_export.claim_append(inodebl);
f67539c2 10054 reply->inode_export_v = srcdnl->get_inode()->get_version();
7c673cae
FG
10055 }
10056
10057 // remove mdr auth pin
10058 mdr->auth_unpin(srcdnl->get_inode());
10059 mdr->more()->is_inode_exporter = true;
10060
10061 if (srcdnl->get_inode()->is_dirty())
10062 srcdnl->get_inode()->mark_clean();
10063
10064 dout(10) << " exported srci " << *srcdnl->get_inode() << dendl;
10065 }
10066
10067 // apply
10068 _rename_apply(mdr, srcdn, destdn, straydn);
11fdf7f2
TL
10069
10070 CDentry::linkage_t *destdnl = destdn->get_linkage();
7c673cae
FG
10071
10072 // bump popularity
11fdf7f2 10073 mds->balancer->hit_dir(srcdn->get_dir(), META_POP_IWR);
7c673cae 10074 if (destdnl->get_inode() && destdnl->get_inode()->is_auth())
11fdf7f2 10075 mds->balancer->hit_inode(destdnl->get_inode(), META_POP_IWR);
7c673cae
FG
10076
10077 // done.
f67539c2 10078 mdr->reset_peer_request();
7c673cae
FG
10079 mdr->straydn = 0;
10080
10081 if (reply) {
f67539c2 10082 mds->send_message_mds(reply, mdr->peer_to_mds);
7c673cae 10083 } else {
11fdf7f2 10084 ceph_assert(mdr->aborted);
7c673cae
FG
10085 dout(10) << " abort flag set, finishing" << dendl;
10086 mdcache->request_finish(mdr);
10087 }
10088}
10089
f67539c2 10090void Server::_commit_peer_rename(MDRequestRef& mdr, int r,
7c673cae
FG
10091 CDentry *srcdn, CDentry *destdn, CDentry *straydn)
10092{
f67539c2 10093 dout(10) << "_commit_peer_rename " << *mdr << " r=" << r << dendl;
7c673cae 10094
f64942e4
AA
10095 CInode *in = destdn->get_linkage()->get_inode();
10096
10097 inodeno_t migrated_stray;
10098 if (srcdn->is_auth() && srcdn->get_dir()->inode->is_stray())
10099 migrated_stray = in->ino();
7c673cae 10100
11fdf7f2 10101 MDSContext::vec finished;
7c673cae
FG
10102 if (r == 0) {
10103 // unfreeze+singleauth inode
10104 // hmm, do i really need to delay this?
10105 if (mdr->more()->is_inode_exporter) {
7c673cae
FG
10106 // drop our pins
10107 // we exported, clear out any xlocks that we moved to another MDS
7c673cae 10108
11fdf7f2
TL
10109 for (auto i = mdr->locks.lower_bound(&in->versionlock);
10110 i != mdr->locks.end(); ) {
10111 SimpleLock *lock = i->lock;
10112 if (lock->get_parent() != in)
10113 break;
7c673cae 10114 // we only care about xlocks on the exported inode
11fdf7f2
TL
10115 if (i->is_xlock() && !lock->is_locallock())
10116 mds->locker->xlock_export(i++, mdr.get());
10117 else
10118 ++i;
7c673cae
FG
10119 }
10120
10121 map<client_t,Capability::Import> peer_imported;
11fdf7f2
TL
10122 auto bp = mdr->more()->inode_import.cbegin();
10123 decode(peer_imported, bp);
7c673cae 10124
f64942e4 10125 dout(10) << " finishing inode export on " << *in << dendl;
f67539c2 10126 mdcache->migrator->finish_export_inode(in, mdr->peer_to_mds, peer_imported, finished);
7c673cae
FG
10127 mds->queue_waiters(finished); // this includes SINGLEAUTH waiters.
10128
10129 // unfreeze
11fdf7f2 10130 ceph_assert(in->is_frozen_inode());
f64942e4 10131 in->unfreeze_inode(finished);
7c673cae
FG
10132 }
10133
10134 // singleauth
10135 if (mdr->more()->is_ambiguous_auth) {
10136 mdr->more()->rename_inode->clear_ambiguous_auth(finished);
10137 mdr->more()->is_ambiguous_auth = false;
10138 }
10139
f67539c2 10140 if (straydn && mdr->more()->peer_update_journaled) {
31f18b77
FG
10141 CInode *strayin = straydn->get_projected_linkage()->get_inode();
10142 if (strayin && !strayin->snaprealm)
10143 mdcache->clear_dirty_bits_for_stray(strayin);
10144 }
7c673cae
FG
10145
10146 mds->queue_waiters(finished);
10147 mdr->cleanup();
10148
f67539c2 10149 if (mdr->more()->peer_update_journaled) {
7c673cae 10150 // write a commit to the journal
f67539c2
TL
10151 EPeerUpdate *le = new EPeerUpdate(mdlog, "peer_rename_commit", mdr->reqid,
10152 mdr->peer_to_mds, EPeerUpdate::OP_COMMIT,
10153 EPeerUpdate::RENAME);
7c673cae 10154 mdlog->start_entry(le);
f67539c2 10155 submit_mdlog_entry(le, new C_MDS_CommittedPeer(this, mdr), mdr, __func__);
7c673cae
FG
10156 mdlog->flush();
10157 } else {
f67539c2 10158 _committed_peer(mdr);
7c673cae
FG
10159 }
10160 } else {
10161
10162 // abort
10163 // rollback_bl may be empty if we froze the inode but had to provide an expanded
f67539c2 10164 // witness list from the leader, and they failed before we tried prep again.
7c673cae
FG
10165 if (mdr->more()->rollback_bl.length()) {
10166 if (mdr->more()->is_inode_exporter) {
f64942e4
AA
10167 dout(10) << " reversing inode export of " << *in << dendl;
10168 in->abort_export();
7c673cae 10169 }
f67539c2
TL
10170 if (mdcache->is_ambiguous_peer_update(mdr->reqid, mdr->peer_to_mds)) {
10171 mdcache->remove_ambiguous_peer_update(mdr->reqid, mdr->peer_to_mds);
10172 // rollback but preserve the peer request
10173 do_rename_rollback(mdr->more()->rollback_bl, mdr->peer_to_mds, mdr, false);
7c673cae
FG
10174 mdr->more()->rollback_bl.clear();
10175 } else
f67539c2 10176 do_rename_rollback(mdr->more()->rollback_bl, mdr->peer_to_mds, mdr, true);
7c673cae 10177 } else {
f67539c2 10178 dout(10) << " rollback_bl empty, not rollback back rename (leader failed after getting extra witnesses?)" << dendl;
7c673cae
FG
10179 // singleauth
10180 if (mdr->more()->is_ambiguous_auth) {
10181 if (srcdn->is_auth())
10182 mdr->more()->rename_inode->unfreeze_inode(finished);
10183
10184 mdr->more()->rename_inode->clear_ambiguous_auth(finished);
10185 mdr->more()->is_ambiguous_auth = false;
10186 }
10187 mds->queue_waiters(finished);
10188 mdcache->request_finish(mdr);
10189 }
10190 }
f64942e4
AA
10191
10192 if (migrated_stray && mds->is_stopping())
10193 mdcache->shutdown_export_stray_finish(migrated_stray);
7c673cae
FG
10194}
10195
f67539c2
TL
10196static void _rollback_repair_dir(MutationRef& mut, CDir *dir,
10197 rename_rollback::drec &r, utime_t ctime,
10198 bool isdir, const nest_info_t &rstat)
7c673cae 10199{
f67539c2 10200 auto pf = dir->project_fnode(mut);
7c673cae
FG
10201 pf->version = dir->pre_dirty();
10202
10203 if (isdir) {
f67539c2 10204 pf->fragstat.nsubdirs += 1;
7c673cae 10205 } else {
f67539c2 10206 pf->fragstat.nfiles += 1;
7c673cae
FG
10207 }
10208 if (r.ino) {
f67539c2
TL
10209 pf->rstat.rbytes += rstat.rbytes;
10210 pf->rstat.rfiles += rstat.rfiles;
10211 pf->rstat.rsubdirs += rstat.rsubdirs;
10212 pf->rstat.rsnaps += rstat.rsnaps;
7c673cae
FG
10213 }
10214 if (pf->fragstat.mtime == ctime) {
10215 pf->fragstat.mtime = r.dirfrag_old_mtime;
10216 if (pf->rstat.rctime == ctime)
10217 pf->rstat.rctime = r.dirfrag_old_rctime;
10218 }
10219 mut->add_updated_lock(&dir->get_inode()->filelock);
10220 mut->add_updated_lock(&dir->get_inode()->nestlock);
10221}
10222
10223struct C_MDS_LoggedRenameRollback : public ServerLogContext {
10224 MutationRef mut;
10225 CDentry *srcdn;
10226 version_t srcdnpv;
10227 CDentry *destdn;
10228 CDentry *straydn;
9f95a23c 10229 map<client_t,ref_t<MClientSnap>> splits[2];
7c673cae
FG
10230 bool finish_mdr;
10231 C_MDS_LoggedRenameRollback(Server *s, MutationRef& m, MDRequestRef& r,
11fdf7f2 10232 CDentry *sd, version_t pv, CDentry *dd, CDentry *st,
9f95a23c 10233 map<client_t,ref_t<MClientSnap>> _splits[2], bool f) :
7c673cae 10234 ServerLogContext(s, r), mut(m), srcdn(sd), srcdnpv(pv), destdn(dd),
11fdf7f2
TL
10235 straydn(st), finish_mdr(f) {
10236 splits[0].swap(_splits[0]);
10237 splits[1].swap(_splits[1]);
10238 }
7c673cae
FG
10239 void finish(int r) override {
10240 server->_rename_rollback_finish(mut, mdr, srcdn, srcdnpv,
11fdf7f2 10241 destdn, straydn, splits, finish_mdr);
7c673cae
FG
10242 }
10243};
10244
f67539c2 10245void Server::do_rename_rollback(bufferlist &rbl, mds_rank_t leader, MDRequestRef& mdr,
7c673cae
FG
10246 bool finish_mdr)
10247{
10248 rename_rollback rollback;
11fdf7f2
TL
10249 auto p = rbl.cbegin();
10250 decode(rollback, p);
7c673cae
FG
10251
10252 dout(10) << "do_rename_rollback on " << rollback.reqid << dendl;
10253 // need to finish this update before sending resolve to claim the subtree
f67539c2 10254 mdcache->add_rollback(rollback.reqid, leader);
7c673cae
FG
10255
10256 MutationRef mut(new MutationImpl(nullptr, utime_t(), rollback.reqid));
10257 mut->ls = mds->mdlog->get_current_segment();
10258
10259 CDentry *srcdn = NULL;
10260 CDir *srcdir = mdcache->get_dirfrag(rollback.orig_src.dirfrag);
10261 if (!srcdir)
10262 srcdir = mdcache->get_dirfrag(rollback.orig_src.dirfrag.ino, rollback.orig_src.dname);
10263 if (srcdir) {
10264 dout(10) << " srcdir " << *srcdir << dendl;
10265 srcdn = srcdir->lookup(rollback.orig_src.dname);
10266 if (srcdn) {
10267 dout(10) << " srcdn " << *srcdn << dendl;
11fdf7f2 10268 ceph_assert(srcdn->get_linkage()->is_null());
7c673cae
FG
10269 } else
10270 dout(10) << " srcdn not found" << dendl;
10271 } else
10272 dout(10) << " srcdir not found" << dendl;
10273
10274 CDentry *destdn = NULL;
10275 CDir *destdir = mdcache->get_dirfrag(rollback.orig_dest.dirfrag);
10276 if (!destdir)
10277 destdir = mdcache->get_dirfrag(rollback.orig_dest.dirfrag.ino, rollback.orig_dest.dname);
10278 if (destdir) {
10279 dout(10) << " destdir " << *destdir << dendl;
10280 destdn = destdir->lookup(rollback.orig_dest.dname);
10281 if (destdn)
10282 dout(10) << " destdn " << *destdn << dendl;
10283 else
10284 dout(10) << " destdn not found" << dendl;
10285 } else
10286 dout(10) << " destdir not found" << dendl;
10287
10288 CInode *in = NULL;
10289 if (rollback.orig_src.ino) {
10290 in = mdcache->get_inode(rollback.orig_src.ino);
10291 if (in && in->is_dir())
11fdf7f2 10292 ceph_assert(srcdn && destdn);
7c673cae
FG
10293 } else
10294 in = mdcache->get_inode(rollback.orig_src.remote_ino);
10295
10296 CDir *straydir = NULL;
10297 CDentry *straydn = NULL;
10298 if (rollback.stray.dirfrag.ino) {
10299 straydir = mdcache->get_dirfrag(rollback.stray.dirfrag);
10300 if (straydir) {
10301 dout(10) << "straydir " << *straydir << dendl;
10302 straydn = straydir->lookup(rollback.stray.dname);
10303 if (straydn) {
10304 dout(10) << " straydn " << *straydn << dendl;
11fdf7f2 10305 ceph_assert(straydn->get_linkage()->is_primary());
7c673cae
FG
10306 } else
10307 dout(10) << " straydn not found" << dendl;
10308 } else
10309 dout(10) << "straydir not found" << dendl;
10310 }
10311
10312 CInode *target = NULL;
10313 if (rollback.orig_dest.ino) {
10314 target = mdcache->get_inode(rollback.orig_dest.ino);
10315 if (target)
11fdf7f2 10316 ceph_assert(destdn && straydn);
7c673cae
FG
10317 } else if (rollback.orig_dest.remote_ino)
10318 target = mdcache->get_inode(rollback.orig_dest.remote_ino);
10319
10320 // can't use is_auth() in the resolve stage
10321 mds_rank_t whoami = mds->get_nodeid();
f67539c2 10322 // peer
11fdf7f2
TL
10323 ceph_assert(!destdn || destdn->authority().first != whoami);
10324 ceph_assert(!straydn || straydn->authority().first != whoami);
7c673cae
FG
10325
10326 bool force_journal_src = false;
10327 bool force_journal_dest = false;
10328 if (in && in->is_dir() && srcdn->authority().first != whoami)
10329 force_journal_src = _need_force_journal(in, false);
10330 if (in && target && target->is_dir())
10331 force_journal_dest = _need_force_journal(in, true);
10332
10333 version_t srcdnpv = 0;
10334 // repair src
10335 if (srcdn) {
10336 if (srcdn->authority().first == whoami)
10337 srcdnpv = srcdn->pre_dirty();
10338 if (rollback.orig_src.ino) {
11fdf7f2 10339 ceph_assert(in);
7c673cae
FG
10340 srcdn->push_projected_linkage(in);
10341 } else
10342 srcdn->push_projected_linkage(rollback.orig_src.remote_ino,
10343 rollback.orig_src.remote_d_type);
10344 }
10345
9f95a23c 10346 map<client_t,ref_t<MClientSnap>> splits[2];
11fdf7f2 10347
f67539c2 10348 const CInode::mempool_inode *pip = nullptr;
7c673cae 10349 if (in) {
11fdf7f2 10350 bool projected;
f67539c2
TL
10351 CDir *pdir = in->get_projected_parent_dir();
10352 if (pdir->authority().first == whoami) {
10353 auto pi = in->project_inode(mut);
10354 pi.inode->version = in->pre_dirty();
10355 if (pdir != srcdir) {
10356 auto pf = pdir->project_fnode(mut);
10357 pf->version = pdir->pre_dirty();
10358 }
10359 if (pi.inode->ctime == rollback.ctime)
10360 pi.inode->ctime = rollback.orig_src.old_ctime;
11fdf7f2
TL
10361 projected = true;
10362 } else {
f67539c2
TL
10363 if (in->get_inode()->ctime == rollback.ctime) {
10364 auto _inode = CInode::allocate_inode(*in->get_inode());
10365 _inode->ctime = rollback.orig_src.old_ctime;
10366 in->reset_inode(_inode);
10367 }
11fdf7f2
TL
10368 projected = false;
10369 }
f67539c2 10370 pip = in->get_projected_inode().get();
11fdf7f2
TL
10371
10372 if (rollback.srci_snapbl.length() && in->snaprealm) {
10373 bool hadrealm;
10374 auto p = rollback.srci_snapbl.cbegin();
10375 decode(hadrealm, p);
10376 if (hadrealm) {
10377 if (projected && !mds->is_resolve()) {
10378 sr_t *new_srnode = new sr_t();
10379 decode(*new_srnode, p);
10380 in->project_snaprealm(new_srnode);
10381 } else
10382 decode(in->snaprealm->srnode, p);
10383 } else {
10384 SnapRealm *realm;
10385 if (rollback.orig_src.ino) {
10386 ceph_assert(srcdir);
10387 realm = srcdir->get_inode()->find_snaprealm();
10388 } else {
10389 realm = in->snaprealm->parent;
10390 }
10391 if (!mds->is_resolve())
10392 mdcache->prepare_realm_merge(in->snaprealm, realm, splits[0]);
10393 if (projected)
10394 in->project_snaprealm(NULL);
10395 else
10396 in->snaprealm->merge_to(realm);
10397 }
10398 }
7c673cae
FG
10399 }
10400
7c673cae
FG
10401 // repair dest
10402 if (destdn) {
10403 if (rollback.orig_dest.ino && target) {
10404 destdn->push_projected_linkage(target);
10405 } else if (rollback.orig_dest.remote_ino) {
10406 destdn->push_projected_linkage(rollback.orig_dest.remote_ino,
10407 rollback.orig_dest.remote_d_type);
10408 } else {
10409 // the dentry will be trimmed soon, it's ok to have wrong linkage
10410 if (rollback.orig_dest.ino)
11fdf7f2 10411 ceph_assert(mds->is_resolve());
7c673cae
FG
10412 destdn->push_projected_linkage();
10413 }
10414 }
10415
10416 if (straydn)
10417 straydn->push_projected_linkage();
10418
10419 if (target) {
11fdf7f2 10420 bool projected;
f67539c2
TL
10421 CInode::inode_ptr ti;
10422 CDir *pdir = target->get_projected_parent_dir();
10423 if (pdir->authority().first == whoami) {
10424 auto pi = target->project_inode(mut);
10425 pi.inode->version = target->pre_dirty();
10426 if (pdir != srcdir) {
10427 auto pf = pdir->project_fnode(mut);
10428 pf->version = pdir->pre_dirty();
10429 }
10430 ti = pi.inode;
11fdf7f2
TL
10431 projected = true;
10432 } else {
f67539c2 10433 ti = CInode::allocate_inode(*target->get_inode());
11fdf7f2
TL
10434 projected = false;
10435 }
f67539c2 10436
7c673cae 10437 if (ti->ctime == rollback.ctime)
91327a77 10438 ti->ctime = rollback.orig_dest.old_ctime;
7c673cae
FG
10439 if (MDS_INO_IS_STRAY(rollback.orig_src.dirfrag.ino)) {
10440 if (MDS_INO_IS_STRAY(rollback.orig_dest.dirfrag.ino))
11fdf7f2 10441 ceph_assert(!rollback.orig_dest.ino && !rollback.orig_dest.remote_ino);
7c673cae 10442 else
11fdf7f2 10443 ceph_assert(rollback.orig_dest.remote_ino &&
7c673cae
FG
10444 rollback.orig_dest.remote_ino == rollback.orig_src.ino);
10445 } else
10446 ti->nlink++;
11fdf7f2 10447
f67539c2
TL
10448 if (!projected)
10449 target->reset_inode(ti);
10450
11fdf7f2
TL
10451 if (rollback.desti_snapbl.length() && target->snaprealm) {
10452 bool hadrealm;
10453 auto p = rollback.desti_snapbl.cbegin();
10454 decode(hadrealm, p);
10455 if (hadrealm) {
10456 if (projected && !mds->is_resolve()) {
10457 sr_t *new_srnode = new sr_t();
10458 decode(*new_srnode, p);
10459 target->project_snaprealm(new_srnode);
10460 } else
10461 decode(target->snaprealm->srnode, p);
10462 } else {
10463 SnapRealm *realm;
10464 if (rollback.orig_dest.ino) {
10465 ceph_assert(destdir);
10466 realm = destdir->get_inode()->find_snaprealm();
10467 } else {
10468 realm = target->snaprealm->parent;
10469 }
10470 if (!mds->is_resolve())
10471 mdcache->prepare_realm_merge(target->snaprealm, realm, splits[1]);
10472 if (projected)
10473 target->project_snaprealm(NULL);
10474 else
10475 target->snaprealm->merge_to(realm);
10476 }
10477 }
7c673cae
FG
10478 }
10479
f67539c2
TL
10480 if (srcdn && srcdn->authority().first == whoami) {
10481 nest_info_t blah;
10482 _rollback_repair_dir(mut, srcdir, rollback.orig_src, rollback.ctime,
10483 in && in->is_dir(), pip ? pip->accounted_rstat : blah);
10484 }
10485
7c673cae
FG
10486 if (srcdn)
10487 dout(0) << " srcdn back to " << *srcdn << dendl;
10488 if (in)
10489 dout(0) << " srci back to " << *in << dendl;
10490 if (destdn)
10491 dout(0) << " destdn back to " << *destdn << dendl;
10492 if (target)
10493 dout(0) << " desti back to " << *target << dendl;
10494
10495 // journal it
f67539c2
TL
10496 EPeerUpdate *le = new EPeerUpdate(mdlog, "peer_rename_rollback", rollback.reqid, leader,
10497 EPeerUpdate::OP_ROLLBACK, EPeerUpdate::RENAME);
7c673cae
FG
10498 mdlog->start_entry(le);
10499
10500 if (srcdn && (srcdn->authority().first == whoami || force_journal_src)) {
10501 le->commit.add_dir_context(srcdir);
10502 if (rollback.orig_src.ino)
10503 le->commit.add_primary_dentry(srcdn, 0, true);
10504 else
10505 le->commit.add_remote_dentry(srcdn, true);
10506 }
10507
10508 if (!rollback.orig_src.ino && // remote linkage
10509 in && in->authority().first == whoami) {
10510 le->commit.add_dir_context(in->get_projected_parent_dir());
10511 le->commit.add_primary_dentry(in->get_projected_parent_dn(), in, true);
10512 }
10513
10514 if (force_journal_dest) {
11fdf7f2 10515 ceph_assert(rollback.orig_dest.ino);
7c673cae
FG
10516 le->commit.add_dir_context(destdir);
10517 le->commit.add_primary_dentry(destdn, 0, true);
10518 }
10519
f67539c2 10520 // peer: no need to journal straydn
7c673cae
FG
10521
10522 if (target && target != in && target->authority().first == whoami) {
11fdf7f2 10523 ceph_assert(rollback.orig_dest.remote_ino);
7c673cae
FG
10524 le->commit.add_dir_context(target->get_projected_parent_dir());
10525 le->commit.add_primary_dentry(target->get_projected_parent_dn(), target, true);
10526 }
10527
10528 if (in && in->is_dir() && (srcdn->authority().first == whoami || force_journal_src)) {
10529 dout(10) << " noting renamed dir ino " << in->ino() << " in metablob" << dendl;
10530 le->commit.renamed_dirino = in->ino();
10531 if (srcdn->authority().first == whoami) {
9f95a23c
TL
10532 auto&& ls = in->get_dirfrags();
10533 for (const auto& dir : ls) {
7c673cae
FG
10534 if (!dir->is_auth())
10535 le->commit.renamed_dir_frags.push_back(dir->get_frag());
10536 }
10537 dout(10) << " noting renamed dir open frags " << le->commit.renamed_dir_frags << dendl;
10538 }
10539 } else if (force_journal_dest) {
10540 dout(10) << " noting rename target ino " << target->ino() << " in metablob" << dendl;
10541 le->commit.renamed_dirino = target->ino();
10542 }
10543
10544 if (target && target->is_dir()) {
11fdf7f2 10545 ceph_assert(destdn);
7c673cae
FG
10546 mdcache->project_subtree_rename(target, straydir, destdir);
10547 }
10548
10549 if (in && in->is_dir()) {
11fdf7f2 10550 ceph_assert(srcdn);
7c673cae
FG
10551 mdcache->project_subtree_rename(in, destdir, srcdir);
10552 }
10553
f67539c2 10554 if (mdr && !mdr->more()->peer_update_journaled) {
11fdf7f2 10555 ceph_assert(le->commit.empty());
7c673cae
FG
10556 mdlog->cancel_entry(le);
10557 mut->ls = NULL;
11fdf7f2 10558 _rename_rollback_finish(mut, mdr, srcdn, srcdnpv, destdn, straydn, splits, finish_mdr);
7c673cae 10559 } else {
11fdf7f2 10560 ceph_assert(!le->commit.empty());
7c673cae 10561 if (mdr)
f67539c2 10562 mdr->more()->peer_update_journaled = false;
11fdf7f2
TL
10563 MDSLogContextBase *fin = new C_MDS_LoggedRenameRollback(this, mut, mdr,
10564 srcdn, srcdnpv, destdn, straydn,
10565 splits, finish_mdr);
7c673cae
FG
10566 submit_mdlog_entry(le, fin, mdr, __func__);
10567 mdlog->flush();
10568 }
10569}
10570
10571void Server::_rename_rollback_finish(MutationRef& mut, MDRequestRef& mdr, CDentry *srcdn,
11fdf7f2 10572 version_t srcdnpv, CDentry *destdn, CDentry *straydn,
9f95a23c 10573 map<client_t,ref_t<MClientSnap>> splits[2], bool finish_mdr)
7c673cae
FG
10574{
10575 dout(10) << "_rename_rollback_finish " << mut->reqid << dendl;
10576
10577 if (straydn) {
10578 straydn->get_dir()->unlink_inode(straydn);
10579 straydn->pop_projected_linkage();
10580 }
10581 if (destdn) {
10582 destdn->get_dir()->unlink_inode(destdn);
10583 destdn->pop_projected_linkage();
10584 }
10585 if (srcdn) {
10586 srcdn->pop_projected_linkage();
11fdf7f2 10587 if (srcdn->authority().first == mds->get_nodeid()) {
7c673cae 10588 srcdn->mark_dirty(srcdnpv, mut->ls);
11fdf7f2
TL
10589 if (srcdn->get_linkage()->is_primary())
10590 srcdn->get_linkage()->get_inode()->state_set(CInode::STATE_AUTH);
10591 }
7c673cae
FG
10592 }
10593
10594 mut->apply();
10595
10596 if (srcdn && srcdn->get_linkage()->is_primary()) {
10597 CInode *in = srcdn->get_linkage()->get_inode();
7c673cae 10598 if (in && in->is_dir()) {
11fdf7f2 10599 ceph_assert(destdn);
7c673cae
FG
10600 mdcache->adjust_subtree_after_rename(in, destdn->get_dir(), true);
10601 }
10602 }
10603
10604 if (destdn) {
10605 CInode *oldin = destdn->get_linkage()->get_inode();
10606 // update subtree map?
10607 if (oldin && oldin->is_dir()) {
11fdf7f2 10608 ceph_assert(straydn);
7c673cae
FG
10609 mdcache->adjust_subtree_after_rename(oldin, straydn->get_dir(), true);
10610 }
10611 }
10612
10613 if (mds->is_resolve()) {
10614 CDir *root = NULL;
10615 if (straydn)
10616 root = mdcache->get_subtree_root(straydn->get_dir());
10617 else if (destdn)
10618 root = mdcache->get_subtree_root(destdn->get_dir());
10619 if (root)
10620 mdcache->try_trim_non_auth_subtree(root);
11fdf7f2
TL
10621 } else {
10622 mdcache->send_snaps(splits[1]);
10623 mdcache->send_snaps(splits[0]);
7c673cae
FG
10624 }
10625
10626 if (mdr) {
11fdf7f2 10627 MDSContext::vec finished;
7c673cae
FG
10628 if (mdr->more()->is_ambiguous_auth) {
10629 if (srcdn->is_auth())
10630 mdr->more()->rename_inode->unfreeze_inode(finished);
10631
10632 mdr->more()->rename_inode->clear_ambiguous_auth(finished);
10633 mdr->more()->is_ambiguous_auth = false;
10634 }
10635 mds->queue_waiters(finished);
10636 if (finish_mdr || mdr->aborted)
10637 mdcache->request_finish(mdr);
10638 else
f67539c2 10639 mdr->more()->peer_rolling_back = false;
7c673cae
FG
10640 }
10641
e306af50 10642 mdcache->finish_rollback(mut->reqid, mdr);
7c673cae
FG
10643
10644 mut->cleanup();
10645}
10646
f67539c2 10647void Server::handle_peer_rename_prep_ack(MDRequestRef& mdr, const cref_t<MMDSPeerRequest> &ack)
7c673cae 10648{
f67539c2 10649 dout(10) << "handle_peer_rename_prep_ack " << *mdr
7c673cae
FG
10650 << " witnessed by " << ack->get_source()
10651 << " " << *ack << dendl;
10652 mds_rank_t from = mds_rank_t(ack->get_source().num());
10653
f67539c2
TL
10654 // note peer
10655 mdr->more()->peers.insert(from);
7c673cae
FG
10656 if (mdr->more()->srcdn_auth_mds == from &&
10657 mdr->more()->is_remote_frozen_authpin &&
10658 !mdr->more()->is_ambiguous_auth) {
10659 mdr->set_ambiguous_auth(mdr->more()->rename_inode);
10660 }
10661
10662 // witnessed? or add extra witnesses?
11fdf7f2 10663 ceph_assert(mdr->more()->witnessed.count(from) == 0);
31f18b77 10664 if (ack->is_interrupted()) {
f67539c2 10665 dout(10) << " peer request interrupted, noop" << dendl;
31f18b77 10666 } else if (ack->witnesses.empty()) {
7c673cae
FG
10667 mdr->more()->witnessed.insert(from);
10668 if (!ack->is_not_journaled())
f67539c2 10669 mdr->more()->has_journaled_peers = true;
7c673cae
FG
10670 } else {
10671 dout(10) << " extra witnesses (srcdn replicas) are " << ack->witnesses << dendl;
11fdf7f2 10672 mdr->more()->extra_witnesses = ack->witnesses;
7c673cae
FG
10673 mdr->more()->extra_witnesses.erase(mds->get_nodeid()); // not me!
10674 }
10675
10676 // srci import?
10677 if (ack->inode_export.length()) {
10678 dout(10) << " got srci import" << dendl;
11fdf7f2 10679 mdr->more()->inode_import.share(ack->inode_export);
7c673cae
FG
10680 mdr->more()->inode_import_v = ack->inode_export_v;
10681 }
10682
10683 // remove from waiting list
f67539c2
TL
10684 ceph_assert(mdr->more()->waiting_on_peer.count(from));
10685 mdr->more()->waiting_on_peer.erase(from);
7c673cae 10686
f67539c2 10687 if (mdr->more()->waiting_on_peer.empty())
7c673cae
FG
10688 dispatch_client_request(mdr); // go again!
10689 else
f67539c2 10690 dout(10) << "still waiting on peers " << mdr->more()->waiting_on_peer << dendl;
7c673cae
FG
10691}
10692
f67539c2 10693void Server::handle_peer_rename_notify_ack(MDRequestRef& mdr, const cref_t<MMDSPeerRequest> &ack)
7c673cae 10694{
f67539c2 10695 dout(10) << "handle_peer_rename_notify_ack " << *mdr << " from mds."
7c673cae 10696 << ack->get_source() << dendl;
f67539c2 10697 ceph_assert(mdr->is_peer());
7c673cae
FG
10698 mds_rank_t from = mds_rank_t(ack->get_source().num());
10699
f67539c2
TL
10700 if (mdr->more()->waiting_on_peer.count(from)) {
10701 mdr->more()->waiting_on_peer.erase(from);
7c673cae 10702
f67539c2
TL
10703 if (mdr->more()->waiting_on_peer.empty()) {
10704 if (mdr->peer_request)
10705 dispatch_peer_request(mdr);
7c673cae
FG
10706 } else
10707 dout(10) << " still waiting for rename notify acks from "
f67539c2 10708 << mdr->more()->waiting_on_peer << dendl;
7c673cae
FG
10709 }
10710}
10711
f67539c2 10712void Server::_peer_rename_sessions_flushed(MDRequestRef& mdr)
7c673cae 10713{
f67539c2 10714 dout(10) << "_peer_rename_sessions_flushed " << *mdr << dendl;
7c673cae 10715
f67539c2
TL
10716 if (mdr->more()->waiting_on_peer.count(MDS_RANK_NONE)) {
10717 mdr->more()->waiting_on_peer.erase(MDS_RANK_NONE);
7c673cae 10718
f67539c2
TL
10719 if (mdr->more()->waiting_on_peer.empty()) {
10720 if (mdr->peer_request)
10721 dispatch_peer_request(mdr);
7c673cae
FG
10722 } else
10723 dout(10) << " still waiting for rename notify acks from "
f67539c2 10724 << mdr->more()->waiting_on_peer << dendl;
7c673cae
FG
10725 }
10726}
10727
10728// snaps
10729/* This function takes responsibility for the passed mdr*/
10730void Server::handle_client_lssnap(MDRequestRef& mdr)
10731{
9f95a23c 10732 const cref_t<MClientRequest> &req = mdr->client_request;
7c673cae
FG
10733
10734 // traverse to path
9f95a23c
TL
10735 CInode *diri = try_get_auth_inode(mdr, req->get_filepath().get_ino());
10736 if (!diri)
7c673cae 10737 return;
9f95a23c 10738
7c673cae 10739 if (!diri->is_dir()) {
f67539c2 10740 respond_to_request(mdr, -CEPHFS_ENOTDIR);
7c673cae
FG
10741 return;
10742 }
10743 dout(10) << "lssnap on " << *diri << dendl;
10744
10745 // lock snap
9f95a23c 10746 if (!mds->locker->try_rdlock_snap_layout(diri, mdr))
7c673cae
FG
10747 return;
10748
10749 if (!check_access(mdr, diri, MAY_READ))
10750 return;
10751
10752 SnapRealm *realm = diri->find_snaprealm();
11fdf7f2 10753 map<snapid_t,const SnapInfo*> infomap;
7c673cae
FG
10754 realm->get_snap_info(infomap, diri->get_oldest_snap());
10755
10756 unsigned max_entries = req->head.args.readdir.max_entries;
10757 if (!max_entries)
10758 max_entries = infomap.size();
10759 int max_bytes = req->head.args.readdir.max_bytes;
10760 if (!max_bytes)
10761 // make sure at least one item can be encoded
11fdf7f2 10762 max_bytes = (512 << 10) + g_conf()->mds_max_xattr_pairs_size;
7c673cae
FG
10763
10764 __u64 last_snapid = 0;
10765 string offset_str = req->get_path2();
10766 if (!offset_str.empty())
10767 last_snapid = realm->resolve_snapname(offset_str, diri->ino());
10768
11fdf7f2 10769 //Empty DirStat
7c673cae 10770 bufferlist dirbl;
11fdf7f2
TL
10771 static DirStat empty;
10772 CDir::encode_dirstat(dirbl, mdr->session->info, empty);
7c673cae
FG
10773
10774 max_bytes -= dirbl.length() - sizeof(__u32) + sizeof(__u8) * 2;
10775
10776 __u32 num = 0;
10777 bufferlist dnbl;
11fdf7f2 10778 auto p = infomap.upper_bound(last_snapid);
7c673cae
FG
10779 for (; p != infomap.end() && num < max_entries; ++p) {
10780 dout(10) << p->first << " -> " << *p->second << dendl;
10781
10782 // actual
10783 string snap_name;
10784 if (p->second->ino == diri->ino())
11fdf7f2 10785 snap_name = p->second->name;
7c673cae 10786 else
11fdf7f2 10787 snap_name = p->second->get_long_name();
7c673cae
FG
10788
10789 unsigned start_len = dnbl.length();
10790 if (int(start_len + snap_name.length() + sizeof(__u32) + sizeof(LeaseStat)) > max_bytes)
10791 break;
10792
11fdf7f2
TL
10793 encode(snap_name, dnbl);
10794 //infinite lease
9f95a23c 10795 LeaseStat e(CEPH_LEASE_VALID, -1, 0);
11fdf7f2
TL
10796 mds->locker->encode_lease(dnbl, mdr->session->info, e);
10797 dout(20) << "encode_infinite_lease" << dendl;
7c673cae
FG
10798
10799 int r = diri->encode_inodestat(dnbl, mdr->session, realm, p->first, max_bytes - (int)dnbl.length());
10800 if (r < 0) {
10801 bufferlist keep;
10802 keep.substr_of(dnbl, 0, start_len);
10803 dnbl.swap(keep);
10804 break;
10805 }
10806 ++num;
10807 }
10808
11fdf7f2 10809 encode(num, dirbl);
7c673cae
FG
10810 __u16 flags = 0;
10811 if (p == infomap.end()) {
10812 flags = CEPH_READDIR_FRAG_END;
10813 if (last_snapid == 0)
10814 flags |= CEPH_READDIR_FRAG_COMPLETE;
10815 }
11fdf7f2 10816 encode(flags, dirbl);
7c673cae
FG
10817 dirbl.claim_append(dnbl);
10818
10819 mdr->reply_extra_bl = dirbl;
10820 mdr->tracei = diri;
10821 respond_to_request(mdr, 0);
10822}
10823
10824
10825// MKSNAP
10826
10827struct C_MDS_mksnap_finish : public ServerLogContext {
10828 CInode *diri;
10829 SnapInfo info;
10830 C_MDS_mksnap_finish(Server *s, MDRequestRef& r, CInode *di, SnapInfo &i) :
10831 ServerLogContext(s, r), diri(di), info(i) {}
10832 void finish(int r) override {
10833 server->_mksnap_finish(mdr, diri, info);
10834 }
10835};
10836
10837/* This function takes responsibility for the passed mdr*/
10838void Server::handle_client_mksnap(MDRequestRef& mdr)
10839{
9f95a23c 10840 const cref_t<MClientRequest> &req = mdr->client_request;
11fdf7f2
TL
10841 // make sure we have as new a map as the client
10842 if (req->get_mdsmap_epoch() > mds->mdsmap->get_epoch()) {
10843 mds->wait_for_mdsmap(req->get_mdsmap_epoch(), new C_MDS_RetryRequest(mdcache, mdr));
10844 return;
10845 }
7c673cae
FG
10846 if (!mds->mdsmap->allows_snaps()) {
10847 // you can't make snapshots until you set an option right now
522d829b 10848 dout(5) << "new snapshots are disabled for this fs" << dendl;
f67539c2 10849 respond_to_request(mdr, -CEPHFS_EPERM);
7c673cae
FG
10850 return;
10851 }
10852
9f95a23c
TL
10853 CInode *diri = try_get_auth_inode(mdr, req->get_filepath().get_ino());
10854 if (!diri)
7c673cae 10855 return;
7c673cae
FG
10856
10857 // dir only
10858 if (!diri->is_dir()) {
f67539c2 10859 respond_to_request(mdr, -CEPHFS_ENOTDIR);
7c673cae
FG
10860 return;
10861 }
10862 if (diri->is_system() && !diri->is_root()) {
10863 // no snaps in system dirs (root is ok)
522d829b 10864 dout(5) << "is an internal system dir" << dendl;
f67539c2 10865 respond_to_request(mdr, -CEPHFS_EPERM);
7c673cae
FG
10866 return;
10867 }
10868
11fdf7f2 10869 std::string_view snapname = req->get_filepath().last_dentry();
7c673cae 10870
11fdf7f2 10871 if (mdr->client_request->get_caller_uid() < g_conf()->mds_snap_min_uid || mdr->client_request->get_caller_uid() > g_conf()->mds_snap_max_uid) {
7c673cae 10872 dout(20) << "mksnap " << snapname << " on " << *diri << " denied to uid " << mdr->client_request->get_caller_uid() << dendl;
f67539c2 10873 respond_to_request(mdr, -CEPHFS_EPERM);
7c673cae
FG
10874 return;
10875 }
10876
10877 dout(10) << "mksnap " << snapname << " on " << *diri << dendl;
10878
10879 // lock snap
9f95a23c
TL
10880 if (!(mdr->locking_state & MutationImpl::ALL_LOCKED)) {
10881 MutationImpl::LockOpVec lov;
10882 lov.add_xlock(&diri->snaplock);
10883 if (!mds->locker->acquire_locks(mdr, lov))
10884 return;
7c673cae 10885
9f95a23c
TL
10886 if (CDentry *pdn = diri->get_projected_parent_dn(); pdn) {
10887 if (!mds->locker->try_rdlock_snap_layout(pdn->get_dir()->get_inode(), mdr))
10888 return;
10889 }
10890 mdr->locking_state |= MutationImpl::ALL_LOCKED;
10891 }
7c673cae 10892
9f95a23c 10893 if (!check_access(mdr, diri, MAY_WRITE|MAY_SNAPSHOT))
7c673cae
FG
10894 return;
10895
adb31ebb
TL
10896 if (inodeno_t subvol_ino = diri->find_snaprealm()->get_subvolume_ino();
10897 (subvol_ino && subvol_ino != diri->ino())) {
522d829b 10898 dout(5) << "is a descendent of a subvolume dir" << dendl;
f67539c2 10899 respond_to_request(mdr, -CEPHFS_EPERM);
adb31ebb
TL
10900 return;
10901 }
10902
9f95a23c
TL
10903 // check if we can create any more snapshots
10904 // we don't allow any more if we are already at or beyond the limit
10905 if (diri->snaprealm &&
10906 diri->snaprealm->get_snaps().size() >= max_snaps_per_dir) {
f67539c2 10907 respond_to_request(mdr, -CEPHFS_EMLINK);
7c673cae 10908 return;
9f95a23c 10909 }
7c673cae
FG
10910
10911 // make sure name is unique
10912 if (diri->snaprealm &&
10913 diri->snaprealm->exists(snapname)) {
f67539c2 10914 respond_to_request(mdr, -CEPHFS_EEXIST);
7c673cae
FG
10915 return;
10916 }
10917 if (snapname.length() == 0 ||
1e59de90 10918 snapname.length() > snapshot_name_max ||
7c673cae 10919 snapname[0] == '_') {
f67539c2 10920 respond_to_request(mdr, -CEPHFS_EINVAL);
7c673cae
FG
10921 return;
10922 }
10923
10924 // allocate a snapid
10925 if (!mdr->more()->stid) {
10926 // prepare an stid
10927 mds->snapclient->prepare_create(diri->ino(), snapname,
10928 mdr->get_mds_stamp(),
10929 &mdr->more()->stid, &mdr->more()->snapidbl,
10930 new C_MDS_RetryRequest(mdcache, mdr));
10931 return;
10932 }
10933
10934 version_t stid = mdr->more()->stid;
10935 snapid_t snapid;
11fdf7f2
TL
10936 auto p = mdr->more()->snapidbl.cbegin();
10937 decode(snapid, p);
7c673cae
FG
10938 dout(10) << " stid " << stid << " snapid " << snapid << dendl;
10939
11fdf7f2
TL
10940 ceph_assert(mds->snapclient->get_cached_version() >= stid);
10941
f67539c2
TL
10942 SnapPayload payload;
10943 if (req->get_data().length()) {
10944 try {
10945 auto iter = req->get_data().cbegin();
10946 decode(payload, iter);
10947 } catch (const ceph::buffer::error &e) {
10948 // backward compat -- client sends xattr bufferlist. however,
10949 // that is not used anywhere -- so (log and) ignore.
10950 dout(20) << ": no metadata in payload (old client?)" << dendl;
10951 }
10952 }
10953
7c673cae
FG
10954 // journal
10955 SnapInfo info;
10956 info.ino = diri->ino();
10957 info.snapid = snapid;
11fdf7f2 10958 info.name = snapname;
7c673cae 10959 info.stamp = mdr->get_op_stamp();
f67539c2 10960 info.metadata = payload.metadata;
7c673cae 10961
f67539c2
TL
10962 auto pi = diri->project_inode(mdr, false, true);
10963 pi.inode->ctime = info.stamp;
10964 if (info.stamp > pi.inode->rstat.rctime)
10965 pi.inode->rstat.rctime = info.stamp;
10966 pi.inode->rstat.rsnaps++;
10967 pi.inode->version = diri->pre_dirty();
7c673cae
FG
10968
10969 // project the snaprealm
94b18763
FG
10970 auto &newsnap = *pi.snapnode;
10971 newsnap.created = snapid;
10972 auto em = newsnap.snaps.emplace(std::piecewise_construct, std::forward_as_tuple(snapid), std::forward_as_tuple(info));
10973 if (!em.second)
10974 em.first->second = info;
10975 newsnap.seq = snapid;
10976 newsnap.last_created = snapid;
1e59de90
TL
10977 newsnap.last_modified = info.stamp;
10978 newsnap.change_attr++;
7c673cae
FG
10979
10980 // journal the inode changes
10981 mdr->ls = mdlog->get_current_segment();
10982 EUpdate *le = new EUpdate(mdlog, "mksnap");
10983 mdlog->start_entry(le);
10984
10985 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
10986 le->metablob.add_table_transaction(TABLE_SNAP, stid);
10987 mdcache->predirty_journal_parents(mdr, &le->metablob, diri, 0, PREDIRTY_PRIMARY, false);
10988 mdcache->journal_dirty_inode(mdr.get(), &le->metablob, diri);
10989
10990 // journal the snaprealm changes
10991 submit_mdlog_entry(le, new C_MDS_mksnap_finish(this, mdr, diri, info),
10992 mdr, __func__);
10993 mdlog->flush();
10994}
10995
10996void Server::_mksnap_finish(MDRequestRef& mdr, CInode *diri, SnapInfo &info)
10997{
10998 dout(10) << "_mksnap_finish " << *mdr << " " << info << dendl;
10999
11000 int op = (diri->snaprealm? CEPH_SNAP_OP_CREATE : CEPH_SNAP_OP_SPLIT);
11001
7c673cae
FG
11002 mdr->apply();
11003
11004 mds->snapclient->commit(mdr->more()->stid, mdr->ls);
11005
11006 // create snap
11007 dout(10) << "snaprealm now " << *diri->snaprealm << dendl;
11008
11fdf7f2
TL
11009 // notify other mds
11010 mdcache->send_snap_update(diri, mdr->more()->stid, op);
11011
7c673cae
FG
11012 mdcache->do_realm_invalidate_and_update_notify(diri, op);
11013
11014 // yay
11015 mdr->in[0] = diri;
11016 mdr->snapid = info.snapid;
11017 mdr->tracei = diri;
11018 respond_to_request(mdr, 0);
11019}
11020
11021
11022// RMSNAP
11023
11024struct C_MDS_rmsnap_finish : public ServerLogContext {
11025 CInode *diri;
11026 snapid_t snapid;
11027 C_MDS_rmsnap_finish(Server *s, MDRequestRef& r, CInode *di, snapid_t sn) :
11028 ServerLogContext(s, r), diri(di), snapid(sn) {}
11029 void finish(int r) override {
11030 server->_rmsnap_finish(mdr, diri, snapid);
11031 }
11032};
11033
11034/* This function takes responsibility for the passed mdr*/
11035void Server::handle_client_rmsnap(MDRequestRef& mdr)
11036{
9f95a23c 11037 const cref_t<MClientRequest> &req = mdr->client_request;
7c673cae 11038
9f95a23c
TL
11039 CInode *diri = try_get_auth_inode(mdr, req->get_filepath().get_ino());
11040 if (!diri)
7c673cae 11041 return;
9f95a23c 11042
7c673cae 11043 if (!diri->is_dir()) {
f67539c2 11044 respond_to_request(mdr, -CEPHFS_ENOTDIR);
7c673cae
FG
11045 return;
11046 }
11047
11fdf7f2 11048 std::string_view snapname = req->get_filepath().last_dentry();
7c673cae 11049
11fdf7f2 11050 if (mdr->client_request->get_caller_uid() < g_conf()->mds_snap_min_uid || mdr->client_request->get_caller_uid() > g_conf()->mds_snap_max_uid) {
7c673cae 11051 dout(20) << "rmsnap " << snapname << " on " << *diri << " denied to uid " << mdr->client_request->get_caller_uid() << dendl;
f67539c2 11052 respond_to_request(mdr, -CEPHFS_EPERM);
7c673cae
FG
11053 return;
11054 }
11055
11056 dout(10) << "rmsnap " << snapname << " on " << *diri << dendl;
11057
11058 // does snap exist?
11059 if (snapname.length() == 0 || snapname[0] == '_') {
f67539c2 11060 respond_to_request(mdr, -CEPHFS_EINVAL); // can't prune a parent snap, currently.
7c673cae
FG
11061 return;
11062 }
11063 if (!diri->snaprealm || !diri->snaprealm->exists(snapname)) {
f67539c2 11064 respond_to_request(mdr, -CEPHFS_ENOENT);
7c673cae
FG
11065 return;
11066 }
11067 snapid_t snapid = diri->snaprealm->resolve_snapname(snapname, diri->ino());
11068 dout(10) << " snapname " << snapname << " is " << snapid << dendl;
9f95a23c
TL
11069 if (!(mdr->locking_state & MutationImpl::ALL_LOCKED)) {
11070 MutationImpl::LockOpVec lov;
11071 lov.add_xlock(&diri->snaplock);
11072 if (!mds->locker->acquire_locks(mdr, lov))
11073 return;
11074 if (CDentry *pdn = diri->get_projected_parent_dn(); pdn) {
11075 if (!mds->locker->try_rdlock_snap_layout(pdn->get_dir()->get_inode(), mdr))
11076 return;
11077 }
11078 mdr->locking_state |= MutationImpl::ALL_LOCKED;
11079 }
7c673cae 11080
11fdf7f2 11081 if (!check_access(mdr, diri, MAY_WRITE|MAY_SNAPSHOT))
7c673cae
FG
11082 return;
11083
11084 // prepare
11085 if (!mdr->more()->stid) {
11086 mds->snapclient->prepare_destroy(diri->ino(), snapid,
11087 &mdr->more()->stid, &mdr->more()->snapidbl,
11088 new C_MDS_RetryRequest(mdcache, mdr));
11089 return;
11090 }
11091 version_t stid = mdr->more()->stid;
11fdf7f2 11092 auto p = mdr->more()->snapidbl.cbegin();
7c673cae 11093 snapid_t seq;
11fdf7f2 11094 decode(seq, p);
7c673cae
FG
11095 dout(10) << " stid is " << stid << ", seq is " << seq << dendl;
11096
11fdf7f2
TL
11097 ceph_assert(mds->snapclient->get_cached_version() >= stid);
11098
7c673cae 11099 // journal
f67539c2
TL
11100 auto pi = diri->project_inode(mdr, false, true);
11101 pi.inode->version = diri->pre_dirty();
11102 pi.inode->ctime = mdr->get_op_stamp();
11103 if (mdr->get_op_stamp() > pi.inode->rstat.rctime)
11104 pi.inode->rstat.rctime = mdr->get_op_stamp();
11105 pi.inode->rstat.rsnaps--;
7c673cae
FG
11106
11107 mdr->ls = mdlog->get_current_segment();
11108 EUpdate *le = new EUpdate(mdlog, "rmsnap");
11109 mdlog->start_entry(le);
11110
11111 // project the snaprealm
94b18763
FG
11112 auto &newnode = *pi.snapnode;
11113 newnode.snaps.erase(snapid);
11114 newnode.seq = seq;
11115 newnode.last_destroyed = seq;
1e59de90
TL
11116 newnode.last_modified = mdr->get_op_stamp();
11117 newnode.change_attr++;
7c673cae
FG
11118
11119 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
11120 le->metablob.add_table_transaction(TABLE_SNAP, stid);
11121 mdcache->predirty_journal_parents(mdr, &le->metablob, diri, 0, PREDIRTY_PRIMARY, false);
11122 mdcache->journal_dirty_inode(mdr.get(), &le->metablob, diri);
11123
11124 submit_mdlog_entry(le, new C_MDS_rmsnap_finish(this, mdr, diri, snapid),
11125 mdr, __func__);
11126 mdlog->flush();
11127}
11128
11129void Server::_rmsnap_finish(MDRequestRef& mdr, CInode *diri, snapid_t snapid)
11130{
11131 dout(10) << "_rmsnap_finish " << *mdr << " " << snapid << dendl;
11132 snapid_t stid = mdr->more()->stid;
7c673cae 11133
7c673cae
FG
11134 mdr->apply();
11135
11136 mds->snapclient->commit(stid, mdr->ls);
11137
11138 dout(10) << "snaprealm now " << *diri->snaprealm << dendl;
11139
11fdf7f2
TL
11140 // notify other mds
11141 mdcache->send_snap_update(diri, mdr->more()->stid, CEPH_SNAP_OP_DESTROY);
11142
7c673cae
FG
11143 mdcache->do_realm_invalidate_and_update_notify(diri, CEPH_SNAP_OP_DESTROY);
11144
11145 // yay
11146 mdr->in[0] = diri;
1e59de90
TL
11147 mdr->tracei = diri;
11148 mdr->snapid = snapid;
7c673cae
FG
11149 respond_to_request(mdr, 0);
11150
11151 // purge snapshot data
f67539c2 11152 diri->purge_stale_snap_data(diri->snaprealm->get_snaps());
7c673cae
FG
11153}
11154
11155struct C_MDS_renamesnap_finish : public ServerLogContext {
11156 CInode *diri;
11157 snapid_t snapid;
11158 C_MDS_renamesnap_finish(Server *s, MDRequestRef& r, CInode *di, snapid_t sn) :
11159 ServerLogContext(s, r), diri(di), snapid(sn) {}
11160 void finish(int r) override {
11161 server->_renamesnap_finish(mdr, diri, snapid);
11162 }
11163};
11164
11165/* This function takes responsibility for the passed mdr*/
11166void Server::handle_client_renamesnap(MDRequestRef& mdr)
11167{
9f95a23c 11168 const cref_t<MClientRequest> &req = mdr->client_request;
7c673cae 11169 if (req->get_filepath().get_ino() != req->get_filepath2().get_ino()) {
f67539c2 11170 respond_to_request(mdr, -CEPHFS_EINVAL);
7c673cae
FG
11171 return;
11172 }
11173
9f95a23c
TL
11174 CInode *diri = try_get_auth_inode(mdr, req->get_filepath().get_ino());
11175 if (!diri)
7c673cae 11176 return;
7c673cae
FG
11177
11178 if (!diri->is_dir()) { // dir only
f67539c2 11179 respond_to_request(mdr, -CEPHFS_ENOTDIR);
7c673cae
FG
11180 return;
11181 }
11182
11fdf7f2
TL
11183 if (mdr->client_request->get_caller_uid() < g_conf()->mds_snap_min_uid ||
11184 mdr->client_request->get_caller_uid() > g_conf()->mds_snap_max_uid) {
f67539c2 11185 respond_to_request(mdr, -CEPHFS_EPERM);
7c673cae
FG
11186 return;
11187 }
11188
11fdf7f2
TL
11189 std::string_view dstname = req->get_filepath().last_dentry();
11190 std::string_view srcname = req->get_filepath2().last_dentry();
7c673cae
FG
11191 dout(10) << "renamesnap " << srcname << "->" << dstname << " on " << *diri << dendl;
11192
11193 if (srcname.length() == 0 || srcname[0] == '_') {
f67539c2 11194 respond_to_request(mdr, -CEPHFS_EINVAL); // can't rename a parent snap.
7c673cae
FG
11195 return;
11196 }
11197 if (!diri->snaprealm || !diri->snaprealm->exists(srcname)) {
f67539c2 11198 respond_to_request(mdr, -CEPHFS_ENOENT);
7c673cae
FG
11199 return;
11200 }
11201 if (dstname.length() == 0 || dstname[0] == '_') {
f67539c2 11202 respond_to_request(mdr, -CEPHFS_EINVAL);
7c673cae
FG
11203 return;
11204 }
11205 if (diri->snaprealm->exists(dstname)) {
f67539c2 11206 respond_to_request(mdr, -CEPHFS_EEXIST);
7c673cae
FG
11207 return;
11208 }
11209
aee94f69
TL
11210 snapid_t snapid = diri->snaprealm->resolve_snapname(srcname, diri->ino());
11211
7c673cae
FG
11212 dout(10) << " snapname " << srcname << " is " << snapid << dendl;
11213
11214 // lock snap
9f95a23c
TL
11215 if (!(mdr->locking_state & MutationImpl::ALL_LOCKED)) {
11216 MutationImpl::LockOpVec lov;
11217 lov.add_xlock(&diri->snaplock);
11218 if (!mds->locker->acquire_locks(mdr, lov))
11219 return;
11220 if (CDentry *pdn = diri->get_projected_parent_dn(); pdn) {
11221 if (!mds->locker->try_rdlock_snap_layout(pdn->get_dir()->get_inode(), mdr))
11222 return;
11223 }
11224 mdr->locking_state |= MutationImpl::ALL_LOCKED;
11225 }
7c673cae 11226
11fdf7f2 11227 if (!check_access(mdr, diri, MAY_WRITE|MAY_SNAPSHOT))
7c673cae
FG
11228 return;
11229
11230 // prepare
11231 if (!mdr->more()->stid) {
11232 mds->snapclient->prepare_update(diri->ino(), snapid, dstname, utime_t(),
11fdf7f2 11233 &mdr->more()->stid,
7c673cae
FG
11234 new C_MDS_RetryRequest(mdcache, mdr));
11235 return;
11236 }
11237
11238 version_t stid = mdr->more()->stid;
11fdf7f2
TL
11239 dout(10) << " stid is " << stid << dendl;
11240
11241 ceph_assert(mds->snapclient->get_cached_version() >= stid);
7c673cae
FG
11242
11243 // journal
f67539c2
TL
11244 auto pi = diri->project_inode(mdr, false, true);
11245 pi.inode->ctime = mdr->get_op_stamp();
11246 if (mdr->get_op_stamp() > pi.inode->rstat.rctime)
11247 pi.inode->rstat.rctime = mdr->get_op_stamp();
11248 pi.inode->version = diri->pre_dirty();
7c673cae
FG
11249
11250 // project the snaprealm
94b18763
FG
11251 auto &newsnap = *pi.snapnode;
11252 auto it = newsnap.snaps.find(snapid);
11fdf7f2
TL
11253 ceph_assert(it != newsnap.snaps.end());
11254 it->second.name = dstname;
1e59de90
TL
11255 newsnap.last_modified = mdr->get_op_stamp();
11256 newsnap.change_attr++;
7c673cae
FG
11257
11258 // journal the inode changes
11259 mdr->ls = mdlog->get_current_segment();
11260 EUpdate *le = new EUpdate(mdlog, "renamesnap");
11261 mdlog->start_entry(le);
11262
11263 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
11264 le->metablob.add_table_transaction(TABLE_SNAP, stid);
11265 mdcache->predirty_journal_parents(mdr, &le->metablob, diri, 0, PREDIRTY_PRIMARY, false);
11266 mdcache->journal_dirty_inode(mdr.get(), &le->metablob, diri);
11267
11268 // journal the snaprealm changes
11269 submit_mdlog_entry(le, new C_MDS_renamesnap_finish(this, mdr, diri, snapid),
11270 mdr, __func__);
11271 mdlog->flush();
11272}
11273
11274void Server::_renamesnap_finish(MDRequestRef& mdr, CInode *diri, snapid_t snapid)
11275{
11276 dout(10) << "_renamesnap_finish " << *mdr << " " << snapid << dendl;
11277
7c673cae
FG
11278 mdr->apply();
11279
11280 mds->snapclient->commit(mdr->more()->stid, mdr->ls);
11281
11282 dout(10) << "snaprealm now " << *diri->snaprealm << dendl;
11283
11fdf7f2
TL
11284 // notify other mds
11285 mdcache->send_snap_update(diri, mdr->more()->stid, CEPH_SNAP_OP_UPDATE);
11286
11287 mdcache->do_realm_invalidate_and_update_notify(diri, CEPH_SNAP_OP_UPDATE);
7c673cae
FG
11288
11289 // yay
11290 mdr->in[0] = diri;
11291 mdr->tracei = diri;
11292 mdr->snapid = snapid;
11293 respond_to_request(mdr, 0);
11294}
11295
aee94f69
TL
11296void Server::handle_client_readdir_snapdiff(MDRequestRef& mdr)
11297{
11298 const cref_t<MClientRequest>& req = mdr->client_request;
11299 Session* session = mds->get_session(req);
11300 MutationImpl::LockOpVec lov;
11301 CInode* diri = rdlock_path_pin_ref(mdr, false, true);
11302 if (!diri) return;
11303
11304 // it's a directory, right?
11305 if (!diri->is_dir()) {
11306 // not a dir
11307 dout(10) << "reply to " << *req << " snapdiff -CEPHFS_ENOTDIR" << dendl;
11308 respond_to_request(mdr, -CEPHFS_ENOTDIR);
11309 return;
11310 }
11311
11312 auto num_caps = session->get_num_caps();
11313 auto session_cap_acquisition = session->get_cap_acquisition();
11314
11315 if (num_caps > static_cast<uint64_t>(max_caps_per_client * max_caps_throttle_ratio) && session_cap_acquisition >= cap_acquisition_throttle) {
11316 dout(20) << "snapdiff throttled. max_caps_per_client: " << max_caps_per_client << " num_caps: " << num_caps
11317 << " session_cap_acquistion: " << session_cap_acquisition << " cap_acquisition_throttle: " << cap_acquisition_throttle << dendl;
11318 if (logger)
11319 logger->inc(l_mdss_cap_acquisition_throttle);
11320
11321 mds->timer.add_event_after(caps_throttle_retry_request_timeout, new C_MDS_RetryRequest(mdcache, mdr));
11322 return;
11323 }
11324
11325 lov.add_rdlock(&diri->filelock);
11326 lov.add_rdlock(&diri->dirfragtreelock);
11327
11328 if (!mds->locker->acquire_locks(mdr, lov))
11329 return;
11330
11331 if (!check_access(mdr, diri, MAY_READ))
11332 return;
11333
11334 // which frag?
11335 frag_t fg = (__u32)req->head.args.snapdiff.frag;
11336 unsigned req_flags = (__u32)req->head.args.snapdiff.flags;
11337 string offset_str = req->get_path2();
11338
11339 __u32 offset_hash = 0;
11340 if (!offset_str.empty()) {
11341 offset_hash = ceph_frag_value(diri->hash_dentry_name(offset_str));
11342 } else {
11343 offset_hash = (__u32)req->head.args.snapdiff.offset_hash;
11344 }
11345
11346 dout(10) << " frag " << fg << " offset '" << offset_str << "'"
11347 << " offset_hash " << offset_hash << " flags " << req_flags << dendl;
11348
11349 // does the frag exist?
11350 if (diri->dirfragtree[fg.value()] != fg) {
11351 frag_t newfg;
11352 if (req_flags & CEPH_READDIR_REPLY_BITFLAGS) {
11353 if (fg.contains((unsigned)offset_hash)) {
11354 newfg = diri->dirfragtree[offset_hash];
11355 } else {
11356 // client actually wants next frag
11357 newfg = diri->dirfragtree[fg.value()];
11358 }
11359 } else {
11360 offset_str.clear();
11361 newfg = diri->dirfragtree[fg.value()];
11362 }
11363 dout(10) << " adjust frag " << fg << " -> " << newfg << " " << diri->dirfragtree << dendl;
11364 fg = newfg;
11365 }
11366
11367 CDir* dir = try_open_auth_dirfrag(diri, fg, mdr);
11368 if (!dir) return;
11369
11370 // ok!
11371 dout(10) << __func__<< " on " << *dir << dendl;
11372 ceph_assert(dir->is_auth());
11373
11374 if (!dir->is_complete()) {
11375 if (dir->is_frozen()) {
11376 dout(7) << "dir is frozen " << *dir << dendl;
11377 mds->locker->drop_locks(mdr.get());
11378 mdr->drop_local_auth_pins();
11379 dir->add_waiter(CDir::WAIT_UNFREEZE, new C_MDS_RetryRequest(mdcache, mdr));
11380 return;
11381 }
11382 // fetch
11383 dout(10) << " incomplete dir contents for snapdiff on " << *dir << ", fetching" << dendl;
11384 dir->fetch(new C_MDS_RetryRequest(mdcache, mdr), true);
11385 return;
11386 }
11387
11388#ifdef MDS_VERIFY_FRAGSTAT
11389 dir->verify_fragstat();
11390#endif
11391
11392 utime_t now = ceph_clock_now();
11393 mdr->set_mds_stamp(now);
11394
11395 mdr->snapid_diff_other = (uint64_t)req->head.args.snapdiff.snap_other;
11396 if (mdr->snapid_diff_other == mdr->snapid ||
11397 mdr->snapid == CEPH_NOSNAP ||
11398 mdr->snapid_diff_other == CEPH_NOSNAP) {
11399 dout(10) << "reply to " << *req << " snapdiff -CEPHFS_EINVAL" << dendl;
11400 respond_to_request(mdr, -CEPHFS_EINVAL);
11401 }
11402
11403 dout(10) << __func__
11404 << " snap " << mdr->snapid
11405 << " vs. snap " << mdr->snapid_diff_other
11406 << dendl;
11407
11408 SnapRealm* realm = diri->find_snaprealm();
11409
11410 unsigned max = req->head.args.snapdiff.max_entries;
11411 if (!max)
11412 max = dir->get_num_any(); // whatever, something big.
11413 unsigned max_bytes = req->head.args.snapdiff.max_bytes;
11414 if (!max_bytes)
11415 // make sure at least one item can be encoded
11416 max_bytes = (512 << 10) + g_conf()->mds_max_xattr_pairs_size;
11417
11418 // start final blob
11419 bufferlist dirbl;
11420 DirStat ds;
11421 ds.frag = dir->get_frag();
11422 ds.auth = dir->get_dir_auth().first;
11423 if (dir->is_auth() && !forward_all_requests_to_auth)
11424 dir->get_dist_spec(ds.dist, mds->get_nodeid());
11425
11426 dir->encode_dirstat(dirbl, mdr->session->info, ds);
11427
11428 // count bytes available.
11429 // this isn't perfect, but we should capture the main variable/unbounded size items!
11430 int front_bytes = dirbl.length() + sizeof(__u32) + sizeof(__u8) * 2;
11431 int bytes_left = max_bytes - front_bytes;
11432 bytes_left -= get_snap_trace(session, realm).length();
11433
11434 _readdir_diff(
11435 now,
11436 mdr,
11437 diri,
11438 dir,
11439 realm,
11440 max,
11441 bytes_left,
11442 offset_str,
11443 offset_hash,
11444 req_flags,
11445 dirbl);
11446}
11447
11448
7c673cae
FG
11449/**
11450 * Return true if server is in state RECONNECT and this
11451 * client has not yet reconnected.
11452 */
11453bool Server::waiting_for_reconnect(client_t c) const
11454{
11455 return client_reconnect_gather.count(c) > 0;
11456}
11457
11458void Server::dump_reconnect_status(Formatter *f) const
11459{
11460 f->open_object_section("reconnect_status");
11461 f->dump_stream("client_reconnect_gather") << client_reconnect_gather;
11462 f->close_section();
11463}
1e59de90
TL
11464
11465const bufferlist& Server::get_snap_trace(Session *session, SnapRealm *realm) const {
11466 ceph_assert(session);
11467 ceph_assert(realm);
11468 if (session->info.has_feature(CEPHFS_FEATURE_NEW_SNAPREALM_INFO)) {
11469 return realm->get_snap_trace_new();
11470 } else {
11471 return realm->get_snap_trace();
11472 }
11473}
11474
11475const bufferlist& Server::get_snap_trace(client_t client, SnapRealm *realm) const {
11476 Session *session = mds->sessionmap.get_session(entity_name_t::CLIENT(client.v));
11477 return get_snap_trace(session, realm);
11478}
aee94f69
TL
11479
11480void Server::_readdir_diff(
11481 utime_t now,
11482 MDRequestRef& mdr,
11483 CInode* diri,
11484 CDir* dir,
11485 SnapRealm* realm,
11486 unsigned max_entries,
11487 int bytes_left,
11488 const string& offset_str,
11489 uint32_t offset_hash,
11490 unsigned req_flags,
11491 bufferlist& dirbl)
11492{
11493 // build dir contents
11494 bufferlist dnbl;
11495 __u32 numfiles = 0;
11496
11497 snapid_t snapid = mdr->snapid;
11498 snapid_t snapid_prev = mdr->snapid_diff_other;
11499 if (snapid < snapid_prev) {
11500 std::swap(snapid, snapid_prev);
11501 }
11502 bool from_the_beginning = !offset_hash && offset_str.empty();
11503 // skip all dns < dentry_key_t(snapid, offset_str, offset_hash)
11504 dentry_key_t skip_key(snapid_prev, offset_str.c_str(), offset_hash);
11505
11506 bool end = build_snap_diff(
11507 mdr,
11508 dir,
11509 bytes_left,
11510 from_the_beginning ? nullptr : & skip_key,
11511 snapid_prev,
11512 snapid,
11513 dnbl,
11514 [&](CDentry* dn, CInode* in, bool exists) {
11515 string name;
11516 snapid_t effective_snapid;
11517 const auto& dn_name = dn->get_name();
11518 // provide the first snapid for removed entries and
11519 // the last one for existent ones
11520 effective_snapid = exists ? snapid : snapid_prev;
11521 name.append(dn_name);
11522 if ((int)(dnbl.length() + name.length() + sizeof(__u32) + sizeof(LeaseStat)) > bytes_left) {
11523 dout(10) << " ran out of room, stopping at " << dnbl.length() << " < " << bytes_left << dendl;
11524 return false;
11525 }
11526
11527 auto diri = dir->get_inode();
11528 auto hash = ceph_frag_value(diri->hash_dentry_name(dn_name));
11529 unsigned start_len = dnbl.length();
11530 dout(10) << "inc dn " << *dn << " as " << name
11531 << std::hex << " hash 0x" << hash << std::dec
11532 << dendl;
11533 encode(name, dnbl);
11534 mds->locker->issue_client_lease(dn, in, mdr, now, dnbl);
11535
11536 // inode
11537 dout(10) << "inc inode " << *in << " snap " << effective_snapid << dendl;
11538 int r = in->encode_inodestat(dnbl, mdr->session, realm, effective_snapid, bytes_left - (int)dnbl.length());
11539 if (r < 0) {
11540 // chop off dn->name, lease
11541 dout(10) << " ran out of room, stopping at "
11542 << start_len << " < " << bytes_left << dendl;
11543 bufferlist keep;
11544 keep.substr_of(dnbl, 0, start_len);
11545 dnbl.swap(keep);
11546 return false;
11547 }
11548
11549 // touch dn
11550 mdcache->lru.lru_touch(dn);
11551 ++numfiles;
11552 return true;
11553 });
11554
11555 __u16 flags = 0;
11556 if (req_flags & CEPH_READDIR_REPLY_BITFLAGS) {
11557 flags |= CEPH_READDIR_HASH_ORDER | CEPH_READDIR_OFFSET_HASH;
11558 }
11559
11560 std::swap(mdr->snapid, mdr->snapid_diff_other); // we want opponent snapid to be used for tracei
11561
11562 _finalize_readdir(mdr, diri, dir, from_the_beginning, end, flags, numfiles,
11563 dirbl, dnbl);
11564}
11565
11566bool Server::build_snap_diff(
11567 MDRequestRef& mdr,
11568 CDir* dir,
11569 int bytes_left,
11570 dentry_key_t* skip_key,
11571 snapid_t snapid_prev,
11572 snapid_t snapid,
11573 const bufferlist& dnbl,
11574 std::function<bool (CDentry*, CInode*, bool)> add_result_cb)
11575{
11576 client_t client = mdr->client_request->get_source().num();
11577
11578 struct EntryInfo {
11579 CDentry* dn = nullptr;
11580 CInode* in = nullptr;
11581 utime_t mtime;
11582
11583 void reset() {
11584 *this = EntryInfo();
11585 }
11586 } before;
11587
11588 auto insert_deleted = [&](EntryInfo& ei) {
11589 dout(20) << "build_snap_diff deleted file " << ei.dn->get_name() << " "
11590 << ei.dn->first << "/" << ei.dn->last << dendl;
11591 int r = add_result_cb(ei.dn, ei.in, false);
11592 ei.reset();
11593 return r;
11594 };
11595
11596 auto it = !skip_key ? dir->begin() : dir->lower_bound(*skip_key);
11597
11598 while(it != dir->end()) {
11599 CDentry* dn = it->second;
11600 dout(20) << __func__ << " " << it->first << "->" << *dn << dendl;
11601 ++it;
11602 if (dn->state_test(CDentry::STATE_PURGING))
11603 continue;
11604
11605 bool dnp = dn->use_projected(client, mdr);
11606 CDentry::linkage_t* dnl = dnp ? dn->get_projected_linkage() : dn->get_linkage();
11607
11608 if (dnl->is_null()) {
11609 dout(20) << __func__ << " linkage is null, skipping" << dendl;
11610 continue;
11611 }
11612
11613 if (dn->last < snapid_prev || dn->first > snapid) {
11614 dout(20) << __func__ << " not in range, skipping" << dendl;
11615 continue;
11616 }
11617 if (skip_key) {
11618 skip_key->snapid = dn->last;
11619 if (!(*skip_key < dn->key()))
11620 continue;
11621 }
11622
11623 CInode* in = dnl->get_inode();
11624 if (in && in->ino() == CEPH_INO_CEPH)
11625 continue;
11626
11627 // remote link?
11628 // better for the MDS to do the work, if we think the client will stat any of these files.
11629 if (dnl->is_remote() && !in) {
11630 in = mdcache->get_inode(dnl->get_remote_ino());
11631 dout(20) << __func__ << " remote in: " << *in << " ino " << std::hex << dnl->get_remote_ino() << std::dec << dendl;
11632 if (in) {
11633 dn->link_remote(dnl, in);
11634 } else if (dn->state_test(CDentry::STATE_BADREMOTEINO)) {
11635 dout(10) << "skipping bad remote ino on " << *dn << dendl;
11636 continue;
11637 } else {
11638 // touch everything i _do_ have
11639 for (auto& p : *dir) {
11640 if (!p.second->get_linkage()->is_null())
11641 mdcache->lru.lru_touch(p.second);
11642 }
11643
11644 // already issued caps and leases, reply immediately.
11645 if (dnbl.length() > 0) {
11646 mdcache->open_remote_dentry(dn, dnp, new C_MDSInternalNoop);
11647 dout(10) << " open remote dentry after caps were issued, stopping at "
11648 << dnbl.length() << " < " << bytes_left << dendl;
11649 } else {
11650 mds->locker->drop_locks(mdr.get());
11651 mdr->drop_local_auth_pins();
11652 mdcache->open_remote_dentry(dn, dnp, new C_MDS_RetryRequest(mdcache, mdr));
11653 }
11654 return false;
11655 }
11656 }
11657 ceph_assert(in);
11658
11659 utime_t mtime = in->get_inode()->mtime;
11660
11661 if (in->is_dir()) {
11662
11663 // we need to maintain the order of entries (determined by their name hashes)
11664 // hence need to insert the previous entry if any immediately.
11665 if (before.dn) {
11666 if (!insert_deleted(before)) {
11667 break;
11668 }
11669 }
11670
11671 bool exists = true;
11672 if (snapid_prev < dn->first && dn->last < snapid) {
11673 dout(20) << __func__ << " skipping inner " << dn->get_name() << " "
11674 << dn->first << "/" << dn->last << dendl;
11675 continue;
11676 } else if (dn->first <= snapid_prev && dn->last < snapid) {
11677 // dir deleted
11678 dout(20) << __func__ << " deleted dir " << dn->get_name() << " "
11679 << dn->first << "/" << dn->last << dendl;
11680 exists = false;
11681 }
11682 bool r = add_result_cb(dn, in, exists);
11683 if (!r) {
11684 break;
11685 }
11686 } else {
11687 if (snapid_prev >= dn->first && snapid <= dn->last) {
11688 dout(20) << __func__ << " skipping unchanged " << dn->get_name() << " "
11689 << dn->first << "/" << dn->last << dendl;
11690 continue;
11691 } else if (snapid_prev < dn->first && snapid > dn->last) {
11692 dout(20) << __func__ << " skipping inner modification " << dn->get_name() << " "
11693 << dn->first << "/" << dn->last << dendl;
11694 continue;
11695 }
11696 string_view name_before =
11697 before.dn ? string_view(before.dn->get_name()) : string_view();
11698 if (before.dn && dn->get_name() != name_before) {
11699 if (!insert_deleted(before)) {
11700 break;
11701 }
11702 before.reset();
11703 }
11704 if (snapid_prev >= dn->first && snapid_prev <= dn->last) {
11705 dout(30) << __func__ << " dn_before " << dn->get_name() << " "
11706 << dn->first << "/" << dn->last << dendl;
11707 before = EntryInfo {dn, in, mtime};
11708 continue;
11709 } else {
11710 if (before.dn && dn->get_name() == name_before) {
11711 if (mtime == before.mtime) {
11712 dout(30) << __func__ << " timestamp not changed " << dn->get_name() << " "
11713 << dn->first << "/" << dn->last
11714 << " " << mtime
11715 << dendl;
11716 before.reset();
11717 continue;
11718 } else {
11719 dout(30) << __func__ << " timestamp changed " << dn->get_name() << " "
11720 << dn->first << "/" << dn->last
11721 << " " << before.mtime << " vs. " << mtime
11722 << dendl;
11723 before.reset();
11724 }
11725 }
11726 dout(20) << __func__ << " new file " << dn->get_name() << " "
11727 << dn->first << "/" << dn->last
11728 << dendl;
11729 ceph_assert(snapid >= dn->first && snapid <= dn->last);
11730 }
11731 if (!add_result_cb(dn, in, true)) {
11732 break;
11733 }
11734 }
11735 }
11736 if (before.dn) {
11737 insert_deleted(before);
11738 }
11739 return it == dir->end();
11740}