]> git.proxmox.com Git - ceph.git/blame - ceph/src/mds/Server.cc
bump version to 17.2.5-pve1
[ceph.git] / ceph / src / mds / Server.cc
CommitLineData
7c673cae
FG
1// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2// vim: ts=8 sw=2 smarttab
3/*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 *
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
12 *
13 */
14
15#include <boost/lexical_cast.hpp>
11fdf7f2 16#include "include/ceph_assert.h" // lexical_cast includes system assert.h
7c673cae
FG
17
18#include <boost/config/warning_disable.hpp>
19#include <boost/fusion/include/std_pair.hpp>
a8e16298 20#include <boost/range/adaptor/reversed.hpp>
7c673cae
FG
21
22#include "MDSRank.h"
23#include "Server.h"
24#include "Locker.h"
25#include "MDCache.h"
26#include "MDLog.h"
27#include "Migrator.h"
28#include "MDBalancer.h"
29#include "InoTable.h"
30#include "SnapClient.h"
31#include "Mutation.h"
f67539c2 32#include "MetricsHandler.h"
11fdf7f2 33#include "cephfs_features.h"
7c673cae
FG
34
35#include "msg/Messenger.h"
36
37#include "osdc/Objecter.h"
38
7c673cae 39#include "events/EUpdate.h"
f67539c2 40#include "events/EPeerUpdate.h"
7c673cae
FG
41#include "events/ESession.h"
42#include "events/EOpen.h"
43#include "events/ECommitted.h"
9f95a23c 44#include "events/EPurged.h"
7c673cae 45
11fdf7f2 46#include "include/stringify.h"
7c673cae
FG
47#include "include/filepath.h"
48#include "common/errno.h"
49#include "common/Timer.h"
50#include "common/perf_counters.h"
51#include "include/compat.h"
52#include "osd/OSDMap.h"
53
54#include <errno.h>
55
56#include <list>
f67539c2 57#include <regex>
11fdf7f2 58#include <string_view>
f67539c2 59#include <functional>
7c673cae
FG
60
61#include "common/config.h"
62
63#define dout_context g_ceph_context
64#define dout_subsys ceph_subsys_mds
65#undef dout_prefix
66#define dout_prefix *_dout << "mds." << mds->get_nodeid() << ".server "
67
20effc67
TL
68using namespace std;
69
11fdf7f2 70class ServerContext : public MDSContext {
7c673cae
FG
71 protected:
72 Server *server;
73 MDSRank *get_mds() override
74 {
75 return server->mds;
76 }
77
78 public:
79 explicit ServerContext(Server *s) : server(s) {
11fdf7f2 80 ceph_assert(server != NULL);
7c673cae
FG
81 }
82};
83
9f95a23c
TL
84class Batch_Getattr_Lookup : public BatchOp {
85protected:
86 Server* server;
87 ceph::ref_t<MDRequestImpl> mdr;
f91f0fd5 88 std::vector<ceph::ref_t<MDRequestImpl>> batch_reqs;
9f95a23c
TL
89 int res = 0;
90public:
f91f0fd5
TL
91 Batch_Getattr_Lookup(Server* s, const ceph::ref_t<MDRequestImpl>& r)
92 : server(s), mdr(r) {
93 if (mdr->client_request->get_op() == CEPH_MDS_OP_LOOKUP)
94 mdr->batch_op_map = &mdr->dn[0].back()->batch_ops;
95 else
96 mdr->batch_op_map = &mdr->in[0]->batch_ops;
97 }
98 void add_request(const ceph::ref_t<MDRequestImpl>& r) override {
99 batch_reqs.push_back(r);
9f95a23c 100 }
f91f0fd5
TL
101 ceph::ref_t<MDRequestImpl> find_new_head() override {
102 while (!batch_reqs.empty()) {
103 auto r = std::move(batch_reqs.back());
104 batch_reqs.pop_back();
105 if (r->killed)
106 continue;
107
108 r->batch_op_map = mdr->batch_op_map;
109 mdr->batch_op_map = nullptr;
110 mdr = r;
111 return mdr;
112 }
113 return nullptr;
9f95a23c
TL
114 }
115 void _forward(mds_rank_t t) override {
f91f0fd5 116 MDCache* mdcache = server->mdcache;
9f95a23c
TL
117 mdcache->mds->forward_message_mds(mdr->release_client_request(), t);
118 mdr->set_mds_stamp(ceph_clock_now());
f91f0fd5 119 for (auto& m : batch_reqs) {
9f95a23c
TL
120 if (!m->killed)
121 mdcache->request_forward(m, t);
122 }
f91f0fd5 123 batch_reqs.clear();
9f95a23c
TL
124 }
125 void _respond(int r) override {
126 mdr->set_mds_stamp(ceph_clock_now());
f91f0fd5 127 for (auto& m : batch_reqs) {
9f95a23c
TL
128 if (!m->killed) {
129 m->tracei = mdr->tracei;
130 m->tracedn = mdr->tracedn;
131 server->respond_to_request(m, r);
132 }
133 }
f91f0fd5 134 batch_reqs.clear();
9f95a23c
TL
135 server->reply_client_request(mdr, make_message<MClientReply>(*mdr->client_request, r));
136 }
137 void print(std::ostream& o) {
138 o << "[batch front=" << *mdr << "]";
139 }
140};
141
7c673cae
FG
142class ServerLogContext : public MDSLogContextBase {
143protected:
144 Server *server;
145 MDSRank *get_mds() override
146 {
147 return server->mds;
148 }
149
150 MDRequestRef mdr;
151 void pre_finish(int r) override {
152 if (mdr)
153 mdr->mark_event("journal_committed: ");
154 }
155public:
156 explicit ServerLogContext(Server *s) : server(s) {
11fdf7f2 157 ceph_assert(server != NULL);
7c673cae
FG
158 }
159 explicit ServerLogContext(Server *s, MDRequestRef& r) : server(s), mdr(r) {
11fdf7f2 160 ceph_assert(server != NULL);
7c673cae
FG
161 }
162};
163
164void Server::create_logger()
165{
166 PerfCountersBuilder plb(g_ceph_context, "mds_server", l_mdss_first, l_mdss_last);
91327a77
AA
167
168 plb.add_u64_counter(l_mdss_handle_client_request, "handle_client_request",
169 "Client requests", "hcr", PerfCountersBuilder::PRIO_INTERESTING);
f67539c2
TL
170 plb.add_u64_counter(l_mdss_handle_peer_request, "handle_peer_request",
171 "Peer requests", "hsr", PerfCountersBuilder::PRIO_INTERESTING);
91327a77
AA
172 plb.add_u64_counter(l_mdss_handle_client_session,
173 "handle_client_session", "Client session messages", "hcs",
174 PerfCountersBuilder::PRIO_INTERESTING);
11fdf7f2
TL
175 plb.add_u64_counter(l_mdss_cap_revoke_eviction, "cap_revoke_eviction",
176 "Cap Revoke Client Eviction", "cre", PerfCountersBuilder::PRIO_INTERESTING);
adb31ebb
TL
177 plb.add_u64_counter(l_mdss_cap_acquisition_throttle,
178 "cap_acquisition_throttle", "Cap acquisition throttle counter", "cat",
179 PerfCountersBuilder::PRIO_INTERESTING);
91327a77
AA
180
181 // fop latencies are useful
182 plb.set_prio_default(PerfCountersBuilder::PRIO_USEFUL);
183 plb.add_time_avg(l_mdss_req_lookuphash_latency, "req_lookuphash_latency",
184 "Request type lookup hash of inode latency");
185 plb.add_time_avg(l_mdss_req_lookupino_latency, "req_lookupino_latency",
186 "Request type lookup inode latency");
187 plb.add_time_avg(l_mdss_req_lookupparent_latency, "req_lookupparent_latency",
188 "Request type lookup parent latency");
189 plb.add_time_avg(l_mdss_req_lookupname_latency, "req_lookupname_latency",
190 "Request type lookup name latency");
191 plb.add_time_avg(l_mdss_req_lookup_latency, "req_lookup_latency",
192 "Request type lookup latency");
193 plb.add_time_avg(l_mdss_req_lookupsnap_latency, "req_lookupsnap_latency",
194 "Request type lookup snapshot latency");
195 plb.add_time_avg(l_mdss_req_getattr_latency, "req_getattr_latency",
196 "Request type get attribute latency");
197 plb.add_time_avg(l_mdss_req_setattr_latency, "req_setattr_latency",
198 "Request type set attribute latency");
199 plb.add_time_avg(l_mdss_req_setlayout_latency, "req_setlayout_latency",
200 "Request type set file layout latency");
201 plb.add_time_avg(l_mdss_req_setdirlayout_latency, "req_setdirlayout_latency",
202 "Request type set directory layout latency");
1d09f67e
TL
203 plb.add_time_avg(l_mdss_req_getvxattr_latency, "req_getvxattr_latency",
204 "Request type get virtual extended attribute latency");
91327a77
AA
205 plb.add_time_avg(l_mdss_req_setxattr_latency, "req_setxattr_latency",
206 "Request type set extended attribute latency");
207 plb.add_time_avg(l_mdss_req_rmxattr_latency, "req_rmxattr_latency",
208 "Request type remove extended attribute latency");
209 plb.add_time_avg(l_mdss_req_readdir_latency, "req_readdir_latency",
210 "Request type read directory latency");
211 plb.add_time_avg(l_mdss_req_setfilelock_latency, "req_setfilelock_latency",
212 "Request type set file lock latency");
213 plb.add_time_avg(l_mdss_req_getfilelock_latency, "req_getfilelock_latency",
214 "Request type get file lock latency");
215 plb.add_time_avg(l_mdss_req_create_latency, "req_create_latency",
216 "Request type create latency");
217 plb.add_time_avg(l_mdss_req_open_latency, "req_open_latency",
218 "Request type open latency");
219 plb.add_time_avg(l_mdss_req_mknod_latency, "req_mknod_latency",
220 "Request type make node latency");
221 plb.add_time_avg(l_mdss_req_link_latency, "req_link_latency",
222 "Request type link latency");
223 plb.add_time_avg(l_mdss_req_unlink_latency, "req_unlink_latency",
224 "Request type unlink latency");
225 plb.add_time_avg(l_mdss_req_rmdir_latency, "req_rmdir_latency",
226 "Request type remove directory latency");
227 plb.add_time_avg(l_mdss_req_rename_latency, "req_rename_latency",
228 "Request type rename latency");
229 plb.add_time_avg(l_mdss_req_mkdir_latency, "req_mkdir_latency",
230 "Request type make directory latency");
231 plb.add_time_avg(l_mdss_req_symlink_latency, "req_symlink_latency",
232 "Request type symbolic link latency");
233 plb.add_time_avg(l_mdss_req_lssnap_latency, "req_lssnap_latency",
234 "Request type list snapshot latency");
235 plb.add_time_avg(l_mdss_req_mksnap_latency, "req_mksnap_latency",
236 "Request type make snapshot latency");
237 plb.add_time_avg(l_mdss_req_rmsnap_latency, "req_rmsnap_latency",
238 "Request type remove snapshot latency");
239 plb.add_time_avg(l_mdss_req_renamesnap_latency, "req_renamesnap_latency",
240 "Request type rename snapshot latency");
241
91327a77
AA
242 plb.set_prio_default(PerfCountersBuilder::PRIO_DEBUGONLY);
243 plb.add_u64_counter(l_mdss_dispatch_client_request, "dispatch_client_request",
244 "Client requests dispatched");
f67539c2 245 plb.add_u64_counter(l_mdss_dispatch_peer_request, "dispatch_server_request",
91327a77
AA
246 "Server requests dispatched");
247
7c673cae
FG
248 logger = plb.create_perf_counters();
249 g_ceph_context->get_perfcounters_collection()->add(logger);
250}
251
f67539c2 252Server::Server(MDSRank *m, MetricsHandler *metrics_handler) :
7c673cae
FG
253 mds(m),
254 mdcache(mds->mdcache), mdlog(mds->mdlog),
f67539c2
TL
255 recall_throttle(g_conf().get_val<double>("mds_recall_max_decay_rate")),
256 metrics_handler(metrics_handler)
7c673cae 257{
f91f0fd5 258 forward_all_requests_to_auth = g_conf().get_val<bool>("mds_forward_all_requests_to_auth");
92f5a8d4 259 replay_unsafe_with_closed_session = g_conf().get_val<bool>("mds_replay_unsafe_with_closed_session");
81eedcae 260 cap_revoke_eviction_timeout = g_conf().get_val<double>("mds_cap_revoke_eviction_timeout");
9f95a23c
TL
261 max_snaps_per_dir = g_conf().get_val<uint64_t>("mds_max_snaps_per_dir");
262 delegate_inos_pct = g_conf().get_val<uint64_t>("mds_client_delegate_inos_pct");
adb31ebb
TL
263 max_caps_per_client = g_conf().get_val<uint64_t>("mds_max_caps_per_client");
264 cap_acquisition_throttle = g_conf().get_val<uint64_t>("mds_session_cap_acquisition_throttle");
265 max_caps_throttle_ratio = g_conf().get_val<double>("mds_session_max_caps_throttle_ratio");
266 caps_throttle_retry_request_timeout = g_conf().get_val<double>("mds_cap_acquisition_throttle_retry_request_timeout");
20effc67
TL
267 dir_max_entries = g_conf().get_val<uint64_t>("mds_dir_max_entries");
268 bal_fragment_size_max = g_conf().get_val<int64_t>("mds_bal_fragment_size_max");
11fdf7f2 269 supported_features = feature_bitset_t(CEPHFS_FEATURES_MDS_SUPPORTED);
33c7a0ef 270 supported_metric_spec = feature_bitset_t(CEPHFS_METRIC_FEATURES_ALL);
7c673cae
FG
271}
272
9f95a23c 273void Server::dispatch(const cref_t<Message> &m)
7c673cae
FG
274{
275 switch (m->get_type()) {
276 case CEPH_MSG_CLIENT_RECONNECT:
9f95a23c 277 handle_client_reconnect(ref_cast<MClientReconnect>(m));
7c673cae
FG
278 return;
279 }
280
92f5a8d4
TL
281/*
282 *In reconnect phase, client sent unsafe requests to mds before reconnect msg. Seting sessionclosed_isok will handle scenario like this:
283
2841. In reconnect phase, client sent unsafe requests to mds.
2852. It reached reconnect timeout. All sessions without sending reconnect msg in time, some of which may had sent unsafe requests, are marked as closed.
286(Another situation is #31668, which will deny all client reconnect msg to speed up reboot).
2873.So these unsafe request from session without sending reconnect msg in time or being denied could be handled in clientreplay phase.
288
289*/
290 bool sessionclosed_isok = replay_unsafe_with_closed_session;
7c673cae 291 // active?
f67539c2 292 // handle_peer_request()/handle_client_session() will wait if necessary
94b18763 293 if (m->get_type() == CEPH_MSG_CLIENT_REQUEST && !mds->is_active()) {
9f95a23c 294 const auto &req = ref_cast<MClientRequest>(m);
94b18763
FG
295 if (mds->is_reconnect() || mds->get_want_state() == CEPH_MDS_STATE_RECONNECT) {
296 Session *session = mds->get_session(req);
92f5a8d4 297 if (!session || (!session->is_open() && !sessionclosed_isok)) {
7c673cae 298 dout(5) << "session is closed, dropping " << req->get_reqid() << dendl;
7c673cae
FG
299 return;
300 }
301 bool queue_replay = false;
9f95a23c 302 if (req->is_replay() || req->is_async()) {
7c673cae
FG
303 dout(3) << "queuing replayed op" << dendl;
304 queue_replay = true;
11fdf7f2
TL
305 if (req->head.ino &&
306 !session->have_completed_request(req->get_reqid().tid, nullptr)) {
f67539c2
TL
307 inodeno_t ino(req->head.ino);
308 mdcache->add_replay_ino_alloc(ino);
309 if (replay_unsafe_with_closed_session &&
310 session->free_prealloc_inos.contains(ino)) {
311 // don't purge inodes that will be created by later replay
312 session->free_prealloc_inos.erase(ino);
313 session->delegated_inos.insert(ino);
314 }
11fdf7f2 315 }
7c673cae
FG
316 } else if (req->get_retry_attempt()) {
317 // process completed request in clientreplay stage. The completed request
318 // might have created new file/directorie. This guarantees MDS sends a reply
319 // to client before other request modifies the new file/directorie.
320 if (session->have_completed_request(req->get_reqid().tid, NULL)) {
321 dout(3) << "queuing completed op" << dendl;
322 queue_replay = true;
323 }
324 // this request was created before the cap reconnect message, drop any embedded
325 // cap releases.
326 req->releases.clear();
327 }
328 if (queue_replay) {
329 req->mark_queued_for_replay();
330 mds->enqueue_replay(new C_MDS_RetryMessage(mds, m));
331 return;
332 }
333 }
334
335 bool wait_for_active = true;
94b18763 336 if (mds->is_stopping()) {
28e407b8 337 wait_for_active = false;
7c673cae 338 } else if (mds->is_clientreplay()) {
94b18763 339 if (req->is_queued_for_replay()) {
7c673cae 340 wait_for_active = false;
7c673cae
FG
341 }
342 }
343 if (wait_for_active) {
344 dout(3) << "not active yet, waiting" << dendl;
345 mds->wait_for_active(new C_MDS_RetryMessage(mds, m));
346 return;
347 }
348 }
349
350 switch (m->get_type()) {
351 case CEPH_MSG_CLIENT_SESSION:
9f95a23c 352 handle_client_session(ref_cast<MClientSession>(m));
7c673cae
FG
353 return;
354 case CEPH_MSG_CLIENT_REQUEST:
9f95a23c 355 handle_client_request(ref_cast<MClientRequest>(m));
11fdf7f2
TL
356 return;
357 case CEPH_MSG_CLIENT_RECLAIM:
9f95a23c 358 handle_client_reclaim(ref_cast<MClientReclaim>(m));
7c673cae 359 return;
f67539c2
TL
360 case MSG_MDS_PEER_REQUEST:
361 handle_peer_request(ref_cast<MMDSPeerRequest>(m));
7c673cae
FG
362 return;
363 default:
364 derr << "server unknown message " << m->get_type() << dendl;
11fdf7f2 365 ceph_abort_msg("server unknown message");
7c673cae
FG
366 }
367}
368
369
370
371// ----------------------------------------------------------
372// SESSION management
373
374class C_MDS_session_finish : public ServerLogContext {
375 Session *session;
376 uint64_t state_seq;
377 bool open;
378 version_t cmapv;
f67539c2 379 interval_set<inodeno_t> inos_to_free;
7c673cae 380 version_t inotablev;
f67539c2 381 interval_set<inodeno_t> inos_to_purge;
9f95a23c 382 LogSegment *ls = nullptr;
7c673cae
FG
383 Context *fin;
384public:
f67539c2 385 C_MDS_session_finish(Server *srv, Session *se, uint64_t sseq, bool s, version_t mv, Context *fin_ = nullptr) :
7c673cae 386 ServerLogContext(srv), session(se), state_seq(sseq), open(s), cmapv(mv), inotablev(0), fin(fin_) { }
f67539c2
TL
387 C_MDS_session_finish(Server *srv, Session *se, uint64_t sseq, bool s, version_t mv,
388 const interval_set<inodeno_t>& to_free, version_t iv,
389 const interval_set<inodeno_t>& to_purge, LogSegment *_ls, Context *fin_ = nullptr) :
390 ServerLogContext(srv), session(se), state_seq(sseq), open(s), cmapv(mv),
391 inos_to_free(to_free), inotablev(iv), inos_to_purge(to_purge), ls(_ls), fin(fin_) {}
7c673cae 392 void finish(int r) override {
11fdf7f2 393 ceph_assert(r == 0);
f67539c2 394 server->_session_logged(session, state_seq, open, cmapv, inos_to_free, inotablev, inos_to_purge, ls);
7c673cae
FG
395 if (fin) {
396 fin->complete(r);
397 }
398 }
399};
400
11fdf7f2
TL
401Session* Server::find_session_by_uuid(std::string_view uuid)
402{
403 Session* session = nullptr;
404 for (auto& it : mds->sessionmap.get_sessions()) {
405 auto& metadata = it.second->info.client_metadata;
406
407 auto p = metadata.find("uuid");
408 if (p == metadata.end() || p->second != uuid)
409 continue;
410
411 if (!session) {
412 session = it.second;
413 } else if (!session->reclaiming_from) {
20effc67 414 ceph_assert(it.second->reclaiming_from == session);
11fdf7f2
TL
415 session = it.second;
416 } else {
20effc67 417 ceph_assert(session->reclaiming_from == it.second);
11fdf7f2
TL
418 }
419 }
420 return session;
421}
422
9f95a23c 423void Server::reclaim_session(Session *session, const cref_t<MClientReclaim> &m)
11fdf7f2
TL
424{
425 if (!session->is_open() && !session->is_stale()) {
426 dout(10) << "session not open, dropping this req" << dendl;
427 return;
428 }
429
9f95a23c 430 auto reply = make_message<MClientReclaimReply>(0);
11fdf7f2
TL
431 if (m->get_uuid().empty()) {
432 dout(10) << __func__ << " invalid message (no uuid)" << dendl;
f67539c2 433 reply->set_result(-CEPHFS_EINVAL);
11fdf7f2
TL
434 mds->send_message_client(reply, session);
435 return;
436 }
437
438 unsigned flags = m->get_flags();
439 if (flags != CEPH_RECLAIM_RESET) { // currently only support reset
440 dout(10) << __func__ << " unsupported flags" << dendl;
f67539c2 441 reply->set_result(-CEPHFS_EOPNOTSUPP);
11fdf7f2
TL
442 mds->send_message_client(reply, session);
443 return;
444 }
445
446 Session* target = find_session_by_uuid(m->get_uuid());
447 if (target) {
448 if (session->info.auth_name != target->info.auth_name) {
449 dout(10) << __func__ << " session auth_name " << session->info.auth_name
450 << " != target auth_name " << target->info.auth_name << dendl;
f67539c2 451 reply->set_result(-CEPHFS_EPERM);
11fdf7f2
TL
452 mds->send_message_client(reply, session);
453 }
454
20effc67
TL
455 ceph_assert(!target->reclaiming_from);
456 ceph_assert(!session->reclaiming_from);
11fdf7f2
TL
457 session->reclaiming_from = target;
458 reply->set_addrs(entity_addrvec_t(target->info.inst.addr));
459 }
460
461 if (flags & CEPH_RECLAIM_RESET) {
462 finish_reclaim_session(session, reply);
463 return;
464 }
465
466 ceph_abort();
467}
468
9f95a23c 469void Server::finish_reclaim_session(Session *session, const ref_t<MClientReclaimReply> &reply)
11fdf7f2
TL
470{
471 Session *target = session->reclaiming_from;
472 if (target) {
473 session->reclaiming_from = nullptr;
474
475 Context *send_reply;
476 if (reply) {
477 int64_t session_id = session->get_client().v;
9f95a23c 478 send_reply = new LambdaContext([this, session_id, reply](int r) {
20effc67 479 ceph_assert(ceph_mutex_is_locked_by_me(mds->mds_lock));
11fdf7f2
TL
480 Session *session = mds->sessionmap.get_session(entity_name_t::CLIENT(session_id));
481 if (!session) {
482 return;
483 }
484 auto epoch = mds->objecter->with_osdmap([](const OSDMap &map){ return map.get_epoch(); });
485 reply->set_epoch(epoch);
486 mds->send_message_client(reply, session);
487 });
488 } else {
489 send_reply = nullptr;
490 }
491
f67539c2
TL
492 bool blocklisted = mds->objecter->with_osdmap([target](const OSDMap &map) {
493 return map.is_blocklisted(target->info.inst.addr);
11fdf7f2
TL
494 });
495
f67539c2 496 if (blocklisted || !g_conf()->mds_session_blocklist_on_evict) {
11fdf7f2
TL
497 kill_session(target, send_reply);
498 } else {
f67539c2
TL
499 CachedStackStringStream css;
500 mds->evict_client(target->get_client().v, false, true, *css, send_reply);
11fdf7f2
TL
501 }
502 } else if (reply) {
503 mds->send_message_client(reply, session);
504 }
505}
506
9f95a23c 507void Server::handle_client_reclaim(const cref_t<MClientReclaim> &m)
11fdf7f2
TL
508{
509 Session *session = mds->get_session(m);
510 dout(3) << __func__ << " " << *m << " from " << m->get_source() << dendl;
20effc67 511 ceph_assert(m->get_source().is_client()); // should _not_ come from an mds!
11fdf7f2
TL
512
513 if (!session) {
514 dout(0) << " ignoring sessionless msg " << *m << dendl;
515 return;
516 }
517
20effc67 518 std::string_view fs_name = mds->mdsmap->get_fs_name();
f67539c2
TL
519 if (!fs_name.empty() && !session->fs_name_capable(fs_name, MAY_READ)) {
520 dout(0) << " dropping message not allowed for this fs_name: " << *m << dendl;
521 return;
522 }
523
11fdf7f2
TL
524 if (mds->get_state() < MDSMap::STATE_CLIENTREPLAY) {
525 mds->wait_for_replay(new C_MDS_RetryMessage(mds, m));
526 return;
527 }
528
529 if (m->get_flags() & MClientReclaim::FLAG_FINISH) {
530 finish_reclaim_session(session);
531 } else {
532 reclaim_session(session, m);
533 }
534}
535
9f95a23c 536void Server::handle_client_session(const cref_t<MClientSession> &m)
7c673cae
FG
537{
538 version_t pv;
94b18763 539 Session *session = mds->get_session(m);
7c673cae
FG
540
541 dout(3) << "handle_client_session " << *m << " from " << m->get_source() << dendl;
11fdf7f2 542 ceph_assert(m->get_source().is_client()); // should _not_ come from an mds!
7c673cae
FG
543
544 if (!session) {
545 dout(0) << " ignoring sessionless msg " << *m << dendl;
9f95a23c 546 auto reply = make_message<MClientSession>(CEPH_SESSION_REJECT);
92f5a8d4
TL
547 reply->metadata["error_string"] = "sessionless";
548 mds->send_message(reply, m->get_connection());
7c673cae
FG
549 return;
550 }
551
20effc67 552 std::string_view fs_name = mds->mdsmap->get_fs_name();
f67539c2
TL
553 if (!fs_name.empty() && !session->fs_name_capable(fs_name, MAY_READ)) {
554 dout(0) << " dropping message not allowed for this fs_name: " << *m << dendl;
555 auto reply = make_message<MClientSession>(CEPH_SESSION_REJECT);
556 reply->metadata["error_string"] = "client doesn't have caps for FS \"" +
557 std::string(fs_name) + "\"";
558 mds->send_message(std::move(reply), m->get_connection());
559 return;
560 }
561
94b18763
FG
562 if (m->get_op() == CEPH_SESSION_REQUEST_RENEWCAPS) {
563 // always handle renewcaps (state >= MDSMap::STATE_RECONNECT)
564 } else if (m->get_op() == CEPH_SESSION_REQUEST_CLOSE) {
565 // close requests need to be handled when mds is active
566 if (mds->get_state() < MDSMap::STATE_ACTIVE) {
567 mds->wait_for_active(new C_MDS_RetryMessage(mds, m));
568 return;
569 }
570 } else {
571 if (mds->get_state() < MDSMap::STATE_CLIENTREPLAY) {
572 mds->wait_for_replay(new C_MDS_RetryMessage(mds, m));
573 return;
574 }
575 }
576
7c673cae
FG
577 if (logger)
578 logger->inc(l_mdss_handle_client_session);
579
580 uint64_t sseq = 0;
581 switch (m->get_op()) {
582 case CEPH_SESSION_REQUEST_OPEN:
583 if (session->is_opening() ||
584 session->is_open() ||
585 session->is_stale() ||
28e407b8
AA
586 session->is_killing() ||
587 terminating_sessions) {
7c673cae 588 dout(10) << "currently open|opening|stale|killing, dropping this req" << dendl;
7c673cae
FG
589 return;
590 }
11fdf7f2 591 ceph_assert(session->is_closed() || session->is_closing());
7c673cae 592
b32b8144
FG
593 if (mds->is_stopping()) {
594 dout(10) << "mds is stopping, dropping open req" << dendl;
b32b8144
FG
595 return;
596 }
597
a8e16298
TL
598 {
599 auto& addr = session->info.inst.addr;
9f95a23c 600 session->set_client_metadata(client_metadata_t(m->metadata, m->supported_features, m->metric_spec));
a8e16298
TL
601 auto& client_metadata = session->info.client_metadata;
602
11fdf7f2 603 auto log_session_status = [this, m, session](std::string_view status, std::string_view err) {
a8e16298
TL
604 auto now = ceph_clock_now();
605 auto throttle_elapsed = m->get_recv_complete_stamp() - m->get_throttle_stamp();
606 auto elapsed = now - m->get_recv_stamp();
11fdf7f2
TL
607 CachedStackStringStream css;
608 *css << "New client session:"
a8e16298
TL
609 << " addr=\"" << session->info.inst.addr << "\""
610 << ",elapsed=" << elapsed
611 << ",throttled=" << throttle_elapsed
612 << ",status=\"" << status << "\"";
613 if (!err.empty()) {
11fdf7f2 614 *css << ",error=\"" << err << "\"";
a8e16298
TL
615 }
616 const auto& metadata = session->info.client_metadata;
11fdf7f2
TL
617 if (auto it = metadata.find("root"); it != metadata.end()) {
618 *css << ",root=\"" << it->second << "\"";
a8e16298 619 }
11fdf7f2
TL
620 dout(2) << css->strv() << dendl;
621 };
622
a4b75251
TL
623 auto send_reject_message = [this, &session, &log_session_status](std::string_view err_str, unsigned flags=0) {
624 auto m = make_message<MClientSession>(CEPH_SESSION_REJECT, 0, flags);
11fdf7f2
TL
625 if (session->info.has_feature(CEPHFS_FEATURE_MIMIC))
626 m->metadata["error_string"] = err_str;
627 mds->send_message_client(m, session);
628 log_session_status("REJECTED", err_str);
a8e16298 629 };
7c673cae 630
f67539c2 631 bool blocklisted = mds->objecter->with_osdmap(
11fdf7f2 632 [&addr](const OSDMap &osd_map) -> bool {
f67539c2 633 return osd_map.is_blocklisted(addr);
11fdf7f2
TL
634 });
635
f67539c2
TL
636 if (blocklisted) {
637 dout(10) << "rejecting blocklisted client " << addr << dendl;
638 // This goes on the wire and the "blacklisted" substring is
639 // depended upon by the kernel client for detecting whether it
640 // has been blocklisted. If mounted with recover_session=clean
641 // (since 5.4), it tries to automatically recover itself from
642 // blocklisting.
a4b75251
TL
643 unsigned flags = 0;
644 flags |= MClientSession::SESSION_BLOCKLISTED;
645 send_reject_message("blocklisted (blacklisted)", flags);
11fdf7f2
TL
646 session->clear();
647 break;
7c673cae 648 }
7c673cae 649
11fdf7f2
TL
650 if (client_metadata.features.empty())
651 infer_supported_features(session, client_metadata);
652
653 dout(20) << __func__ << " CEPH_SESSION_REQUEST_OPEN metadata entries:" << dendl;
9f95a23c
TL
654 dout(20) << " features: '" << client_metadata.features << "'" << dendl;
655 dout(20) << " metric specification: [" << client_metadata.metric_spec << "]" << dendl;
11fdf7f2
TL
656 for (const auto& p : client_metadata) {
657 dout(20) << " " << p.first << ": " << p.second << dendl;
658 }
659
660 feature_bitset_t missing_features = required_client_features;
661 missing_features -= client_metadata.features;
662 if (!missing_features.empty()) {
f67539c2
TL
663 CachedStackStringStream css;
664 *css << "missing required features '" << missing_features << "'";
665 send_reject_message(css->strv());
92f5a8d4
TL
666 mds->clog->warn() << "client session (" << session->info.inst
667 << ") lacks required features " << missing_features
668 << "; client supports " << client_metadata.features;
11fdf7f2
TL
669 session->clear();
670 break;
a8e16298 671 }
7c673cae 672
a8e16298
TL
673 // Special case for the 'root' metadata path; validate that the claimed
674 // root is actually within the caps of the session
11fdf7f2
TL
675 if (auto it = client_metadata.find("root"); it != client_metadata.end()) {
676 auto claimed_root = it->second;
f67539c2 677 CachedStackStringStream css;
11fdf7f2
TL
678 bool denied = false;
679 // claimed_root has a leading "/" which we strip before passing
680 // into caps check
681 if (claimed_root.empty() || claimed_root[0] != '/') {
682 denied = true;
f67539c2 683 *css << "invalue root '" << claimed_root << "'";
11fdf7f2
TL
684 } else if (!session->auth_caps.path_capable(claimed_root.substr(1))) {
685 denied = true;
f67539c2 686 *css << "non-allowable root '" << claimed_root << "'";
11fdf7f2
TL
687 }
688
689 if (denied) {
690 // Tell the client we're rejecting their open
f67539c2
TL
691 send_reject_message(css->strv());
692 mds->clog->warn() << "client session with " << css->strv()
11fdf7f2
TL
693 << " denied (" << session->info.inst << ")";
694 session->clear();
695 break;
696 }
697 }
698
699 if (auto it = client_metadata.find("uuid"); it != client_metadata.end()) {
700 if (find_session_by_uuid(it->second)) {
701 send_reject_message("duplicated session uuid");
702 mds->clog->warn() << "client session with duplicated session uuid '"
703 << it->second << "' denied (" << session->info.inst << ")";
704 session->clear();
705 break;
706 }
a8e16298
TL
707 }
708
f67539c2
TL
709 if (session->is_closed()) {
710 mds->sessionmap.add_session(session);
711 }
a8e16298
TL
712
713 pv = mds->sessionmap.mark_projected(session);
714 sseq = mds->sessionmap.set_state(session, Session::STATE_OPENING);
715 mds->sessionmap.touch_session(session);
9f95a23c 716 auto fin = new LambdaContext([log_session_status = std::move(log_session_status)](int r){
11fdf7f2 717 ceph_assert(r == 0);
a8e16298
TL
718 log_session_status("ACCEPTED", "");
719 });
720 mdlog->start_submit_entry(new ESession(m->get_source_inst(), true, pv, client_metadata),
721 new C_MDS_session_finish(this, session, sseq, true, pv, fin));
11fdf7f2 722 mdlog->flush();
a8e16298 723 }
7c673cae
FG
724 break;
725
726 case CEPH_SESSION_REQUEST_RENEWCAPS:
11fdf7f2 727 if (session->is_open() || session->is_stale()) {
7c673cae
FG
728 mds->sessionmap.touch_session(session);
729 if (session->is_stale()) {
730 mds->sessionmap.set_state(session, Session::STATE_OPEN);
731 mds->locker->resume_stale_caps(session);
732 mds->sessionmap.touch_session(session);
733 }
9f95a23c 734 auto reply = make_message<MClientSession>(CEPH_SESSION_RENEWCAPS, m->get_seq());
11fdf7f2 735 mds->send_message_client(reply, session);
7c673cae
FG
736 } else {
737 dout(10) << "ignoring renewcaps on non open|stale session (" << session->get_state_name() << ")" << dendl;
738 }
739 break;
740
741 case CEPH_SESSION_REQUEST_CLOSE:
742 {
743 if (session->is_closed() ||
744 session->is_closing() ||
745 session->is_killing()) {
746 dout(10) << "already closed|closing|killing, dropping this req" << dendl;
7c673cae
FG
747 return;
748 }
749 if (session->is_importing()) {
750 dout(10) << "ignoring close req on importing session" << dendl;
7c673cae
FG
751 return;
752 }
11fdf7f2 753 ceph_assert(session->is_open() ||
7c673cae
FG
754 session->is_stale() ||
755 session->is_opening());
756 if (m->get_seq() < session->get_push_seq()) {
757 dout(10) << "old push seq " << m->get_seq() << " < " << session->get_push_seq()
758 << ", dropping" << dendl;
7c673cae
FG
759 return;
760 }
761 // We are getting a seq that is higher than expected.
762 // Handle the same as any other seqn error.
763 //
764 if (m->get_seq() != session->get_push_seq()) {
765 dout(0) << "old push seq " << m->get_seq() << " != " << session->get_push_seq()
766 << ", BUGGY!" << dendl;
767 mds->clog->warn() << "incorrect push seq " << m->get_seq() << " != "
768 << session->get_push_seq() << ", dropping" << " from client : " << session->get_human_name();
7c673cae
FG
769 return;
770 }
771 journal_close_session(session, Session::STATE_CLOSING, NULL);
772 }
773 break;
774
775 case CEPH_SESSION_FLUSHMSG_ACK:
776 finish_flush_session(session, m->get_seq());
777 break;
778
31f18b77 779 case CEPH_SESSION_REQUEST_FLUSH_MDLOG:
b32b8144
FG
780 if (mds->is_active())
781 mdlog->flush();
31f18b77
FG
782 break;
783
7c673cae
FG
784 default:
785 ceph_abort();
786 }
7c673cae
FG
787}
788
f91f0fd5 789void Server::flush_session(Session *session, MDSGatherBuilder& gather) {
f64942e4 790 if (!session->is_open() ||
11fdf7f2
TL
791 !session->get_connection() ||
792 !session->get_connection()->has_feature(CEPH_FEATURE_EXPORT_PEER)) {
f64942e4
AA
793 return;
794 }
795
f91f0fd5 796 version_t seq = session->wait_for_flush(gather.new_sub());
11fdf7f2 797 mds->send_message_client(
9f95a23c 798 make_message<MClientSession>(CEPH_SESSION_FLUSHMSG, seq), session);
f64942e4
AA
799}
800
7c673cae
FG
801void Server::flush_client_sessions(set<client_t>& client_set, MDSGatherBuilder& gather)
802{
f91f0fd5
TL
803 for (const auto& client : client_set) {
804 Session *session = mds->sessionmap.get_session(entity_name_t::CLIENT(client.v));
11fdf7f2 805 ceph_assert(session);
f91f0fd5 806 flush_session(session, gather);
7c673cae
FG
807 }
808}
809
810void Server::finish_flush_session(Session *session, version_t seq)
811{
11fdf7f2 812 MDSContext::vec finished;
7c673cae
FG
813 session->finish_flush(seq, finished);
814 mds->queue_waiters(finished);
815}
816
817void Server::_session_logged(Session *session, uint64_t state_seq, bool open, version_t pv,
f67539c2
TL
818 const interval_set<inodeno_t>& inos_to_free, version_t piv,
819 const interval_set<inodeno_t>& inos_to_purge, LogSegment *ls)
7c673cae 820{
9f95a23c
TL
821 dout(10) << "_session_logged " << session->info.inst
822 << " state_seq " << state_seq
f67539c2
TL
823 << " " << (open ? "open":"close") << " " << pv
824 << " inos_to_free " << inos_to_free << " inotablev " << piv
825 << " inos_to_purge " << inos_to_purge << dendl;
826
827 if (!open) {
828 if (inos_to_purge.size()){
829 ceph_assert(ls);
830 session->info.prealloc_inos.subtract(inos_to_purge);
831 ls->purging_inodes.insert(inos_to_purge);
832 if (mds->is_clientreplay() || mds->is_active() || mds->is_stopping())
833 mdcache->purge_inodes(inos_to_purge, ls);
834 }
835
836 if (inos_to_free.size()) {
837 ceph_assert(piv);
838 ceph_assert(session->is_closing() || session->is_killing() ||
839 session->is_opening()); // re-open closing session
840 session->info.prealloc_inos.subtract(inos_to_free);
841 mds->inotable->apply_release_ids(inos_to_free);
842 ceph_assert(mds->inotable->get_version() == piv);
843 }
844 session->free_prealloc_inos = session->info.prealloc_inos;
9f95a23c 845 session->delegated_inos.clear();
7c673cae
FG
846 }
847
848 mds->sessionmap.mark_dirty(session);
849
850 // apply
851 if (session->get_state_seq() != state_seq) {
852 dout(10) << " journaled state_seq " << state_seq << " != current " << session->get_state_seq()
853 << ", noop" << dendl;
854 // close must have been canceled (by an import?), or any number of other things..
855 } else if (open) {
11fdf7f2 856 ceph_assert(session->is_opening());
7c673cae
FG
857 mds->sessionmap.set_state(session, Session::STATE_OPEN);
858 mds->sessionmap.touch_session(session);
f67539c2 859 metrics_handler->add_session(session);
11fdf7f2 860 ceph_assert(session->get_connection());
9f95a23c 861 auto reply = make_message<MClientSession>(CEPH_SESSION_OPEN);
33c7a0ef 862 if (session->info.has_feature(CEPHFS_FEATURE_MIMIC)) {
11fdf7f2 863 reply->supported_features = supported_features;
33c7a0ef
TL
864 reply->metric_spec = supported_metric_spec;
865 }
11fdf7f2
TL
866 mds->send_message_client(reply, session);
867 if (mdcache->is_readonly()) {
9f95a23c 868 auto m = make_message<MClientSession>(CEPH_SESSION_FORCE_RO);
11fdf7f2
TL
869 mds->send_message_client(m, session);
870 }
7c673cae
FG
871 } else if (session->is_closing() ||
872 session->is_killing()) {
873 // kill any lingering capabilities, leases, requests
f91f0fd5 874 bool killing = session->is_killing();
7c673cae
FG
875 while (!session->caps.empty()) {
876 Capability *cap = session->caps.front();
877 CInode *in = cap->get_inode();
878 dout(20) << " killing capability " << ccap_string(cap->issued()) << " on " << *in << dendl;
f91f0fd5 879 mds->locker->remove_client_cap(in, cap, killing);
7c673cae
FG
880 }
881 while (!session->leases.empty()) {
882 ClientLease *r = session->leases.front();
883 CDentry *dn = static_cast<CDentry*>(r->parent);
884 dout(20) << " killing client lease of " << *dn << dendl;
885 dn->remove_client_lease(r, mds->locker);
886 }
11fdf7f2 887 if (client_reconnect_gather.erase(session->info.get_client())) {
7c673cae 888 dout(20) << " removing client from reconnect set" << dendl;
7c673cae
FG
889 if (client_reconnect_gather.empty()) {
890 dout(7) << " client " << session->info.inst << " was last reconnect, finishing" << dendl;
891 reconnect_gather_finish();
892 }
893 }
11fdf7f2
TL
894 if (client_reclaim_gather.erase(session->info.get_client())) {
895 dout(20) << " removing client from reclaim set" << dendl;
896 if (client_reclaim_gather.empty()) {
897 dout(7) << " client " << session->info.inst << " was last reclaimed, finishing" << dendl;
898 mds->maybe_clientreplay_done();
899 }
900 }
7c673cae
FG
901
902 if (session->is_closing()) {
903 // mark con disposable. if there is a fault, we will get a
904 // reset and clean it up. if the client hasn't received the
905 // CLOSE message yet, they will reconnect and get an
906 // ms_handle_remote_reset() and realize they had in fact closed.
907 // do this *before* sending the message to avoid a possible
908 // race.
11fdf7f2 909 if (session->get_connection()) {
7c673cae
FG
910 // Conditional because terminate_sessions will indiscrimately
911 // put sessions in CLOSING whether they ever had a conn or not.
11fdf7f2 912 session->get_connection()->mark_disposable();
7c673cae
FG
913 }
914
915 // reset session
9f95a23c 916 mds->send_message_client(make_message<MClientSession>(CEPH_SESSION_CLOSE), session);
7c673cae
FG
917 mds->sessionmap.set_state(session, Session::STATE_CLOSED);
918 session->clear();
f67539c2 919 metrics_handler->remove_session(session);
7c673cae
FG
920 mds->sessionmap.remove_session(session);
921 } else if (session->is_killing()) {
922 // destroy session, close connection
11fdf7f2 923 if (session->get_connection()) {
92f5a8d4
TL
924 session->get_connection()->mark_down();
925 mds->sessionmap.set_state(session, Session::STATE_CLOSED);
926 session->set_connection(nullptr);
7c673cae 927 }
f67539c2 928 metrics_handler->remove_session(session);
7c673cae
FG
929 mds->sessionmap.remove_session(session);
930 } else {
931 ceph_abort();
932 }
933 } else {
934 ceph_abort();
935 }
936}
937
938/**
939 * Inject sessions from some source other than actual connections.
940 *
941 * For example:
942 * - sessions inferred from journal replay
943 * - sessions learned from other MDSs during rejoin
944 * - sessions learned from other MDSs during dir/caps migration
945 * - sessions learned from other MDSs during a cross-MDS rename
946 */
947version_t Server::prepare_force_open_sessions(map<client_t,entity_inst_t>& cm,
11fdf7f2 948 map<client_t,client_metadata_t>& cmm,
28e407b8 949 map<client_t, pair<Session*,uint64_t> >& smap)
7c673cae
FG
950{
951 version_t pv = mds->sessionmap.get_projected();
952
953 dout(10) << "prepare_force_open_sessions " << pv
954 << " on " << cm.size() << " clients"
955 << dendl;
7c673cae 956
28e407b8 957 mds->objecter->with_osdmap(
11fdf7f2 958 [this, &cm, &cmm](const OSDMap &osd_map) {
28e407b8 959 for (auto p = cm.begin(); p != cm.end(); ) {
f67539c2
TL
960 if (osd_map.is_blocklisted(p->second.addr)) {
961 dout(10) << " ignoring blocklisted client." << p->first
28e407b8 962 << " (" << p->second.addr << ")" << dendl;
11fdf7f2 963 cmm.erase(p->first);
28e407b8
AA
964 cm.erase(p++);
965 } else {
966 ++p;
967 }
968 }
969 });
970
971 for (map<client_t,entity_inst_t>::iterator p = cm.begin(); p != cm.end(); ++p) {
7c673cae
FG
972 Session *session = mds->sessionmap.get_or_add_session(p->second);
973 pv = mds->sessionmap.mark_projected(session);
28e407b8 974 uint64_t sseq;
7c673cae
FG
975 if (session->is_closed() ||
976 session->is_closing() ||
28e407b8
AA
977 session->is_killing()) {
978 sseq = mds->sessionmap.set_state(session, Session::STATE_OPENING);
11fdf7f2
TL
979 auto q = cmm.find(p->first);
980 if (q != cmm.end())
981 session->info.client_metadata.merge(q->second);
28e407b8 982 } else {
11fdf7f2 983 ceph_assert(session->is_open() ||
7c673cae
FG
984 session->is_opening() ||
985 session->is_stale());
28e407b8
AA
986 sseq = 0;
987 }
988 smap[p->first] = make_pair(session, sseq);
7c673cae
FG
989 session->inc_importing();
990 }
991 return pv;
992}
993
28e407b8 994void Server::finish_force_open_sessions(const map<client_t,pair<Session*,uint64_t> >& smap,
7c673cae
FG
995 bool dec_import)
996{
997 /*
998 * FIXME: need to carefully consider the race conditions between a
999 * client trying to close a session and an MDS doing an import
1000 * trying to force open a session...
1001 */
28e407b8 1002 dout(10) << "finish_force_open_sessions on " << smap.size() << " clients,"
7c673cae 1003 << " initial v " << mds->sessionmap.get_version() << dendl;
7c673cae 1004
28e407b8
AA
1005 for (auto &it : smap) {
1006 Session *session = it.second.first;
1007 uint64_t sseq = it.second.second;
1008 if (sseq > 0) {
7c673cae
FG
1009 if (session->get_state_seq() != sseq) {
1010 dout(10) << "force_open_sessions skipping changed " << session->info.inst << dendl;
1011 } else {
1012 dout(10) << "force_open_sessions opened " << session->info.inst << dendl;
1013 mds->sessionmap.set_state(session, Session::STATE_OPEN);
1014 mds->sessionmap.touch_session(session);
f67539c2 1015 metrics_handler->add_session(session);
11fdf7f2 1016
9f95a23c 1017 auto reply = make_message<MClientSession>(CEPH_SESSION_OPEN);
33c7a0ef 1018 if (session->info.has_feature(CEPHFS_FEATURE_MIMIC)) {
11fdf7f2 1019 reply->supported_features = supported_features;
33c7a0ef
TL
1020 reply->metric_spec = supported_metric_spec;
1021 }
11fdf7f2
TL
1022 mds->send_message_client(reply, session);
1023
7c673cae 1024 if (mdcache->is_readonly())
9f95a23c 1025 mds->send_message_client(make_message<MClientSession>(CEPH_SESSION_FORCE_RO), session);
7c673cae
FG
1026 }
1027 } else {
1028 dout(10) << "force_open_sessions skipping already-open " << session->info.inst << dendl;
11fdf7f2 1029 ceph_assert(session->is_open() || session->is_stale());
7c673cae
FG
1030 }
1031
1032 if (dec_import) {
1033 session->dec_importing();
1034 }
1035
1036 mds->sessionmap.mark_dirty(session);
1037 }
1038
1039 dout(10) << __func__ << ": final v " << mds->sessionmap.get_version() << dendl;
1040}
1041
1042class C_MDS_TerminatedSessions : public ServerContext {
1043 void finish(int r) override {
1044 server->terminating_sessions = false;
1045 }
1046 public:
1047 explicit C_MDS_TerminatedSessions(Server *s) : ServerContext(s) {}
1048};
1049
1050void Server::terminate_sessions()
1051{
a8e16298 1052 dout(5) << "terminating all sessions..." << dendl;
7c673cae
FG
1053
1054 terminating_sessions = true;
1055
1056 // kill them off. clients will retry etc.
1057 set<Session*> sessions;
1058 mds->sessionmap.get_client_session_set(sessions);
1059 for (set<Session*>::const_iterator p = sessions.begin();
1060 p != sessions.end();
1061 ++p) {
1062 Session *session = *p;
1063 if (session->is_closing() ||
1064 session->is_killing() ||
1065 session->is_closed())
1066 continue;
1067 journal_close_session(session, Session::STATE_CLOSING, NULL);
1068 }
1069
1070 mdlog->wait_for_safe(new C_MDS_TerminatedSessions(this));
1071}
1072
1073
1074void Server::find_idle_sessions()
1075{
91327a77
AA
1076 auto now = clock::now();
1077 auto last_cleared_laggy = mds->last_cleared_laggy();
1078
1079 dout(10) << "find_idle_sessions. last cleared laggy state " << last_cleared_laggy << "s ago" << dendl;
7c673cae
FG
1080
1081 // timeout/stale
1082 // (caps go stale, lease die)
91327a77
AA
1083 double queue_max_age = mds->get_dispatch_queue_max_age(ceph_clock_now());
1084 double cutoff = queue_max_age + mds->mdsmap->get_session_timeout();
f64942e4 1085
494da23a
TL
1086 // don't kick clients if we've been laggy
1087 if (last_cleared_laggy < cutoff) {
1088 dout(10) << " last cleared laggy " << last_cleared_laggy << "s ago (< cutoff " << cutoff
1089 << "), not marking any client stale" << dendl;
1090 return;
1091 }
1092
11fdf7f2
TL
1093 std::vector<Session*> to_evict;
1094
494da23a 1095 bool defer_session_stale = g_conf().get_val<bool>("mds_defer_session_stale");
f64942e4
AA
1096 const auto sessions_p1 = mds->sessionmap.by_state.find(Session::STATE_OPEN);
1097 if (sessions_p1 != mds->sessionmap.by_state.end() && !sessions_p1->second->empty()) {
1098 std::vector<Session*> new_stale;
1099
1100 for (auto session : *(sessions_p1->second)) {
1101 auto last_cap_renew_span = std::chrono::duration<double>(now - session->last_cap_renew).count();
1102 if (last_cap_renew_span < cutoff) {
1103 dout(20) << "laggiest active session is " << session->info.inst
1104 << " and renewed caps recently (" << last_cap_renew_span << "s ago)" << dendl;
1105 break;
1106 }
1107
1108 if (session->last_seen > session->last_cap_renew) {
1109 last_cap_renew_span = std::chrono::duration<double>(now - session->last_seen).count();
1110 if (last_cap_renew_span < cutoff) {
1111 dout(20) << "laggiest active session is " << session->info.inst
1112 << " and renewed caps recently (" << last_cap_renew_span << "s ago)" << dendl;
1113 continue;
1114 }
1115 }
1116
494da23a
TL
1117 if (last_cap_renew_span >= mds->mdsmap->get_session_autoclose()) {
1118 dout(20) << "evicting session " << session->info.inst << " since autoclose "
1119 "has arrived" << dendl;
1120 // evict session without marking it stale
1121 to_evict.push_back(session);
1122 continue;
1123 }
1124
1125 if (defer_session_stale &&
1126 !session->is_any_flush_waiter() &&
1127 !mds->locker->is_revoking_any_caps_from(session->get_client())) {
1128 dout(20) << "deferring marking session " << session->info.inst << " stale "
1129 "since it holds no caps" << dendl;
1130 continue;
1131 }
1132
11fdf7f2
TL
1133 auto it = session->info.client_metadata.find("timeout");
1134 if (it != session->info.client_metadata.end()) {
1135 unsigned timeout = strtoul(it->second.c_str(), nullptr, 0);
1136 if (timeout == 0) {
1137 dout(10) << "skipping session " << session->info.inst
1138 << ", infinite timeout specified" << dendl;
1139 continue;
1140 }
1141 double cutoff = queue_max_age + timeout;
1142 if (last_cap_renew_span < cutoff) {
1143 dout(10) << "skipping session " << session->info.inst
1144 << ", timeout (" << timeout << ") specified"
1145 << " and renewed caps recently (" << last_cap_renew_span << "s ago)" << dendl;
1146 continue;
1147 }
1148
1149 // do not go through stale, evict it directly.
1150 to_evict.push_back(session);
1151 } else {
1152 dout(10) << "new stale session " << session->info.inst
1153 << " last renewed caps " << last_cap_renew_span << "s ago" << dendl;
1154 new_stale.push_back(session);
1155 }
7c673cae
FG
1156 }
1157
f64942e4
AA
1158 for (auto session : new_stale) {
1159 mds->sessionmap.set_state(session, Session::STATE_STALE);
494da23a
TL
1160 if (mds->locker->revoke_stale_caps(session)) {
1161 mds->locker->remove_stale_leases(session);
1162 finish_flush_session(session, session->get_push_seq());
9f95a23c 1163 auto m = make_message<MClientSession>(CEPH_SESSION_STALE, session->get_push_seq());
494da23a
TL
1164 mds->send_message_client(m, session);
1165 } else {
1166 to_evict.push_back(session);
1167 }
f64942e4 1168 }
7c673cae
FG
1169 }
1170
1171 // autoclose
91327a77 1172 cutoff = queue_max_age + mds->mdsmap->get_session_autoclose();
7c673cae 1173
31f18b77 1174 // Collect a list of sessions exceeding the autoclose threshold
f64942e4 1175 const auto sessions_p2 = mds->sessionmap.by_state.find(Session::STATE_STALE);
11fdf7f2
TL
1176 if (sessions_p2 != mds->sessionmap.by_state.end() && !sessions_p2->second->empty()) {
1177 for (auto session : *(sessions_p2->second)) {
20effc67 1178 ceph_assert(session->is_stale());
11fdf7f2
TL
1179 auto last_cap_renew_span = std::chrono::duration<double>(now - session->last_cap_renew).count();
1180 if (last_cap_renew_span < cutoff) {
1181 dout(20) << "oldest stale session is " << session->info.inst
1182 << " and recently renewed caps " << last_cap_renew_span << "s ago" << dendl;
1183 break;
1184 }
1185 to_evict.push_back(session);
1186 }
31f18b77 1187 }
31f18b77 1188
11fdf7f2 1189 for (auto session: to_evict) {
7c673cae 1190 if (session->is_importing()) {
11fdf7f2
TL
1191 dout(10) << "skipping session " << session->info.inst << ", it's being imported" << dendl;
1192 continue;
7c673cae 1193 }
31f18b77 1194
11fdf7f2
TL
1195 auto last_cap_renew_span = std::chrono::duration<double>(now - session->last_cap_renew).count();
1196 mds->clog->warn() << "evicting unresponsive client " << *session
1197 << ", after " << last_cap_renew_span << " seconds";
1198 dout(10) << "autoclosing stale session " << session->info.inst
1199 << " last renewed caps " << last_cap_renew_span << "s ago" << dendl;
31f18b77 1200
f67539c2
TL
1201 if (g_conf()->mds_session_blocklist_on_timeout) {
1202 CachedStackStringStream css;
1203 mds->evict_client(session->get_client().v, false, true, *css, nullptr);
31f18b77
FG
1204 } else {
1205 kill_session(session, NULL);
1206 }
7c673cae
FG
1207 }
1208}
1209
91327a77
AA
1210void Server::evict_cap_revoke_non_responders() {
1211 if (!cap_revoke_eviction_timeout) {
1212 return;
1213 }
1214
9f95a23c 1215 auto&& to_evict = mds->locker->get_late_revoking_clients(cap_revoke_eviction_timeout);
91327a77
AA
1216
1217 for (auto const &client: to_evict) {
1218 mds->clog->warn() << "client id " << client << " has not responded to"
1219 << " cap revoke by MDS for over " << cap_revoke_eviction_timeout
1220 << " seconds, evicting";
1221 dout(1) << __func__ << ": evicting cap revoke non-responder client id "
1222 << client << dendl;
1223
f67539c2 1224 CachedStackStringStream css;
91327a77 1225 bool evicted = mds->evict_client(client.v, false,
f67539c2
TL
1226 g_conf()->mds_session_blocklist_on_evict,
1227 *css, nullptr);
91327a77
AA
1228 if (evicted && logger) {
1229 logger->inc(l_mdss_cap_revoke_eviction);
1230 }
1231 }
1232}
1233
92f5a8d4 1234void Server::handle_conf_change(const std::set<std::string>& changed) {
f91f0fd5
TL
1235 if (changed.count("mds_forward_all_requests_to_auth")){
1236 forward_all_requests_to_auth = g_conf().get_val<bool>("mds_forward_all_requests_to_auth");
92f5a8d4 1237 }
91327a77 1238 if (changed.count("mds_cap_revoke_eviction_timeout")) {
11fdf7f2 1239 cap_revoke_eviction_timeout = g_conf().get_val<double>("mds_cap_revoke_eviction_timeout");
91327a77
AA
1240 dout(20) << __func__ << " cap revoke eviction timeout changed to "
1241 << cap_revoke_eviction_timeout << dendl;
1242 }
a8e16298 1243 if (changed.count("mds_recall_max_decay_rate")) {
11fdf7f2 1244 recall_throttle = DecayCounter(g_conf().get_val<double>("mds_recall_max_decay_rate"));
a8e16298 1245 }
9f95a23c
TL
1246 if (changed.count("mds_max_snaps_per_dir")) {
1247 max_snaps_per_dir = g_conf().get_val<uint64_t>("mds_max_snaps_per_dir");
1248 dout(20) << __func__ << " max snapshots per directory changed to "
1249 << max_snaps_per_dir << dendl;
1250 }
1251 if (changed.count("mds_client_delegate_inos_pct")) {
1252 delegate_inos_pct = g_conf().get_val<uint64_t>("mds_client_delegate_inos_pct");
1253 }
adb31ebb
TL
1254 if (changed.count("mds_max_caps_per_client")) {
1255 max_caps_per_client = g_conf().get_val<uint64_t>("mds_max_caps_per_client");
1256 }
1257 if (changed.count("mds_session_cap_acquisition_throttle")) {
1258 cap_acquisition_throttle = g_conf().get_val<uint64_t>("mds_session_cap_acquisition_throttle");
1259 }
1260 if (changed.count("mds_session_max_caps_throttle_ratio")) {
1261 max_caps_throttle_ratio = g_conf().get_val<double>("mds_session_max_caps_throttle_ratio");
1262 }
1263 if (changed.count("mds_cap_acquisition_throttle_retry_request_timeout")) {
1264 caps_throttle_retry_request_timeout = g_conf().get_val<double>("mds_cap_acquisition_throttle_retry_request_timeout");
1265 }
f67539c2
TL
1266 if (changed.count("mds_alternate_name_max")) {
1267 alternate_name_max = g_conf().get_val<Option::size_t>("mds_alternate_name_max");
1268 }
20effc67
TL
1269 if (changed.count("mds_dir_max_entries")) {
1270 dir_max_entries = g_conf().get_val<uint64_t>("mds_dir_max_entries");
1271 dout(20) << __func__ << " max entries per directory changed to "
1272 << dir_max_entries << dendl;
1273 }
1274 if (changed.count("mds_bal_fragment_size_max")) {
1275 bal_fragment_size_max = g_conf().get_val<int64_t>("mds_bal_fragment_size_max");
1276 dout(20) << __func__ << " max fragment size changed to "
1277 << bal_fragment_size_max << dendl;
1278 }
91327a77
AA
1279}
1280
7c673cae 1281/*
11fdf7f2 1282 * XXX bump in the interface here, not using an MDSContext here
7c673cae
FG
1283 * because all the callers right now happen to use a SaferCond
1284 */
f67539c2 1285void Server::kill_session(Session *session, Context *on_safe)
7c673cae 1286{
9f95a23c 1287 ceph_assert(ceph_mutex_is_locked_by_me(mds->mds_lock));
31f18b77 1288
7c673cae
FG
1289 if ((session->is_opening() ||
1290 session->is_open() ||
1291 session->is_stale()) &&
1292 !session->is_importing()) {
1293 dout(10) << "kill_session " << session << dendl;
f67539c2 1294 journal_close_session(session, Session::STATE_KILLING, on_safe);
7c673cae
FG
1295 } else {
1296 dout(10) << "kill_session importing or already closing/killing " << session << dendl;
11fdf7f2
TL
1297 if (session->is_closing() ||
1298 session->is_killing()) {
1299 if (on_safe)
1300 mdlog->wait_for_safe(new MDSInternalContextWrapper(mds, on_safe));
1301 } else {
1302 ceph_assert(session->is_closed() ||
1303 session->is_importing());
1304 if (on_safe)
1305 on_safe->complete(0);
7c673cae
FG
1306 }
1307 }
1308}
1309
33c7a0ef 1310size_t Server::apply_blocklist()
31f18b77 1311{
81eedcae 1312 std::vector<Session*> victims;
11fdf7f2 1313 const auto& sessions = mds->sessionmap.get_sessions();
33c7a0ef
TL
1314 mds->objecter->with_osdmap(
1315 [&](const OSDMap& o) {
1316 for (const auto& p : sessions) {
1317 if (!p.first.is_client()) {
1318 // Do not apply OSDMap blocklist to MDS daemons, we find out
1319 // about their death via MDSMap.
1320 continue;
1321 }
1322 if (o.is_blocklisted(p.second->info.inst.addr)) {
1323 victims.push_back(p.second);
1324 }
81eedcae 1325 }
33c7a0ef 1326 });
31f18b77 1327
9f95a23c 1328 for (const auto& s : victims) {
31f18b77
FG
1329 kill_session(s, nullptr);
1330 }
1331
f67539c2 1332 dout(10) << "apply_blocklist: killed " << victims.size() << dendl;
31f18b77
FG
1333
1334 return victims.size();
1335}
1336
f67539c2 1337void Server::journal_close_session(Session *session, int state, Context *on_safe)
7c673cae 1338{
9f95a23c 1339 dout(10) << __func__ << " : "
9f95a23c 1340 << session->info.inst
f67539c2
TL
1341 << " pending_prealloc_inos " << session->pending_prealloc_inos
1342 << " free_prealloc_inos " << session->free_prealloc_inos
1343 << " delegated_inos " << session->delegated_inos << dendl;
9f95a23c 1344
7c673cae
FG
1345 uint64_t sseq = mds->sessionmap.set_state(session, state);
1346 version_t pv = mds->sessionmap.mark_projected(session);
1347 version_t piv = 0;
1348
1349 // release alloc and pending-alloc inos for this session
1350 // and wipe out session state, in case the session close aborts for some reason
f67539c2
TL
1351 interval_set<inodeno_t> inos_to_free;
1352 inos_to_free.insert(session->pending_prealloc_inos);
1353 inos_to_free.insert(session->free_prealloc_inos);
1354 if (inos_to_free.size()) {
1355 mds->inotable->project_release_ids(inos_to_free);
7c673cae
FG
1356 piv = mds->inotable->get_projected_version();
1357 } else
1358 piv = 0;
9f95a23c 1359
f67539c2
TL
1360 auto le = new ESession(session->info.inst, false, pv, inos_to_free, piv, session->delegated_inos);
1361 auto fin = new C_MDS_session_finish(this, session, sseq, false, pv, inos_to_free, piv,
1362 session->delegated_inos, mdlog->get_current_segment(), on_safe);
1363 mdlog->start_submit_entry(le, fin);
7c673cae
FG
1364 mdlog->flush();
1365
1366 // clean up requests, too
f67539c2
TL
1367 while(!session->requests.empty()) {
1368 auto mdr = MDRequestRef(*session->requests.begin());
7c673cae
FG
1369 mdcache->request_kill(mdr);
1370 }
1371
1372 finish_flush_session(session, session->get_push_seq());
1373}
1374
11fdf7f2 1375void Server::reconnect_clients(MDSContext *reconnect_done_)
7c673cae
FG
1376{
1377 reconnect_done = reconnect_done_;
28e407b8 1378
11fdf7f2 1379 auto now = clock::now();
28e407b8
AA
1380 set<Session*> sessions;
1381 mds->sessionmap.get_client_session_set(sessions);
1382 for (auto session : sessions) {
11fdf7f2
TL
1383 if (session->is_open()) {
1384 client_reconnect_gather.insert(session->get_client());
92f5a8d4 1385 session->set_reconnecting(true);
11fdf7f2
TL
1386 session->last_cap_renew = now;
1387 }
28e407b8 1388 }
7c673cae
FG
1389
1390 if (client_reconnect_gather.empty()) {
1391 dout(7) << "reconnect_clients -- no sessions, doing nothing." << dendl;
1392 reconnect_gather_finish();
1393 return;
1394 }
1395
1396 // clients will get the mdsmap and discover we're reconnecting via the monitor.
1397
11fdf7f2 1398 reconnect_start = now;
7c673cae
FG
1399 dout(1) << "reconnect_clients -- " << client_reconnect_gather.size() << " sessions" << dendl;
1400 mds->sessionmap.dump();
1401}
1402
9f95a23c 1403void Server::handle_client_reconnect(const cref_t<MClientReconnect> &m)
7c673cae 1404{
11fdf7f2
TL
1405 dout(7) << "handle_client_reconnect " << m->get_source()
1406 << (m->has_more() ? " (more)" : "") << dendl;
7c673cae 1407 client_t from = m->get_source().num();
94b18763 1408 Session *session = mds->get_session(m);
92f5a8d4
TL
1409 if (!session) {
1410 dout(0) << " ignoring sessionless msg " << *m << dendl;
9f95a23c 1411 auto reply = make_message<MClientSession>(CEPH_SESSION_REJECT);
92f5a8d4
TL
1412 reply->metadata["error_string"] = "sessionless";
1413 mds->send_message(reply, m->get_connection());
81eedcae 1414 return;
92f5a8d4
TL
1415 }
1416
1417 if (!session->is_open()) {
1418 dout(0) << " ignoring msg from not-open session" << *m << dendl;
9f95a23c 1419 auto reply = make_message<MClientSession>(CEPH_SESSION_CLOSE);
92f5a8d4
TL
1420 mds->send_message(reply, m->get_connection());
1421 return;
1422 }
7c673cae 1423
f67539c2
TL
1424 bool reconnect_all_deny = g_conf().get_val<bool>("mds_deny_all_reconnect");
1425
7c673cae
FG
1426 if (!mds->is_reconnect() && mds->get_want_state() == CEPH_MDS_STATE_RECONNECT) {
1427 dout(10) << " we're almost in reconnect state (mdsmap delivery race?); waiting" << dendl;
1428 mds->wait_for_reconnect(new C_MDS_RetryMessage(mds, m));
1429 return;
1430 }
1431
f64942e4 1432 auto delay = std::chrono::duration<double>(clock::now() - reconnect_start).count();
7c673cae
FG
1433 dout(10) << " reconnect_start " << reconnect_start << " delay " << delay << dendl;
1434
1435 bool deny = false;
f67539c2 1436 if (reconnect_all_deny || !mds->is_reconnect() || mds->get_want_state() != CEPH_MDS_STATE_RECONNECT || reconnect_evicting) {
7c673cae 1437 // XXX maybe in the future we can do better than this?
f67539c2
TL
1438 if (reconnect_all_deny) {
1439 dout(1) << "mds_deny_all_reconnect was set to speed up reboot phase, ignoring reconnect, sending close" << dendl;
1440 } else {
1441 dout(1) << "no longer in reconnect state, ignoring reconnect, sending close" << dendl;
1442 }
7c673cae
FG
1443 mds->clog->info() << "denied reconnect attempt (mds is "
1444 << ceph_mds_state_name(mds->get_state())
1445 << ") from " << m->get_source_inst()
11fdf7f2 1446 << " after " << delay << " (allowed interval " << g_conf()->mds_reconnect_timeout << ")";
7c673cae 1447 deny = true;
11fdf7f2
TL
1448 } else {
1449 std::string error_str;
1450 if (!session->is_open()) {
1451 error_str = "session is closed";
1452 } else if (mdcache->is_readonly()) {
1453 error_str = "mds is readonly";
1454 } else {
1455 if (session->info.client_metadata.features.empty())
1456 infer_supported_features(session, session->info.client_metadata);
1457
1458 feature_bitset_t missing_features = required_client_features;
1459 missing_features -= session->info.client_metadata.features;
1460 if (!missing_features.empty()) {
f67539c2
TL
1461 CachedStackStringStream css;
1462 *css << "missing required features '" << missing_features << "'";
1463 error_str = css->strv();
11fdf7f2
TL
1464 }
1465 }
1466
1467 if (!error_str.empty()) {
1468 deny = true;
1469 dout(1) << " " << error_str << ", ignoring reconnect, sending close" << dendl;
1470 mds->clog->info() << "denied reconnect attempt from "
1471 << m->get_source_inst() << " (" << error_str << ")";
1472 }
7c673cae
FG
1473 }
1474
1475 if (deny) {
9f95a23c 1476 auto r = make_message<MClientSession>(CEPH_SESSION_CLOSE);
11fdf7f2 1477 mds->send_message_client(r, session);
f67539c2
TL
1478 if (session->is_open()) {
1479 client_reconnect_denied.insert(session->get_client());
1480 }
7c673cae
FG
1481 return;
1482 }
1483
11fdf7f2 1484 if (!m->has_more()) {
f67539c2 1485 metrics_handler->add_session(session);
11fdf7f2 1486 // notify client of success with an OPEN
9f95a23c 1487 auto reply = make_message<MClientSession>(CEPH_SESSION_OPEN);
33c7a0ef 1488 if (session->info.has_feature(CEPHFS_FEATURE_MIMIC)) {
11fdf7f2 1489 reply->supported_features = supported_features;
33c7a0ef
TL
1490 reply->metric_spec = supported_metric_spec;
1491 }
11fdf7f2
TL
1492 mds->send_message_client(reply, session);
1493 mds->clog->debug() << "reconnect by " << session->info.inst << " after " << delay;
1494 }
1495
91327a77 1496 session->last_cap_renew = clock::now();
7c673cae
FG
1497
1498 // snaprealms
11fdf7f2
TL
1499 for (const auto &r : m->realms) {
1500 CInode *in = mdcache->get_inode(inodeno_t(r.realm.ino));
7c673cae
FG
1501 if (in && in->state_test(CInode::STATE_PURGING))
1502 continue;
1503 if (in) {
11fdf7f2
TL
1504 if (in->snaprealm) {
1505 dout(15) << "open snaprealm (w inode) on " << *in << dendl;
7c673cae 1506 } else {
11fdf7f2
TL
1507 // this can happen if we are non-auth or we rollback snaprealm
1508 dout(15) << "open snaprealm (null snaprealm) on " << *in << dendl;
7c673cae 1509 }
11fdf7f2 1510 mdcache->add_reconnected_snaprealm(from, inodeno_t(r.realm.ino), snapid_t(r.realm.seq));
7c673cae 1511 } else {
11fdf7f2
TL
1512 dout(15) << "open snaprealm (w/o inode) on " << inodeno_t(r.realm.ino)
1513 << " seq " << r.realm.seq << dendl;
1514 mdcache->add_reconnected_snaprealm(from, inodeno_t(r.realm.ino), snapid_t(r.realm.seq));
7c673cae
FG
1515 }
1516 }
1517
1518 // caps
11fdf7f2 1519 for (const auto &p : m->caps) {
7c673cae 1520 // make sure our last_cap_id is MAX over all issued caps
11fdf7f2
TL
1521 if (p.second.capinfo.cap_id > mdcache->last_cap_id)
1522 mdcache->last_cap_id = p.second.capinfo.cap_id;
7c673cae 1523
11fdf7f2 1524 CInode *in = mdcache->get_inode(p.first);
7c673cae
FG
1525 if (in && in->state_test(CInode::STATE_PURGING))
1526 continue;
1527 if (in && in->is_auth()) {
1528 // we recovered it, and it's ours. take note.
11fdf7f2 1529 dout(15) << "open cap realm " << inodeno_t(p.second.capinfo.snaprealm)
7c673cae 1530 << " on " << *in << dendl;
11fdf7f2
TL
1531 in->reconnect_cap(from, p.second, session);
1532 mdcache->add_reconnected_cap(from, p.first, p.second);
1533 recover_filelocks(in, p.second.flockbl, m->get_orig_source().num());
7c673cae
FG
1534 continue;
1535 }
1536
1537 if (in && !in->is_auth()) {
1538 // not mine.
1539 dout(10) << "non-auth " << *in << ", will pass off to authority" << dendl;
1540 // add to cap export list.
11fdf7f2
TL
1541 mdcache->rejoin_export_caps(p.first, from, p.second,
1542 in->authority().first, true);
7c673cae
FG
1543 } else {
1544 // don't know if the inode is mine
11fdf7f2
TL
1545 dout(10) << "missing ino " << p.first << ", will load later" << dendl;
1546 mdcache->rejoin_recovered_caps(p.first, from, p.second, MDS_RANK_NONE);
7c673cae
FG
1547 }
1548 }
1549
f64942e4
AA
1550 reconnect_last_seen = clock::now();
1551
11fdf7f2
TL
1552 if (!m->has_more()) {
1553 mdcache->rejoin_recovered_client(session->get_client(), session->info.inst);
1554
1555 // remove from gather set
1556 client_reconnect_gather.erase(from);
92f5a8d4 1557 session->set_reconnecting(false);
11fdf7f2
TL
1558 if (client_reconnect_gather.empty())
1559 reconnect_gather_finish();
1560 }
1561}
1562
1563void Server::infer_supported_features(Session *session, client_metadata_t& client_metadata)
1564{
1565 int supported = -1;
1566 auto it = client_metadata.find("ceph_version");
1567 if (it != client_metadata.end()) {
1568 // user space client
1569 if (it->second.compare(0, 16, "ceph version 12.") == 0)
1570 supported = CEPHFS_FEATURE_LUMINOUS;
1571 else if (session->get_connection()->has_feature(CEPH_FEATURE_FS_CHANGE_ATTR))
1572 supported = CEPHFS_FEATURE_KRAKEN;
1573 } else {
1574 it = client_metadata.find("kernel_version");
1575 if (it != client_metadata.end()) {
1576 // kernel client
1577 if (session->get_connection()->has_feature(CEPH_FEATURE_NEW_OSDOP_ENCODING))
1578 supported = CEPHFS_FEATURE_LUMINOUS;
1579 }
1580 }
1581 if (supported == -1 &&
1582 session->get_connection()->has_feature(CEPH_FEATURE_FS_FILE_LAYOUT_V2))
1583 supported = CEPHFS_FEATURE_JEWEL;
7c673cae 1584
11fdf7f2
TL
1585 if (supported >= 0) {
1586 unsigned long value = (1UL << (supported + 1)) - 1;
1587 client_metadata.features = feature_bitset_t(value);
1588 dout(10) << __func__ << " got '" << client_metadata.features << "'" << dendl;
1589 }
7c673cae
FG
1590}
1591
11fdf7f2
TL
1592void Server::update_required_client_features()
1593{
f67539c2 1594 required_client_features = mds->mdsmap->get_required_client_features();
11fdf7f2
TL
1595 dout(7) << "required_client_features: " << required_client_features << dendl;
1596
1597 if (mds->get_state() >= MDSMap::STATE_RECONNECT) {
1598 set<Session*> sessions;
1599 mds->sessionmap.get_client_session_set(sessions);
1600 for (auto session : sessions) {
1601 feature_bitset_t missing_features = required_client_features;
1602 missing_features -= session->info.client_metadata.features;
1603 if (!missing_features.empty()) {
f67539c2 1604 bool blocklisted = mds->objecter->with_osdmap(
11fdf7f2 1605 [session](const OSDMap &osd_map) -> bool {
f67539c2 1606 return osd_map.is_blocklisted(session->info.inst.addr);
11fdf7f2 1607 });
f67539c2 1608 if (blocklisted)
11fdf7f2 1609 continue;
7c673cae 1610
11fdf7f2
TL
1611 mds->clog->warn() << "evicting session " << *session << ", missing required features '"
1612 << missing_features << "'";
f67539c2 1613 CachedStackStringStream css;
11fdf7f2 1614 mds->evict_client(session->get_client().v, false,
f67539c2 1615 g_conf()->mds_session_blocklist_on_evict, *css);
11fdf7f2
TL
1616 }
1617 }
1618 }
1619}
7c673cae
FG
1620
1621void Server::reconnect_gather_finish()
1622{
1623 dout(7) << "reconnect_gather_finish. failed on " << failed_reconnects << " clients" << dendl;
11fdf7f2
TL
1624 ceph_assert(reconnect_done);
1625
1626 if (!mds->snapclient->is_synced()) {
1627 // make sure snaptable cache is populated. snaprealms will be
1628 // extensively used in rejoin stage.
1629 dout(7) << " snaptable cache isn't synced, delaying state transition" << dendl;
1630 mds->snapclient->wait_for_sync(reconnect_done);
1631 } else {
1632 reconnect_done->complete(0);
1633 }
7c673cae
FG
1634 reconnect_done = NULL;
1635}
1636
1637void Server::reconnect_tick()
1638{
f67539c2 1639 bool reject_all_reconnect = false;
31f18b77 1640 if (reconnect_evicting) {
f64942e4 1641 dout(7) << "reconnect_tick: waiting for evictions" << dendl;
31f18b77
FG
1642 return;
1643 }
1644
f67539c2
TL
1645 /*
1646 * Set mds_deny_all_reconnect to reject all the reconnect req ,
1647 * then load less meta information in rejoin phase. This will shorten reboot time.
1648 * Moreover, loading less meta increases the chance standby with less memory can failover.
1649
1650 * Why not shorten reconnect period?
1651 * Clients may send unsafe or retry requests, which haven't been
1652 * completed before old mds stop, to new mds. These requests may
1653 * need to be processed during new mds's clientreplay phase,
1654 * see: #https://github.com/ceph/ceph/pull/29059.
1655 */
1656 bool reconnect_all_deny = g_conf().get_val<bool>("mds_deny_all_reconnect");
f64942e4
AA
1657 if (client_reconnect_gather.empty())
1658 return;
31f18b77 1659
f67539c2
TL
1660 if (reconnect_all_deny && (client_reconnect_gather == client_reconnect_denied))
1661 reject_all_reconnect = true;
1662
f64942e4
AA
1663 auto now = clock::now();
1664 auto elapse1 = std::chrono::duration<double>(now - reconnect_start).count();
f67539c2 1665 if (elapse1 < g_conf()->mds_reconnect_timeout && !reject_all_reconnect)
f64942e4 1666 return;
31f18b77 1667
f64942e4
AA
1668 vector<Session*> remaining_sessions;
1669 remaining_sessions.reserve(client_reconnect_gather.size());
1670 for (auto c : client_reconnect_gather) {
1671 Session *session = mds->sessionmap.get_session(entity_name_t::CLIENT(c.v));
1672 ceph_assert(session);
1673 remaining_sessions.push_back(session);
1674 // client re-sends cap flush messages before the reconnect message
1675 if (session->last_seen > reconnect_last_seen)
1676 reconnect_last_seen = session->last_seen;
1677 }
31f18b77 1678
f64942e4 1679 auto elapse2 = std::chrono::duration<double>(now - reconnect_last_seen).count();
f67539c2 1680 if (elapse2 < g_conf()->mds_reconnect_timeout / 2 && !reject_all_reconnect) {
f64942e4
AA
1681 dout(7) << "reconnect_tick: last seen " << elapse2
1682 << " seconds ago, extending reconnect interval" << dendl;
1683 return;
1684 }
1685
1686 dout(7) << "reconnect timed out, " << remaining_sessions.size()
f67539c2 1687 << " clients have not reconnected in time" << dendl;
f64942e4 1688
f67539c2 1689 // If we're doing blocklist evictions, use this to wait for them before
f64942e4
AA
1690 // proceeding to reconnect_gather_finish
1691 MDSGatherBuilder gather(g_ceph_context);
1692
1693 for (auto session : remaining_sessions) {
11fdf7f2
TL
1694 // Keep sessions that have specified timeout. These sessions will prevent
1695 // mds from going to active. MDS goes to active after they all have been
1696 // killed or reclaimed.
1697 if (session->info.client_metadata.find("timeout") !=
1698 session->info.client_metadata.end()) {
1699 dout(1) << "reconnect keeps " << session->info.inst
1700 << ", need to be reclaimed" << dendl;
1701 client_reclaim_gather.insert(session->get_client());
1702 continue;
1703 }
1704
f64942e4 1705 dout(1) << "reconnect gives up on " << session->info.inst << dendl;
31f18b77 1706
f64942e4
AA
1707 mds->clog->warn() << "evicting unresponsive client " << *session
1708 << ", after waiting " << elapse1
1709 << " seconds during MDS startup";
1710
f67539c2
TL
1711 // make _session_logged() purge orphan objects of lost async/unsafe requests
1712 session->delegated_inos.swap(session->free_prealloc_inos);
1713
1714 if (g_conf()->mds_session_blocklist_on_timeout) {
1715 CachedStackStringStream css;
1716 mds->evict_client(session->get_client().v, false, true, *css,
f64942e4 1717 gather.new_sub());
31f18b77 1718 } else {
f67539c2 1719 kill_session(session, NULL);
31f18b77 1720 }
f64942e4
AA
1721
1722 failed_reconnects++;
1723 }
1724 client_reconnect_gather.clear();
f67539c2 1725 client_reconnect_denied.clear();
f64942e4
AA
1726
1727 if (gather.has_subs()) {
1728 dout(1) << "reconnect will complete once clients are evicted" << dendl;
9f95a23c 1729 gather.set_finisher(new MDSInternalContextWrapper(mds, new LambdaContext(
f64942e4
AA
1730 [this](int r){reconnect_gather_finish();})));
1731 gather.activate();
1732 reconnect_evicting = true;
1733 } else {
1734 reconnect_gather_finish();
7c673cae
FG
1735 }
1736}
1737
1738void Server::recover_filelocks(CInode *in, bufferlist locks, int64_t client)
1739{
1740 if (!locks.length()) return;
1741 int numlocks;
1742 ceph_filelock lock;
11fdf7f2
TL
1743 auto p = locks.cbegin();
1744 decode(numlocks, p);
7c673cae 1745 for (int i = 0; i < numlocks; ++i) {
11fdf7f2 1746 decode(lock, p);
7c673cae
FG
1747 lock.client = client;
1748 in->get_fcntl_lock_state()->held_locks.insert(pair<uint64_t, ceph_filelock>(lock.start, lock));
1749 ++in->get_fcntl_lock_state()->client_held_lock_counts[client];
1750 }
11fdf7f2 1751 decode(numlocks, p);
7c673cae 1752 for (int i = 0; i < numlocks; ++i) {
11fdf7f2 1753 decode(lock, p);
7c673cae
FG
1754 lock.client = client;
1755 in->get_flock_lock_state()->held_locks.insert(pair<uint64_t, ceph_filelock> (lock.start, lock));
1756 ++in->get_flock_lock_state()->client_held_lock_counts[client];
1757 }
1758}
1759
7c673cae
FG
1760/**
1761 * Call this when the MDCache is oversized, to send requests to the clients
1762 * to trim some caps, and consequently unpin some inodes in the MDCache so
1763 * that it can trim too.
1764 */
a8e16298
TL
1765std::pair<bool, uint64_t> Server::recall_client_state(MDSGatherBuilder* gather, RecallFlags flags)
1766{
1767 const auto now = clock::now();
92f5a8d4
TL
1768 const bool steady = !!(flags&RecallFlags::STEADY);
1769 const bool enforce_max = !!(flags&RecallFlags::ENFORCE_MAX);
1770 const bool enforce_liveness = !!(flags&RecallFlags::ENFORCE_LIVENESS);
1771 const bool trim = !!(flags&RecallFlags::TRIM);
a8e16298 1772
11fdf7f2
TL
1773 const auto max_caps_per_client = g_conf().get_val<uint64_t>("mds_max_caps_per_client");
1774 const auto min_caps_per_client = g_conf().get_val<uint64_t>("mds_min_caps_per_client");
1775 const auto recall_global_max_decay_threshold = g_conf().get_val<Option::size_t>("mds_recall_global_max_decay_threshold");
1776 const auto recall_max_caps = g_conf().get_val<Option::size_t>("mds_recall_max_caps");
1777 const auto recall_max_decay_threshold = g_conf().get_val<Option::size_t>("mds_recall_max_decay_threshold");
92f5a8d4 1778 const auto cache_liveness_magnitude = g_conf().get_val<Option::size_t>("mds_session_cache_liveness_magnitude");
a8e16298
TL
1779
1780 dout(7) << __func__ << ":"
1781 << " min=" << min_caps_per_client
1782 << " max=" << max_caps_per_client
1783 << " total=" << Capability::count()
92f5a8d4 1784 << " flags=" << flags
a8e16298 1785 << dendl;
f64942e4 1786
a8e16298
TL
1787 /* trim caps of sessions with the most caps first */
1788 std::multimap<uint64_t, Session*> caps_session;
92f5a8d4 1789 auto f = [&caps_session, enforce_max, enforce_liveness, trim, max_caps_per_client, cache_liveness_magnitude](auto& s) {
a8e16298 1790 auto num_caps = s->caps.size();
92f5a8d4
TL
1791 auto cache_liveness = s->get_session_cache_liveness();
1792 if (trim || (enforce_max && num_caps > max_caps_per_client) || (enforce_liveness && cache_liveness < (num_caps>>cache_liveness_magnitude))) {
a8e16298
TL
1793 caps_session.emplace(std::piecewise_construct, std::forward_as_tuple(num_caps), std::forward_as_tuple(s));
1794 }
1795 };
1796 mds->sessionmap.get_client_sessions(std::move(f));
1797
1798 std::pair<bool, uint64_t> result = {false, 0};
11fdf7f2 1799 auto& [throttled, caps_recalled] = result;
a8e16298 1800 last_recall_state = now;
11fdf7f2 1801 for (const auto& [num_caps, session] : boost::adaptors::reverse(caps_session)) {
7c673cae 1802 if (!session->is_open() ||
11fdf7f2 1803 !session->get_connection() ||
7c673cae
FG
1804 !session->info.inst.name.is_client())
1805 continue;
1806
a8e16298
TL
1807 dout(10) << __func__ << ":"
1808 << " session " << session->info.inst
1809 << " caps " << num_caps
7c673cae
FG
1810 << ", leases " << session->leases.size()
1811 << dendl;
1812
a8e16298
TL
1813 uint64_t newlim;
1814 if (num_caps < recall_max_caps || (num_caps-recall_max_caps) < min_caps_per_client) {
1815 newlim = min_caps_per_client;
1816 } else {
1817 newlim = num_caps-recall_max_caps;
1818 }
1819 if (num_caps > newlim) {
1820 /* now limit the number of caps we recall at a time to prevent overloading ourselves */
1821 uint64_t recall = std::min<uint64_t>(recall_max_caps, num_caps-newlim);
1822 newlim = num_caps-recall;
1823 const uint64_t session_recall_throttle = session->get_recall_caps_throttle();
11fdf7f2
TL
1824 const uint64_t session_recall_throttle2o = session->get_recall_caps_throttle2o();
1825 const uint64_t global_recall_throttle = recall_throttle.get();
a8e16298
TL
1826 if (session_recall_throttle+recall > recall_max_decay_threshold) {
1827 dout(15) << " session recall threshold (" << recall_max_decay_threshold << ") hit at " << session_recall_throttle << "; skipping!" << dendl;
1828 throttled = true;
1829 continue;
11fdf7f2
TL
1830 } else if (session_recall_throttle2o+recall > recall_max_caps*2) {
1831 dout(15) << " session recall 2nd-order threshold (" << 2*recall_max_caps << ") hit at " << session_recall_throttle2o << "; skipping!" << dendl;
1832 throttled = true;
1833 continue;
a8e16298
TL
1834 } else if (global_recall_throttle+recall > recall_global_max_decay_threshold) {
1835 dout(15) << " global recall threshold (" << recall_global_max_decay_threshold << ") hit at " << global_recall_throttle << "; skipping!" << dendl;
1836 throttled = true;
1837 break;
1838 }
1839
1840 // now check if we've recalled caps recently and the client is unlikely to satisfy a new recall
1841 if (steady) {
1842 const auto session_recall = session->get_recall_caps();
1843 const auto session_release = session->get_release_caps();
1844 if (2*session_release < session_recall && 2*session_recall > recall_max_decay_threshold) {
1845 /* The session has been unable to keep up with the number of caps
1846 * recalled (by half); additionally, to prevent marking sessions
1847 * we've just begun to recall from, the session_recall counter
1848 * (decayed count of caps recently recalled) is **greater** than the
1849 * session threshold for the session's cap recall throttle.
1850 */
1851 dout(15) << " 2*session_release < session_recall"
11fdf7f2
TL
1852 " (2*" << session_release << " < " << session_recall << ") &&"
1853 " 2*session_recall < recall_max_decay_threshold"
1854 " (2*" << session_recall << " > " << recall_max_decay_threshold << ")"
a8e16298
TL
1855 " Skipping because we are unlikely to get more released." << dendl;
1856 continue;
1857 } else if (recall < recall_max_caps && 2*recall < session_recall) {
1858 /* The number of caps recalled is less than the number we *could*
1859 * recall (so there isn't much left to recall?) and the number of
1860 * caps is less than the current recall_caps counter (decayed count
1861 * of caps recently recalled).
1862 */
1863 dout(15) << " 2*recall < session_recall "
1864 " (2*" << recall << " < " << session_recall << ") &&"
1865 " recall < recall_max_caps (" << recall << " < " << recall_max_caps << ");"
1866 " Skipping because we are unlikely to get more released." << dendl;
1867 continue;
1868 }
1869 }
1870
1871 dout(7) << " recalling " << recall << " caps; session_recall_throttle = " << session_recall_throttle << "; global_recall_throttle = " << global_recall_throttle << dendl;
1872
9f95a23c 1873 auto m = make_message<MClientSession>(CEPH_SESSION_RECALL_STATE);
3efd9988
FG
1874 m->head.max_caps = newlim;
1875 mds->send_message_client(m, session);
a8e16298 1876 if (gather) {
f91f0fd5 1877 flush_session(session, *gather);
f64942e4 1878 }
a8e16298 1879 caps_recalled += session->notify_recall_sent(newlim);
11fdf7f2 1880 recall_throttle.hit(recall);
7c673cae
FG
1881 }
1882 }
a8e16298
TL
1883
1884 dout(7) << "recalled" << (throttled ? " (throttled)" : "") << " " << caps_recalled << " client caps." << dendl;
1885
1886 return result;
7c673cae
FG
1887}
1888
1889void Server::force_clients_readonly()
1890{
1891 dout(10) << "force_clients_readonly" << dendl;
1892 set<Session*> sessions;
1893 mds->sessionmap.get_client_session_set(sessions);
1894 for (set<Session*>::const_iterator p = sessions.begin();
1895 p != sessions.end();
1896 ++p) {
1897 Session *session = *p;
1898 if (!session->info.inst.name.is_client() ||
1899 !(session->is_open() || session->is_stale()))
1900 continue;
9f95a23c 1901 mds->send_message_client(make_message<MClientSession>(CEPH_SESSION_FORCE_RO), session);
7c673cae
FG
1902 }
1903}
1904
1905/*******
1906 * some generic stuff for finishing off requests
1907 */
1908void Server::journal_and_reply(MDRequestRef& mdr, CInode *in, CDentry *dn, LogEvent *le, MDSLogContextBase *fin)
1909{
1910 dout(10) << "journal_and_reply tracei " << in << " tracedn " << dn << dendl;
11fdf7f2 1911 ceph_assert(!mdr->has_completed);
7c673cae
FG
1912
1913 // note trace items for eventual reply.
1914 mdr->tracei = in;
1915 if (in)
1916 mdr->pin(in);
1917
1918 mdr->tracedn = dn;
1919 if (dn)
1920 mdr->pin(dn);
1921
1922 early_reply(mdr, in, dn);
1923
1924 mdr->committing = true;
1925 submit_mdlog_entry(le, fin, mdr, __func__);
1926
1927 if (mdr->client_request && mdr->client_request->is_queued_for_replay()) {
1928 if (mds->queue_one_replay()) {
1929 dout(10) << " queued next replay op" << dendl;
1930 } else {
11fdf7f2 1931 dout(10) << " journaled last replay op" << dendl;
7c673cae
FG
1932 }
1933 } else if (mdr->did_early_reply)
b32b8144 1934 mds->locker->drop_rdlocks_for_early_reply(mdr.get());
7c673cae
FG
1935 else
1936 mdlog->flush();
1937}
1938
1939void Server::submit_mdlog_entry(LogEvent *le, MDSLogContextBase *fin, MDRequestRef& mdr,
11fdf7f2 1940 std::string_view event)
7c673cae
FG
1941{
1942 if (mdr) {
1943 string event_str("submit entry: ");
1944 event_str += event;
11fdf7f2 1945 mdr->mark_event(event_str);
7c673cae
FG
1946 }
1947 mdlog->submit_entry(le, fin);
1948}
1949
1950/*
1951 * send response built from mdr contents and error code; clean up mdr
1952 */
1953void Server::respond_to_request(MDRequestRef& mdr, int r)
1954{
1955 if (mdr->client_request) {
f91f0fd5
TL
1956 if (mdr->is_batch_head()) {
1957 dout(20) << __func__ << " batch head " << *mdr << dendl;
1958 mdr->release_batch_op()->respond(r);
9f95a23c
TL
1959 } else {
1960 reply_client_request(mdr, make_message<MClientReply>(*mdr->client_request, r));
1961 }
7c673cae
FG
1962 } else if (mdr->internal_op > -1) {
1963 dout(10) << "respond_to_request on internal request " << mdr << dendl;
1964 if (!mdr->internal_op_finish)
11fdf7f2 1965 ceph_abort_msg("trying to respond to internal op without finisher");
7c673cae
FG
1966 mdr->internal_op_finish->complete(r);
1967 mdcache->request_finish(mdr);
1968 }
1969}
1970
91327a77 1971// statistics mds req op number and latency
9f95a23c 1972void Server::perf_gather_op_latency(const cref_t<MClientRequest> &req, utime_t lat)
91327a77
AA
1973{
1974 int code = l_mdss_first;
1975 switch(req->get_op()) {
1976 case CEPH_MDS_OP_LOOKUPHASH:
1977 code = l_mdss_req_lookuphash_latency;
1978 break;
1979 case CEPH_MDS_OP_LOOKUPINO:
1980 code = l_mdss_req_lookupino_latency;
1981 break;
1982 case CEPH_MDS_OP_LOOKUPPARENT:
1983 code = l_mdss_req_lookupparent_latency;
1984 break;
1985 case CEPH_MDS_OP_LOOKUPNAME:
1986 code = l_mdss_req_lookupname_latency;
1987 break;
1988 case CEPH_MDS_OP_LOOKUP:
1989 code = l_mdss_req_lookup_latency;
1990 break;
1991 case CEPH_MDS_OP_LOOKUPSNAP:
1992 code = l_mdss_req_lookupsnap_latency;
1993 break;
1994 case CEPH_MDS_OP_GETATTR:
1995 code = l_mdss_req_getattr_latency;
1996 break;
1997 case CEPH_MDS_OP_SETATTR:
1998 code = l_mdss_req_setattr_latency;
1999 break;
2000 case CEPH_MDS_OP_SETLAYOUT:
2001 code = l_mdss_req_setlayout_latency;
2002 break;
2003 case CEPH_MDS_OP_SETDIRLAYOUT:
2004 code = l_mdss_req_setdirlayout_latency;
2005 break;
1d09f67e
TL
2006 case CEPH_MDS_OP_GETVXATTR:
2007 code = l_mdss_req_getvxattr_latency;
2008 break;
91327a77
AA
2009 case CEPH_MDS_OP_SETXATTR:
2010 code = l_mdss_req_setxattr_latency;
2011 break;
2012 case CEPH_MDS_OP_RMXATTR:
2013 code = l_mdss_req_rmxattr_latency;
2014 break;
2015 case CEPH_MDS_OP_READDIR:
2016 code = l_mdss_req_readdir_latency;
2017 break;
2018 case CEPH_MDS_OP_SETFILELOCK:
2019 code = l_mdss_req_setfilelock_latency;
2020 break;
2021 case CEPH_MDS_OP_GETFILELOCK:
2022 code = l_mdss_req_getfilelock_latency;
2023 break;
2024 case CEPH_MDS_OP_CREATE:
2025 code = l_mdss_req_create_latency;
2026 break;
2027 case CEPH_MDS_OP_OPEN:
2028 code = l_mdss_req_open_latency;
2029 break;
2030 case CEPH_MDS_OP_MKNOD:
2031 code = l_mdss_req_mknod_latency;
2032 break;
2033 case CEPH_MDS_OP_LINK:
2034 code = l_mdss_req_link_latency;
2035 break;
2036 case CEPH_MDS_OP_UNLINK:
2037 code = l_mdss_req_unlink_latency;
2038 break;
2039 case CEPH_MDS_OP_RMDIR:
2040 code = l_mdss_req_rmdir_latency;
2041 break;
2042 case CEPH_MDS_OP_RENAME:
2043 code = l_mdss_req_rename_latency;
2044 break;
2045 case CEPH_MDS_OP_MKDIR:
2046 code = l_mdss_req_mkdir_latency;
2047 break;
2048 case CEPH_MDS_OP_SYMLINK:
2049 code = l_mdss_req_symlink_latency;
2050 break;
2051 case CEPH_MDS_OP_LSSNAP:
2052 code = l_mdss_req_lssnap_latency;
2053 break;
2054 case CEPH_MDS_OP_MKSNAP:
2055 code = l_mdss_req_mksnap_latency;
2056 break;
2057 case CEPH_MDS_OP_RMSNAP:
2058 code = l_mdss_req_rmsnap_latency;
2059 break;
2060 case CEPH_MDS_OP_RENAMESNAP:
2061 code = l_mdss_req_renamesnap_latency;
2062 break;
20effc67
TL
2063 default:
2064 dout(1) << ": unknown client op" << dendl;
2065 return;
91327a77
AA
2066 }
2067 logger->tinc(code, lat);
2068}
2069
7c673cae
FG
2070void Server::early_reply(MDRequestRef& mdr, CInode *tracei, CDentry *tracedn)
2071{
11fdf7f2 2072 if (!g_conf()->mds_early_reply)
7c673cae
FG
2073 return;
2074
b32b8144
FG
2075 if (mdr->no_early_reply) {
2076 dout(10) << "early_reply - flag no_early_reply is set, not allowed." << dendl;
2077 return;
2078 }
2079
f67539c2
TL
2080 if (mdr->has_more() && mdr->more()->has_journaled_peers) {
2081 dout(10) << "early_reply - there are journaled peers, not allowed." << dendl;
7c673cae
FG
2082 return;
2083 }
2084
2085 if (mdr->alloc_ino) {
2086 dout(10) << "early_reply - allocated ino, not allowed" << dendl;
2087 return;
2088 }
2089
9f95a23c 2090 const cref_t<MClientRequest> &req = mdr->client_request;
7c673cae
FG
2091 entity_inst_t client_inst = req->get_source_inst();
2092 if (client_inst.name.is_mds())
2093 return;
2094
2095 if (req->is_replay()) {
2096 dout(10) << " no early reply on replay op" << dendl;
2097 return;
2098 }
2099
2100
9f95a23c 2101 auto reply = make_message<MClientReply>(*req, 0);
7c673cae
FG
2102 reply->set_unsafe();
2103
2104 // mark xlocks "done", indicating that we are exposing uncommitted changes.
2105 //
2106 //_rename_finish() does not send dentry link/unlink message to replicas.
2107 // so do not set xlocks on dentries "done", the xlocks prevent dentries
2108 // that have projected linkages from getting new replica.
2109 mds->locker->set_xlocks_done(mdr.get(), req->get_op() == CEPH_MDS_OP_RENAME);
2110
2111 dout(10) << "early_reply " << reply->get_result()
2112 << " (" << cpp_strerror(reply->get_result())
2113 << ") " << *req << dendl;
2114
2115 if (tracei || tracedn) {
2116 if (tracei)
2117 mdr->cap_releases.erase(tracei->vino());
2118 if (tracedn)
2119 mdr->cap_releases.erase(tracedn->get_dir()->get_inode()->vino());
2120
9f95a23c 2121 set_trace_dist(reply, tracei, tracedn, mdr);
7c673cae
FG
2122 }
2123
2124 reply->set_extra_bl(mdr->reply_extra_bl);
11fdf7f2 2125 mds->send_message_client(reply, mdr->session);
7c673cae
FG
2126
2127 mdr->did_early_reply = true;
2128
2129 mds->logger->inc(l_mds_reply);
2130 utime_t lat = ceph_clock_now() - req->get_recv_stamp();
2131 mds->logger->tinc(l_mds_reply_latency, lat);
33c7a0ef
TL
2132 if (lat >= g_conf()->mds_op_complaint_time) {
2133 mds->logger->inc(l_mds_slow_reply);
2134 }
91327a77
AA
2135 if (client_inst.name.is_client()) {
2136 mds->sessionmap.hit_session(mdr->session);
2137 }
2138 perf_gather_op_latency(req, lat);
7c673cae
FG
2139 dout(20) << "lat " << lat << dendl;
2140
2141 mdr->mark_event("early_replied");
2142}
2143
2144/*
2145 * send given reply
2146 * include a trace to tracei
2147 * Clean up mdr
2148 */
9f95a23c 2149void Server::reply_client_request(MDRequestRef& mdr, const ref_t<MClientReply> &reply)
7c673cae 2150{
11fdf7f2 2151 ceph_assert(mdr.get());
9f95a23c 2152 const cref_t<MClientRequest> &req = mdr->client_request;
7c673cae
FG
2153
2154 dout(7) << "reply_client_request " << reply->get_result()
2155 << " (" << cpp_strerror(reply->get_result())
2156 << ") " << *req << dendl;
2157
2158 mdr->mark_event("replying");
2159
2160 Session *session = mdr->session;
2161
2162 // note successful request in session map?
2163 //
2164 // setfilelock requests are special, they only modify states in MDS memory.
2165 // The states get lost when MDS fails. If Client re-send a completed
2166 // setfilelock request, it means that client did not receive corresponding
2167 // setfilelock reply. So MDS should re-execute the setfilelock request.
2168 if (req->may_write() && req->get_op() != CEPH_MDS_OP_SETFILELOCK &&
2169 reply->get_result() == 0 && session) {
2170 inodeno_t created = mdr->alloc_ino ? mdr->alloc_ino : mdr->used_prealloc_ino;
2171 session->add_completed_request(mdr->reqid.tid, created);
2172 if (mdr->ls) {
2173 mdr->ls->touched_sessions.insert(session->info.inst.name);
2174 }
2175 }
2176
2177 // give any preallocated inos to the session
2178 apply_allocated_inos(mdr, session);
2179
2180 // get tracei/tracedn from mdr?
7c673cae
FG
2181 CInode *tracei = mdr->tracei;
2182 CDentry *tracedn = mdr->tracedn;
2183
2184 bool is_replay = mdr->client_request->is_replay();
2185 bool did_early_reply = mdr->did_early_reply;
2186 entity_inst_t client_inst = req->get_source_inst();
7c673cae
FG
2187
2188 if (!did_early_reply && !is_replay) {
2189
2190 mds->logger->inc(l_mds_reply);
2191 utime_t lat = ceph_clock_now() - mdr->client_request->get_recv_stamp();
2192 mds->logger->tinc(l_mds_reply_latency, lat);
33c7a0ef
TL
2193 if (lat >= g_conf()->mds_op_complaint_time) {
2194 mds->logger->inc(l_mds_slow_reply);
2195 }
81eedcae 2196 if (session && client_inst.name.is_client()) {
91327a77
AA
2197 mds->sessionmap.hit_session(session);
2198 }
2199 perf_gather_op_latency(req, lat);
7c673cae
FG
2200 dout(20) << "lat " << lat << dendl;
2201
2202 if (tracei)
2203 mdr->cap_releases.erase(tracei->vino());
2204 if (tracedn)
2205 mdr->cap_releases.erase(tracedn->get_dir()->get_inode()->vino());
2206 }
2207
2208 // drop non-rdlocks before replying, so that we can issue leases
2209 mdcache->request_drop_non_rdlocks(mdr);
2210
2211 // reply at all?
81eedcae 2212 if (session && !client_inst.name.is_mds()) {
7c673cae
FG
2213 // send reply.
2214 if (!did_early_reply && // don't issue leases if we sent an earlier reply already
2215 (tracei || tracedn)) {
2216 if (is_replay) {
2217 if (tracei)
2218 mdcache->try_reconnect_cap(tracei, session);
2219 } else {
2220 // include metadata in reply
9f95a23c 2221 set_trace_dist(reply, tracei, tracedn, mdr);
7c673cae
FG
2222 }
2223 }
2224
2225 // We can set the extra bl unconditionally: if it's already been sent in the
2226 // early_reply, set_extra_bl will have claimed it and reply_extra_bl is empty
2227 reply->set_extra_bl(mdr->reply_extra_bl);
2228
2229 reply->set_mdsmap_epoch(mds->mdsmap->get_epoch());
11fdf7f2 2230 mds->send_message_client(reply, session);
7c673cae
FG
2231 }
2232
2233 if (req->is_queued_for_replay() &&
2234 (mdr->has_completed || reply->get_result() < 0)) {
2235 if (reply->get_result() < 0) {
2236 int r = reply->get_result();
2237 derr << "reply_client_request: failed to replay " << *req
2238 << " error " << r << " (" << cpp_strerror(r) << ")" << dendl;
2239 mds->clog->warn() << "failed to replay " << req->get_reqid() << " error " << r;
2240 }
2241 mds->queue_one_replay();
2242 }
2243
2244 // clean up request
2245 mdcache->request_finish(mdr);
2246
2247 // take a closer look at tracei, if it happens to be a remote link
2248 if (tracei &&
2249 tracedn &&
2250 tracedn->get_projected_linkage()->is_remote()) {
2251 mdcache->eval_remote(tracedn);
2252 }
2253}
2254
7c673cae
FG
2255/*
2256 * pass inode OR dentry (not both, or we may get confused)
2257 *
2258 * trace is in reverse order (i.e. root inode comes last)
2259 */
9f95a23c 2260void Server::set_trace_dist(const ref_t<MClientReply> &reply,
7c673cae 2261 CInode *in, CDentry *dn,
7c673cae
FG
2262 MDRequestRef& mdr)
2263{
2264 // skip doing this for debugging purposes?
11fdf7f2 2265 if (g_conf()->mds_inject_traceless_reply_probability &&
7c673cae 2266 mdr->ls && !mdr->o_trunc &&
11fdf7f2 2267 (rand() % 10000 < g_conf()->mds_inject_traceless_reply_probability * 10000.0)) {
7c673cae
FG
2268 dout(5) << "deliberately skipping trace for " << *reply << dendl;
2269 return;
2270 }
2271
2272 // inode, dentry, dir, ..., inode
2273 bufferlist bl;
2274 mds_rank_t whoami = mds->get_nodeid();
9f95a23c
TL
2275 Session *session = mdr->session;
2276 snapid_t snapid = mdr->snapid;
7c673cae
FG
2277 utime_t now = ceph_clock_now();
2278
2279 dout(20) << "set_trace_dist snapid " << snapid << dendl;
2280
7c673cae
FG
2281 // realm
2282 if (snapid == CEPH_NOSNAP) {
2283 SnapRealm *realm;
2284 if (in)
2285 realm = in->find_snaprealm();
2286 else
2287 realm = dn->get_dir()->get_inode()->find_snaprealm();
2288 reply->snapbl = realm->get_snap_trace();
2289 dout(10) << "set_trace_dist snaprealm " << *realm << " len=" << reply->snapbl.length() << dendl;
2290 }
2291
2292 // dir + dentry?
2293 if (dn) {
2294 reply->head.is_dentry = 1;
2295 CDir *dir = dn->get_dir();
2296 CInode *diri = dir->get_inode();
2297
2298 diri->encode_inodestat(bl, session, NULL, snapid);
2299 dout(20) << "set_trace_dist added diri " << *diri << dendl;
2300
2301#ifdef MDS_VERIFY_FRAGSTAT
2302 if (dir->is_complete())
2303 dir->verify_fragstat();
2304#endif
11fdf7f2
TL
2305 DirStat ds;
2306 ds.frag = dir->get_frag();
2307 ds.auth = dir->get_dir_auth().first;
f91f0fd5 2308 if (dir->is_auth() && !forward_all_requests_to_auth)
11fdf7f2
TL
2309 dir->get_dist_spec(ds.dist, whoami);
2310
2311 dir->encode_dirstat(bl, session->info, ds);
7c673cae
FG
2312 dout(20) << "set_trace_dist added dir " << *dir << dendl;
2313
11fdf7f2 2314 encode(dn->get_name(), bl);
2a845540 2315 mds->locker->issue_client_lease(dn, in, mdr, now, bl);
7c673cae
FG
2316 } else
2317 reply->head.is_dentry = 0;
2318
2319 // inode
2320 if (in) {
2321 in->encode_inodestat(bl, session, NULL, snapid, 0, mdr->getattr_caps);
2322 dout(20) << "set_trace_dist added in " << *in << dendl;
2323 reply->head.is_target = 1;
2324 } else
2325 reply->head.is_target = 0;
2326
2327 reply->set_trace(bl);
2328}
2329
9f95a23c 2330void Server::handle_client_request(const cref_t<MClientRequest> &req)
7c673cae
FG
2331{
2332 dout(4) << "handle_client_request " << *req << dendl;
2333
2334 if (mds->logger)
2335 mds->logger->inc(l_mds_request);
2336 if (logger)
2337 logger->inc(l_mdss_handle_client_request);
2338
2339 if (!mdcache->is_open()) {
2340 dout(5) << "waiting for root" << dendl;
2341 mdcache->wait_for_open(new C_MDS_RetryMessage(mds, req));
2342 return;
2343 }
2344
92f5a8d4 2345 bool sessionclosed_isok = replay_unsafe_with_closed_session;
7c673cae
FG
2346 // active session?
2347 Session *session = 0;
2348 if (req->get_source().is_client()) {
94b18763 2349 session = mds->get_session(req);
7c673cae
FG
2350 if (!session) {
2351 dout(5) << "no session for " << req->get_source() << ", dropping" << dendl;
92f5a8d4 2352 } else if ((session->is_closed() && (!mds->is_clientreplay() || !sessionclosed_isok)) ||
7c673cae
FG
2353 session->is_closing() ||
2354 session->is_killing()) {
2355 dout(5) << "session closed|closing|killing, dropping" << dendl;
2356 session = NULL;
2357 }
2358 if (!session) {
2359 if (req->is_queued_for_replay())
2360 mds->queue_one_replay();
7c673cae
FG
2361 return;
2362 }
2363 }
2364
2365 // old mdsmap?
2366 if (req->get_mdsmap_epoch() < mds->mdsmap->get_epoch()) {
2367 // send it? hrm, this isn't ideal; they may get a lot of copies if
2368 // they have a high request rate.
2369 }
2370
2371 // completed request?
2372 bool has_completed = false;
2373 if (req->is_replay() || req->get_retry_attempt()) {
11fdf7f2 2374 ceph_assert(session);
7c673cae
FG
2375 inodeno_t created;
2376 if (session->have_completed_request(req->get_reqid().tid, &created)) {
2377 has_completed = true;
92f5a8d4
TL
2378 if (!session->is_open())
2379 return;
7c673cae
FG
2380 // Don't send traceless reply if the completed request has created
2381 // new inode. Treat the request as lookup request instead.
2382 if (req->is_replay() ||
2383 ((created == inodeno_t() || !mds->is_clientreplay()) &&
2384 req->get_op() != CEPH_MDS_OP_OPEN &&
2385 req->get_op() != CEPH_MDS_OP_CREATE)) {
2386 dout(5) << "already completed " << req->get_reqid() << dendl;
9f95a23c 2387 auto reply = make_message<MClientReply>(*req, 0);
7c673cae
FG
2388 if (created != inodeno_t()) {
2389 bufferlist extra;
11fdf7f2 2390 encode(created, extra);
7c673cae
FG
2391 reply->set_extra_bl(extra);
2392 }
11fdf7f2 2393 mds->send_message_client(reply, session);
7c673cae
FG
2394
2395 if (req->is_queued_for_replay())
2396 mds->queue_one_replay();
2397
7c673cae
FG
2398 return;
2399 }
2400 if (req->get_op() != CEPH_MDS_OP_OPEN &&
2401 req->get_op() != CEPH_MDS_OP_CREATE) {
2402 dout(10) << " completed request which created new inode " << created
2403 << ", convert it to lookup request" << dendl;
2404 req->head.op = req->get_dentry_wanted() ? CEPH_MDS_OP_LOOKUP : CEPH_MDS_OP_GETATTR;
2405 req->head.args.getattr.mask = CEPH_STAT_CAP_INODE_ALL;
2406 }
2407 }
2408 }
2409
2410 // trim completed_request list
2411 if (req->get_oldest_client_tid() > 0) {
2412 dout(15) << " oldest_client_tid=" << req->get_oldest_client_tid() << dendl;
11fdf7f2 2413 ceph_assert(session);
7c673cae
FG
2414 if (session->trim_completed_requests(req->get_oldest_client_tid())) {
2415 // Sessions 'completed_requests' was dirtied, mark it to be
2416 // potentially flushed at segment expiry.
2417 mdlog->get_current_segment()->touched_sessions.insert(session->info.inst.name);
2418
2419 if (session->get_num_trim_requests_warnings() > 0 &&
11fdf7f2 2420 session->get_num_completed_requests() * 2 < g_conf()->mds_max_completed_requests)
7c673cae
FG
2421 session->reset_num_trim_requests_warnings();
2422 } else {
2423 if (session->get_num_completed_requests() >=
11fdf7f2 2424 (g_conf()->mds_max_completed_requests << session->get_num_trim_requests_warnings())) {
7c673cae 2425 session->inc_num_trim_requests_warnings();
f67539c2
TL
2426 CachedStackStringStream css;
2427 *css << "client." << session->get_client() << " does not advance its oldest_client_tid ("
7c673cae
FG
2428 << req->get_oldest_client_tid() << "), "
2429 << session->get_num_completed_requests()
2430 << " completed requests recorded in session\n";
f67539c2
TL
2431 mds->clog->warn() << css->strv();
2432 dout(20) << __func__ << " " << css->strv() << dendl;
7c673cae
FG
2433 }
2434 }
2435 }
2436
2437 // register + dispatch
2438 MDRequestRef mdr = mdcache->request_start(req);
2439 if (!mdr.get())
2440 return;
2441
2442 if (session) {
2443 mdr->session = session;
2444 session->requests.push_back(&mdr->item_session_request);
2445 }
2446
2447 if (has_completed)
2448 mdr->has_completed = true;
2449
2450 // process embedded cap releases?
2451 // (only if NOT replay!)
2452 if (!req->releases.empty() && req->get_source().is_client() && !req->is_replay()) {
2453 client_t client = req->get_source().num();
11fdf7f2
TL
2454 for (const auto &r : req->releases) {
2455 mds->locker->process_request_cap_release(mdr, client, r.item, r.dname);
2456 }
7c673cae
FG
2457 req->releases.clear();
2458 }
2459
2460 dispatch_client_request(mdr);
2461 return;
2462}
2463
2464void Server::handle_osd_map()
2465{
2466 /* Note that we check the OSDMAP_FULL flag directly rather than
2467 * using osdmap_full_flag(), because we want to know "is the flag set"
2468 * rather than "does the flag apply to us?" */
2469 mds->objecter->with_osdmap([this](const OSDMap& o) {
b3b6e05e 2470 auto pi = o.get_pg_pool(mds->get_metadata_pool());
b32b8144 2471 is_full = pi && pi->has_flag(pg_pool_t::FLAG_FULL);
7c673cae
FG
2472 dout(7) << __func__ << ": full = " << is_full << " epoch = "
2473 << o.get_epoch() << dendl;
2474 });
2475}
2476
2477void Server::dispatch_client_request(MDRequestRef& mdr)
2478{
2479 // we shouldn't be waiting on anyone.
f67539c2 2480 ceph_assert(!mdr->has_more() || mdr->more()->waiting_on_peer.empty());
7c673cae
FG
2481
2482 if (mdr->killed) {
2483 dout(10) << "request " << *mdr << " was killed" << dendl;
9f95a23c
TL
2484 //if the mdr is a "batch_op" and it has followers, pick a follower as
2485 //the new "head of the batch ops" and go on processing the new one.
f91f0fd5
TL
2486 if (mdr->is_batch_head()) {
2487 int mask = mdr->client_request->head.args.getattr.mask;
2488 auto it = mdr->batch_op_map->find(mask);
2489 auto new_batch_head = it->second->find_new_head();
2490 if (!new_batch_head) {
2491 mdr->batch_op_map->erase(it);
9f95a23c
TL
2492 return;
2493 }
f91f0fd5 2494 mdr = std::move(new_batch_head);
9f95a23c
TL
2495 } else {
2496 return;
2497 }
94b18763
FG
2498 } else if (mdr->aborted) {
2499 mdr->aborted = false;
2500 mdcache->request_kill(mdr);
2501 return;
7c673cae
FG
2502 }
2503
9f95a23c 2504 const cref_t<MClientRequest> &req = mdr->client_request;
7c673cae
FG
2505
2506 if (logger) logger->inc(l_mdss_dispatch_client_request);
2507
2508 dout(7) << "dispatch_client_request " << *req << dendl;
2509
9f95a23c
TL
2510 if (req->may_write() && mdcache->is_readonly()) {
2511 dout(10) << " read-only FS" << dendl;
f67539c2 2512 respond_to_request(mdr, -CEPHFS_EROFS);
9f95a23c
TL
2513 return;
2514 }
f67539c2
TL
2515 if (mdr->has_more() && mdr->more()->peer_error) {
2516 dout(10) << " got error from peers" << dendl;
2517 respond_to_request(mdr, mdr->more()->peer_error);
9f95a23c 2518 return;
7c673cae
FG
2519 }
2520
2521 if (is_full) {
b3b6e05e
TL
2522 CInode *cur = try_get_auth_inode(mdr, req->get_filepath().get_ino());
2523 if (!cur) {
20effc67 2524 // the request is already responded to
b3b6e05e
TL
2525 return;
2526 }
7c673cae
FG
2527 if (req->get_op() == CEPH_MDS_OP_SETLAYOUT ||
2528 req->get_op() == CEPH_MDS_OP_SETDIRLAYOUT ||
2529 req->get_op() == CEPH_MDS_OP_SETLAYOUT ||
2530 req->get_op() == CEPH_MDS_OP_RMXATTR ||
2531 req->get_op() == CEPH_MDS_OP_SETXATTR ||
2532 req->get_op() == CEPH_MDS_OP_CREATE ||
2533 req->get_op() == CEPH_MDS_OP_SYMLINK ||
2534 req->get_op() == CEPH_MDS_OP_MKSNAP ||
2535 ((req->get_op() == CEPH_MDS_OP_LINK ||
2536 req->get_op() == CEPH_MDS_OP_RENAME) &&
f67539c2 2537 (!mdr->has_more() || mdr->more()->witnessed.empty())) // haven't started peer request
7c673cae
FG
2538 ) {
2539
b3b6e05e
TL
2540 if (check_access(mdr, cur, MAY_FULL)) {
2541 dout(20) << __func__ << ": full, has FULL caps, permitting op " << ceph_mds_op_name(req->get_op()) << dendl;
2542 } else {
2543 dout(20) << __func__ << ": full, responding CEPHFS_ENOSPC to op " << ceph_mds_op_name(req->get_op()) << dendl;
2544 respond_to_request(mdr, -CEPHFS_ENOSPC);
2545 return;
2546 }
7c673cae
FG
2547 } else {
2548 dout(20) << __func__ << ": full, permitting op " << ceph_mds_op_name(req->get_op()) << dendl;
2549 }
2550 }
2551
2552 switch (req->get_op()) {
2553 case CEPH_MDS_OP_LOOKUPHASH:
2554 case CEPH_MDS_OP_LOOKUPINO:
2555 handle_client_lookup_ino(mdr, false, false);
2556 break;
2557 case CEPH_MDS_OP_LOOKUPPARENT:
2558 handle_client_lookup_ino(mdr, true, false);
2559 break;
2560 case CEPH_MDS_OP_LOOKUPNAME:
2561 handle_client_lookup_ino(mdr, false, true);
2562 break;
2563
2564 // inodes ops.
2565 case CEPH_MDS_OP_LOOKUP:
2566 handle_client_getattr(mdr, true);
2567 break;
2568
2569 case CEPH_MDS_OP_LOOKUPSNAP:
2570 // lookupsnap does not reference a CDentry; treat it as a getattr
2571 case CEPH_MDS_OP_GETATTR:
2572 handle_client_getattr(mdr, false);
2573 break;
1d09f67e
TL
2574 case CEPH_MDS_OP_GETVXATTR:
2575 handle_client_getvxattr(mdr);
2576 break;
7c673cae
FG
2577
2578 case CEPH_MDS_OP_SETATTR:
2579 handle_client_setattr(mdr);
2580 break;
2581 case CEPH_MDS_OP_SETLAYOUT:
2582 handle_client_setlayout(mdr);
2583 break;
2584 case CEPH_MDS_OP_SETDIRLAYOUT:
2585 handle_client_setdirlayout(mdr);
2586 break;
2587 case CEPH_MDS_OP_SETXATTR:
2588 handle_client_setxattr(mdr);
2589 break;
2590 case CEPH_MDS_OP_RMXATTR:
2591 handle_client_removexattr(mdr);
2592 break;
2593
2594 case CEPH_MDS_OP_READDIR:
2595 handle_client_readdir(mdr);
2596 break;
2597
2598 case CEPH_MDS_OP_SETFILELOCK:
2599 handle_client_file_setlock(mdr);
2600 break;
2601
2602 case CEPH_MDS_OP_GETFILELOCK:
2603 handle_client_file_readlock(mdr);
2604 break;
2605
2606 // funky.
2607 case CEPH_MDS_OP_CREATE:
2608 if (mdr->has_completed)
2609 handle_client_open(mdr); // already created.. just open
2610 else
2611 handle_client_openc(mdr);
2612 break;
2613
2614 case CEPH_MDS_OP_OPEN:
2615 handle_client_open(mdr);
2616 break;
2617
2618 // namespace.
2619 // no prior locks.
2620 case CEPH_MDS_OP_MKNOD:
2621 handle_client_mknod(mdr);
2622 break;
2623 case CEPH_MDS_OP_LINK:
2624 handle_client_link(mdr);
2625 break;
2626 case CEPH_MDS_OP_UNLINK:
2627 case CEPH_MDS_OP_RMDIR:
2628 handle_client_unlink(mdr);
2629 break;
2630 case CEPH_MDS_OP_RENAME:
2631 handle_client_rename(mdr);
2632 break;
2633 case CEPH_MDS_OP_MKDIR:
2634 handle_client_mkdir(mdr);
2635 break;
2636 case CEPH_MDS_OP_SYMLINK:
2637 handle_client_symlink(mdr);
2638 break;
2639
2640
2641 // snaps
2642 case CEPH_MDS_OP_LSSNAP:
2643 handle_client_lssnap(mdr);
2644 break;
2645 case CEPH_MDS_OP_MKSNAP:
2646 handle_client_mksnap(mdr);
2647 break;
2648 case CEPH_MDS_OP_RMSNAP:
2649 handle_client_rmsnap(mdr);
2650 break;
2651 case CEPH_MDS_OP_RENAMESNAP:
2652 handle_client_renamesnap(mdr);
2653 break;
2654
2655 default:
2656 dout(1) << " unknown client op " << req->get_op() << dendl;
f67539c2 2657 respond_to_request(mdr, -CEPHFS_EOPNOTSUPP);
7c673cae
FG
2658 }
2659}
2660
2661
2662// ---------------------------------------
f67539c2 2663// PEER REQUESTS
7c673cae 2664
f67539c2 2665void Server::handle_peer_request(const cref_t<MMDSPeerRequest> &m)
7c673cae 2666{
f67539c2 2667 dout(4) << "handle_peer_request " << m->get_reqid() << " from " << m->get_source() << dendl;
7c673cae
FG
2668 mds_rank_t from = mds_rank_t(m->get_source().num());
2669
f67539c2 2670 if (logger) logger->inc(l_mdss_handle_peer_request);
7c673cae
FG
2671
2672 // reply?
2673 if (m->is_reply())
f67539c2 2674 return handle_peer_request_reply(m);
7c673cae
FG
2675
2676 // the purpose of rename notify is enforcing causal message ordering. making sure
2677 // bystanders have received all messages from rename srcdn's auth MDS.
f67539c2
TL
2678 if (m->get_op() == MMDSPeerRequest::OP_RENAMENOTIFY) {
2679 auto reply = make_message<MMDSPeerRequest>(m->get_reqid(), m->get_attempt(), MMDSPeerRequest::OP_RENAMENOTIFYACK);
7c673cae 2680 mds->send_message(reply, m->get_connection());
7c673cae
FG
2681 return;
2682 }
2683
2684 CDentry *straydn = NULL;
11fdf7f2 2685 if (m->straybl.length() > 0) {
33c7a0ef 2686 mdcache->decode_replica_stray(straydn, nullptr, m->straybl, from);
11fdf7f2
TL
2687 ceph_assert(straydn);
2688 m->straybl.clear();
7c673cae
FG
2689 }
2690
9f95a23c
TL
2691 if (!mds->is_clientreplay() && !mds->is_active() && !mds->is_stopping()) {
2692 dout(3) << "not clientreplay|active yet, waiting" << dendl;
2693 mds->wait_for_replay(new C_MDS_RetryMessage(mds, m));
2694 return;
2695 }
2696
f67539c2 2697 // am i a new peer?
7c673cae
FG
2698 MDRequestRef mdr;
2699 if (mdcache->have_request(m->get_reqid())) {
2700 // existing?
2701 mdr = mdcache->request_get(m->get_reqid());
2702
2703 // is my request newer?
2704 if (mdr->attempt > m->get_attempt()) {
2705 dout(10) << "local request " << *mdr << " attempt " << mdr->attempt << " > " << m->get_attempt()
2706 << ", dropping " << *m << dendl;
7c673cae
FG
2707 return;
2708 }
2709
7c673cae
FG
2710 if (mdr->attempt < m->get_attempt()) {
2711 // mine is old, close it out
2712 dout(10) << "local request " << *mdr << " attempt " << mdr->attempt << " < " << m->get_attempt()
2713 << ", closing out" << dendl;
2714 mdcache->request_finish(mdr);
2715 mdr.reset();
f67539c2
TL
2716 } else if (mdr->peer_to_mds != from) {
2717 dout(10) << "local request " << *mdr << " not peer to mds." << from << dendl;
7c673cae
FG
2718 return;
2719 }
2720
f67539c2
TL
2721 // may get these while mdr->peer_request is non-null
2722 if (m->get_op() == MMDSPeerRequest::OP_DROPLOCKS) {
9f95a23c
TL
2723 mds->locker->drop_locks(mdr.get());
2724 return;
2725 }
f67539c2 2726 if (m->get_op() == MMDSPeerRequest::OP_FINISH) {
9f95a23c
TL
2727 if (m->is_abort()) {
2728 mdr->aborted = true;
f67539c2 2729 if (mdr->peer_request) {
9f95a23c 2730 // only abort on-going xlock, wrlock and auth pin
f67539c2 2731 ceph_assert(!mdr->peer_did_prepare());
9f95a23c
TL
2732 } else {
2733 mdcache->request_finish(mdr);
2734 }
7c673cae 2735 } else {
9f95a23c
TL
2736 if (m->inode_export.length() > 0)
2737 mdr->more()->inode_import = m->inode_export;
2738 // finish off request.
7c673cae
FG
2739 mdcache->request_finish(mdr);
2740 }
2741 return;
2742 }
2743 }
2744 if (!mdr.get()) {
2745 // new?
f67539c2
TL
2746 if (m->get_op() == MMDSPeerRequest::OP_FINISH) {
2747 dout(10) << "missing peer request for " << m->get_reqid()
7c673cae 2748 << " OP_FINISH, must have lost race with a forward" << dendl;
7c673cae
FG
2749 return;
2750 }
f67539c2 2751 mdr = mdcache->request_start_peer(m->get_reqid(), m->get_attempt(), m);
7c673cae
FG
2752 mdr->set_op_stamp(m->op_stamp);
2753 }
f67539c2 2754 ceph_assert(mdr->peer_request == 0); // only one at a time, please!
7c673cae
FG
2755
2756 if (straydn) {
2757 mdr->pin(straydn);
2758 mdr->straydn = straydn;
2759 }
2760
9f95a23c
TL
2761 if (mds->is_clientreplay() && !mds->mdsmap->is_clientreplay(from) &&
2762 mdr->locks.empty()) {
7c673cae
FG
2763 dout(3) << "not active yet, waiting" << dendl;
2764 mds->wait_for_active(new C_MDS_RetryMessage(mds, m));
2765 return;
2766 }
2767
f67539c2 2768 mdr->reset_peer_request(m);
7c673cae 2769
f67539c2 2770 dispatch_peer_request(mdr);
7c673cae
FG
2771}
2772
f67539c2 2773void Server::handle_peer_request_reply(const cref_t<MMDSPeerRequest> &m)
7c673cae
FG
2774{
2775 mds_rank_t from = mds_rank_t(m->get_source().num());
2776
2777 if (!mds->is_clientreplay() && !mds->is_active() && !mds->is_stopping()) {
2778 metareqid_t r = m->get_reqid();
f67539c2
TL
2779 if (!mdcache->have_uncommitted_leader(r, from)) {
2780 dout(10) << "handle_peer_request_reply ignoring peer reply from mds."
7c673cae 2781 << from << " reqid " << r << dendl;
7c673cae
FG
2782 return;
2783 }
2784 dout(3) << "not clientreplay|active yet, waiting" << dendl;
2785 mds->wait_for_replay(new C_MDS_RetryMessage(mds, m));
2786 return;
2787 }
2788
f67539c2 2789 if (m->get_op() == MMDSPeerRequest::OP_COMMITTED) {
7c673cae 2790 metareqid_t r = m->get_reqid();
f67539c2 2791 mdcache->committed_leader_peer(r, from);
7c673cae
FG
2792 return;
2793 }
2794
2795 MDRequestRef mdr = mdcache->request_get(m->get_reqid());
2796 if (m->get_attempt() != mdr->attempt) {
f67539c2 2797 dout(10) << "handle_peer_request_reply " << *mdr << " ignoring reply from other attempt "
7c673cae 2798 << m->get_attempt() << dendl;
7c673cae
FG
2799 return;
2800 }
2801
2802 switch (m->get_op()) {
f67539c2 2803 case MMDSPeerRequest::OP_XLOCKACK:
7c673cae 2804 {
f67539c2 2805 // identify lock, leader request
7c673cae
FG
2806 SimpleLock *lock = mds->locker->get_lock(m->get_lock_type(),
2807 m->get_object_info());
f67539c2 2808 mdr->more()->peers.insert(from);
11fdf7f2 2809 lock->decode_locked_state(m->get_lock_data());
7c673cae 2810 dout(10) << "got remote xlock on " << *lock << " on " << *lock->get_parent() << dendl;
9f95a23c 2811 mdr->emplace_lock(lock, MutationImpl::LockOp::XLOCK);
7c673cae
FG
2812 mdr->finish_locking(lock);
2813 lock->get_xlock(mdr, mdr->get_client());
2814
f67539c2
TL
2815 ceph_assert(mdr->more()->waiting_on_peer.count(from));
2816 mdr->more()->waiting_on_peer.erase(from);
2817 ceph_assert(mdr->more()->waiting_on_peer.empty());
7c673cae
FG
2818 mdcache->dispatch_request(mdr);
2819 }
2820 break;
2821
f67539c2 2822 case MMDSPeerRequest::OP_WRLOCKACK:
7c673cae 2823 {
f67539c2 2824 // identify lock, leader request
7c673cae
FG
2825 SimpleLock *lock = mds->locker->get_lock(m->get_lock_type(),
2826 m->get_object_info());
f67539c2 2827 mdr->more()->peers.insert(from);
7c673cae 2828 dout(10) << "got remote wrlock on " << *lock << " on " << *lock->get_parent() << dendl;
9f95a23c 2829 auto it = mdr->emplace_lock(lock, MutationImpl::LockOp::REMOTE_WRLOCK, from);
11fdf7f2
TL
2830 ceph_assert(it->is_remote_wrlock());
2831 ceph_assert(it->wrlock_target == from);
2832
7c673cae
FG
2833 mdr->finish_locking(lock);
2834
f67539c2
TL
2835 ceph_assert(mdr->more()->waiting_on_peer.count(from));
2836 mdr->more()->waiting_on_peer.erase(from);
2837 ceph_assert(mdr->more()->waiting_on_peer.empty());
7c673cae
FG
2838 mdcache->dispatch_request(mdr);
2839 }
2840 break;
2841
f67539c2
TL
2842 case MMDSPeerRequest::OP_AUTHPINACK:
2843 handle_peer_auth_pin_ack(mdr, m);
7c673cae
FG
2844 break;
2845
f67539c2
TL
2846 case MMDSPeerRequest::OP_LINKPREPACK:
2847 handle_peer_link_prep_ack(mdr, m);
7c673cae
FG
2848 break;
2849
f67539c2
TL
2850 case MMDSPeerRequest::OP_RMDIRPREPACK:
2851 handle_peer_rmdir_prep_ack(mdr, m);
7c673cae
FG
2852 break;
2853
f67539c2
TL
2854 case MMDSPeerRequest::OP_RENAMEPREPACK:
2855 handle_peer_rename_prep_ack(mdr, m);
7c673cae
FG
2856 break;
2857
f67539c2
TL
2858 case MMDSPeerRequest::OP_RENAMENOTIFYACK:
2859 handle_peer_rename_notify_ack(mdr, m);
7c673cae
FG
2860 break;
2861
2862 default:
2863 ceph_abort();
2864 }
7c673cae
FG
2865}
2866
f67539c2 2867void Server::dispatch_peer_request(MDRequestRef& mdr)
7c673cae 2868{
f67539c2 2869 dout(7) << "dispatch_peer_request " << *mdr << " " << *mdr->peer_request << dendl;
7c673cae
FG
2870
2871 if (mdr->aborted) {
2872 dout(7) << " abort flag set, finishing" << dendl;
2873 mdcache->request_finish(mdr);
2874 return;
2875 }
2876
f67539c2 2877 if (logger) logger->inc(l_mdss_dispatch_peer_request);
7c673cae 2878
f67539c2 2879 int op = mdr->peer_request->get_op();
7c673cae 2880 switch (op) {
f67539c2
TL
2881 case MMDSPeerRequest::OP_XLOCK:
2882 case MMDSPeerRequest::OP_WRLOCK:
7c673cae
FG
2883 {
2884 // identify object
f67539c2
TL
2885 SimpleLock *lock = mds->locker->get_lock(mdr->peer_request->get_lock_type(),
2886 mdr->peer_request->get_object_info());
7c673cae
FG
2887
2888 if (!lock) {
2889 dout(10) << "don't have object, dropping" << dendl;
2890 ceph_abort(); // can this happen, if we auth pinned properly.
2891 }
f67539c2 2892 if (op == MMDSPeerRequest::OP_XLOCK && !lock->get_parent()->is_auth()) {
7c673cae
FG
2893 dout(10) << "not auth for remote xlock attempt, dropping on "
2894 << *lock << " on " << *lock->get_parent() << dendl;
2895 } else {
2896 // use acquire_locks so that we get auth_pinning.
11fdf7f2
TL
2897 MutationImpl::LockOpVec lov;
2898 for (const auto& p : mdr->locks) {
2899 if (p.is_xlock())
2900 lov.add_xlock(p.lock);
2901 else if (p.is_wrlock())
2902 lov.add_wrlock(p.lock);
2903 }
7c673cae
FG
2904
2905 int replycode = 0;
2906 switch (op) {
f67539c2 2907 case MMDSPeerRequest::OP_XLOCK:
11fdf7f2 2908 lov.add_xlock(lock);
f67539c2 2909 replycode = MMDSPeerRequest::OP_XLOCKACK;
7c673cae 2910 break;
f67539c2 2911 case MMDSPeerRequest::OP_WRLOCK:
11fdf7f2 2912 lov.add_wrlock(lock);
f67539c2 2913 replycode = MMDSPeerRequest::OP_WRLOCKACK;
7c673cae
FG
2914 break;
2915 }
2916
11fdf7f2 2917 if (!mds->locker->acquire_locks(mdr, lov))
7c673cae
FG
2918 return;
2919
2920 // ack
f67539c2 2921 auto r = make_message<MMDSPeerRequest>(mdr->reqid, mdr->attempt, replycode);
7c673cae
FG
2922 r->set_lock_type(lock->get_type());
2923 lock->get_parent()->set_object_info(r->get_object_info());
f67539c2 2924 if (replycode == MMDSPeerRequest::OP_XLOCKACK)
11fdf7f2 2925 lock->encode_locked_state(r->get_lock_data());
f67539c2 2926 mds->send_message(r, mdr->peer_request->get_connection());
7c673cae
FG
2927 }
2928
2929 // done.
f67539c2 2930 mdr->reset_peer_request();
7c673cae
FG
2931 }
2932 break;
2933
f67539c2
TL
2934 case MMDSPeerRequest::OP_UNXLOCK:
2935 case MMDSPeerRequest::OP_UNWRLOCK:
7c673cae 2936 {
f67539c2
TL
2937 SimpleLock *lock = mds->locker->get_lock(mdr->peer_request->get_lock_type(),
2938 mdr->peer_request->get_object_info());
11fdf7f2
TL
2939 ceph_assert(lock);
2940 auto it = mdr->locks.find(lock);
2941 ceph_assert(it != mdr->locks.end());
7c673cae
FG
2942 bool need_issue = false;
2943 switch (op) {
f67539c2 2944 case MMDSPeerRequest::OP_UNXLOCK:
11fdf7f2 2945 mds->locker->xlock_finish(it, mdr.get(), &need_issue);
7c673cae 2946 break;
f67539c2 2947 case MMDSPeerRequest::OP_UNWRLOCK:
11fdf7f2 2948 mds->locker->wrlock_finish(it, mdr.get(), &need_issue);
7c673cae
FG
2949 break;
2950 }
2951 if (need_issue)
2952 mds->locker->issue_caps(static_cast<CInode*>(lock->get_parent()));
2953
2954 // done. no ack necessary.
f67539c2 2955 mdr->reset_peer_request();
7c673cae
FG
2956 }
2957 break;
2958
f67539c2
TL
2959 case MMDSPeerRequest::OP_AUTHPIN:
2960 handle_peer_auth_pin(mdr);
7c673cae
FG
2961 break;
2962
f67539c2
TL
2963 case MMDSPeerRequest::OP_LINKPREP:
2964 case MMDSPeerRequest::OP_UNLINKPREP:
2965 handle_peer_link_prep(mdr);
7c673cae
FG
2966 break;
2967
f67539c2
TL
2968 case MMDSPeerRequest::OP_RMDIRPREP:
2969 handle_peer_rmdir_prep(mdr);
7c673cae
FG
2970 break;
2971
f67539c2
TL
2972 case MMDSPeerRequest::OP_RENAMEPREP:
2973 handle_peer_rename_prep(mdr);
7c673cae
FG
2974 break;
2975
7c673cae
FG
2976 default:
2977 ceph_abort();
2978 }
2979}
2980
f67539c2 2981void Server::handle_peer_auth_pin(MDRequestRef& mdr)
7c673cae 2982{
f67539c2 2983 dout(10) << "handle_peer_auth_pin " << *mdr << dendl;
7c673cae
FG
2984
2985 // build list of objects
2986 list<MDSCacheObject*> objects;
2987 CInode *auth_pin_freeze = NULL;
f67539c2 2988 bool nonblocking = mdr->peer_request->is_nonblocking();
7c673cae 2989 bool fail = false, wouldblock = false, readonly = false;
f67539c2 2990 ref_t<MMDSPeerRequest> reply;
7c673cae
FG
2991
2992 if (mdcache->is_readonly()) {
2993 dout(10) << " read-only FS" << dendl;
2994 readonly = true;
2995 fail = true;
2996 }
2997
2998 if (!fail) {
f67539c2 2999 for (const auto &oi : mdr->peer_request->get_authpins()) {
11fdf7f2 3000 MDSCacheObject *object = mdcache->get_object(oi);
7c673cae 3001 if (!object) {
11fdf7f2 3002 dout(10) << " don't have " << oi << dendl;
7c673cae
FG
3003 fail = true;
3004 break;
3005 }
3006
3007 objects.push_back(object);
f67539c2 3008 if (oi == mdr->peer_request->get_authpin_freeze())
7c673cae
FG
3009 auth_pin_freeze = static_cast<CInode*>(object);
3010 }
3011 }
3012
3013 // can we auth pin them?
3014 if (!fail) {
9f95a23c
TL
3015 for (const auto& obj : objects) {
3016 if (!obj->is_auth()) {
3017 dout(10) << " not auth for " << *obj << dendl;
7c673cae
FG
3018 fail = true;
3019 break;
3020 }
9f95a23c 3021 if (mdr->is_auth_pinned(obj))
7c673cae 3022 continue;
9f95a23c
TL
3023 if (!mdr->can_auth_pin(obj)) {
3024 if (nonblocking) {
3025 dout(10) << " can't auth_pin (freezing?) " << *obj << " nonblocking" << dendl;
7c673cae
FG
3026 fail = true;
3027 wouldblock = true;
3028 break;
3029 }
3030 // wait
9f95a23c
TL
3031 dout(10) << " waiting for authpinnable on " << *obj << dendl;
3032 obj->add_waiter(CDir::WAIT_UNFREEZE, new C_MDS_RetryRequest(mdcache, mdr));
7c673cae
FG
3033 mdr->drop_local_auth_pins();
3034
9f95a23c
TL
3035 mds->locker->notify_freeze_waiter(obj);
3036 goto blocked;
7c673cae
FG
3037 }
3038 }
3039 }
3040
9f95a23c 3041 if (!fail) {
7c673cae
FG
3042 /* freeze authpin wrong inode */
3043 if (mdr->has_more() && mdr->more()->is_freeze_authpin &&
3044 mdr->more()->rename_inode != auth_pin_freeze)
3045 mdr->unfreeze_auth_pin(true);
3046
f67539c2 3047 /* handle_peer_rename_prep() call freeze_inode() to wait for all other operations
7c673cae
FG
3048 * on the source inode to complete. This happens after all locks for the rename
3049 * operation are acquired. But to acquire locks, we need auth pin locks' parent
3050 * objects first. So there is an ABBA deadlock if someone auth pins the source inode
f67539c2 3051 * after locks are acquired and before Server::handle_peer_rename_prep() is called.
7c673cae
FG
3052 * The solution is freeze the inode and prevent other MDRequests from getting new
3053 * auth pins.
3054 */
3055 if (auth_pin_freeze) {
3056 dout(10) << " freezing auth pin on " << *auth_pin_freeze << dendl;
3057 if (!mdr->freeze_auth_pin(auth_pin_freeze)) {
3058 auth_pin_freeze->add_waiter(CInode::WAIT_FROZEN, new C_MDS_RetryRequest(mdcache, mdr));
3059 mds->mdlog->flush();
9f95a23c 3060 goto blocked;
7c673cae
FG
3061 }
3062 }
7c673cae
FG
3063 }
3064
f67539c2 3065 reply = make_message<MMDSPeerRequest>(mdr->reqid, mdr->attempt, MMDSPeerRequest::OP_AUTHPINACK);
7c673cae 3066
9f95a23c
TL
3067 if (fail) {
3068 mdr->drop_local_auth_pins(); // just in case
3069 if (readonly)
3070 reply->mark_error_rofs();
3071 if (wouldblock)
3072 reply->mark_error_wouldblock();
3073 } else {
3074 // auth pin!
3075 for (const auto& obj : objects) {
3076 dout(10) << "auth_pinning " << *obj << dendl;
3077 mdr->auth_pin(obj);
3078 }
3079 // return list of my auth_pins (if any)
3080 for (const auto &p : mdr->object_states) {
3081 if (!p.second.auth_pinned)
3082 continue;
3083 MDSCacheObjectInfo info;
3084 p.first->set_object_info(info);
3085 reply->get_authpins().push_back(info);
3086 if (p.first == (MDSCacheObject*)auth_pin_freeze)
3087 auth_pin_freeze->set_object_info(reply->get_authpin_freeze());
3088 }
3089 }
7c673cae 3090
f67539c2 3091 mds->send_message_mds(reply, mdr->peer_to_mds);
7c673cae
FG
3092
3093 // clean up this request
f67539c2 3094 mdr->reset_peer_request();
7c673cae 3095 return;
9f95a23c
TL
3096
3097blocked:
f67539c2
TL
3098 if (mdr->peer_request->should_notify_blocking()) {
3099 reply = make_message<MMDSPeerRequest>(mdr->reqid, mdr->attempt, MMDSPeerRequest::OP_AUTHPINACK);
9f95a23c 3100 reply->mark_req_blocked();
f67539c2
TL
3101 mds->send_message_mds(reply, mdr->peer_to_mds);
3102 mdr->peer_request->clear_notify_blocking();
9f95a23c
TL
3103 }
3104 return;
7c673cae
FG
3105}
3106
f67539c2 3107void Server::handle_peer_auth_pin_ack(MDRequestRef& mdr, const cref_t<MMDSPeerRequest> &ack)
7c673cae 3108{
f67539c2 3109 dout(10) << "handle_peer_auth_pin_ack on " << *mdr << " " << *ack << dendl;
7c673cae
FG
3110 mds_rank_t from = mds_rank_t(ack->get_source().num());
3111
9f95a23c
TL
3112 if (ack->is_req_blocked()) {
3113 mdr->disable_lock_cache();
f67539c2 3114 // peer auth pin is blocked, drop locks to avoid deadlock
9f95a23c
TL
3115 mds->locker->drop_locks(mdr.get(), nullptr);
3116 return;
3117 }
3118
7c673cae
FG
3119 // added auth pins?
3120 set<MDSCacheObject*> pinned;
11fdf7f2
TL
3121 for (const auto &oi : ack->get_authpins()) {
3122 MDSCacheObject *object = mdcache->get_object(oi);
3123 ceph_assert(object); // we pinned it
7c673cae 3124 dout(10) << " remote has pinned " << *object << dendl;
9f95a23c 3125 mdr->set_remote_auth_pinned(object, from);
11fdf7f2 3126 if (oi == ack->get_authpin_freeze())
7c673cae
FG
3127 mdr->set_remote_frozen_auth_pin(static_cast<CInode *>(object));
3128 pinned.insert(object);
3129 }
3130
3131 // removed frozen auth pin ?
3132 if (mdr->more()->is_remote_frozen_authpin &&
3133 ack->get_authpin_freeze() == MDSCacheObjectInfo()) {
9f95a23c
TL
3134 auto stat_p = mdr->find_object_state(mdr->more()->rename_inode);
3135 ceph_assert(stat_p);
3136 if (stat_p->remote_auth_pinned == from) {
7c673cae
FG
3137 mdr->more()->is_remote_frozen_authpin = false;
3138 }
3139 }
3140
3141 // removed auth pins?
9f95a23c
TL
3142 for (auto& p : mdr->object_states) {
3143 if (p.second.remote_auth_pinned == MDS_RANK_NONE)
3144 continue;
3145 MDSCacheObject* object = p.first;
3146 if (p.second.remote_auth_pinned == from && pinned.count(object) == 0) {
7c673cae 3147 dout(10) << " remote has unpinned " << *object << dendl;
9f95a23c 3148 mdr->_clear_remote_auth_pinned(p.second);
7c673cae
FG
3149 }
3150 }
3151
f67539c2
TL
3152 // note peer
3153 mdr->more()->peers.insert(from);
9f95a23c
TL
3154
3155 // clear from waiting list
f67539c2 3156 auto ret = mdr->more()->waiting_on_peer.erase(from);
9f95a23c
TL
3157 ceph_assert(ret);
3158
7c673cae 3159 if (ack->is_error_rofs()) {
f67539c2 3160 mdr->more()->peer_error = -CEPHFS_EROFS;
7c673cae 3161 } else if (ack->is_error_wouldblock()) {
f67539c2 3162 mdr->more()->peer_error = -CEPHFS_EWOULDBLOCK;
7c673cae 3163 }
7c673cae
FG
3164
3165 // go again?
f67539c2 3166 if (mdr->more()->waiting_on_peer.empty())
7c673cae
FG
3167 mdcache->dispatch_request(mdr);
3168 else
f67539c2 3169 dout(10) << "still waiting on peers " << mdr->more()->waiting_on_peer << dendl;
7c673cae
FG
3170}
3171
3172
3173// ---------------------------------------
3174// HELPERS
3175
3176
3177/**
3178 * check whether we are permitted to complete a request
3179 *
3180 * Check whether we have permission to perform the operation specified
3181 * by mask on the given inode, based on the capability in the mdr's
3182 * session.
3183 */
3184bool Server::check_access(MDRequestRef& mdr, CInode *in, unsigned mask)
3185{
3186 if (mdr->session) {
3187 int r = mdr->session->check_access(
3188 in, mask,
3189 mdr->client_request->get_caller_uid(),
3190 mdr->client_request->get_caller_gid(),
3191 &mdr->client_request->get_caller_gid_list(),
3192 mdr->client_request->head.args.setattr.uid,
3193 mdr->client_request->head.args.setattr.gid);
3194 if (r < 0) {
3195 respond_to_request(mdr, r);
3196 return false;
3197 }
3198 }
3199 return true;
3200}
3201
3202/**
3203 * check whether fragment has reached maximum size
3204 *
3205 */
20effc67 3206bool Server::check_fragment_space(MDRequestRef &mdr, CDir *dir)
7c673cae 3207{
20effc67
TL
3208 const auto size = dir->get_frag_size();
3209 const auto max = bal_fragment_size_max;
3210 if (size >= max) {
3211 dout(10) << "fragment " << *dir << " size exceeds " << max << " (CEPHFS_ENOSPC)" << dendl;
f67539c2 3212 respond_to_request(mdr, -CEPHFS_ENOSPC);
7c673cae 3213 return false;
20effc67
TL
3214 } else {
3215 dout(20) << "fragment " << *dir << " size " << size << " < " << max << dendl;
7c673cae
FG
3216 }
3217
3218 return true;
3219}
3220
20effc67
TL
3221/**
3222 * check whether entries in a dir reached maximum size
3223 *
3224 */
3225bool Server::check_dir_max_entries(MDRequestRef &mdr, CDir *in)
3226{
3227 const uint64_t size = in->inode->get_projected_inode()->dirstat.nfiles +
3228 in->inode->get_projected_inode()->dirstat.nsubdirs;
3229 if (dir_max_entries && size >= dir_max_entries) {
3230 dout(10) << "entries per dir " << *in << " size exceeds " << dir_max_entries << " (ENOSPC)" << dendl;
3231 respond_to_request(mdr, -ENOSPC);
3232 return false;
3233 }
3234 return true;
3235}
3236
3237
7c673cae
FG
3238CDentry* Server::prepare_stray_dentry(MDRequestRef& mdr, CInode *in)
3239{
f67539c2
TL
3240 string straydname;
3241 in->name_stray_dentry(straydname);
3242
7c673cae
FG
3243 CDentry *straydn = mdr->straydn;
3244 if (straydn) {
9f95a23c
TL
3245 ceph_assert(straydn->get_name() == straydname);
3246 return straydn;
7c673cae 3247 }
7c673cae
FG
3248 CDir *straydir = mdcache->get_stray_dir(in);
3249
3250 if (!mdr->client_request->is_replay() &&
3251 !check_fragment_space(mdr, straydir))
f67539c2
TL
3252 return nullptr;
3253
3254 straydn = straydir->lookup(straydname);
3255 if (!straydn) {
3256 if (straydir->is_frozen_dir()) {
3257 dout(10) << __func__ << ": " << *straydir << " is frozen, waiting" << dendl;
3258 mds->locker->drop_locks(mdr.get());
3259 mdr->drop_local_auth_pins();
3260 straydir->add_waiter(CInode::WAIT_UNFREEZE, new C_MDS_RetryRequest(mdcache, mdr));
3261 return nullptr;
3262 }
3263 straydn = straydir->add_null_dentry(straydname);
3264 straydn->mark_new();
3265 } else {
3266 ceph_assert(straydn->get_projected_linkage()->is_null());
3267 }
7c673cae 3268
f67539c2 3269 straydn->state_set(CDentry::STATE_STRAY);
7c673cae
FG
3270 mdr->straydn = straydn;
3271 mdr->pin(straydn);
f67539c2 3272
7c673cae
FG
3273 return straydn;
3274}
3275
3276/** prepare_new_inode
3277 *
3278 * create a new inode. set c/m/atime. hit dir pop.
3279 */
3280CInode* Server::prepare_new_inode(MDRequestRef& mdr, CDir *dir, inodeno_t useino, unsigned mode,
f67539c2 3281 const file_layout_t *layout)
7c673cae
FG
3282{
3283 CInode *in = new CInode(mdcache);
f67539c2 3284 auto _inode = in->_get_inode();
7c673cae
FG
3285
3286 // Server::prepare_force_open_sessions() can re-open session in closing
3287 // state. In that corner case, session's prealloc_inos are being freed.
3288 // To simplify the code, we disallow using/refilling session's prealloc_ino
3289 // while session is opening.
92f5a8d4 3290 bool allow_prealloc_inos = mdr->session->is_open();
7c673cae
FG
3291
3292 // assign ino
f67539c2 3293 if (allow_prealloc_inos && (mdr->used_prealloc_ino = _inode->ino = mdr->session->take_ino(useino))) {
7c673cae 3294 mds->sessionmap.mark_projected(mdr->session);
7c673cae 3295 dout(10) << "prepare_new_inode used_prealloc " << mdr->used_prealloc_ino
f67539c2 3296 << " (" << mdr->session->info.prealloc_inos.size() << " left)"
7c673cae
FG
3297 << dendl;
3298 } else {
3299 mdr->alloc_ino =
f67539c2 3300 _inode->ino = mds->inotable->project_alloc_id(useino);
7c673cae
FG
3301 dout(10) << "prepare_new_inode alloc " << mdr->alloc_ino << dendl;
3302 }
3303
f67539c2
TL
3304 if (useino && useino != _inode->ino) {
3305 dout(0) << "WARNING: client specified " << useino << " and i allocated " << _inode->ino << dendl;
7c673cae
FG
3306 mds->clog->error() << mdr->client_request->get_source()
3307 << " specified ino " << useino
f67539c2 3308 << " but mds." << mds->get_nodeid() << " allocated " << _inode->ino;
7c673cae
FG
3309 //ceph_abort(); // just for now.
3310 }
3311
3312 if (allow_prealloc_inos &&
11fdf7f2
TL
3313 mdr->session->get_num_projected_prealloc_inos() < g_conf()->mds_client_prealloc_inos / 2) {
3314 int need = g_conf()->mds_client_prealloc_inos - mdr->session->get_num_projected_prealloc_inos();
7c673cae 3315 mds->inotable->project_alloc_ids(mdr->prealloc_inos, need);
11fdf7f2 3316 ceph_assert(mdr->prealloc_inos.size()); // or else fix projected increment semantics
7c673cae
FG
3317 mdr->session->pending_prealloc_inos.insert(mdr->prealloc_inos);
3318 mds->sessionmap.mark_projected(mdr->session);
3319 dout(10) << "prepare_new_inode prealloc " << mdr->prealloc_inos << dendl;
3320 }
3321
f67539c2
TL
3322 _inode->version = 1;
3323 _inode->xattr_version = 1;
3324 _inode->nlink = 1; // FIXME
7c673cae 3325
f67539c2 3326 _inode->mode = mode;
7c673cae 3327
92f5a8d4 3328 // FIPS zeroization audit 20191117: this memset is not security related.
f67539c2
TL
3329 memset(&_inode->dir_layout, 0, sizeof(_inode->dir_layout));
3330 if (_inode->is_dir()) {
3331 _inode->dir_layout.dl_dir_hash = g_conf()->mds_default_dir_hash;
7c673cae 3332 } else if (layout) {
f67539c2 3333 _inode->layout = *layout;
7c673cae 3334 } else {
f67539c2 3335 _inode->layout = mdcache->default_file_layout;
7c673cae
FG
3336 }
3337
f67539c2
TL
3338 _inode->truncate_size = -1ull; // not truncated, yet!
3339 _inode->truncate_seq = 1; /* starting with 1, 0 is kept for no-truncation logic */
7c673cae
FG
3340
3341 CInode *diri = dir->get_inode();
2a845540 3342 auto pip = diri->get_projected_inode();
7c673cae 3343
2a845540 3344 dout(10) << oct << " dir mode 0" << pip->mode << " new mode 0" << mode << dec << dendl;
7c673cae 3345
2a845540 3346 if (pip->mode & S_ISGID) {
7c673cae 3347 dout(10) << " dir is sticky" << dendl;
2a845540 3348 _inode->gid = pip->gid;
7c673cae 3349 if (S_ISDIR(mode)) {
2a845540 3350 dout(10) << " new dir also sticky" << dendl;
f67539c2 3351 _inode->mode |= S_ISGID;
7c673cae 3352 }
2a845540 3353 } else {
f67539c2 3354 _inode->gid = mdr->client_request->get_caller_gid();
2a845540 3355 }
7c673cae 3356
f67539c2 3357 _inode->uid = mdr->client_request->get_caller_uid();
7c673cae 3358
f67539c2 3359 _inode->btime = _inode->ctime = _inode->mtime = _inode->atime =
7c673cae
FG
3360 mdr->get_op_stamp();
3361
f67539c2 3362 _inode->change_attr = 0;
7c673cae 3363
9f95a23c 3364 const cref_t<MClientRequest> &req = mdr->client_request;
7c673cae 3365 if (req->get_data().length()) {
11fdf7f2 3366 auto p = req->get_data().cbegin();
7c673cae
FG
3367
3368 // xattrs on new inode?
f67539c2
TL
3369 auto _xattrs = CInode::allocate_xattr_map();
3370 decode_noshare(*_xattrs, p);
3371 dout(10) << "prepare_new_inode setting xattrs " << *_xattrs << dendl;
20effc67
TL
3372 if (_xattrs->count("encryption.ctx")) {
3373 _inode->fscrypt = true;
3374 }
f67539c2 3375 in->reset_xattrs(std::move(_xattrs));
7c673cae
FG
3376 }
3377
3378 if (!mds->mdsmap->get_inline_data_enabled() ||
11fdf7f2 3379 !mdr->session->get_connection()->has_feature(CEPH_FEATURE_MDS_INLINE_DATA))
f67539c2 3380 _inode->inline_data.version = CEPH_INLINE_NONE;
7c673cae
FG
3381
3382 mdcache->add_inode(in); // add
3383 dout(10) << "prepare_new_inode " << *in << dendl;
3384 return in;
3385}
3386
3387void Server::journal_allocated_inos(MDRequestRef& mdr, EMetaBlob *blob)
3388{
3389 dout(20) << "journal_allocated_inos sessionmapv " << mds->sessionmap.get_projected()
3390 << " inotablev " << mds->inotable->get_projected_version()
3391 << dendl;
3392 blob->set_ino_alloc(mdr->alloc_ino,
3393 mdr->used_prealloc_ino,
3394 mdr->prealloc_inos,
3395 mdr->client_request->get_source(),
3396 mds->sessionmap.get_projected(),
3397 mds->inotable->get_projected_version());
3398}
3399
3400void Server::apply_allocated_inos(MDRequestRef& mdr, Session *session)
3401{
3402 dout(10) << "apply_allocated_inos " << mdr->alloc_ino
3403 << " / " << mdr->prealloc_inos
3404 << " / " << mdr->used_prealloc_ino << dendl;
3405
3406 if (mdr->alloc_ino) {
3407 mds->inotable->apply_alloc_id(mdr->alloc_ino);
3408 }
3409 if (mdr->prealloc_inos.size()) {
11fdf7f2 3410 ceph_assert(session);
7c673cae 3411 session->pending_prealloc_inos.subtract(mdr->prealloc_inos);
f67539c2 3412 session->free_prealloc_inos.insert(mdr->prealloc_inos);
7c673cae 3413 session->info.prealloc_inos.insert(mdr->prealloc_inos);
81eedcae 3414 mds->sessionmap.mark_dirty(session, !mdr->used_prealloc_ino);
7c673cae
FG
3415 mds->inotable->apply_alloc_ids(mdr->prealloc_inos);
3416 }
3417 if (mdr->used_prealloc_ino) {
11fdf7f2 3418 ceph_assert(session);
f67539c2 3419 session->info.prealloc_inos.erase(mdr->used_prealloc_ino);
7c673cae
FG
3420 mds->sessionmap.mark_dirty(session);
3421 }
3422}
3423
2a845540
TL
3424struct C_MDS_TryOpenInode : public ServerContext {
3425 MDRequestRef mdr;
3426 inodeno_t ino;
3427 C_MDS_TryOpenInode(Server *s, MDRequestRef& r, inodeno_t i) :
3428 ServerContext(s), mdr(r), ino(i) {}
3429 void finish(int r) override {
3430 server->_try_open_ino(mdr, r, ino);
3431 }
3432};
3433
3434void Server::_try_open_ino(MDRequestRef& mdr, int r, inodeno_t ino)
3435{
3436 dout(10) << "_try_open_ino " << mdr.get() << " ino " << ino << " r=" << r << dendl;
3437
3438 // `r` is a rank if >=0, else an error code
3439 if (r >= 0) {
3440 mds_rank_t dest_rank(r);
3441 if (dest_rank == mds->get_nodeid())
3442 dispatch_client_request(mdr);
3443 else
3444 mdcache->request_forward(mdr, dest_rank);
3445 return;
3446 }
3447
3448 // give up
3449 if (r == -CEPHFS_ENOENT || r == -CEPHFS_ENODATA)
3450 r = -CEPHFS_ESTALE;
3451 respond_to_request(mdr, r);
3452}
3453
7c673cae
FG
3454class C_MDS_TryFindInode : public ServerContext {
3455 MDRequestRef mdr;
2a845540
TL
3456 MDCache *mdcache;
3457 inodeno_t ino;
7c673cae 3458public:
2a845540
TL
3459 C_MDS_TryFindInode(Server *s, MDRequestRef& r, MDCache *m, inodeno_t i) :
3460 ServerContext(s), mdr(r), mdcache(m), ino(i) {}
7c673cae 3461 void finish(int r) override {
2a845540
TL
3462 if (r == -CEPHFS_ESTALE) { // :( find_ino_peers failed
3463 /*
3464 * There has one case that when the MDS crashes and the
3465 * openfiletable journal couldn't be flushed and then
3466 * the replacing MDS is possibly won't load some already
3467 * opened CInodes into the MDCache. And if the clients
3468 * will retry some requests after reconnected, the MDS
3469 * will return -ESTALE after failing to find the ino in
3470 * all active peers.
3471 *
3472 * As a workaround users can run `ls -R ${mountpoint}`
3473 * to list all the sub-files or sub-direcotries from the
3474 * mountpoint.
3475 *
3476 * We need try to open the ino and try it again.
3477 */
3478 CInode *in = mdcache->get_inode(ino);
3479 if (in && in->state_test(CInode::STATE_PURGING))
3480 server->respond_to_request(mdr, r);
3481 else
3482 mdcache->open_ino(ino, (int64_t)-1, new C_MDS_TryOpenInode(server, mdr, ino));
3483 } else {
7c673cae 3484 server->dispatch_client_request(mdr);
2a845540 3485 }
7c673cae
FG
3486 }
3487};
3488
7c673cae
FG
3489/* If this returns null, the request has been handled
3490 * as appropriate: forwarded on, or the client's been replied to */
9f95a23c 3491CInode* Server::rdlock_path_pin_ref(MDRequestRef& mdr,
7c673cae 3492 bool want_auth,
9f95a23c 3493 bool no_want_auth)
7c673cae 3494{
9f95a23c 3495 const filepath& refpath = mdr->get_filepath();
7c673cae
FG
3496 dout(10) << "rdlock_path_pin_ref " << *mdr << " " << refpath << dendl;
3497
9f95a23c
TL
3498 if (mdr->locking_state & MutationImpl::PATH_LOCKED)
3499 return mdr->in[0];
7c673cae
FG
3500
3501 // traverse
f67539c2 3502 CF_MDS_RetryRequestFactory cf(mdcache, mdr, true);
9f95a23c
TL
3503 int flags = 0;
3504 if (refpath.is_last_snap()) {
3505 if (!no_want_auth)
3506 want_auth = true;
3507 } else {
f91f0fd5
TL
3508 if (!no_want_auth && forward_all_requests_to_auth)
3509 want_auth = true;
9f95a23c
TL
3510 flags |= MDS_TRAVERSE_RDLOCK_PATH | MDS_TRAVERSE_RDLOCK_SNAP;
3511 }
3512 if (want_auth)
3513 flags |= MDS_TRAVERSE_WANT_AUTH;
3514 int r = mdcache->path_traverse(mdr, cf, refpath, flags, &mdr->dn[0], &mdr->in[0]);
7c673cae 3515 if (r > 0)
9f95a23c 3516 return nullptr; // delayed
7c673cae 3517 if (r < 0) { // error
f67539c2 3518 if (r == -CEPHFS_ENOENT && !mdr->dn[0].empty()) {
9f95a23c
TL
3519 if (mdr->client_request &&
3520 mdr->client_request->get_dentry_wanted())
3521 mdr->tracedn = mdr->dn[0].back();
7c673cae 3522 respond_to_request(mdr, r);
f67539c2
TL
3523 } else if (r == -CEPHFS_ESTALE) {
3524 dout(10) << "FAIL on CEPHFS_ESTALE but attempting recovery" << dendl;
2a845540
TL
3525 inodeno_t ino = refpath.get_ino();
3526 mdcache->find_ino_peers(ino, new C_MDS_TryFindInode(this, mdr, mdcache, ino));
7c673cae
FG
3527 } else {
3528 dout(10) << "FAIL on error " << r << dendl;
3529 respond_to_request(mdr, r);
3530 }
9f95a23c 3531 return nullptr;
7c673cae 3532 }
9f95a23c 3533 CInode *ref = mdr->in[0];
7c673cae
FG
3534 dout(10) << "ref is " << *ref << dendl;
3535
7c673cae 3536 if (want_auth) {
7c673cae
FG
3537 // auth_pin?
3538 // do NOT proceed if freezing, as cap release may defer in that case, and
3539 // we could deadlock when we try to lock @ref.
3540 // if we're already auth_pinned, continue; the release has already been processed.
3541 if (ref->is_frozen() || ref->is_frozen_auth_pin() ||
3542 (ref->is_freezing() && !mdr->is_auth_pinned(ref))) {
3543 dout(7) << "waiting for !frozen/authpinnable on " << *ref << dendl;
9f95a23c
TL
3544 ref->add_waiter(CInode::WAIT_UNFREEZE, cf.build());
3545 if (mdr->is_any_remote_auth_pin())
224ce89b 3546 mds->locker->notify_freeze_waiter(ref);
7c673cae
FG
3547 return 0;
3548 }
7c673cae
FG
3549 mdr->auth_pin(ref);
3550 }
3551
7c673cae
FG
3552 // set and pin ref
3553 mdr->pin(ref);
3554 return ref;
3555}
3556
3557
3558/** rdlock_path_xlock_dentry
3559 * traverse path to the directory that could/would contain dentry.
3560 * make sure i am auth for that dentry, forward as necessary.
3561 * create null dentry in place (or use existing if okexist).
3562 * get rdlocks on traversed dentries, xlock on new dentry.
3563 */
9f95a23c
TL
3564CDentry* Server::rdlock_path_xlock_dentry(MDRequestRef& mdr,
3565 bool create, bool okexist, bool want_layout)
7c673cae 3566{
9f95a23c 3567 const filepath& refpath = mdr->get_filepath();
7c673cae
FG
3568 dout(10) << "rdlock_path_xlock_dentry " << *mdr << " " << refpath << dendl;
3569
9f95a23c
TL
3570 if (mdr->locking_state & MutationImpl::PATH_LOCKED)
3571 return mdr->dn[0].back();
3572
3573 // figure parent dir vs dname
3574 if (refpath.depth() == 0) {
3575 dout(7) << "invalid path (zero length)" << dendl;
f67539c2 3576 respond_to_request(mdr, -CEPHFS_EINVAL);
9f95a23c
TL
3577 return nullptr;
3578 }
3579
3580 if (refpath.is_last_snap()) {
f67539c2 3581 respond_to_request(mdr, -CEPHFS_EROFS);
9f95a23c
TL
3582 return nullptr;
3583 }
7c673cae 3584
9f95a23c
TL
3585 if (refpath.is_last_dot_or_dotdot()) {
3586 dout(7) << "invalid path (last dot or dot_dot)" << dendl;
3587 if (create)
f67539c2 3588 respond_to_request(mdr, -CEPHFS_EEXIST);
9f95a23c 3589 else
f67539c2 3590 respond_to_request(mdr, -CEPHFS_ENOTEMPTY);
9f95a23c
TL
3591 return nullptr;
3592 }
7c673cae 3593
9f95a23c 3594 // traverse to parent dir
f67539c2 3595 CF_MDS_RetryRequestFactory cf(mdcache, mdr, true);
9f95a23c
TL
3596 int flags = MDS_TRAVERSE_RDLOCK_SNAP | MDS_TRAVERSE_RDLOCK_PATH |
3597 MDS_TRAVERSE_WANT_DENTRY | MDS_TRAVERSE_XLOCK_DENTRY |
3598 MDS_TRAVERSE_WANT_AUTH;
3599 if (refpath.depth() == 1 && !mdr->lock_cache_disabled)
3600 flags |= MDS_TRAVERSE_CHECK_LOCKCACHE;
3601 if (create)
3602 flags |= MDS_TRAVERSE_RDLOCK_AUTHLOCK;
3603 if (want_layout)
3604 flags |= MDS_TRAVERSE_WANT_DIRLAYOUT;
3605 int r = mdcache->path_traverse(mdr, cf, refpath, flags, &mdr->dn[0]);
3606 if (r > 0)
3607 return nullptr; // delayed
3608 if (r < 0) {
f67539c2
TL
3609 if (r == -CEPHFS_ESTALE) {
3610 dout(10) << "FAIL on CEPHFS_ESTALE but attempting recovery" << dendl;
2a845540
TL
3611 inodeno_t ino = refpath.get_ino();
3612 mdcache->find_ino_peers(ino, new C_MDS_TryFindInode(this, mdr, mdcache, ino));
9f95a23c
TL
3613 return nullptr;
3614 }
3615 respond_to_request(mdr, r);
3616 return nullptr;
3617 }
7c673cae 3618
9f95a23c
TL
3619 CDentry *dn = mdr->dn[0].back();
3620 CDir *dir = dn->get_dir();
7c673cae 3621 CInode *diri = dir->get_inode();
9f95a23c 3622
7c673cae
FG
3623 if (!mdr->reqid.name.is_mds()) {
3624 if (diri->is_system() && !diri->is_root()) {
f67539c2 3625 respond_to_request(mdr, -CEPHFS_EROFS);
9f95a23c 3626 return nullptr;
7c673cae
FG
3627 }
3628 }
9f95a23c 3629
7c673cae 3630 if (!diri->is_base() && diri->get_projected_parent_dir()->inode->is_stray()) {
f67539c2 3631 respond_to_request(mdr, -CEPHFS_ENOENT);
9f95a23c 3632 return nullptr;
7c673cae
FG
3633 }
3634
9f95a23c
TL
3635 CDentry::linkage_t *dnl = dn->get_projected_linkage();
3636 if (dnl->is_null()) {
3637 if (!create && okexist) {
f67539c2 3638 respond_to_request(mdr, -CEPHFS_ENOENT);
9f95a23c 3639 return nullptr;
7c673cae
FG
3640 }
3641
9f95a23c
TL
3642 snapid_t next_snap = mdcache->get_global_snaprealm()->get_newest_seq() + 1;
3643 dn->first = std::max(dn->first, next_snap);
7c673cae 3644 } else {
9f95a23c 3645 if (!okexist) {
f67539c2 3646 respond_to_request(mdr, -CEPHFS_EEXIST);
9f95a23c
TL
3647 return nullptr;
3648 }
3649 mdr->in[0] = dnl->get_inode();
7c673cae
FG
3650 }
3651
7c673cae
FG
3652 return dn;
3653}
3654
9f95a23c
TL
3655/** rdlock_two_paths_xlock_destdn
3656 * traverse two paths and lock the two paths in proper order.
3657 * The order of taking locks is:
3658 * 1. Lock directory inodes or dentries according to which trees they
3659 * are under. Lock objects under fs root before objects under mdsdir.
3660 * 2. Lock directory inodes or dentries according to their depth, in
3661 * ascending order.
3662 * 3. Lock directory inodes or dentries according to inode numbers or
3663 * dentries' parent inode numbers, in ascending order.
3664 * 4. Lock dentries in the same directory in order of their keys.
3665 * 5. Lock non-directory inodes according to inode numbers, in ascending
3666 * order.
3667 */
3668std::pair<CDentry*, CDentry*>
3669Server::rdlock_two_paths_xlock_destdn(MDRequestRef& mdr, bool xlock_srcdn)
3670{
7c673cae 3671
9f95a23c
TL
3672 const filepath& refpath = mdr->get_filepath();
3673 const filepath& refpath2 = mdr->get_filepath2();
7c673cae 3674
9f95a23c 3675 dout(10) << "rdlock_two_paths_xlock_destdn " << *mdr << " " << refpath << " " << refpath2 << dendl;
7c673cae 3676
9f95a23c
TL
3677 if (mdr->locking_state & MutationImpl::PATH_LOCKED)
3678 return std::make_pair(mdr->dn[0].back(), mdr->dn[1].back());
7c673cae 3679
9f95a23c 3680 if (refpath.depth() != 1 || refpath2.depth() != 1) {
f67539c2 3681 respond_to_request(mdr, -CEPHFS_EINVAL);
9f95a23c
TL
3682 return std::pair<CDentry*, CDentry*>(nullptr, nullptr);
3683 }
3684
3685 if (refpath.is_last_snap() || refpath2.is_last_snap()) {
f67539c2 3686 respond_to_request(mdr, -CEPHFS_EROFS);
9f95a23c
TL
3687 return std::make_pair(nullptr, nullptr);
3688 }
3689
3690 // traverse to parent dir
f67539c2 3691 CF_MDS_RetryRequestFactory cf(mdcache, mdr, true);
9f95a23c
TL
3692 int flags = MDS_TRAVERSE_RDLOCK_SNAP | MDS_TRAVERSE_WANT_DENTRY | MDS_TRAVERSE_WANT_AUTH;
3693 int r = mdcache->path_traverse(mdr, cf, refpath, flags, &mdr->dn[0]);
3694 if (r != 0) {
f67539c2
TL
3695 if (r == -CEPHFS_ESTALE) {
3696 dout(10) << "CEPHFS_ESTALE on path, attempting recovery" << dendl;
2a845540
TL
3697 inodeno_t ino = refpath.get_ino();
3698 mdcache->find_ino_peers(ino, new C_MDS_TryFindInode(this, mdr, mdcache, ino));
9f95a23c
TL
3699 } else if (r < 0) {
3700 respond_to_request(mdr, r);
3701 }
3702 return std::make_pair(nullptr, nullptr);
3703 }
3704
3705 flags = MDS_TRAVERSE_RDLOCK_SNAP2 | MDS_TRAVERSE_WANT_DENTRY | MDS_TRAVERSE_DISCOVER;
3706 r = mdcache->path_traverse(mdr, cf, refpath2, flags, &mdr->dn[1]);
3707 if (r != 0) {
f67539c2
TL
3708 if (r == -CEPHFS_ESTALE) {
3709 dout(10) << "CEPHFS_ESTALE on path2, attempting recovery" << dendl;
2a845540
TL
3710 inodeno_t ino = refpath2.get_ino();
3711 mdcache->find_ino_peers(ino, new C_MDS_TryFindInode(this, mdr, mdcache, ino));
9f95a23c
TL
3712 } else if (r < 0) {
3713 respond_to_request(mdr, r);
3714 }
3715 return std::make_pair(nullptr, nullptr);
3716 }
3717
3718 CDentry *srcdn = mdr->dn[1].back();
3719 CDir *srcdir = srcdn->get_dir();
3720 CDentry *destdn = mdr->dn[0].back();
3721 CDir *destdir = destdn->get_dir();
3722
3723 if (!mdr->reqid.name.is_mds()) {
3724 if ((srcdir->get_inode()->is_system() && !srcdir->get_inode()->is_root()) ||
3725 (destdir->get_inode()->is_system() && !destdir->get_inode()->is_root())) {
f67539c2 3726 respond_to_request(mdr, -CEPHFS_EROFS);
9f95a23c
TL
3727 return std::make_pair(nullptr, nullptr);
3728 }
3729 }
3730
3731 if (!destdir->get_inode()->is_base() &&
3732 destdir->get_inode()->get_projected_parent_dir()->inode->is_stray()) {
f67539c2 3733 respond_to_request(mdr, -CEPHFS_ENOENT);
9f95a23c
TL
3734 return std::make_pair(nullptr, nullptr);
3735 }
3736
3737 MutationImpl::LockOpVec lov;
3738 if (srcdir->get_inode() == destdir->get_inode()) {
3739 lov.add_wrlock(&destdir->inode->filelock);
3740 lov.add_wrlock(&destdir->inode->nestlock);
3741 if (xlock_srcdn && srcdir != destdir) {
3742 mds_rank_t srcdir_auth = srcdir->authority().first;
3743 if (srcdir_auth != mds->get_nodeid()) {
3744 lov.add_remote_wrlock(&srcdir->inode->filelock, srcdir_auth);
3745 lov.add_remote_wrlock(&srcdir->inode->nestlock, srcdir_auth);
3746 }
3747 }
3748
3749 if (srcdn->get_name() > destdn->get_name())
3750 lov.add_xlock(&destdn->lock);
3751
3752 if (xlock_srcdn)
3753 lov.add_xlock(&srcdn->lock);
3754 else
3755 lov.add_rdlock(&srcdn->lock);
3756
3757 if (srcdn->get_name() < destdn->get_name())
3758 lov.add_xlock(&destdn->lock);
3759 } else {
3760 int cmp = mdr->compare_paths();
3761 bool lock_destdir_first =
3762 (cmp < 0 || (cmp == 0 && destdir->ino() < srcdir->ino()));
3763
3764 if (lock_destdir_first) {
3765 lov.add_wrlock(&destdir->inode->filelock);
3766 lov.add_wrlock(&destdir->inode->nestlock);
3767 lov.add_xlock(&destdn->lock);
3768 }
3769
3770 if (xlock_srcdn) {
3771 mds_rank_t srcdir_auth = srcdir->authority().first;
3772 if (srcdir_auth == mds->get_nodeid()) {
3773 lov.add_wrlock(&srcdir->inode->filelock);
3774 lov.add_wrlock(&srcdir->inode->nestlock);
3775 } else {
3776 lov.add_remote_wrlock(&srcdir->inode->filelock, srcdir_auth);
3777 lov.add_remote_wrlock(&srcdir->inode->nestlock, srcdir_auth);
3778 }
3779 lov.add_xlock(&srcdn->lock);
3780 } else {
3781 lov.add_rdlock(&srcdn->lock);
3782 }
3783
3784 if (!lock_destdir_first) {
3785 lov.add_wrlock(&destdir->inode->filelock);
3786 lov.add_wrlock(&destdir->inode->nestlock);
3787 lov.add_xlock(&destdn->lock);
3788 }
3789 }
3790
3791 CInode *auth_pin_freeze = nullptr;
3792 // XXX any better way to do this?
3793 if (xlock_srcdn && !srcdn->is_auth()) {
3794 CDentry::linkage_t *srcdnl = srcdn->get_projected_linkage();
3795 auth_pin_freeze = srcdnl->is_primary() ? srcdnl->get_inode() : nullptr;
3796 }
3797 if (!mds->locker->acquire_locks(mdr, lov, auth_pin_freeze))
3798 return std::make_pair(nullptr, nullptr);
3799
3800 if (srcdn->get_projected_linkage()->is_null()) {
f67539c2 3801 respond_to_request(mdr, -CEPHFS_ENOENT);
9f95a23c
TL
3802 return std::make_pair(nullptr, nullptr);
3803 }
3804
3805 if (destdn->get_projected_linkage()->is_null()) {
3806 snapid_t next_snap = mdcache->get_global_snaprealm()->get_newest_seq() + 1;
3807 destdn->first = std::max(destdn->first, next_snap);
3808 }
3809
3810 mdr->locking_state |= MutationImpl::PATH_LOCKED;
3811
3812 return std::make_pair(destdn, srcdn);
3813}
3814
3815/**
3816 * try_open_auth_dirfrag -- open dirfrag, or forward to dirfrag auth
3817 *
3818 * @param diri base inode
3819 * @param fg the exact frag we want
7c673cae
FG
3820 * @param mdr request
3821 * @returns the pointer, or NULL if it had to be delayed (but mdr is taken care of)
3822 */
3823CDir* Server::try_open_auth_dirfrag(CInode *diri, frag_t fg, MDRequestRef& mdr)
3824{
3825 CDir *dir = diri->get_dirfrag(fg);
3826
9f95a23c
TL
3827 if (dir) {
3828 // am i auth for the dirfrag?
3829 if (!dir->is_auth()) {
3830 mds_rank_t auth = dir->authority().first;
3831 dout(7) << "try_open_auth_dirfrag: not auth for " << *dir
3832 << ", fw to mds." << auth << dendl;
3833 mdcache->request_forward(mdr, auth);
3834 return nullptr;
3835 }
3836 } else {
3837 // not open and inode not mine?
3838 if (!diri->is_auth()) {
3839 mds_rank_t inauth = diri->authority().first;
3840 dout(7) << "try_open_auth_dirfrag: not open, not inode auth, fw to mds." << inauth << dendl;
3841 mdcache->request_forward(mdr, inauth);
3842 return nullptr;
3843 }
7c673cae 3844
9f95a23c
TL
3845 // not open and inode frozen?
3846 if (diri->is_frozen()) {
3847 dout(10) << "try_open_auth_dirfrag: dir inode is frozen, waiting " << *diri << dendl;
3848 ceph_assert(diri->get_parent_dir());
3849 diri->add_waiter(CInode::WAIT_UNFREEZE, new C_MDS_RetryRequest(mdcache, mdr));
3850 return nullptr;
3851 }
7c673cae 3852
9f95a23c 3853 // invent?
7c673cae 3854 dir = diri->get_or_open_dirfrag(mdcache, fg);
7c673cae
FG
3855 }
3856
3857 return dir;
3858}
3859
3860
3861// ===============================================================================
3862// STAT
3863
3864void Server::handle_client_getattr(MDRequestRef& mdr, bool is_lookup)
3865{
9f95a23c 3866 const cref_t<MClientRequest> &req = mdr->client_request;
7c673cae
FG
3867
3868 if (req->get_filepath().depth() == 0 && is_lookup) {
3869 // refpath can't be empty for lookup but it can for
3870 // getattr (we do getattr with empty refpath for mount of '/')
f67539c2 3871 respond_to_request(mdr, -CEPHFS_EINVAL);
7c673cae
FG
3872 return;
3873 }
3874
28e407b8
AA
3875 bool want_auth = false;
3876 int mask = req->head.args.getattr.mask;
3877 if (mask & CEPH_STAT_RSTAT)
3878 want_auth = true; // set want_auth for CEPH_STAT_RSTAT mask
3879
f91f0fd5 3880 if (!mdr->is_batch_head() && mdr->can_batch()) {
f67539c2 3881 CF_MDS_RetryRequestFactory cf(mdcache, mdr, false);
f91f0fd5
TL
3882 int r = mdcache->path_traverse(mdr, cf, mdr->get_filepath(),
3883 (want_auth ? MDS_TRAVERSE_WANT_AUTH : 0),
3884 &mdr->dn[0], &mdr->in[0]);
3885 if (r > 0)
3886 return; // delayed
9f95a23c 3887
f91f0fd5
TL
3888 if (r < 0) {
3889 // fall-thru. let rdlock_path_pin_ref() check again.
3890 } else if (is_lookup) {
3891 CDentry* dn = mdr->dn[0].back();
3892 mdr->pin(dn);
3893 auto em = dn->batch_ops.emplace(std::piecewise_construct, std::forward_as_tuple(mask), std::forward_as_tuple());
9f95a23c 3894 if (em.second) {
f91f0fd5 3895 em.first->second = std::make_unique<Batch_Getattr_Lookup>(this, mdr);
9f95a23c 3896 } else {
f91f0fd5 3897 dout(20) << __func__ << ": LOOKUP op, wait for previous same getattr ops to respond. " << *mdr << dendl;
9f95a23c
TL
3898 em.first->second->add_request(mdr);
3899 return;
3900 }
3901 } else {
f91f0fd5
TL
3902 CInode *in = mdr->in[0];
3903 mdr->pin(in);
3904 auto em = in->batch_ops.emplace(std::piecewise_construct, std::forward_as_tuple(mask), std::forward_as_tuple());
9f95a23c 3905 if (em.second) {
f91f0fd5 3906 em.first->second = std::make_unique<Batch_Getattr_Lookup>(this, mdr);
9f95a23c 3907 } else {
f91f0fd5 3908 dout(20) << __func__ << ": GETATTR op, wait for previous same getattr ops to respond. " << *mdr << dendl;
9f95a23c
TL
3909 em.first->second->add_request(mdr);
3910 return;
3911 }
3912 }
9f95a23c 3913 }
7c673cae 3914
f91f0fd5
TL
3915 CInode *ref = rdlock_path_pin_ref(mdr, want_auth, false);
3916 if (!ref)
3917 return;
3918
3919 mdr->getattr_caps = mask;
3920
7c673cae
FG
3921 /*
3922 * if client currently holds the EXCL cap on a field, do not rdlock
3923 * it; client's stat() will result in valid info if _either_ EXCL
3924 * cap is held or MDS rdlocks and reads the value here.
3925 *
3926 * handling this case here is easier than weakening rdlock
3927 * semantics... that would cause problems elsewhere.
3928 */
3929 client_t client = mdr->get_client();
3930 int issued = 0;
3931 Capability *cap = ref->get_client_cap(client);
3932 if (cap && (mdr->snapid == CEPH_NOSNAP ||
3933 mdr->snapid <= cap->client_follows))
3934 issued = cap->issued();
3935
9f95a23c
TL
3936 // FIXME
3937 MutationImpl::LockOpVec lov;
94b18763 3938 if ((mask & CEPH_CAP_LINK_SHARED) && !(issued & CEPH_CAP_LINK_EXCL))
11fdf7f2 3939 lov.add_rdlock(&ref->linklock);
94b18763 3940 if ((mask & CEPH_CAP_AUTH_SHARED) && !(issued & CEPH_CAP_AUTH_EXCL))
11fdf7f2 3941 lov.add_rdlock(&ref->authlock);
94b18763 3942 if ((mask & CEPH_CAP_XATTR_SHARED) && !(issued & CEPH_CAP_XATTR_EXCL))
11fdf7f2 3943 lov.add_rdlock(&ref->xattrlock);
94b18763
FG
3944 if ((mask & CEPH_CAP_FILE_SHARED) && !(issued & CEPH_CAP_FILE_EXCL)) {
3945 // Don't wait on unstable filelock if client is allowed to read file size.
3946 // This can reduce the response time of getattr in the case that multiple
3947 // clients do stat(2) and there are writers.
3948 // The downside of this optimization is that mds may not issue Fs caps along
3949 // with getattr reply. Client may need to send more getattr requests.
11fdf7f2
TL
3950 if (mdr->is_rdlocked(&ref->filelock)) {
3951 lov.add_rdlock(&ref->filelock);
94b18763
FG
3952 } else if (ref->filelock.is_stable() ||
3953 ref->filelock.get_num_wrlocks() > 0 ||
3954 !ref->filelock.can_read(mdr->get_client())) {
11fdf7f2 3955 lov.add_rdlock(&ref->filelock);
9f95a23c 3956 mdr->locking_state &= ~MutationImpl::ALL_LOCKED;
94b18763
FG
3957 }
3958 }
7c673cae 3959
11fdf7f2 3960 if (!mds->locker->acquire_locks(mdr, lov))
7c673cae
FG
3961 return;
3962
3963 if (!check_access(mdr, ref, MAY_READ))
3964 return;
3965
28e407b8
AA
3966 utime_t now = ceph_clock_now();
3967 mdr->set_mds_stamp(now);
3968
7c673cae
FG
3969 // note which caps are requested, so we return at least a snapshot
3970 // value for them. (currently this matters for xattrs and inline data)
3971 mdr->getattr_caps = mask;
3972
11fdf7f2 3973 mds->balancer->hit_inode(ref, META_POP_IRD, req->get_source().num());
7c673cae
FG
3974
3975 // reply
3976 dout(10) << "reply to stat on " << *req << dendl;
3977 mdr->tracei = ref;
3978 if (is_lookup)
3979 mdr->tracedn = mdr->dn[0].back();
3980 respond_to_request(mdr, 0);
3981}
3982
3983struct C_MDS_LookupIno2 : public ServerContext {
3984 MDRequestRef mdr;
3985 C_MDS_LookupIno2(Server *s, MDRequestRef& r) : ServerContext(s), mdr(r) {}
3986 void finish(int r) override {
3987 server->_lookup_ino_2(mdr, r);
3988 }
3989};
3990
7c673cae
FG
3991/*
3992 * filepath: ino
3993 */
3994void Server::handle_client_lookup_ino(MDRequestRef& mdr,
3995 bool want_parent, bool want_dentry)
3996{
9f95a23c 3997 const cref_t<MClientRequest> &req = mdr->client_request;
11fdf7f2
TL
3998
3999 if ((uint64_t)req->head.args.lookupino.snapid > 0)
4000 return _lookup_snap_ino(mdr);
7c673cae
FG
4001
4002 inodeno_t ino = req->get_filepath().get_ino();
b3b6e05e
TL
4003 auto _ino = ino.val;
4004
4005 /* It's been observed [1] that a client may lookup a private ~mdsdir inode.
4006 * I do not have an explanation for how that happened organically but this
4007 * check will ensure that the client can no longer do that.
4008 *
4009 * [1] https://tracker.ceph.com/issues/49922
4010 */
4011 if (MDS_IS_PRIVATE_INO(_ino)) {
4012 respond_to_request(mdr, -CEPHFS_ESTALE);
4013 return;
4014 }
4015
7c673cae
FG
4016 CInode *in = mdcache->get_inode(ino);
4017 if (in && in->state_test(CInode::STATE_PURGING)) {
f67539c2 4018 respond_to_request(mdr, -CEPHFS_ESTALE);
7c673cae
FG
4019 return;
4020 }
4021 if (!in) {
4022 mdcache->open_ino(ino, (int64_t)-1, new C_MDS_LookupIno2(this, mdr), false);
4023 return;
4024 }
4025
7c673cae
FG
4026 // check for nothing (not read or write); this still applies the
4027 // path check.
4028 if (!check_access(mdr, in, 0))
4029 return;
4030
4031 CDentry *dn = in->get_projected_parent_dn();
4032 CInode *diri = dn ? dn->get_dir()->inode : NULL;
4033
11fdf7f2 4034 MutationImpl::LockOpVec lov;
7c673cae
FG
4035 if (dn && (want_parent || want_dentry)) {
4036 mdr->pin(dn);
11fdf7f2 4037 lov.add_rdlock(&dn->lock);
7c673cae
FG
4038 }
4039
11fdf7f2 4040 unsigned mask = req->head.args.lookupino.mask;
7c673cae
FG
4041 if (mask) {
4042 Capability *cap = in->get_client_cap(mdr->get_client());
4043 int issued = 0;
4044 if (cap && (mdr->snapid == CEPH_NOSNAP || mdr->snapid <= cap->client_follows))
4045 issued = cap->issued();
9f95a23c 4046 // FIXME
7c673cae
FG
4047 // permission bits, ACL/security xattrs
4048 if ((mask & CEPH_CAP_AUTH_SHARED) && (issued & CEPH_CAP_AUTH_EXCL) == 0)
11fdf7f2 4049 lov.add_rdlock(&in->authlock);
7c673cae 4050 if ((mask & CEPH_CAP_XATTR_SHARED) && (issued & CEPH_CAP_XATTR_EXCL) == 0)
11fdf7f2 4051 lov.add_rdlock(&in->xattrlock);
7c673cae
FG
4052
4053 mdr->getattr_caps = mask;
4054 }
4055
11fdf7f2
TL
4056 if (!lov.empty()) {
4057 if (!mds->locker->acquire_locks(mdr, lov))
7c673cae
FG
4058 return;
4059
d2e6a577
FG
4060 if (diri != NULL) {
4061 // need read access to directory inode
4062 if (!check_access(mdr, diri, MAY_READ))
4063 return;
4064 }
7c673cae
FG
4065 }
4066
4067 if (want_parent) {
4068 if (in->is_base()) {
f67539c2 4069 respond_to_request(mdr, -CEPHFS_EINVAL);
7c673cae
FG
4070 return;
4071 }
4072 if (!diri || diri->is_stray()) {
f67539c2 4073 respond_to_request(mdr, -CEPHFS_ESTALE);
7c673cae
FG
4074 return;
4075 }
4076 dout(10) << "reply to lookup_parent " << *in << dendl;
4077 mdr->tracei = diri;
4078 respond_to_request(mdr, 0);
4079 } else {
4080 if (want_dentry) {
4081 inodeno_t dirino = req->get_filepath2().get_ino();
4082 if (!diri || (dirino != inodeno_t() && diri->ino() != dirino)) {
f67539c2 4083 respond_to_request(mdr, -CEPHFS_ENOENT);
7c673cae
FG
4084 return;
4085 }
4086 dout(10) << "reply to lookup_name " << *in << dendl;
4087 } else
4088 dout(10) << "reply to lookup_ino " << *in << dendl;
4089
4090 mdr->tracei = in;
4091 if (want_dentry)
4092 mdr->tracedn = dn;
4093 respond_to_request(mdr, 0);
4094 }
4095}
4096
11fdf7f2
TL
4097void Server::_lookup_snap_ino(MDRequestRef& mdr)
4098{
9f95a23c 4099 const cref_t<MClientRequest> &req = mdr->client_request;
11fdf7f2
TL
4100
4101 vinodeno_t vino;
4102 vino.ino = req->get_filepath().get_ino();
4103 vino.snapid = (__u64)req->head.args.lookupino.snapid;
4104 inodeno_t parent_ino = (__u64)req->head.args.lookupino.parent;
4105 __u32 hash = req->head.args.lookupino.hash;
4106
4107 dout(7) << "lookup_snap_ino " << vino << " parent " << parent_ino << " hash " << hash << dendl;
4108
4109 CInode *in = mdcache->lookup_snap_inode(vino);
4110 if (!in) {
4111 in = mdcache->get_inode(vino.ino);
4112 if (in) {
4113 if (in->state_test(CInode::STATE_PURGING) ||
4114 !in->has_snap_data(vino.snapid)) {
4115 if (in->is_dir() || !parent_ino) {
f67539c2 4116 respond_to_request(mdr, -CEPHFS_ESTALE);
11fdf7f2
TL
4117 return;
4118 }
4119 in = NULL;
4120 }
4121 }
4122 }
4123
4124 if (in) {
4125 dout(10) << "reply to lookup_snap_ino " << *in << dendl;
4126 mdr->snapid = vino.snapid;
4127 mdr->tracei = in;
4128 respond_to_request(mdr, 0);
4129 return;
4130 }
4131
4132 CInode *diri = NULL;
4133 if (parent_ino) {
4134 diri = mdcache->get_inode(parent_ino);
4135 if (!diri) {
b3b6e05e 4136 mdcache->open_ino(parent_ino, mds->get_metadata_pool(), new C_MDS_LookupIno2(this, mdr));
11fdf7f2
TL
4137 return;
4138 }
4139
4140 if (!diri->is_dir()) {
f67539c2 4141 respond_to_request(mdr, -CEPHFS_EINVAL);
11fdf7f2
TL
4142 return;
4143 }
4144
4145 MutationImpl::LockOpVec lov;
4146 lov.add_rdlock(&diri->dirfragtreelock);
4147 if (!mds->locker->acquire_locks(mdr, lov))
4148 return;
4149
4150 frag_t frag = diri->dirfragtree[hash];
4151 CDir *dir = try_open_auth_dirfrag(diri, frag, mdr);
4152 if (!dir)
4153 return;
4154
4155 if (!dir->is_complete()) {
4156 if (dir->is_frozen()) {
4157 mds->locker->drop_locks(mdr.get());
4158 mdr->drop_local_auth_pins();
4159 dir->add_waiter(CDir::WAIT_UNFREEZE, new C_MDS_RetryRequest(mdcache, mdr));
4160 return;
4161 }
4162 dir->fetch(new C_MDS_RetryRequest(mdcache, mdr), true);
4163 return;
4164 }
4165
f67539c2 4166 respond_to_request(mdr, -CEPHFS_ESTALE);
11fdf7f2 4167 } else {
b3b6e05e 4168 mdcache->open_ino(vino.ino, mds->get_metadata_pool(), new C_MDS_LookupIno2(this, mdr), false);
11fdf7f2
TL
4169 }
4170}
4171
7c673cae
FG
4172void Server::_lookup_ino_2(MDRequestRef& mdr, int r)
4173{
4174 inodeno_t ino = mdr->client_request->get_filepath().get_ino();
4175 dout(10) << "_lookup_ino_2 " << mdr.get() << " ino " << ino << " r=" << r << dendl;
4176
4177 // `r` is a rank if >=0, else an error code
4178 if (r >= 0) {
4179 mds_rank_t dest_rank(r);
4180 if (dest_rank == mds->get_nodeid())
4181 dispatch_client_request(mdr);
4182 else
4183 mdcache->request_forward(mdr, dest_rank);
4184 return;
4185 }
4186
4187 // give up
f67539c2
TL
4188 if (r == -CEPHFS_ENOENT || r == -CEPHFS_ENODATA)
4189 r = -CEPHFS_ESTALE;
7c673cae
FG
4190 respond_to_request(mdr, r);
4191}
4192
4193
4194/* This function takes responsibility for the passed mdr*/
4195void Server::handle_client_open(MDRequestRef& mdr)
4196{
9f95a23c 4197 const cref_t<MClientRequest> &req = mdr->client_request;
7c673cae
FG
4198 dout(7) << "open on " << req->get_filepath() << dendl;
4199
4200 int flags = req->head.args.open.flags;
4201 int cmode = ceph_flags_to_mode(flags);
4202 if (cmode < 0) {
f67539c2 4203 respond_to_request(mdr, -CEPHFS_EINVAL);
7c673cae
FG
4204 return;
4205 }
4206
181888fb
FG
4207 bool need_auth = !file_mode_is_readonly(cmode) ||
4208 (flags & (CEPH_O_TRUNC | CEPH_O_DIRECTORY));
7c673cae
FG
4209
4210 if ((cmode & CEPH_FILE_MODE_WR) && mdcache->is_readonly()) {
4211 dout(7) << "read-only FS" << dendl;
f67539c2 4212 respond_to_request(mdr, -CEPHFS_EROFS);
7c673cae
FG
4213 return;
4214 }
4215
9f95a23c 4216 CInode *cur = rdlock_path_pin_ref(mdr, need_auth);
7c673cae
FG
4217 if (!cur)
4218 return;
4219
4220 if (cur->is_frozen() || cur->state_test(CInode::STATE_EXPORTINGCAPS)) {
11fdf7f2 4221 ceph_assert(!need_auth);
9f95a23c
TL
4222 mdr->locking_state &= ~(MutationImpl::PATH_LOCKED | MutationImpl::ALL_LOCKED);
4223 CInode *cur = rdlock_path_pin_ref(mdr, true);
7c673cae
FG
4224 if (!cur)
4225 return;
4226 }
4227
f67539c2 4228 if (!cur->is_file()) {
7c673cae
FG
4229 // can only open non-regular inode with mode FILE_MODE_PIN, at least for now.
4230 cmode = CEPH_FILE_MODE_PIN;
4231 // the inode is symlink and client wants to follow it, ignore the O_TRUNC flag.
f67539c2 4232 if (cur->is_symlink() && !(flags & CEPH_O_NOFOLLOW))
7c673cae
FG
4233 flags &= ~CEPH_O_TRUNC;
4234 }
4235
4236 dout(10) << "open flags = " << flags
4237 << ", filemode = " << cmode
4238 << ", need_auth = " << need_auth
4239 << dendl;
4240
4241 // regular file?
4242 /*if (!cur->inode.is_file() && !cur->inode.is_dir()) {
4243 dout(7) << "not a file or dir " << *cur << dendl;
f67539c2 4244 respond_to_request(mdr, -CEPHFS_ENXIO); // FIXME what error do we want?
7c673cae
FG
4245 return;
4246 }*/
f67539c2 4247 if ((flags & CEPH_O_DIRECTORY) && !cur->is_dir() && !cur->is_symlink()) {
7c673cae 4248 dout(7) << "specified O_DIRECTORY on non-directory " << *cur << dendl;
f67539c2 4249 respond_to_request(mdr, -CEPHFS_EINVAL);
7c673cae
FG
4250 return;
4251 }
4252
f67539c2 4253 if ((flags & CEPH_O_TRUNC) && !cur->is_file()) {
7c673cae 4254 dout(7) << "specified O_TRUNC on !(file|symlink) " << *cur << dendl;
f67539c2
TL
4255 // we should return -CEPHFS_EISDIR for directory, return -CEPHFS_EINVAL for other non-regular
4256 respond_to_request(mdr, cur->is_dir() ? -CEPHFS_EISDIR : -CEPHFS_EINVAL);
7c673cae
FG
4257 return;
4258 }
4259
f67539c2 4260 if (cur->get_inode()->inline_data.version != CEPH_INLINE_NONE &&
11fdf7f2 4261 !mdr->session->get_connection()->has_feature(CEPH_FEATURE_MDS_INLINE_DATA)) {
7c673cae 4262 dout(7) << "old client cannot open inline data file " << *cur << dendl;
f67539c2 4263 respond_to_request(mdr, -CEPHFS_EPERM);
7c673cae
FG
4264 return;
4265 }
4266
4267 // snapped data is read only
4268 if (mdr->snapid != CEPH_NOSNAP &&
4269 ((cmode & CEPH_FILE_MODE_WR) || req->may_write())) {
4270 dout(7) << "snap " << mdr->snapid << " is read-only " << *cur << dendl;
f67539c2 4271 respond_to_request(mdr, -CEPHFS_EROFS);
7c673cae
FG
4272 return;
4273 }
4274
9f95a23c
TL
4275 MutationImpl::LockOpVec lov;
4276
7c673cae
FG
4277 unsigned mask = req->head.args.open.mask;
4278 if (mask) {
4279 Capability *cap = cur->get_client_cap(mdr->get_client());
4280 int issued = 0;
4281 if (cap && (mdr->snapid == CEPH_NOSNAP || mdr->snapid <= cap->client_follows))
4282 issued = cap->issued();
4283 // permission bits, ACL/security xattrs
4284 if ((mask & CEPH_CAP_AUTH_SHARED) && (issued & CEPH_CAP_AUTH_EXCL) == 0)
11fdf7f2 4285 lov.add_rdlock(&cur->authlock);
7c673cae 4286 if ((mask & CEPH_CAP_XATTR_SHARED) && (issued & CEPH_CAP_XATTR_EXCL) == 0)
11fdf7f2 4287 lov.add_rdlock(&cur->xattrlock);
7c673cae
FG
4288
4289 mdr->getattr_caps = mask;
4290 }
4291
4292 // O_TRUNC
4293 if ((flags & CEPH_O_TRUNC) && !mdr->has_completed) {
11fdf7f2 4294 ceph_assert(cur->is_auth());
7c673cae 4295
11fdf7f2
TL
4296 lov.add_xlock(&cur->filelock);
4297 if (!mds->locker->acquire_locks(mdr, lov))
7c673cae
FG
4298 return;
4299
4300 if (!check_access(mdr, cur, MAY_WRITE))
4301 return;
4302
4303 // wait for pending truncate?
f67539c2 4304 const auto& pi = cur->get_projected_inode();
7c673cae
FG
4305 if (pi->is_truncating()) {
4306 dout(10) << " waiting for pending truncate from " << pi->truncate_from
4307 << " to " << pi->truncate_size << " to complete on " << *cur << dendl;
4308 mds->locker->drop_locks(mdr.get());
4309 mdr->drop_local_auth_pins();
4310 cur->add_waiter(CInode::WAIT_TRUNC, new C_MDS_RetryRequest(mdcache, mdr));
4311 return;
4312 }
4313
4314 do_open_truncate(mdr, cmode);
4315 return;
4316 }
4317
4318 // sync filelock if snapped.
4319 // this makes us wait for writers to flushsnaps, ensuring we get accurate metadata,
4320 // and that data itself is flushed so that we can read the snapped data off disk.
4321 if (mdr->snapid != CEPH_NOSNAP && !cur->is_dir()) {
11fdf7f2 4322 lov.add_rdlock(&cur->filelock);
7c673cae
FG
4323 }
4324
11fdf7f2 4325 if (!mds->locker->acquire_locks(mdr, lov))
7c673cae
FG
4326 return;
4327
4328 mask = MAY_READ;
4329 if (cmode & CEPH_FILE_MODE_WR)
4330 mask |= MAY_WRITE;
4331 if (!check_access(mdr, cur, mask))
4332 return;
4333
28e407b8
AA
4334 utime_t now = ceph_clock_now();
4335 mdr->set_mds_stamp(now);
4336
7c673cae
FG
4337 if (cur->is_file() || cur->is_dir()) {
4338 if (mdr->snapid == CEPH_NOSNAP) {
4339 // register new cap
9f95a23c 4340 Capability *cap = mds->locker->issue_new_caps(cur, cmode, mdr, nullptr);
7c673cae
FG
4341 if (cap)
4342 dout(12) << "open issued caps " << ccap_string(cap->pending())
4343 << " for " << req->get_source()
4344 << " on " << *cur << dendl;
4345 } else {
4346 int caps = ceph_caps_for_mode(cmode);
4347 dout(12) << "open issued IMMUTABLE SNAP caps " << ccap_string(caps)
4348 << " for " << req->get_source()
4349 << " snapid " << mdr->snapid
4350 << " on " << *cur << dendl;
4351 mdr->snap_caps = caps;
4352 }
4353 }
4354
4355 // increase max_size?
4356 if (cmode & CEPH_FILE_MODE_WR)
4357 mds->locker->check_inode_max_size(cur);
4358
4359 // make sure this inode gets into the journal
4360 if (cur->is_auth() && cur->last == CEPH_NOSNAP &&
11fdf7f2 4361 mdcache->open_file_table.should_log_open(cur)) {
7c673cae
FG
4362 EOpen *le = new EOpen(mds->mdlog);
4363 mdlog->start_entry(le);
4364 le->add_clean_inode(cur);
7c673cae
FG
4365 mdlog->submit_entry(le);
4366 }
4367
4368 // hit pop
4369 if (cmode & CEPH_FILE_MODE_WR)
11fdf7f2 4370 mds->balancer->hit_inode(cur, META_POP_IWR);
7c673cae 4371 else
11fdf7f2 4372 mds->balancer->hit_inode(cur, META_POP_IRD,
7c673cae
FG
4373 mdr->client_request->get_source().num());
4374
4375 CDentry *dn = 0;
4376 if (req->get_dentry_wanted()) {
11fdf7f2 4377 ceph_assert(mdr->dn[0].size());
7c673cae
FG
4378 dn = mdr->dn[0].back();
4379 }
4380
4381 mdr->tracei = cur;
4382 mdr->tracedn = dn;
4383 respond_to_request(mdr, 0);
4384}
4385
4386class C_MDS_openc_finish : public ServerLogContext {
4387 CDentry *dn;
4388 CInode *newi;
7c673cae 4389public:
11fdf7f2
TL
4390 C_MDS_openc_finish(Server *s, MDRequestRef& r, CDentry *d, CInode *ni) :
4391 ServerLogContext(s, r), dn(d), newi(ni) {}
7c673cae 4392 void finish(int r) override {
11fdf7f2 4393 ceph_assert(r == 0);
7c673cae
FG
4394
4395 dn->pop_projected_linkage();
4396
4397 // dirty inode, dn, dir
f67539c2 4398 newi->mark_dirty(mdr->ls);
28e407b8 4399 newi->mark_dirty_parent(mdr->ls, true);
7c673cae
FG
4400
4401 mdr->apply();
4402
4403 get_mds()->locker->share_inode_max_size(newi);
4404
4405 MDRequestRef null_ref;
4406 get_mds()->mdcache->send_dentry_link(dn, null_ref);
4407
11fdf7f2 4408 get_mds()->balancer->hit_inode(newi, META_POP_IWR);
7c673cae
FG
4409
4410 server->respond_to_request(mdr, 0);
4411
11fdf7f2 4412 ceph_assert(g_conf()->mds_kill_openc_at != 1);
7c673cae
FG
4413 }
4414};
4415
4416/* This function takes responsibility for the passed mdr*/
4417void Server::handle_client_openc(MDRequestRef& mdr)
4418{
9f95a23c 4419 const cref_t<MClientRequest> &req = mdr->client_request;
7c673cae
FG
4420 client_t client = mdr->get_client();
4421
4422 dout(7) << "open w/ O_CREAT on " << req->get_filepath() << dendl;
4423
4424 int cmode = ceph_flags_to_mode(req->head.args.open.flags);
4425 if (cmode < 0) {
f67539c2 4426 respond_to_request(mdr, -CEPHFS_EINVAL);
7c673cae
FG
4427 return;
4428 }
4429
c07f9fc5 4430 bool excl = req->head.args.open.flags & CEPH_O_EXCL;
9f95a23c
TL
4431 CDentry *dn = rdlock_path_xlock_dentry(mdr, true, !excl, true);
4432 if (!dn)
4433 return;
c07f9fc5 4434
9f95a23c
TL
4435 CDentry::linkage_t *dnl = dn->get_projected_linkage();
4436 if (!excl && !dnl->is_null()) {
4437 // it existed.
4438 mds->locker->xlock_downgrade(&dn->lock, mdr.get());
4439
4440 MutationImpl::LockOpVec lov;
4441 lov.add_rdlock(&dnl->get_inode()->snaplock);
4442 if (!mds->locker->acquire_locks(mdr, lov))
7c673cae 4443 return;
7c673cae 4444
9f95a23c 4445 handle_client_open(mdr);
7c673cae
FG
4446 return;
4447 }
9f95a23c
TL
4448
4449 ceph_assert(dnl->is_null());
4450
f67539c2
TL
4451 if (req->get_alternate_name().size() > alternate_name_max) {
4452 dout(10) << " alternate_name longer than " << alternate_name_max << dendl;
4453 respond_to_request(mdr, -CEPHFS_ENAMETOOLONG);
4454 return;
4455 }
4456 dn->set_alternate_name(req->get_alternate_name());
4457
7c673cae
FG
4458 // set layout
4459 file_layout_t layout;
9f95a23c
TL
4460 if (mdr->dir_layout != file_layout_t())
4461 layout = mdr->dir_layout;
7c673cae
FG
4462 else
4463 layout = mdcache->default_file_layout;
4464
4465 // What kind of client caps are required to complete this operation
4466 uint64_t access = MAY_WRITE;
4467
4468 const auto default_layout = layout;
4469
4470 // fill in any special params from client
4471 if (req->head.args.open.stripe_unit)
4472 layout.stripe_unit = req->head.args.open.stripe_unit;
4473 if (req->head.args.open.stripe_count)
4474 layout.stripe_count = req->head.args.open.stripe_count;
4475 if (req->head.args.open.object_size)
4476 layout.object_size = req->head.args.open.object_size;
4477 if (req->get_connection()->has_feature(CEPH_FEATURE_CREATEPOOLID) &&
4478 (__s32)req->head.args.open.pool >= 0) {
4479 layout.pool_id = req->head.args.open.pool;
4480
4481 // make sure we have as new a map as the client
4482 if (req->get_mdsmap_epoch() > mds->mdsmap->get_epoch()) {
4483 mds->wait_for_mdsmap(req->get_mdsmap_epoch(), new C_MDS_RetryRequest(mdcache, mdr));
4484 return;
4485 }
4486 }
4487
4488 // If client doesn't have capability to modify layout pools, then
4489 // only permit this request if the requested pool matches what the
4490 // file would have inherited anyway from its parent.
4491 if (default_layout != layout) {
4492 access |= MAY_SET_VXATTR;
4493 }
4494
4495 if (!layout.is_valid()) {
4496 dout(10) << " invalid initial file layout" << dendl;
f67539c2 4497 respond_to_request(mdr, -CEPHFS_EINVAL);
7c673cae
FG
4498 return;
4499 }
4500 if (!mds->mdsmap->is_data_pool(layout.pool_id)) {
4501 dout(10) << " invalid data pool " << layout.pool_id << dendl;
f67539c2 4502 respond_to_request(mdr, -CEPHFS_EINVAL);
7c673cae
FG
4503 return;
4504 }
4505
c07f9fc5 4506 // created null dn.
7c673cae
FG
4507 CDir *dir = dn->get_dir();
4508 CInode *diri = dir->get_inode();
7c673cae
FG
4509 if (!check_access(mdr, diri, access))
4510 return;
7c673cae
FG
4511 if (!check_fragment_space(mdr, dir))
4512 return;
20effc67
TL
4513 if (!check_dir_max_entries(mdr, dir))
4514 return;
7c673cae 4515
9f95a23c
TL
4516 if (mdr->dn[0].size() == 1)
4517 mds->locker->create_lock_cache(mdr, diri, &mdr->dir_layout);
7c673cae 4518
7c673cae 4519 // create inode.
f67539c2
TL
4520 CInode *newi = prepare_new_inode(mdr, dn->get_dir(), inodeno_t(req->head.ino),
4521 req->head.args.open.mode | S_IFREG, &layout);
4522 ceph_assert(newi);
7c673cae
FG
4523
4524 // it's a file.
f67539c2 4525 dn->push_projected_linkage(newi);
7c673cae 4526
f67539c2
TL
4527 auto _inode = newi->_get_inode();
4528 _inode->version = dn->pre_dirty();
7c673cae 4529 if (layout.pool_id != mdcache->default_file_layout.pool_id)
f67539c2
TL
4530 _inode->add_old_pool(mdcache->default_file_layout.pool_id);
4531 _inode->update_backtrace();
4532 _inode->rstat.rfiles = 1;
4533 _inode->accounted_rstat = _inode->rstat;
a8e16298
TL
4534
4535 SnapRealm *realm = diri->find_snaprealm();
11fdf7f2
TL
4536 snapid_t follows = mdcache->get_global_snaprealm()->get_newest_seq();
4537 ceph_assert(follows >= realm->get_newest_seq());
a8e16298
TL
4538
4539 ceph_assert(dn->first == follows+1);
f67539c2 4540 newi->first = dn->first;
a8e16298
TL
4541
4542 // do the open
f67539c2
TL
4543 Capability *cap = mds->locker->issue_new_caps(newi, cmode, mdr, realm);
4544 newi->authlock.set_state(LOCK_EXCL);
4545 newi->xattrlock.set_state(LOCK_EXCL);
a8e16298
TL
4546
4547 if (cap && (cmode & CEPH_FILE_MODE_WR)) {
f67539c2
TL
4548 _inode->client_ranges[client].range.first = 0;
4549 _inode->client_ranges[client].range.last = _inode->layout.stripe_unit;
4550 _inode->client_ranges[client].follows = follows;
4551 newi->mark_clientwriteable();
a8e16298 4552 cap->mark_clientwriteable();
7c673cae 4553 }
7c673cae
FG
4554
4555 // prepare finisher
4556 mdr->ls = mdlog->get_current_segment();
4557 EUpdate *le = new EUpdate(mdlog, "openc");
4558 mdlog->start_entry(le);
4559 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
4560 journal_allocated_inos(mdr, &le->metablob);
f67539c2
TL
4561 mdcache->predirty_journal_parents(mdr, &le->metablob, newi, dn->get_dir(), PREDIRTY_PRIMARY|PREDIRTY_DIR, 1);
4562 le->metablob.add_primary_dentry(dn, newi, true, true, true);
7c673cae 4563
7c673cae 4564 // make sure this inode gets into the journal
f67539c2 4565 le->metablob.add_opened_ino(newi->ino());
7c673cae 4566
f67539c2 4567 C_MDS_openc_finish *fin = new C_MDS_openc_finish(this, mdr, dn, newi);
7c673cae 4568
9f95a23c
TL
4569 if (mdr->session->info.has_feature(CEPHFS_FEATURE_DELEG_INO)) {
4570 openc_response_t ocresp;
4571
4572 dout(10) << "adding created_ino and delegated_inos" << dendl;
f67539c2 4573 ocresp.created_ino = _inode->ino;
9f95a23c
TL
4574
4575 if (delegate_inos_pct && !req->is_queued_for_replay()) {
4576 // Try to delegate some prealloc_inos to the client, if it's down to half the max
4577 unsigned frac = 100 / delegate_inos_pct;
4578 if (mdr->session->delegated_inos.size() < (unsigned)g_conf()->mds_client_prealloc_inos / frac / 2)
4579 mdr->session->delegate_inos(g_conf()->mds_client_prealloc_inos / frac, ocresp.delegated_inos);
4580 }
4581
4582 encode(ocresp, mdr->reply_extra_bl);
4583 } else if (mdr->client_request->get_connection()->has_feature(CEPH_FEATURE_REPLY_CREATE_INODE)) {
7c673cae
FG
4584 dout(10) << "adding ino to reply to indicate inode was created" << dendl;
4585 // add the file created flag onto the reply if create_flags features is supported
f67539c2 4586 encode(newi->ino(), mdr->reply_extra_bl);
7c673cae
FG
4587 }
4588
f67539c2 4589 journal_and_reply(mdr, newi, dn, le, fin);
7c673cae
FG
4590
4591 // We hit_dir (via hit_inode) in our finish callback, but by then we might
4592 // have overshot the split size (multiple opencs in flight), so here is
4593 // an early chance to split the dir if this openc makes it oversized.
4594 mds->balancer->maybe_fragment(dir, false);
4595}
4596
4597
4598
4599void Server::handle_client_readdir(MDRequestRef& mdr)
4600{
9f95a23c 4601 const cref_t<MClientRequest> &req = mdr->client_request;
adb31ebb 4602 Session *session = mds->get_session(req);
7c673cae 4603 client_t client = req->get_source().num();
11fdf7f2 4604 MutationImpl::LockOpVec lov;
9f95a23c 4605 CInode *diri = rdlock_path_pin_ref(mdr, false, true);
7c673cae
FG
4606 if (!diri) return;
4607
4608 // it's a directory, right?
4609 if (!diri->is_dir()) {
4610 // not a dir
f67539c2
TL
4611 dout(10) << "reply to " << *req << " readdir -CEPHFS_ENOTDIR" << dendl;
4612 respond_to_request(mdr, -CEPHFS_ENOTDIR);
7c673cae
FG
4613 return;
4614 }
4615
adb31ebb
TL
4616 auto num_caps = session->get_num_caps();
4617 auto session_cap_acquisition = session->get_cap_acquisition();
4618
4619 if (num_caps > static_cast<uint64_t>(max_caps_per_client * max_caps_throttle_ratio) && session_cap_acquisition >= cap_acquisition_throttle) {
4620 dout(20) << "readdir throttled. max_caps_per_client: " << max_caps_per_client << " num_caps: " << num_caps
4621 << " session_cap_acquistion: " << session_cap_acquisition << " cap_acquisition_throttle: " << cap_acquisition_throttle << dendl;
4622 if (logger)
4623 logger->inc(l_mdss_cap_acquisition_throttle);
4624
4625 mds->timer.add_event_after(caps_throttle_retry_request_timeout, new C_MDS_RetryRequest(mdcache, mdr));
4626 return;
4627 }
4628
11fdf7f2
TL
4629 lov.add_rdlock(&diri->filelock);
4630 lov.add_rdlock(&diri->dirfragtreelock);
7c673cae 4631
11fdf7f2 4632 if (!mds->locker->acquire_locks(mdr, lov))
7c673cae
FG
4633 return;
4634
4635 if (!check_access(mdr, diri, MAY_READ))
4636 return;
4637
4638 // which frag?
4639 frag_t fg = (__u32)req->head.args.readdir.frag;
4640 unsigned req_flags = (__u32)req->head.args.readdir.flags;
4641 string offset_str = req->get_path2();
4642
4643 __u32 offset_hash = 0;
4644 if (!offset_str.empty())
4645 offset_hash = ceph_frag_value(diri->hash_dentry_name(offset_str));
4646 else
4647 offset_hash = (__u32)req->head.args.readdir.offset_hash;
4648
4649 dout(10) << " frag " << fg << " offset '" << offset_str << "'"
4650 << " offset_hash " << offset_hash << " flags " << req_flags << dendl;
4651
4652 // does the frag exist?
4653 if (diri->dirfragtree[fg.value()] != fg) {
4654 frag_t newfg;
4655 if (req_flags & CEPH_READDIR_REPLY_BITFLAGS) {
4656 if (fg.contains((unsigned)offset_hash)) {
4657 newfg = diri->dirfragtree[offset_hash];
4658 } else {
4659 // client actually wants next frag
4660 newfg = diri->dirfragtree[fg.value()];
4661 }
4662 } else {
4663 offset_str.clear();
4664 newfg = diri->dirfragtree[fg.value()];
4665 }
4666 dout(10) << " adjust frag " << fg << " -> " << newfg << " " << diri->dirfragtree << dendl;
4667 fg = newfg;
4668 }
4669
4670 CDir *dir = try_open_auth_dirfrag(diri, fg, mdr);
4671 if (!dir) return;
4672
4673 // ok!
4674 dout(10) << "handle_client_readdir on " << *dir << dendl;
11fdf7f2 4675 ceph_assert(dir->is_auth());
7c673cae
FG
4676
4677 if (!dir->is_complete()) {
4678 if (dir->is_frozen()) {
4679 dout(7) << "dir is frozen " << *dir << dendl;
4680 mds->locker->drop_locks(mdr.get());
4681 mdr->drop_local_auth_pins();
4682 dir->add_waiter(CDir::WAIT_UNFREEZE, new C_MDS_RetryRequest(mdcache, mdr));
4683 return;
4684 }
4685 // fetch
4686 dout(10) << " incomplete dir contents for readdir on " << *dir << ", fetching" << dendl;
4687 dir->fetch(new C_MDS_RetryRequest(mdcache, mdr), true);
4688 return;
4689 }
4690
4691#ifdef MDS_VERIFY_FRAGSTAT
4692 dir->verify_fragstat();
4693#endif
4694
4695 utime_t now = ceph_clock_now();
4696 mdr->set_mds_stamp(now);
4697
4698 snapid_t snapid = mdr->snapid;
4699 dout(10) << "snapid " << snapid << dendl;
4700
4701 SnapRealm *realm = diri->find_snaprealm();
4702
4703 unsigned max = req->head.args.readdir.max_entries;
4704 if (!max)
4705 max = dir->get_num_any(); // whatever, something big.
4706 unsigned max_bytes = req->head.args.readdir.max_bytes;
4707 if (!max_bytes)
4708 // make sure at least one item can be encoded
11fdf7f2 4709 max_bytes = (512 << 10) + g_conf()->mds_max_xattr_pairs_size;
7c673cae
FG
4710
4711 // start final blob
4712 bufferlist dirbl;
11fdf7f2
TL
4713 DirStat ds;
4714 ds.frag = dir->get_frag();
4715 ds.auth = dir->get_dir_auth().first;
f91f0fd5 4716 if (dir->is_auth() && !forward_all_requests_to_auth)
11fdf7f2
TL
4717 dir->get_dist_spec(ds.dist, mds->get_nodeid());
4718
4719 dir->encode_dirstat(dirbl, mdr->session->info, ds);
7c673cae
FG
4720
4721 // count bytes available.
4722 // this isn't perfect, but we should capture the main variable/unbounded size items!
4723 int front_bytes = dirbl.length() + sizeof(__u32) + sizeof(__u8)*2;
4724 int bytes_left = max_bytes - front_bytes;
4725 bytes_left -= realm->get_snap_trace().length();
4726
4727 // build dir contents
4728 bufferlist dnbl;
4729 __u32 numfiles = 0;
4730 bool start = !offset_hash && offset_str.empty();
7c673cae
FG
4731 // skip all dns < dentry_key_t(snapid, offset_str, offset_hash)
4732 dentry_key_t skip_key(snapid, offset_str.c_str(), offset_hash);
181888fb
FG
4733 auto it = start ? dir->begin() : dir->lower_bound(skip_key);
4734 bool end = (it == dir->end());
4735 for (; !end && numfiles < max; end = (it == dir->end())) {
7c673cae
FG
4736 CDentry *dn = it->second;
4737 ++it;
4738
4739 if (dn->state_test(CDentry::STATE_PURGING))
4740 continue;
4741
4742 bool dnp = dn->use_projected(client, mdr);
4743 CDentry::linkage_t *dnl = dnp ? dn->get_projected_linkage() : dn->get_linkage();
4744
4745 if (dnl->is_null())
4746 continue;
4747
4748 if (dn->last < snapid || dn->first > snapid) {
4749 dout(20) << "skipping non-overlapping snap " << *dn << dendl;
4750 continue;
4751 }
4752
4753 if (!start) {
4754 dentry_key_t offset_key(dn->last, offset_str.c_str(), offset_hash);
4755 if (!(offset_key < dn->key()))
4756 continue;
4757 }
4758
4759 CInode *in = dnl->get_inode();
4760
4761 if (in && in->ino() == CEPH_INO_CEPH)
4762 continue;
4763
4764 // remote link?
4765 // better for the MDS to do the work, if we think the client will stat any of these files.
4766 if (dnl->is_remote() && !in) {
4767 in = mdcache->get_inode(dnl->get_remote_ino());
4768 if (in) {
4769 dn->link_remote(dnl, in);
4770 } else if (dn->state_test(CDentry::STATE_BADREMOTEINO)) {
4771 dout(10) << "skipping bad remote ino on " << *dn << dendl;
4772 continue;
4773 } else {
4774 // touch everything i _do_ have
94b18763
FG
4775 for (auto &p : *dir) {
4776 if (!p.second->get_linkage()->is_null())
4777 mdcache->lru.lru_touch(p.second);
4778 }
7c673cae
FG
4779
4780 // already issued caps and leases, reply immediately.
4781 if (dnbl.length() > 0) {
4782 mdcache->open_remote_dentry(dn, dnp, new C_MDSInternalNoop);
4783 dout(10) << " open remote dentry after caps were issued, stopping at "
4784 << dnbl.length() << " < " << bytes_left << dendl;
4785 break;
4786 }
4787
4788 mds->locker->drop_locks(mdr.get());
4789 mdr->drop_local_auth_pins();
4790 mdcache->open_remote_dentry(dn, dnp, new C_MDS_RetryRequest(mdcache, mdr));
4791 return;
4792 }
4793 }
11fdf7f2 4794 ceph_assert(in);
7c673cae 4795
94b18763 4796 if ((int)(dnbl.length() + dn->get_name().length() + sizeof(__u32) + sizeof(LeaseStat)) > bytes_left) {
7c673cae
FG
4797 dout(10) << " ran out of room, stopping at " << dnbl.length() << " < " << bytes_left << dendl;
4798 break;
4799 }
4800
4801 unsigned start_len = dnbl.length();
4802
4803 // dentry
4804 dout(12) << "including dn " << *dn << dendl;
11fdf7f2 4805 encode(dn->get_name(), dnbl);
2a845540 4806 mds->locker->issue_client_lease(dn, in, mdr, now, dnbl);
7c673cae
FG
4807
4808 // inode
4809 dout(12) << "including inode " << *in << dendl;
4810 int r = in->encode_inodestat(dnbl, mdr->session, realm, snapid, bytes_left - (int)dnbl.length());
4811 if (r < 0) {
4812 // chop off dn->name, lease
4813 dout(10) << " ran out of room, stopping at " << start_len << " < " << bytes_left << dendl;
4814 bufferlist keep;
4815 keep.substr_of(dnbl, 0, start_len);
4816 dnbl.swap(keep);
4817 break;
4818 }
11fdf7f2 4819 ceph_assert(r >= 0);
7c673cae
FG
4820 numfiles++;
4821
4822 // touch dn
4823 mdcache->lru.lru_touch(dn);
4824 }
4825
adb31ebb
TL
4826 session->touch_readdir_cap(numfiles);
4827
7c673cae
FG
4828 __u16 flags = 0;
4829 if (end) {
4830 flags = CEPH_READDIR_FRAG_END;
4831 if (start)
4832 flags |= CEPH_READDIR_FRAG_COMPLETE; // FIXME: what purpose does this serve
4833 }
4834 // client only understand END and COMPLETE flags ?
4835 if (req_flags & CEPH_READDIR_REPLY_BITFLAGS) {
4836 flags |= CEPH_READDIR_HASH_ORDER | CEPH_READDIR_OFFSET_HASH;
4837 }
4838
4839 // finish final blob
11fdf7f2
TL
4840 encode(numfiles, dirbl);
4841 encode(flags, dirbl);
7c673cae
FG
4842 dirbl.claim_append(dnbl);
4843
4844 // yay, reply
4845 dout(10) << "reply to " << *req << " readdir num=" << numfiles
4846 << " bytes=" << dirbl.length()
4847 << " start=" << (int)start
4848 << " end=" << (int)end
4849 << dendl;
4850 mdr->reply_extra_bl = dirbl;
4851
4852 // bump popularity. NOTE: this doesn't quite capture it.
522d829b 4853 mds->balancer->hit_dir(dir, META_POP_READDIR, -1, numfiles);
7c673cae
FG
4854
4855 // reply
4856 mdr->tracei = diri;
4857 respond_to_request(mdr, 0);
4858}
4859
4860
4861
4862// ===============================================================================
4863// INODE UPDATES
4864
4865
4866/*
4867 * finisher for basic inode updates
4868 */
4869class C_MDS_inode_update_finish : public ServerLogContext {
4870 CInode *in;
adb31ebb 4871 bool truncating_smaller, changed_ranges, adjust_realm;
7c673cae
FG
4872public:
4873 C_MDS_inode_update_finish(Server *s, MDRequestRef& r, CInode *i,
adb31ebb 4874 bool sm=false, bool cr=false, bool ar=false) :
11fdf7f2 4875 ServerLogContext(s, r), in(i),
adb31ebb 4876 truncating_smaller(sm), changed_ranges(cr), adjust_realm(ar) { }
7c673cae 4877 void finish(int r) override {
11fdf7f2 4878 ceph_assert(r == 0);
7c673cae 4879
adb31ebb
TL
4880 int snap_op = (in->snaprealm ? CEPH_SNAP_OP_UPDATE : CEPH_SNAP_OP_SPLIT);
4881
7c673cae 4882 // apply
7c673cae
FG
4883 mdr->apply();
4884
11fdf7f2
TL
4885 MDSRank *mds = get_mds();
4886
7c673cae 4887 // notify any clients
f67539c2 4888 if (truncating_smaller && in->get_inode()->is_truncating()) {
11fdf7f2
TL
4889 mds->locker->issue_truncate(in);
4890 mds->mdcache->truncate_inode(in, mdr->ls);
4891 }
4892
adb31ebb
TL
4893 if (adjust_realm) {
4894 mds->mdcache->send_snap_update(in, 0, snap_op);
4895 mds->mdcache->do_realm_invalidate_and_update_notify(in, snap_op);
7c673cae
FG
4896 }
4897
11fdf7f2 4898 get_mds()->balancer->hit_inode(in, META_POP_IWR);
7c673cae
FG
4899
4900 server->respond_to_request(mdr, 0);
4901
4902 if (changed_ranges)
4903 get_mds()->locker->share_inode_max_size(in);
4904 }
4905};
4906
4907void Server::handle_client_file_setlock(MDRequestRef& mdr)
4908{
9f95a23c 4909 const cref_t<MClientRequest> &req = mdr->client_request;
11fdf7f2 4910 MutationImpl::LockOpVec lov;
7c673cae
FG
4911
4912 // get the inode to operate on, and set up any locks needed for that
9f95a23c 4913 CInode *cur = rdlock_path_pin_ref(mdr, true);
7c673cae
FG
4914 if (!cur)
4915 return;
4916
11fdf7f2 4917 lov.add_xlock(&cur->flocklock);
7c673cae
FG
4918 /* acquire_locks will return true if it gets the locks. If it fails,
4919 it will redeliver this request at a later date, so drop the request.
4920 */
11fdf7f2 4921 if (!mds->locker->acquire_locks(mdr, lov)) {
7c673cae
FG
4922 dout(10) << "handle_client_file_setlock could not get locks!" << dendl;
4923 return;
4924 }
4925
4926 // copy the lock change into a ceph_filelock so we can store/apply it
4927 ceph_filelock set_lock;
4928 set_lock.start = req->head.args.filelock_change.start;
4929 set_lock.length = req->head.args.filelock_change.length;
4930 set_lock.client = req->get_orig_source().num();
4931 set_lock.owner = req->head.args.filelock_change.owner;
4932 set_lock.pid = req->head.args.filelock_change.pid;
4933 set_lock.type = req->head.args.filelock_change.type;
4934 bool will_wait = req->head.args.filelock_change.wait;
4935
4936 dout(10) << "handle_client_file_setlock: " << set_lock << dendl;
4937
4938 ceph_lock_state_t *lock_state = NULL;
4939 bool interrupt = false;
4940
4941 // get the appropriate lock state
4942 switch (req->head.args.filelock_change.rule) {
4943 case CEPH_LOCK_FLOCK_INTR:
4944 interrupt = true;
4945 // fall-thru
4946 case CEPH_LOCK_FLOCK:
4947 lock_state = cur->get_flock_lock_state();
4948 break;
4949
4950 case CEPH_LOCK_FCNTL_INTR:
4951 interrupt = true;
4952 // fall-thru
4953 case CEPH_LOCK_FCNTL:
4954 lock_state = cur->get_fcntl_lock_state();
4955 break;
4956
4957 default:
4958 dout(10) << "got unknown lock type " << set_lock.type
4959 << ", dropping request!" << dendl;
f67539c2 4960 respond_to_request(mdr, -CEPHFS_EOPNOTSUPP);
7c673cae
FG
4961 return;
4962 }
4963
4964 dout(10) << " state prior to lock change: " << *lock_state << dendl;
4965 if (CEPH_LOCK_UNLOCK == set_lock.type) {
4966 list<ceph_filelock> activated_locks;
11fdf7f2 4967 MDSContext::vec waiters;
7c673cae
FG
4968 if (lock_state->is_waiting(set_lock)) {
4969 dout(10) << " unlock removing waiting lock " << set_lock << dendl;
4970 lock_state->remove_waiting(set_lock);
4971 cur->take_waiting(CInode::WAIT_FLOCK, waiters);
4972 } else if (!interrupt) {
4973 dout(10) << " unlock attempt on " << set_lock << dendl;
4974 lock_state->remove_lock(set_lock, activated_locks);
4975 cur->take_waiting(CInode::WAIT_FLOCK, waiters);
4976 }
4977 mds->queue_waiters(waiters);
4978
4979 respond_to_request(mdr, 0);
4980 } else {
4981 dout(10) << " lock attempt on " << set_lock << dendl;
4982 bool deadlock = false;
4983 if (mdr->more()->flock_was_waiting &&
4984 !lock_state->is_waiting(set_lock)) {
4985 dout(10) << " was waiting for lock but not anymore, must have been canceled " << set_lock << dendl;
f67539c2 4986 respond_to_request(mdr, -CEPHFS_EINTR);
7c673cae
FG
4987 } else if (!lock_state->add_lock(set_lock, will_wait, mdr->more()->flock_was_waiting, &deadlock)) {
4988 dout(10) << " it failed on this attempt" << dendl;
4989 // couldn't set lock right now
4990 if (deadlock) {
f67539c2 4991 respond_to_request(mdr, -CEPHFS_EDEADLK);
7c673cae 4992 } else if (!will_wait) {
f67539c2 4993 respond_to_request(mdr, -CEPHFS_EWOULDBLOCK);
7c673cae
FG
4994 } else {
4995 dout(10) << " added to waiting list" << dendl;
11fdf7f2 4996 ceph_assert(lock_state->is_waiting(set_lock));
7c673cae
FG
4997 mdr->more()->flock_was_waiting = true;
4998 mds->locker->drop_locks(mdr.get());
4999 mdr->drop_local_auth_pins();
1adf2230
AA
5000 mdr->mark_event("failed to add lock, waiting");
5001 mdr->mark_nowarn();
7c673cae
FG
5002 cur->add_waiter(CInode::WAIT_FLOCK, new C_MDS_RetryRequest(mdcache, mdr));
5003 }
5004 } else
5005 respond_to_request(mdr, 0);
5006 }
5007 dout(10) << " state after lock change: " << *lock_state << dendl;
5008}
5009
5010void Server::handle_client_file_readlock(MDRequestRef& mdr)
5011{
9f95a23c 5012 const cref_t<MClientRequest> &req = mdr->client_request;
11fdf7f2 5013 MutationImpl::LockOpVec lov;
7c673cae
FG
5014
5015 // get the inode to operate on, and set up any locks needed for that
9f95a23c 5016 CInode *cur = rdlock_path_pin_ref(mdr, true);
7c673cae
FG
5017 if (!cur)
5018 return;
5019
5020 /* acquire_locks will return true if it gets the locks. If it fails,
5021 it will redeliver this request at a later date, so drop the request.
5022 */
11fdf7f2
TL
5023 lov.add_rdlock(&cur->flocklock);
5024 if (!mds->locker->acquire_locks(mdr, lov)) {
7c673cae
FG
5025 dout(10) << "handle_client_file_readlock could not get locks!" << dendl;
5026 return;
5027 }
5028
5029 // copy the lock change into a ceph_filelock so we can store/apply it
5030 ceph_filelock checking_lock;
5031 checking_lock.start = req->head.args.filelock_change.start;
5032 checking_lock.length = req->head.args.filelock_change.length;
5033 checking_lock.client = req->get_orig_source().num();
5034 checking_lock.owner = req->head.args.filelock_change.owner;
5035 checking_lock.pid = req->head.args.filelock_change.pid;
5036 checking_lock.type = req->head.args.filelock_change.type;
5037
5038 // get the appropriate lock state
5039 ceph_lock_state_t *lock_state = NULL;
5040 switch (req->head.args.filelock_change.rule) {
5041 case CEPH_LOCK_FLOCK:
5042 lock_state = cur->get_flock_lock_state();
5043 break;
5044
5045 case CEPH_LOCK_FCNTL:
5046 lock_state = cur->get_fcntl_lock_state();
5047 break;
5048
5049 default:
5050 dout(10) << "got unknown lock type " << checking_lock.type << dendl;
f67539c2 5051 respond_to_request(mdr, -CEPHFS_EINVAL);
7c673cae
FG
5052 return;
5053 }
5054 lock_state->look_for_lock(checking_lock);
5055
5056 bufferlist lock_bl;
11fdf7f2 5057 encode(checking_lock, lock_bl);
7c673cae
FG
5058
5059 mdr->reply_extra_bl = lock_bl;
5060 respond_to_request(mdr, 0);
5061}
5062
5063void Server::handle_client_setattr(MDRequestRef& mdr)
5064{
9f95a23c 5065 const cref_t<MClientRequest> &req = mdr->client_request;
11fdf7f2 5066 MutationImpl::LockOpVec lov;
9f95a23c 5067 CInode *cur = rdlock_path_pin_ref(mdr, true);
7c673cae
FG
5068 if (!cur) return;
5069
5070 if (mdr->snapid != CEPH_NOSNAP) {
f67539c2 5071 respond_to_request(mdr, -CEPHFS_EROFS);
7c673cae
FG
5072 return;
5073 }
5074 if (cur->ino() < MDS_INO_SYSTEM_BASE && !cur->is_base()) {
f67539c2 5075 respond_to_request(mdr, -CEPHFS_EPERM);
7c673cae
FG
5076 return;
5077 }
5078
5079 __u32 mask = req->head.args.setattr.mask;
5080 __u32 access_mask = MAY_WRITE;
5081
5082 // xlock inode
5083 if (mask & (CEPH_SETATTR_MODE|CEPH_SETATTR_UID|CEPH_SETATTR_GID|CEPH_SETATTR_BTIME|CEPH_SETATTR_KILL_SGUID))
11fdf7f2 5084 lov.add_xlock(&cur->authlock);
7c673cae 5085 if (mask & (CEPH_SETATTR_MTIME|CEPH_SETATTR_ATIME|CEPH_SETATTR_SIZE))
11fdf7f2 5086 lov.add_xlock(&cur->filelock);
7c673cae 5087 if (mask & CEPH_SETATTR_CTIME)
11fdf7f2 5088 lov.add_wrlock(&cur->versionlock);
7c673cae 5089
11fdf7f2 5090 if (!mds->locker->acquire_locks(mdr, lov))
7c673cae
FG
5091 return;
5092
f67539c2 5093 if ((mask & CEPH_SETATTR_UID) && (cur->get_inode()->uid != req->head.args.setattr.uid))
7c673cae
FG
5094 access_mask |= MAY_CHOWN;
5095
f67539c2 5096 if ((mask & CEPH_SETATTR_GID) && (cur->get_inode()->gid != req->head.args.setattr.gid))
7c673cae
FG
5097 access_mask |= MAY_CHGRP;
5098
5099 if (!check_access(mdr, cur, access_mask))
5100 return;
5101
5102 // trunc from bigger -> smaller?
f67539c2 5103 const auto& pip = cur->get_projected_inode();
7c673cae 5104
94b18763 5105 uint64_t old_size = std::max<uint64_t>(pip->size, req->head.args.setattr.old_size);
7c673cae 5106
f67539c2 5107 // CEPHFS_ENOSPC on growing file while full, but allow shrinks
7c673cae 5108 if (is_full && req->head.args.setattr.size > old_size) {
f67539c2
TL
5109 dout(20) << __func__ << ": full, responding CEPHFS_ENOSPC to setattr with larger size" << dendl;
5110 respond_to_request(mdr, -CEPHFS_ENOSPC);
7c673cae
FG
5111 return;
5112 }
5113
5114 bool truncating_smaller = false;
5115 if (mask & CEPH_SETATTR_SIZE) {
5116 truncating_smaller = req->head.args.setattr.size < old_size;
94b18763
FG
5117 if (truncating_smaller && pip->is_truncating()) {
5118 dout(10) << " waiting for pending truncate from " << pip->truncate_from
5119 << " to " << pip->truncate_size << " to complete on " << *cur << dendl;
7c673cae
FG
5120 mds->locker->drop_locks(mdr.get());
5121 mdr->drop_local_auth_pins();
5122 cur->add_waiter(CInode::WAIT_TRUNC, new C_MDS_RetryRequest(mdcache, mdr));
5123 return;
5124 }
5125 }
5126
5127 bool changed_ranges = false;
5128
5129 // project update
5130 mdr->ls = mdlog->get_current_segment();
5131 EUpdate *le = new EUpdate(mdlog, "setattr");
5132 mdlog->start_entry(le);
5133
f67539c2 5134 auto pi = cur->project_inode(mdr);
7c673cae
FG
5135
5136 if (mask & CEPH_SETATTR_UID)
f67539c2 5137 pi.inode->uid = req->head.args.setattr.uid;
7c673cae 5138 if (mask & CEPH_SETATTR_GID)
f67539c2 5139 pi.inode->gid = req->head.args.setattr.gid;
7c673cae
FG
5140
5141 if (mask & CEPH_SETATTR_MODE)
f67539c2 5142 pi.inode->mode = (pi.inode->mode & ~07777) | (req->head.args.setattr.mode & 07777);
7c673cae 5143 else if ((mask & (CEPH_SETATTR_UID|CEPH_SETATTR_GID|CEPH_SETATTR_KILL_SGUID)) &&
f67539c2
TL
5144 S_ISREG(pi.inode->mode) &&
5145 (pi.inode->mode & (S_IXUSR|S_IXGRP|S_IXOTH))) {
5146 pi.inode->mode &= ~(S_ISUID|S_ISGID);
7c673cae
FG
5147 }
5148
5149 if (mask & CEPH_SETATTR_MTIME)
f67539c2 5150 pi.inode->mtime = req->head.args.setattr.mtime;
7c673cae 5151 if (mask & CEPH_SETATTR_ATIME)
f67539c2 5152 pi.inode->atime = req->head.args.setattr.atime;
7c673cae 5153 if (mask & CEPH_SETATTR_BTIME)
f67539c2 5154 pi.inode->btime = req->head.args.setattr.btime;
7c673cae 5155 if (mask & (CEPH_SETATTR_ATIME | CEPH_SETATTR_MTIME | CEPH_SETATTR_BTIME))
f67539c2 5156 pi.inode->time_warp_seq++; // maybe not a timewarp, but still a serialization point.
7c673cae
FG
5157 if (mask & CEPH_SETATTR_SIZE) {
5158 if (truncating_smaller) {
f67539c2 5159 pi.inode->truncate(old_size, req->head.args.setattr.size);
7c673cae
FG
5160 le->metablob.add_truncate_start(cur->ino());
5161 } else {
f67539c2
TL
5162 pi.inode->size = req->head.args.setattr.size;
5163 pi.inode->rstat.rbytes = pi.inode->size;
7c673cae 5164 }
f67539c2 5165 pi.inode->mtime = mdr->get_op_stamp();
7c673cae
FG
5166
5167 // adjust client's max_size?
f67539c2 5168 if (mds->locker->calc_new_client_ranges(cur, pi.inode->size)) {
f91f0fd5 5169 dout(10) << " client_ranges " << cur->get_previous_projected_inode()->client_ranges
f67539c2 5170 << " -> " << pi.inode->client_ranges << dendl;
7c673cae
FG
5171 changed_ranges = true;
5172 }
5173 }
5174
f67539c2
TL
5175 pi.inode->version = cur->pre_dirty();
5176 pi.inode->ctime = mdr->get_op_stamp();
5177 if (mdr->get_op_stamp() > pi.inode->rstat.rctime)
5178 pi.inode->rstat.rctime = mdr->get_op_stamp();
5179 pi.inode->change_attr++;
7c673cae
FG
5180
5181 // log + wait
5182 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
5183 mdcache->predirty_journal_parents(mdr, &le->metablob, cur, 0, PREDIRTY_PRIMARY);
5184 mdcache->journal_dirty_inode(mdr.get(), &le->metablob, cur);
5185
5186 journal_and_reply(mdr, cur, 0, le, new C_MDS_inode_update_finish(this, mdr, cur,
5187 truncating_smaller, changed_ranges));
5188
5189 // flush immediately if there are readers/writers waiting
11fdf7f2 5190 if (mdr->is_xlocked(&cur->filelock) &&
7c673cae
FG
5191 (cur->get_caps_wanted() & (CEPH_CAP_FILE_RD|CEPH_CAP_FILE_WR)))
5192 mds->mdlog->flush();
5193}
5194
5195/* Takes responsibility for mdr */
5196void Server::do_open_truncate(MDRequestRef& mdr, int cmode)
5197{
5198 CInode *in = mdr->in[0];
5199 client_t client = mdr->get_client();
11fdf7f2 5200 ceph_assert(in);
7c673cae
FG
5201
5202 dout(10) << "do_open_truncate " << *in << dendl;
5203
5204 SnapRealm *realm = in->find_snaprealm();
9f95a23c 5205 Capability *cap = mds->locker->issue_new_caps(in, cmode, mdr, realm);
7c673cae
FG
5206
5207 mdr->ls = mdlog->get_current_segment();
5208 EUpdate *le = new EUpdate(mdlog, "open_truncate");
5209 mdlog->start_entry(le);
5210
5211 // prepare
f67539c2
TL
5212 auto pi = in->project_inode(mdr);
5213 pi.inode->version = in->pre_dirty();
5214 pi.inode->mtime = pi.inode->ctime = mdr->get_op_stamp();
5215 if (mdr->get_op_stamp() > pi.inode->rstat.rctime)
5216 pi.inode->rstat.rctime = mdr->get_op_stamp();
5217 pi.inode->change_attr++;
5218
5219 uint64_t old_size = std::max<uint64_t>(pi.inode->size, mdr->client_request->head.args.open.old_size);
7c673cae 5220 if (old_size > 0) {
f67539c2 5221 pi.inode->truncate(old_size, 0);
7c673cae
FG
5222 le->metablob.add_truncate_start(in->ino());
5223 }
5224
5225 bool changed_ranges = false;
a8e16298 5226 if (cap && (cmode & CEPH_FILE_MODE_WR)) {
f67539c2
TL
5227 pi.inode->client_ranges[client].range.first = 0;
5228 pi.inode->client_ranges[client].range.last = pi.inode->get_layout_size_increment();
5229 pi.inode->client_ranges[client].follows = realm->get_newest_seq();
7c673cae 5230 changed_ranges = true;
f91f0fd5 5231 in->mark_clientwriteable();
a8e16298 5232 cap->mark_clientwriteable();
7c673cae
FG
5233 }
5234
5235 le->metablob.add_client_req(mdr->reqid, mdr->client_request->get_oldest_client_tid());
5236
5237 mdcache->predirty_journal_parents(mdr, &le->metablob, in, 0, PREDIRTY_PRIMARY);
5238 mdcache->journal_dirty_inode(mdr.get(), &le->metablob, in);
5239
5240 // make sure ino gets into the journal
5241 le->metablob.add_opened_ino(in->ino());
7c673cae
FG
5242
5243 mdr->o_trunc = true;
5244
5245 CDentry *dn = 0;
5246 if (mdr->client_request->get_dentry_wanted()) {
11fdf7f2 5247 ceph_assert(mdr->dn[0].size());
7c673cae
FG
5248 dn = mdr->dn[0].back();
5249 }
5250
5251 journal_and_reply(mdr, in, dn, le, new C_MDS_inode_update_finish(this, mdr, in, old_size > 0,
5252 changed_ranges));
5253 // Although the `open` part can give an early reply, the truncation won't
5254 // happen until our EUpdate is persistent, to give the client a prompt
5255 // response we must also flush that event.
5256 mdlog->flush();
5257}
5258
5259
5260/* This function cleans up the passed mdr */
5261void Server::handle_client_setlayout(MDRequestRef& mdr)
5262{
9f95a23c
TL
5263 const cref_t<MClientRequest> &req = mdr->client_request;
5264 CInode *cur = rdlock_path_pin_ref(mdr, true);
7c673cae
FG
5265 if (!cur) return;
5266
5267 if (mdr->snapid != CEPH_NOSNAP) {
f67539c2 5268 respond_to_request(mdr, -CEPHFS_EROFS);
7c673cae
FG
5269 return;
5270 }
5271 if (!cur->is_file()) {
f67539c2 5272 respond_to_request(mdr, -CEPHFS_EINVAL);
7c673cae
FG
5273 return;
5274 }
5275 if (cur->get_projected_inode()->size ||
5276 cur->get_projected_inode()->truncate_seq > 1) {
f67539c2 5277 respond_to_request(mdr, -CEPHFS_ENOTEMPTY);
7c673cae
FG
5278 return;
5279 }
5280
5281 // validate layout
5282 file_layout_t layout = cur->get_projected_inode()->layout;
5283 // save existing layout for later
5284 const auto old_layout = layout;
5285
5286 int access = MAY_WRITE;
5287
5288 if (req->head.args.setlayout.layout.fl_object_size > 0)
5289 layout.object_size = req->head.args.setlayout.layout.fl_object_size;
5290 if (req->head.args.setlayout.layout.fl_stripe_unit > 0)
5291 layout.stripe_unit = req->head.args.setlayout.layout.fl_stripe_unit;
5292 if (req->head.args.setlayout.layout.fl_stripe_count > 0)
5293 layout.stripe_count=req->head.args.setlayout.layout.fl_stripe_count;
5294 if (req->head.args.setlayout.layout.fl_pg_pool > 0) {
5295 layout.pool_id = req->head.args.setlayout.layout.fl_pg_pool;
5296
5297 // make sure we have as new a map as the client
5298 if (req->get_mdsmap_epoch() > mds->mdsmap->get_epoch()) {
5299 mds->wait_for_mdsmap(req->get_mdsmap_epoch(), new C_MDS_RetryRequest(mdcache, mdr));
5300 return;
5301 }
5302 }
5303
5304 // Don't permit layout modifications without 'p' caps
5305 if (layout != old_layout) {
5306 access |= MAY_SET_VXATTR;
5307 }
5308
5309 if (!layout.is_valid()) {
5310 dout(10) << "bad layout" << dendl;
f67539c2 5311 respond_to_request(mdr, -CEPHFS_EINVAL);
7c673cae
FG
5312 return;
5313 }
5314 if (!mds->mdsmap->is_data_pool(layout.pool_id)) {
5315 dout(10) << " invalid data pool " << layout.pool_id << dendl;
f67539c2 5316 respond_to_request(mdr, -CEPHFS_EINVAL);
7c673cae
FG
5317 return;
5318 }
5319
9f95a23c 5320 MutationImpl::LockOpVec lov;
11fdf7f2
TL
5321 lov.add_xlock(&cur->filelock);
5322 if (!mds->locker->acquire_locks(mdr, lov))
7c673cae
FG
5323 return;
5324
5325 if (!check_access(mdr, cur, access))
5326 return;
5327
5328 // project update
f67539c2
TL
5329 auto pi = cur->project_inode(mdr);
5330 pi.inode->layout = layout;
7c673cae 5331 // add the old pool to the inode
f67539c2
TL
5332 pi.inode->add_old_pool(old_layout.pool_id);
5333 pi.inode->version = cur->pre_dirty();
5334 pi.inode->ctime = mdr->get_op_stamp();
5335 if (mdr->get_op_stamp() > pi.inode->rstat.rctime)
5336 pi.inode->rstat.rctime = mdr->get_op_stamp();
5337 pi.inode->change_attr++;
7c673cae
FG
5338
5339 // log + wait
5340 mdr->ls = mdlog->get_current_segment();
5341 EUpdate *le = new EUpdate(mdlog, "setlayout");
5342 mdlog->start_entry(le);
5343 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
5344 mdcache->predirty_journal_parents(mdr, &le->metablob, cur, 0, PREDIRTY_PRIMARY);
5345 mdcache->journal_dirty_inode(mdr.get(), &le->metablob, cur);
5346
5347 journal_and_reply(mdr, cur, 0, le, new C_MDS_inode_update_finish(this, mdr, cur));
5348}
5349
9f95a23c 5350bool Server::xlock_policylock(MDRequestRef& mdr, CInode *in, bool want_layout, bool xlock_snaplock)
7c673cae 5351{
9f95a23c
TL
5352 if (mdr->locking_state & MutationImpl::ALL_LOCKED)
5353 return true;
5354
11fdf7f2 5355 MutationImpl::LockOpVec lov;
9f95a23c
TL
5356 lov.add_xlock(&in->policylock);
5357 if (xlock_snaplock)
5358 lov.add_xlock(&in->snaplock);
5359 else
5360 lov.add_rdlock(&in->snaplock);
5361 if (!mds->locker->acquire_locks(mdr, lov))
5362 return false;
7c673cae 5363
9f95a23c
TL
5364 if (want_layout && in->get_projected_inode()->has_layout()) {
5365 mdr->dir_layout = in->get_projected_inode()->layout;
5366 want_layout = false;
5367 }
5368 if (CDentry *pdn = in->get_projected_parent_dn(); pdn) {
5369 if (!mds->locker->try_rdlock_snap_layout(pdn->get_dir()->get_inode(), mdr, 0, want_layout))
5370 return false;
7c673cae
FG
5371 }
5372
9f95a23c
TL
5373 mdr->locking_state |= MutationImpl::ALL_LOCKED;
5374 return true;
5375}
5376
5377CInode* Server::try_get_auth_inode(MDRequestRef& mdr, inodeno_t ino)
5378{
5379 CInode *in = mdcache->get_inode(ino);
5380 if (!in || in->state_test(CInode::STATE_PURGING)) {
f67539c2 5381 respond_to_request(mdr, -CEPHFS_ESTALE);
9f95a23c
TL
5382 return nullptr;
5383 }
5384 if (!in->is_auth()) {
5385 mdcache->request_forward(mdr, in->authority().first);
5386 return nullptr;
5387 }
5388
5389 return in;
5390}
5391
5392void Server::handle_client_setdirlayout(MDRequestRef& mdr)
5393{
5394 const cref_t<MClientRequest> &req = mdr->client_request;
5395
5396 // can't use rdlock_path_pin_ref because we need to xlock snaplock/policylock
5397 CInode *cur = try_get_auth_inode(mdr, req->get_filepath().get_ino());
5398 if (!cur)
5399 return;
5400
7c673cae 5401 if (!cur->is_dir()) {
f67539c2 5402 respond_to_request(mdr, -CEPHFS_ENOTDIR);
7c673cae
FG
5403 return;
5404 }
5405
9f95a23c 5406 if (!xlock_policylock(mdr, cur, true))
7c673cae
FG
5407 return;
5408
5409 // validate layout
f67539c2 5410 const auto& old_pi = cur->get_projected_inode();
7c673cae
FG
5411 file_layout_t layout;
5412 if (old_pi->has_layout())
5413 layout = old_pi->layout;
9f95a23c
TL
5414 else if (mdr->dir_layout != file_layout_t())
5415 layout = mdr->dir_layout;
7c673cae
FG
5416 else
5417 layout = mdcache->default_file_layout;
5418
5419 // Level of access required to complete
5420 int access = MAY_WRITE;
5421
5422 const auto old_layout = layout;
5423
5424 if (req->head.args.setlayout.layout.fl_object_size > 0)
5425 layout.object_size = req->head.args.setlayout.layout.fl_object_size;
5426 if (req->head.args.setlayout.layout.fl_stripe_unit > 0)
5427 layout.stripe_unit = req->head.args.setlayout.layout.fl_stripe_unit;
5428 if (req->head.args.setlayout.layout.fl_stripe_count > 0)
5429 layout.stripe_count=req->head.args.setlayout.layout.fl_stripe_count;
5430 if (req->head.args.setlayout.layout.fl_pg_pool > 0) {
5431 layout.pool_id = req->head.args.setlayout.layout.fl_pg_pool;
5432 // make sure we have as new a map as the client
5433 if (req->get_mdsmap_epoch() > mds->mdsmap->get_epoch()) {
5434 mds->wait_for_mdsmap(req->get_mdsmap_epoch(), new C_MDS_RetryRequest(mdcache, mdr));
5435 return;
5436 }
5437 }
5438
5439 if (layout != old_layout) {
5440 access |= MAY_SET_VXATTR;
5441 }
5442
5443 if (!layout.is_valid()) {
5444 dout(10) << "bad layout" << dendl;
f67539c2 5445 respond_to_request(mdr, -CEPHFS_EINVAL);
7c673cae
FG
5446 return;
5447 }
5448 if (!mds->mdsmap->is_data_pool(layout.pool_id)) {
5449 dout(10) << " invalid data pool " << layout.pool_id << dendl;
f67539c2 5450 respond_to_request(mdr, -CEPHFS_EINVAL);
7c673cae
FG
5451 return;
5452 }
5453
5454 if (!check_access(mdr, cur, access))
5455 return;
5456
f67539c2
TL
5457 auto pi = cur->project_inode(mdr);
5458 pi.inode->layout = layout;
5459 pi.inode->version = cur->pre_dirty();
7c673cae
FG
5460
5461 // log + wait
5462 mdr->ls = mdlog->get_current_segment();
5463 EUpdate *le = new EUpdate(mdlog, "setlayout");
5464 mdlog->start_entry(le);
5465 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
5466 mdcache->predirty_journal_parents(mdr, &le->metablob, cur, 0, PREDIRTY_PRIMARY);
5467 mdcache->journal_dirty_inode(mdr.get(), &le->metablob, cur);
5468
b32b8144 5469 mdr->no_early_reply = true;
7c673cae
FG
5470 journal_and_reply(mdr, cur, 0, le, new C_MDS_inode_update_finish(this, mdr, cur));
5471}
5472
5473// XATTRS
1d09f67e
TL
5474int Server::parse_layout_vxattr_json(
5475 string name, string value, const OSDMap& osdmap, file_layout_t *layout)
5476{
5477 auto parse_pool = [&](std::string pool_name, int64_t pool_id) -> int64_t {
5478 if (pool_name != "") {
5479 int64_t _pool_id = osdmap.lookup_pg_pool_name(pool_name);
5480 if (_pool_id < 0) {
5481 dout(10) << __func__ << ": unknown pool name:" << pool_name << dendl;
5482 return -CEPHFS_EINVAL;
5483 }
5484 return _pool_id;
5485 } else if (pool_id >= 0) {
5486 const auto pools = osdmap.get_pools();
5487 if (pools.find(pool_id) == pools.end()) {
5488 dout(10) << __func__ << ": unknown pool id:" << pool_id << dendl;
5489 return -CEPHFS_EINVAL;
5490 }
5491 return pool_id;
5492 } else {
5493 return -CEPHFS_EINVAL;
5494 }
5495 };
7c673cae 5496
1d09f67e
TL
5497 try {
5498 if (name == "layout.json") {
5499 JSONParser json_parser;
5500 if (json_parser.parse(value.c_str(), value.length()) and json_parser.is_object()) {
5501 std::string field;
5502 try {
5503 field = "object_size";
5504 JSONDecoder::decode_json("object_size", layout->object_size, &json_parser, true);
5505
5506 field = "stripe_unit";
5507 JSONDecoder::decode_json("stripe_unit", layout->stripe_unit, &json_parser, true);
5508
5509 field = "stripe_count";
5510 JSONDecoder::decode_json("stripe_count", layout->stripe_count, &json_parser, true);
5511
5512 field = "pool_namespace";
5513 JSONDecoder::decode_json("pool_namespace", layout->pool_ns, &json_parser, false);
5514
5515 field = "pool_id";
5516 int64_t pool_id = 0;
5517 JSONDecoder::decode_json("pool_id", pool_id, &json_parser, false);
5518
5519 field = "pool_name";
5520 std::string pool_name;
5521 JSONDecoder::decode_json("pool_name", pool_name, &json_parser, false);
5522
5523 pool_id = parse_pool(pool_name, pool_id);
5524 if (pool_id < 0) {
5525 return (int)pool_id;
5526 }
5527 layout->pool_id = pool_id;
5528 } catch (JSONDecoder::err&) {
5529 dout(10) << __func__ << ": json is missing a mandatory field named "
5530 << field << dendl;
5531 return -CEPHFS_EINVAL;
5532 }
5533 } else {
5534 dout(10) << __func__ << ": bad json" << dendl;
5535 return -CEPHFS_EINVAL;
5536 }
5537 } else {
5538 dout(10) << __func__ << ": unknown layout vxattr " << name << dendl;
5539 return -CEPHFS_ENODATA; // no such attribute
5540 }
5541 } catch (boost::bad_lexical_cast const&) {
5542 dout(10) << __func__ << ": bad vxattr value:" << value
5543 << ", unable to parse for xattr:" << name << dendl;
5544 return -CEPHFS_EINVAL;
5545 }
5546 return 0;
5547}
5548
5549// parse old style layout string
5550int Server::parse_layout_vxattr_string(
5551 string name, string value, const OSDMap& osdmap, file_layout_t *layout)
7c673cae 5552{
7c673cae
FG
5553 try {
5554 if (name == "layout") {
5555 string::iterator begin = value.begin();
5556 string::iterator end = value.end();
5557 keys_and_values<string::iterator> p; // create instance of parser
5558 std::map<string, string> m; // map to receive results
5559 if (!qi::parse(begin, end, p, m)) { // returns true if successful
f67539c2 5560 return -CEPHFS_EINVAL;
7c673cae
FG
5561 }
5562 string left(begin, end);
1d09f67e 5563 dout(10) << __func__ << ": parsed " << m << " left '" << left << "'" << dendl;
7c673cae 5564 if (begin != end)
f67539c2 5565 return -CEPHFS_EINVAL;
7c673cae
FG
5566 for (map<string,string>::iterator q = m.begin(); q != m.end(); ++q) {
5567 // Skip validation on each attr, we do it once at the end (avoid
5568 // rejecting intermediate states if the overall result is ok)
1d09f67e
TL
5569 int r = parse_layout_vxattr_string(string("layout.") + q->first, q->second,
5570 osdmap, layout);
7c673cae
FG
5571 if (r < 0)
5572 return r;
5573 }
5574 } else if (name == "layout.object_size") {
5575 layout->object_size = boost::lexical_cast<unsigned>(value);
5576 } else if (name == "layout.stripe_unit") {
5577 layout->stripe_unit = boost::lexical_cast<unsigned>(value);
5578 } else if (name == "layout.stripe_count") {
5579 layout->stripe_count = boost::lexical_cast<unsigned>(value);
5580 } else if (name == "layout.pool") {
5581 try {
5582 layout->pool_id = boost::lexical_cast<unsigned>(value);
5583 } catch (boost::bad_lexical_cast const&) {
5584 int64_t pool = osdmap.lookup_pg_pool_name(value);
5585 if (pool < 0) {
1d09f67e 5586 dout(10) << __func__ << ": unknown pool " << value << dendl;
f67539c2 5587 return -CEPHFS_ENOENT;
7c673cae
FG
5588 }
5589 layout->pool_id = pool;
5590 }
1d09f67e
TL
5591 } else if (name == "layout.pool_id") {
5592 layout->pool_id = boost::lexical_cast<int64_t>(value);
5593 } else if (name == "layout.pool_name") {
5594 layout->pool_id = osdmap.lookup_pg_pool_name(value);
5595 if (layout->pool_id < 0) {
5596 dout(10) << __func__ << ": unknown pool " << value << dendl;
5597 return -CEPHFS_EINVAL;
5598 }
7c673cae
FG
5599 } else if (name == "layout.pool_namespace") {
5600 layout->pool_ns = value;
5601 } else {
1d09f67e
TL
5602 dout(10) << __func__ << ": unknown layout vxattr " << name << dendl;
5603 return -CEPHFS_ENODATA; // no such attribute
7c673cae
FG
5604 }
5605 } catch (boost::bad_lexical_cast const&) {
1d09f67e
TL
5606 dout(10) << __func__ << ": bad vxattr value, unable to parse int for "
5607 << name << dendl;
f67539c2 5608 return -CEPHFS_EINVAL;
7c673cae 5609 }
1d09f67e
TL
5610 return 0;
5611}
5612
5613int Server::parse_layout_vxattr(string name, string value, const OSDMap& osdmap,
5614 file_layout_t *layout, bool validate)
5615{
5616 dout(20) << __func__ << ": name:" << name << " value:'" << value << "'" << dendl;
5617
5618 int r;
5619 if (name == "layout.json") {
5620 r = parse_layout_vxattr_json(name, value, osdmap, layout);
5621 } else {
5622 r = parse_layout_vxattr_string(name, value, osdmap, layout);
5623 }
5624 if (r < 0) {
5625 return r;
5626 }
7c673cae
FG
5627
5628 if (validate && !layout->is_valid()) {
1d09f67e 5629 dout(10) << __func__ << ": bad layout" << dendl;
f67539c2 5630 return -CEPHFS_EINVAL;
7c673cae
FG
5631 }
5632 if (!mds->mdsmap->is_data_pool(layout->pool_id)) {
1d09f67e 5633 dout(10) << __func__ << ": invalid data pool " << layout->pool_id << dendl;
f67539c2 5634 return -CEPHFS_EINVAL;
7c673cae
FG
5635 }
5636 return 0;
5637}
5638
5639int Server::parse_quota_vxattr(string name, string value, quota_info_t *quota)
5640{
5641 dout(20) << "parse_quota_vxattr name " << name << " value '" << value << "'" << dendl;
5642 try {
5643 if (name == "quota") {
5644 string::iterator begin = value.begin();
5645 string::iterator end = value.end();
11fdf7f2
TL
5646 if (begin == end) {
5647 // keep quota unchanged. (for create_quota_realm())
5648 return 0;
5649 }
7c673cae
FG
5650 keys_and_values<string::iterator> p; // create instance of parser
5651 std::map<string, string> m; // map to receive results
5652 if (!qi::parse(begin, end, p, m)) { // returns true if successful
f67539c2 5653 return -CEPHFS_EINVAL;
7c673cae
FG
5654 }
5655 string left(begin, end);
5656 dout(10) << " parsed " << m << " left '" << left << "'" << dendl;
5657 if (begin != end)
f67539c2 5658 return -CEPHFS_EINVAL;
7c673cae
FG
5659 for (map<string,string>::iterator q = m.begin(); q != m.end(); ++q) {
5660 int r = parse_quota_vxattr(string("quota.") + q->first, q->second, quota);
5661 if (r < 0)
5662 return r;
5663 }
5664 } else if (name == "quota.max_bytes") {
5665 int64_t q = boost::lexical_cast<int64_t>(value);
5666 if (q < 0)
f67539c2 5667 return -CEPHFS_EINVAL;
7c673cae
FG
5668 quota->max_bytes = q;
5669 } else if (name == "quota.max_files") {
5670 int64_t q = boost::lexical_cast<int64_t>(value);
5671 if (q < 0)
f67539c2 5672 return -CEPHFS_EINVAL;
7c673cae
FG
5673 quota->max_files = q;
5674 } else {
5675 dout(10) << " unknown quota vxattr " << name << dendl;
f67539c2 5676 return -CEPHFS_EINVAL;
7c673cae
FG
5677 }
5678 } catch (boost::bad_lexical_cast const&) {
5679 dout(10) << "bad vxattr value, unable to parse int for " << name << dendl;
f67539c2 5680 return -CEPHFS_EINVAL;
7c673cae
FG
5681 }
5682
5683 if (!quota->is_valid()) {
5684 dout(10) << "bad quota" << dendl;
f67539c2 5685 return -CEPHFS_EINVAL;
7c673cae
FG
5686 }
5687 return 0;
5688}
5689
11fdf7f2
TL
5690void Server::create_quota_realm(CInode *in)
5691{
5692 dout(10) << __func__ << " " << *in << dendl;
5693
9f95a23c 5694 auto req = make_message<MClientRequest>(CEPH_MDS_OP_SETXATTR);
11fdf7f2
TL
5695 req->set_filepath(filepath(in->ino()));
5696 req->set_string2("ceph.quota");
5697 // empty vxattr value
5698 req->set_tid(mds->issue_tid());
5699
5700 mds->send_message_mds(req, in->authority().first);
5701}
5702
7c673cae
FG
5703/*
5704 * Verify that the file layout attribute carried by client
5705 * is well-formatted.
5706 * Return 0 on success, otherwise this function takes
5707 * responsibility for the passed mdr.
5708 */
5709int Server::check_layout_vxattr(MDRequestRef& mdr,
5710 string name,
5711 string value,
5712 file_layout_t *layout)
5713{
9f95a23c 5714 const cref_t<MClientRequest> &req = mdr->client_request;
7c673cae
FG
5715 epoch_t epoch;
5716 int r;
5717
5718 mds->objecter->with_osdmap([&](const OSDMap& osdmap) {
5719 r = parse_layout_vxattr(name, value, osdmap, layout);
5720 epoch = osdmap.get_epoch();
5721 });
5722
f67539c2 5723 if (r == -CEPHFS_ENOENT) {
7c673cae
FG
5724
5725 // we don't have the specified pool, make sure our map
5726 // is newer than or as new as the client.
5727 epoch_t req_epoch = req->get_osdmap_epoch();
5728
5729 if (req_epoch > epoch) {
5730
5731 // well, our map is older. consult mds.
f67539c2 5732 auto fin = new C_IO_Wrapper(mds, new C_MDS_RetryRequest(mdcache, mdr));
7c673cae 5733
f67539c2
TL
5734 mds->objecter->wait_for_map(req_epoch, lambdafy(fin));
5735 return r;
7c673cae
FG
5736 } else if (req_epoch == 0 && !mdr->waited_for_osdmap) {
5737
5738 // For compatibility with client w/ old code, we still need get the
5739 // latest map. One day if COMPACT_VERSION of MClientRequest >=3,
5740 // we can remove those code.
5741 mdr->waited_for_osdmap = true;
f67539c2
TL
5742 mds->objecter->wait_for_latest_osdmap(std::ref(*new C_IO_Wrapper(
5743 mds, new C_MDS_RetryRequest(mdcache, mdr))));
7c673cae
FG
5744 return r;
5745 }
5746 }
5747
5748 if (r < 0) {
5749
f67539c2
TL
5750 if (r == -CEPHFS_ENOENT)
5751 r = -CEPHFS_EINVAL;
7c673cae
FG
5752
5753 respond_to_request(mdr, r);
5754 return r;
5755 }
5756
5757 // all is well
5758 return 0;
5759}
5760
9f95a23c 5761void Server::handle_set_vxattr(MDRequestRef& mdr, CInode *cur)
7c673cae 5762{
9f95a23c 5763 const cref_t<MClientRequest> &req = mdr->client_request;
7c673cae
FG
5764 string name(req->get_path2());
5765 bufferlist bl = req->get_data();
5766 string value (bl.c_str(), bl.length());
5767 dout(10) << "handle_set_vxattr " << name
5768 << " val " << value.length()
5769 << " bytes on " << *cur
5770 << dendl;
5771
94b18763 5772 CInode::mempool_inode *pip = nullptr;
7c673cae
FG
5773 string rest;
5774
5775 if (!check_access(mdr, cur, MAY_SET_VXATTR)) {
5776 return;
5777 }
5778
adb31ebb 5779 bool adjust_realm = false;
7c673cae
FG
5780 if (name.compare(0, 15, "ceph.dir.layout") == 0) {
5781 if (!cur->is_dir()) {
f67539c2 5782 respond_to_request(mdr, -CEPHFS_EINVAL);
7c673cae
FG
5783 return;
5784 }
5785
9f95a23c
TL
5786 if (!xlock_policylock(mdr, cur, true))
5787 return;
5788
7c673cae
FG
5789 file_layout_t layout;
5790 if (cur->get_projected_inode()->has_layout())
5791 layout = cur->get_projected_inode()->layout;
9f95a23c
TL
5792 else if (mdr->dir_layout != file_layout_t())
5793 layout = mdr->dir_layout;
7c673cae
FG
5794 else
5795 layout = mdcache->default_file_layout;
5796
5797 rest = name.substr(name.find("layout"));
5798 if (check_layout_vxattr(mdr, rest, value, &layout) < 0)
5799 return;
5800
f67539c2
TL
5801 auto pi = cur->project_inode(mdr);
5802 pi.inode->layout = layout;
b32b8144 5803 mdr->no_early_reply = true;
f67539c2 5804 pip = pi.inode.get();
7c673cae
FG
5805 } else if (name.compare(0, 16, "ceph.file.layout") == 0) {
5806 if (!cur->is_file()) {
f67539c2 5807 respond_to_request(mdr, -CEPHFS_EINVAL);
7c673cae
FG
5808 return;
5809 }
5810 if (cur->get_projected_inode()->size ||
5811 cur->get_projected_inode()->truncate_seq > 1) {
f67539c2 5812 respond_to_request(mdr, -CEPHFS_ENOTEMPTY);
7c673cae
FG
5813 return;
5814 }
5815 file_layout_t layout = cur->get_projected_inode()->layout;
5816 rest = name.substr(name.find("layout"));
5817 if (check_layout_vxattr(mdr, rest, value, &layout) < 0)
5818 return;
5819
9f95a23c 5820 MutationImpl::LockOpVec lov;
11fdf7f2
TL
5821 lov.add_xlock(&cur->filelock);
5822 if (!mds->locker->acquire_locks(mdr, lov))
7c673cae
FG
5823 return;
5824
f67539c2
TL
5825 auto pi = cur->project_inode(mdr);
5826 int64_t old_pool = pi.inode->layout.pool_id;
5827 pi.inode->add_old_pool(old_pool);
5828 pi.inode->layout = layout;
5829 pip = pi.inode.get();
7c673cae 5830 } else if (name.compare(0, 10, "ceph.quota") == 0) {
f67539c2
TL
5831 if (!cur->is_dir()) {
5832 respond_to_request(mdr, -CEPHFS_EINVAL);
7c673cae
FG
5833 return;
5834 }
5835
5836 quota_info_t quota = cur->get_projected_inode()->quota;
5837
5838 rest = name.substr(name.find("quota"));
5839 int r = parse_quota_vxattr(rest, value, &quota);
5840 if (r < 0) {
5841 respond_to_request(mdr, r);
5842 return;
5843 }
5844
9f95a23c 5845 if (quota.is_enable() && !cur->get_projected_srnode())
adb31ebb
TL
5846 adjust_realm = true;
5847
5848 if (!xlock_policylock(mdr, cur, false, adjust_realm))
5849 return;
11fdf7f2 5850
adb31ebb
TL
5851 if (cur->get_projected_inode()->quota == quota) {
5852 respond_to_request(mdr, 0);
7c673cae 5853 return;
adb31ebb 5854 }
7c673cae 5855
f67539c2
TL
5856 auto pi = cur->project_inode(mdr, false, adjust_realm);
5857 pi.inode->quota = quota;
94b18763 5858
adb31ebb
TL
5859 if (adjust_realm)
5860 pi.snapnode->created = pi.snapnode->seq = cur->find_snaprealm()->get_newest_seq();
5861
b32b8144 5862 mdr->no_early_reply = true;
f67539c2 5863 pip = pi.inode.get();
28e407b8
AA
5864
5865 client_t exclude_ct = mdr->get_client();
a8e16298 5866 mdcache->broadcast_quota_to_client(cur, exclude_ct, true);
adb31ebb
TL
5867 } else if (name == "ceph.dir.subvolume"sv) {
5868 if (!cur->is_dir()) {
f67539c2 5869 respond_to_request(mdr, -CEPHFS_EINVAL);
adb31ebb
TL
5870 return;
5871 }
5872
5873 bool val;
5874 try {
5875 val = boost::lexical_cast<bool>(value);
5876 } catch (boost::bad_lexical_cast const&) {
5877 dout(10) << "bad vxattr value, unable to parse bool for " << name << dendl;
f67539c2 5878 respond_to_request(mdr, -CEPHFS_EINVAL);
adb31ebb
TL
5879 return;
5880 }
5881
b3b6e05e
TL
5882 /* Verify it's not already a subvolume with lighter weight
5883 * rdlock.
5884 */
5885 if (!mdr->more()->rdonly_checks) {
5886 if (!(mdr->locking_state & MutationImpl::ALL_LOCKED)) {
5887 MutationImpl::LockOpVec lov;
5888 lov.add_rdlock(&cur->snaplock);
5889 if (!mds->locker->acquire_locks(mdr, lov))
5890 return;
5891 mdr->locking_state |= MutationImpl::ALL_LOCKED;
5892 }
b3b6e05e
TL
5893 const auto srnode = cur->get_projected_srnode();
5894 if (val == (srnode && srnode->is_subvolume())) {
5895 dout(20) << "already marked subvolume" << dendl;
5896 respond_to_request(mdr, 0);
5897 return;
5898 }
5899 mdr->more()->rdonly_checks = true;
5900 }
5901
5902 if ((mdr->locking_state & MutationImpl::ALL_LOCKED) && !mdr->is_xlocked(&cur->snaplock)) {
5903 /* drop the rdlock and acquire xlocks */
5904 dout(20) << "dropping rdlocks" << dendl;
5905 mds->locker->drop_locks(mdr.get());
5906 if (!xlock_policylock(mdr, cur, false, true))
5907 return;
5908 }
adb31ebb 5909
b3b6e05e 5910 /* repeat rdonly checks in case changed between rdlock -> xlock */
adb31ebb
TL
5911 SnapRealm *realm = cur->find_snaprealm();
5912 if (val) {
5913 inodeno_t subvol_ino = realm->get_subvolume_ino();
5914 // can't create subvolume inside another subvolume
5915 if (subvol_ino && subvol_ino != cur->ino()) {
f67539c2 5916 respond_to_request(mdr, -CEPHFS_EINVAL);
adb31ebb
TL
5917 return;
5918 }
5919 }
5920
5921 const auto srnode = cur->get_projected_srnode();
5922 if (val == (srnode && srnode->is_subvolume())) {
5923 respond_to_request(mdr, 0);
5924 return;
5925 }
5926
f67539c2 5927 auto pi = cur->project_inode(mdr, false, true);
adb31ebb
TL
5928 if (!srnode)
5929 pi.snapnode->created = pi.snapnode->seq = realm->get_newest_seq();
5930 if (val)
5931 pi.snapnode->mark_subvolume();
5932 else
5933 pi.snapnode->clear_subvolume();
5934
5935 mdr->no_early_reply = true;
f67539c2 5936 pip = pi.inode.get();
adb31ebb 5937 adjust_realm = true;
f6b5b4d7 5938 } else if (name == "ceph.dir.pin"sv) {
7c673cae 5939 if (!cur->is_dir() || cur->is_root()) {
f67539c2 5940 respond_to_request(mdr, -CEPHFS_EINVAL);
7c673cae
FG
5941 return;
5942 }
5943
5944 mds_rank_t rank;
5945 try {
5946 rank = boost::lexical_cast<mds_rank_t>(value);
5947 if (rank < 0) rank = MDS_RANK_NONE;
20effc67
TL
5948 else if (rank >= MAX_MDS) {
5949 respond_to_request(mdr, -CEPHFS_EDOM);
5950 return;
5951 }
7c673cae
FG
5952 } catch (boost::bad_lexical_cast const&) {
5953 dout(10) << "bad vxattr value, unable to parse int for " << name << dendl;
f67539c2 5954 respond_to_request(mdr, -CEPHFS_EINVAL);
7c673cae
FG
5955 return;
5956 }
5957
9f95a23c 5958 if (!xlock_policylock(mdr, cur))
7c673cae
FG
5959 return;
5960
f67539c2 5961 auto pi = cur->project_inode(mdr);
7c673cae 5962 cur->set_export_pin(rank);
f67539c2 5963 pip = pi.inode.get();
f6b5b4d7
TL
5964 } else if (name == "ceph.dir.pin.random"sv) {
5965 if (!cur->is_dir() || cur->is_root()) {
f67539c2 5966 respond_to_request(mdr, -CEPHFS_EINVAL);
f6b5b4d7
TL
5967 return;
5968 }
5969
5970 double val;
5971 try {
5972 val = boost::lexical_cast<double>(value);
5973 } catch (boost::bad_lexical_cast const&) {
5974 dout(10) << "bad vxattr value, unable to parse float for " << name << dendl;
f67539c2 5975 respond_to_request(mdr, -CEPHFS_EINVAL);
f6b5b4d7
TL
5976 return;
5977 }
5978
5979 if (val < 0.0 || 1.0 < val) {
f67539c2 5980 respond_to_request(mdr, -CEPHFS_EDOM);
f6b5b4d7
TL
5981 return;
5982 } else if (mdcache->export_ephemeral_random_max < val) {
f67539c2 5983 respond_to_request(mdr, -CEPHFS_EINVAL);
f6b5b4d7
TL
5984 return;
5985 }
5986
5987 if (!xlock_policylock(mdr, cur))
5988 return;
5989
f67539c2 5990 auto pi = cur->project_inode(mdr);
f6b5b4d7 5991 cur->setxattr_ephemeral_rand(val);
f67539c2 5992 pip = pi.inode.get();
f6b5b4d7
TL
5993 } else if (name == "ceph.dir.pin.distributed"sv) {
5994 if (!cur->is_dir() || cur->is_root()) {
f67539c2 5995 respond_to_request(mdr, -CEPHFS_EINVAL);
f6b5b4d7
TL
5996 return;
5997 }
5998
5999 bool val;
6000 try {
6001 val = boost::lexical_cast<bool>(value);
6002 } catch (boost::bad_lexical_cast const&) {
6003 dout(10) << "bad vxattr value, unable to parse bool for " << name << dendl;
f67539c2 6004 respond_to_request(mdr, -CEPHFS_EINVAL);
f6b5b4d7
TL
6005 return;
6006 }
6007
6008 if (!xlock_policylock(mdr, cur))
6009 return;
6010
f67539c2 6011 auto pi = cur->project_inode(mdr);
f6b5b4d7 6012 cur->setxattr_ephemeral_dist(val);
f67539c2 6013 pip = pi.inode.get();
7c673cae
FG
6014 } else {
6015 dout(10) << " unknown vxattr " << name << dendl;
f67539c2 6016 respond_to_request(mdr, -CEPHFS_EINVAL);
7c673cae
FG
6017 return;
6018 }
6019
94b18763 6020 pip->change_attr++;
91327a77
AA
6021 pip->ctime = mdr->get_op_stamp();
6022 if (mdr->get_op_stamp() > pip->rstat.rctime)
6023 pip->rstat.rctime = mdr->get_op_stamp();
94b18763 6024 pip->version = cur->pre_dirty();
7c673cae 6025 if (cur->is_file())
94b18763 6026 pip->update_backtrace();
7c673cae
FG
6027
6028 // log + wait
6029 mdr->ls = mdlog->get_current_segment();
6030 EUpdate *le = new EUpdate(mdlog, "set vxattr layout");
6031 mdlog->start_entry(le);
6032 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
6033 mdcache->predirty_journal_parents(mdr, &le->metablob, cur, 0, PREDIRTY_PRIMARY);
6034 mdcache->journal_dirty_inode(mdr.get(), &le->metablob, cur);
6035
11fdf7f2 6036 journal_and_reply(mdr, cur, 0, le, new C_MDS_inode_update_finish(this, mdr, cur,
adb31ebb 6037 false, false, adjust_realm));
7c673cae
FG
6038 return;
6039}
6040
9f95a23c 6041void Server::handle_remove_vxattr(MDRequestRef& mdr, CInode *cur)
7c673cae 6042{
9f95a23c 6043 const cref_t<MClientRequest> &req = mdr->client_request;
7c673cae
FG
6044 string name(req->get_path2());
6045
6046 dout(10) << __func__ << " " << name << " on " << *cur << dendl;
6047
6048 if (name == "ceph.dir.layout") {
6049 if (!cur->is_dir()) {
f67539c2 6050 respond_to_request(mdr, -CEPHFS_ENODATA);
7c673cae
FG
6051 return;
6052 }
6053 if (cur->is_root()) {
6054 dout(10) << "can't remove layout policy on the root directory" << dendl;
f67539c2 6055 respond_to_request(mdr, -CEPHFS_EINVAL);
7c673cae
FG
6056 return;
6057 }
6058
6059 if (!cur->get_projected_inode()->has_layout()) {
f67539c2 6060 respond_to_request(mdr, -CEPHFS_ENODATA);
7c673cae
FG
6061 return;
6062 }
6063
9f95a23c 6064 MutationImpl::LockOpVec lov;
11fdf7f2
TL
6065 lov.add_xlock(&cur->policylock);
6066 if (!mds->locker->acquire_locks(mdr, lov))
7c673cae
FG
6067 return;
6068
f67539c2
TL
6069 auto pi = cur->project_inode(mdr);
6070 pi.inode->clear_layout();
6071 pi.inode->version = cur->pre_dirty();
7c673cae
FG
6072
6073 // log + wait
6074 mdr->ls = mdlog->get_current_segment();
6075 EUpdate *le = new EUpdate(mdlog, "remove dir layout vxattr");
6076 mdlog->start_entry(le);
6077 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
6078 mdcache->predirty_journal_parents(mdr, &le->metablob, cur, 0, PREDIRTY_PRIMARY);
6079 mdcache->journal_dirty_inode(mdr.get(), &le->metablob, cur);
6080
b32b8144 6081 mdr->no_early_reply = true;
7c673cae
FG
6082 journal_and_reply(mdr, cur, 0, le, new C_MDS_inode_update_finish(this, mdr, cur));
6083 return;
6084 } else if (name == "ceph.dir.layout.pool_namespace"
6085 || name == "ceph.file.layout.pool_namespace") {
6086 // Namespace is the only layout field that has a meaningful
6087 // null/none value (empty string, means default layout). Is equivalent
6088 // to a setxattr with empty string: pass through the empty payload of
6089 // the rmxattr request to do this.
9f95a23c 6090 handle_set_vxattr(mdr, cur);
7c673cae
FG
6091 return;
6092 }
6093
f67539c2 6094 respond_to_request(mdr, -CEPHFS_ENODATA);
7c673cae
FG
6095}
6096
f67539c2
TL
6097const Server::XattrHandler Server::xattr_handlers[] = {
6098 {
6099 xattr_name: Server::DEFAULT_HANDLER,
6100 description: "default xattr handler",
6101 validate: &Server::default_xattr_validate,
6102 setxattr: &Server::default_setxattr_handler,
6103 removexattr: &Server::default_removexattr_handler,
6104 },
6105 {
6106 xattr_name: "ceph.mirror.info",
6107 description: "mirror info xattr handler",
6108 validate: &Server::mirror_info_xattr_validate,
6109 setxattr: &Server::mirror_info_setxattr_handler,
6110 removexattr: &Server::mirror_info_removexattr_handler
6111 },
6112};
7c673cae 6113
f67539c2
TL
6114const Server::XattrHandler* Server::get_xattr_or_default_handler(std::string_view xattr_name) {
6115 const XattrHandler *default_xattr_handler = nullptr;
7c673cae 6116
f67539c2
TL
6117 for (auto &handler : xattr_handlers) {
6118 if (handler.xattr_name == Server::DEFAULT_HANDLER) {
6119 ceph_assert(default_xattr_handler == nullptr);
6120 default_xattr_handler = &handler;
6121 }
6122 if (handler.xattr_name == xattr_name) {
6123 dout(20) << "handler=" << handler.description << dendl;
6124 return &handler;
6125 }
6126 }
7c673cae 6127
f67539c2
TL
6128 ceph_assert(default_xattr_handler != nullptr);
6129 dout(20) << "handler=" << default_xattr_handler->description << dendl;
6130 return default_xattr_handler;
6131}
7c673cae 6132
f67539c2
TL
6133int Server::xattr_validate(CInode *cur, const InodeStoreBase::xattr_map_const_ptr xattrs,
6134 const std::string &xattr_name, int op, int flags) {
6135 if (op == CEPH_MDS_OP_SETXATTR) {
6136 if (xattrs) {
6137 if ((flags & CEPH_XATTR_CREATE) && xattrs->count(mempool::mds_co::string(xattr_name))) {
6138 dout(10) << "setxattr '" << xattr_name << "' XATTR_CREATE and CEPHFS_EEXIST on " << *cur << dendl;
6139 return -CEPHFS_EEXIST;
6140 }
6141 }
6142 if ((flags & CEPH_XATTR_REPLACE) && !(xattrs && xattrs->count(mempool::mds_co::string(xattr_name)))) {
6143 dout(10) << "setxattr '" << xattr_name << "' XATTR_REPLACE and CEPHFS_ENODATA on " << *cur << dendl;
6144 return -CEPHFS_ENODATA;
6145 }
6146
6147 return 0;
7c673cae 6148 }
f67539c2
TL
6149
6150 if (op == CEPH_MDS_OP_RMXATTR) {
6151 if (!xattrs || xattrs->count(mempool::mds_co::string(xattr_name)) == 0) {
6152 dout(10) << "removexattr '" << xattr_name << "' and CEPHFS_ENODATA on " << *cur << dendl;
6153 return -CEPHFS_ENODATA;
6154 }
6155
6156 return 0;
6157 }
6158
6159 derr << ": unhandled validation for: " << xattr_name << dendl;
6160 return -CEPHFS_EINVAL;
6161}
6162
6163void Server::xattr_set(InodeStoreBase::xattr_map_ptr xattrs, const std::string &xattr_name,
6164 const bufferlist &xattr_value) {
6165 size_t len = xattr_value.length();
6166 bufferptr b = buffer::create(len);
6167 if (len) {
6168 xattr_value.begin().copy(len, b.c_str());
6169 }
6170 auto em = xattrs->emplace(std::piecewise_construct,
6171 std::forward_as_tuple(mempool::mds_co::string(xattr_name)),
6172 std::forward_as_tuple(b));
6173 if (!em.second) {
6174 em.first->second = b;
6175 }
6176}
6177
6178void Server::xattr_rm(InodeStoreBase::xattr_map_ptr xattrs, const std::string &xattr_name) {
6179 xattrs->erase(mempool::mds_co::string(xattr_name));
6180}
6181
6182int Server::default_xattr_validate(CInode *cur, const InodeStoreBase::xattr_map_const_ptr xattrs,
6183 XattrOp *xattr_op) {
6184 return xattr_validate(cur, xattrs, xattr_op->xattr_name, xattr_op->op, xattr_op->flags);
6185}
6186
6187void Server::default_setxattr_handler(CInode *cur, InodeStoreBase::xattr_map_ptr xattrs,
6188 const XattrOp &xattr_op) {
6189 xattr_set(xattrs, xattr_op.xattr_name, xattr_op.xattr_value);
6190}
6191
6192void Server::default_removexattr_handler(CInode *cur, InodeStoreBase::xattr_map_ptr xattrs,
6193 const XattrOp &xattr_op) {
6194 xattr_rm(xattrs, xattr_op.xattr_name);
6195}
6196
6197// mirror info xattr handlers
6198const std::string Server::MirrorXattrInfo::MIRROR_INFO_REGEX = "^cluster_id=([a-f0-9]{8}-" \
6199 "[a-f0-9]{4}-[a-f0-9]{4}-" \
6200 "[a-f0-9]{4}-[a-f0-9]{12})" \
6201 " fs_id=(\\d+)$";
6202const std::string Server::MirrorXattrInfo::CLUSTER_ID = "ceph.mirror.info.cluster_id";
6203const std::string Server::MirrorXattrInfo::FS_ID = "ceph.mirror.info.fs_id";
6204int Server::parse_mirror_info_xattr(const std::string &name, const std::string &value,
6205 std::string &cluster_id, std::string &fs_id) {
6206 dout(20) << "parsing name=" << name << ", value=" << value << dendl;
6207
6208 static const std::regex regex(Server::MirrorXattrInfo::MIRROR_INFO_REGEX);
6209 std::smatch match;
6210
6211 std::regex_search(value, match, regex);
6212 if (match.size() != 3) {
6213 derr << "mirror info parse error" << dendl;
6214 return -CEPHFS_EINVAL;
6215 }
6216
6217 cluster_id = match[1];
6218 fs_id = match[2];
6219 dout(20) << " parsed cluster_id=" << cluster_id << ", fs_id=" << fs_id << dendl;
6220 return 0;
6221}
6222
6223int Server::mirror_info_xattr_validate(CInode *cur, const InodeStoreBase::xattr_map_const_ptr xattrs,
6224 XattrOp *xattr_op) {
6225 if (!cur->is_root()) {
6226 return -CEPHFS_EINVAL;
6227 }
6228
6229 int v1 = xattr_validate(cur, xattrs, Server::MirrorXattrInfo::CLUSTER_ID, xattr_op->op, xattr_op->flags);
6230 int v2 = xattr_validate(cur, xattrs, Server::MirrorXattrInfo::FS_ID, xattr_op->op, xattr_op->flags);
6231 if (v1 != v2) {
6232 derr << "inconsistent mirror info state (" << v1 << "," << v2 << ")" << dendl;
6233 return -CEPHFS_EINVAL;
6234 }
6235
6236 if (v1 < 0) {
6237 return v1;
6238 }
6239
6240 if (xattr_op->op == CEPH_MDS_OP_RMXATTR) {
6241 return 0;
6242 }
6243
6244 std::string cluster_id;
6245 std::string fs_id;
6246 int r = parse_mirror_info_xattr(xattr_op->xattr_name, xattr_op->xattr_value.to_str(),
6247 cluster_id, fs_id);
6248 if (r < 0) {
6249 return r;
6250 }
6251
6252 xattr_op->xinfo = std::make_unique<MirrorXattrInfo>(cluster_id, fs_id);
6253 return 0;
6254}
6255
6256void Server::mirror_info_setxattr_handler(CInode *cur, InodeStoreBase::xattr_map_ptr xattrs,
6257 const XattrOp &xattr_op) {
6258 auto mirror_info = dynamic_cast<MirrorXattrInfo&>(*(xattr_op.xinfo));
6259
6260 bufferlist bl;
6261 bl.append(mirror_info.cluster_id.c_str(), mirror_info.cluster_id.length());
6262 xattr_set(xattrs, Server::MirrorXattrInfo::CLUSTER_ID, bl);
6263
6264 bl.clear();
6265 bl.append(mirror_info.fs_id.c_str(), mirror_info.fs_id.length());
6266 xattr_set(xattrs, Server::MirrorXattrInfo::FS_ID, bl);
6267}
6268
6269void Server::mirror_info_removexattr_handler(CInode *cur, InodeStoreBase::xattr_map_ptr xattrs,
6270 const XattrOp &xattr_op) {
6271 xattr_rm(xattrs, Server::MirrorXattrInfo::CLUSTER_ID);
6272 xattr_rm(xattrs, Server::MirrorXattrInfo::FS_ID);
6273}
7c673cae
FG
6274
6275void Server::handle_client_setxattr(MDRequestRef& mdr)
6276{
9f95a23c 6277 const cref_t<MClientRequest> &req = mdr->client_request;
7c673cae 6278 string name(req->get_path2());
7c673cae 6279
f67539c2
TL
6280 // is a ceph virtual xattr?
6281 if (is_ceph_vxattr(name)) {
9f95a23c
TL
6282 // can't use rdlock_path_pin_ref because we need to xlock snaplock/policylock
6283 CInode *cur = try_get_auth_inode(mdr, req->get_filepath().get_ino());
6284 if (!cur)
6285 return;
6286
6287 handle_set_vxattr(mdr, cur);
6288 return;
6289 }
6290
f67539c2
TL
6291 if (!is_allowed_ceph_xattr(name)) {
6292 respond_to_request(mdr, -CEPHFS_EINVAL);
6293 return;
6294 }
6295
9f95a23c 6296 CInode *cur = rdlock_path_pin_ref(mdr, true);
7c673cae
FG
6297 if (!cur)
6298 return;
6299
6300 if (mdr->snapid != CEPH_NOSNAP) {
f67539c2 6301 respond_to_request(mdr, -CEPHFS_EROFS);
7c673cae
FG
6302 return;
6303 }
6304
6305 int flags = req->head.args.setxattr.flags;
6306
9f95a23c 6307 MutationImpl::LockOpVec lov;
11fdf7f2
TL
6308 lov.add_xlock(&cur->xattrlock);
6309 if (!mds->locker->acquire_locks(mdr, lov))
7c673cae
FG
6310 return;
6311
6312 if (!check_access(mdr, cur, MAY_WRITE))
6313 return;
6314
7c673cae
FG
6315 size_t len = req->get_data().length();
6316 size_t inc = len + name.length();
6317
f67539c2
TL
6318 auto handler = Server::get_xattr_or_default_handler(name);
6319 const auto& pxattrs = cur->get_projected_xattrs();
6320 if (pxattrs) {
6321 // check xattrs kv pairs size
6322 size_t cur_xattrs_size = 0;
6323 for (const auto& p : *pxattrs) {
6324 if ((flags & CEPH_XATTR_REPLACE) && name.compare(p.first) == 0) {
6325 continue;
6326 }
6327 cur_xattrs_size += p.first.length() + p.second.length();
7c673cae 6328 }
7c673cae 6329
f67539c2
TL
6330 if (((cur_xattrs_size + inc) > g_conf()->mds_max_xattr_pairs_size)) {
6331 dout(10) << "xattr kv pairs size too big. cur_xattrs_size "
6332 << cur_xattrs_size << ", inc " << inc << dendl;
6333 respond_to_request(mdr, -CEPHFS_ENOSPC);
6334 return;
6335 }
7c673cae
FG
6336 }
6337
f67539c2
TL
6338 XattrOp xattr_op(CEPH_MDS_OP_SETXATTR, name, req->get_data(), flags);
6339 int r = std::invoke(handler->validate, this, cur, pxattrs, &xattr_op);
6340 if (r < 0) {
6341 respond_to_request(mdr, r);
7c673cae
FG
6342 return;
6343 }
6344
6345 dout(10) << "setxattr '" << name << "' len " << len << " on " << *cur << dendl;
6346
6347 // project update
f67539c2
TL
6348 auto pi = cur->project_inode(mdr, true);
6349 pi.inode->version = cur->pre_dirty();
6350 pi.inode->ctime = mdr->get_op_stamp();
6351 if (mdr->get_op_stamp() > pi.inode->rstat.rctime)
6352 pi.inode->rstat.rctime = mdr->get_op_stamp();
6353 if (name == "encryption.ctx"sv)
6354 pi.inode->fscrypt = true;
6355 pi.inode->change_attr++;
6356 pi.inode->xattr_version++;
6357
94b18763 6358 if ((flags & CEPH_XATTR_REMOVE)) {
f67539c2 6359 std::invoke(handler->removexattr, this, cur, pi.xattrs, xattr_op);
94b18763 6360 } else {
f67539c2 6361 std::invoke(handler->setxattr, this, cur, pi.xattrs, xattr_op);
7c673cae
FG
6362 }
6363
6364 // log + wait
6365 mdr->ls = mdlog->get_current_segment();
6366 EUpdate *le = new EUpdate(mdlog, "setxattr");
6367 mdlog->start_entry(le);
6368 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
6369 mdcache->predirty_journal_parents(mdr, &le->metablob, cur, 0, PREDIRTY_PRIMARY);
6370 mdcache->journal_dirty_inode(mdr.get(), &le->metablob, cur);
6371
6372 journal_and_reply(mdr, cur, 0, le, new C_MDS_inode_update_finish(this, mdr, cur));
6373}
6374
6375void Server::handle_client_removexattr(MDRequestRef& mdr)
6376{
9f95a23c 6377 const cref_t<MClientRequest> &req = mdr->client_request;
94b18763 6378 std::string name(req->get_path2());
11fdf7f2 6379
f67539c2
TL
6380 // is a ceph virtual xattr?
6381 if (is_ceph_vxattr(name)) {
9f95a23c
TL
6382 // can't use rdlock_path_pin_ref because we need to xlock snaplock/policylock
6383 CInode *cur = try_get_auth_inode(mdr, req->get_filepath().get_ino());
6384 if (!cur)
6385 return;
6386
6387 handle_remove_vxattr(mdr, cur);
6388 return;
6389 }
6390
f67539c2
TL
6391 if (!is_allowed_ceph_xattr(name)) {
6392 respond_to_request(mdr, -CEPHFS_EINVAL);
6393 return;
6394 }
6395
9f95a23c 6396 CInode* cur = rdlock_path_pin_ref(mdr, true);
7c673cae
FG
6397 if (!cur)
6398 return;
6399
6400 if (mdr->snapid != CEPH_NOSNAP) {
f67539c2 6401 respond_to_request(mdr, -CEPHFS_EROFS);
7c673cae
FG
6402 return;
6403 }
6404
9f95a23c 6405 MutationImpl::LockOpVec lov;
11fdf7f2
TL
6406 lov.add_xlock(&cur->xattrlock);
6407 if (!mds->locker->acquire_locks(mdr, lov))
7c673cae
FG
6408 return;
6409
f67539c2
TL
6410
6411 auto handler = Server::get_xattr_or_default_handler(name);
6412 bufferlist bl;
6413 XattrOp xattr_op(CEPH_MDS_OP_RMXATTR, name, bl, 0);
6414
6415 const auto& pxattrs = cur->get_projected_xattrs();
6416 int r = std::invoke(handler->validate, this, cur, pxattrs, &xattr_op);
6417 if (r < 0) {
6418 respond_to_request(mdr, r);
7c673cae
FG
6419 return;
6420 }
6421
6422 dout(10) << "removexattr '" << name << "' on " << *cur << dendl;
6423
6424 // project update
f67539c2
TL
6425 auto pi = cur->project_inode(mdr, true);
6426 pi.inode->version = cur->pre_dirty();
6427 pi.inode->ctime = mdr->get_op_stamp();
6428 if (mdr->get_op_stamp() > pi.inode->rstat.rctime)
6429 pi.inode->rstat.rctime = mdr->get_op_stamp();
6430 pi.inode->change_attr++;
6431 pi.inode->xattr_version++;
6432 std::invoke(handler->removexattr, this, cur, pi.xattrs, xattr_op);
7c673cae
FG
6433
6434 // log + wait
6435 mdr->ls = mdlog->get_current_segment();
6436 EUpdate *le = new EUpdate(mdlog, "removexattr");
6437 mdlog->start_entry(le);
6438 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
6439 mdcache->predirty_journal_parents(mdr, &le->metablob, cur, 0, PREDIRTY_PRIMARY);
6440 mdcache->journal_dirty_inode(mdr.get(), &le->metablob, cur);
6441
6442 journal_and_reply(mdr, cur, 0, le, new C_MDS_inode_update_finish(this, mdr, cur));
6443}
6444
1d09f67e
TL
6445void Server::handle_client_getvxattr(MDRequestRef& mdr)
6446{
6447 const auto& req = mdr->client_request;
6448 string xattr_name{req->get_path2()};
6449
6450 // is a ceph virtual xattr?
6451 if (!is_ceph_vxattr(xattr_name)) {
6452 respond_to_request(mdr, -CEPHFS_ENODATA);
6453 return;
6454 }
6455
6456 CInode *cur = rdlock_path_pin_ref(mdr, true, false);
6457 if (!cur) {
6458 return;
6459 }
6460
6461 if (is_ceph_dir_vxattr(xattr_name)) {
6462 if (!cur->is_dir()) {
6463 respond_to_request(mdr, -CEPHFS_ENODATA);
6464 return;
6465 }
6466 } else if (is_ceph_file_vxattr(xattr_name)) {
6467 if (cur->is_dir()) {
6468 respond_to_request(mdr, -CEPHFS_ENODATA);
6469 return;
6470 }
6471 }
6472
6473 CachedStackStringStream css;
6474 int r = 0;
6475 ceph::bufferlist bl;
6476 // handle these vxattrs
6477 if ((xattr_name.substr(0, 15) == "ceph.dir.layout"sv) ||
6478 (xattr_name.substr(0, 16) == "ceph.file.layout"sv)) {
6479 std::string layout_field;
6480
6481 struct layout_xattr_info_t {
6482 enum class InheritanceStatus : uint32_t {
6483 DEFAULT = 0,
6484 SET = 1,
6485 INHERITED = 2
6486 };
6487
6488 const file_layout_t layout;
6489 const InheritanceStatus status;
6490
6491 layout_xattr_info_t(const file_layout_t& l, InheritanceStatus inh)
6492 : layout(l), status(inh) { }
6493
6494 static std::string status_to_string(InheritanceStatus status) {
6495 switch (status) {
6496 case InheritanceStatus::DEFAULT: return "default"s;
6497 case InheritanceStatus::SET: return "set"s;
6498 case InheritanceStatus::INHERITED: return "inherited"s;
6499 default: return "unknown"s;
6500 }
6501 }
6502 };
6503
6504 auto is_default_layout = [&](const file_layout_t& layout) -> bool {
6505 return (layout == mdcache->default_file_layout);
6506 };
6507 auto get_inherited_layout = [&](CInode *cur) -> layout_xattr_info_t {
6508 auto orig_in = cur;
6509
6510 while (cur) {
6511 if (cur->get_projected_inode()->has_layout()) {
6512 auto& curr_layout = cur->get_projected_inode()->layout;
6513 if (is_default_layout(curr_layout)) {
6514 return {curr_layout, layout_xattr_info_t::InheritanceStatus::DEFAULT};
6515 }
6516 if (cur == orig_in) {
6517 // we've found a new layout at this inode
6518 return {curr_layout, layout_xattr_info_t::InheritanceStatus::SET};
6519 } else {
6520 return {curr_layout, layout_xattr_info_t::InheritanceStatus::INHERITED};
6521 }
6522 }
6523
6524 if (cur->is_root()) {
6525 break;
6526 }
6527
6528 cur = cur->get_projected_parent_dir()->get_inode();
6529 }
6530 mds->clog->error() << "no layout found at root dir!";
6531 ceph_abort("no layout found at root dir! something is really messed up with layouts!");
6532 };
6533
6534 if (xattr_name == "ceph.dir.layout.json"sv ||
6535 xattr_name == "ceph.file.layout.json"sv) {
6536 // fetch layout only for valid xattr_name
6537 const auto lxi = get_inherited_layout(cur);
6538
6539 *css << "{\"stripe_unit\": " << lxi.layout.stripe_unit
6540 << ", \"stripe_count\": " << lxi.layout.stripe_count
6541 << ", \"object_size\": " << lxi.layout.object_size
6542 << ", \"pool_name\": ";
6543 mds->objecter->with_osdmap([lxi, &css](const OSDMap& o) {
6544 *css << "\"";
6545 if (o.have_pg_pool(lxi.layout.pool_id)) {
6546 *css << o.get_pool_name(lxi.layout.pool_id);
6547 }
6548 *css << "\"";
6549 });
6550 *css << ", \"pool_id\": " << (uint64_t)lxi.layout.pool_id;
6551 *css << ", \"pool_namespace\": \"" << lxi.layout.pool_ns << "\"";
6552 *css << ", \"inheritance\": \"@"
6553 << layout_xattr_info_t::status_to_string(lxi.status) << "\"}";
6554 } else if ((xattr_name == "ceph.dir.layout.pool_name"sv) ||
6555 (xattr_name == "ceph.file.layout.pool_name"sv)) {
6556 // fetch layout only for valid xattr_name
6557 const auto lxi = get_inherited_layout(cur);
6558 mds->objecter->with_osdmap([lxi, &css](const OSDMap& o) {
6559 if (o.have_pg_pool(lxi.layout.pool_id)) {
6560 *css << o.get_pool_name(lxi.layout.pool_id);
6561 }
6562 });
6563 } else if ((xattr_name == "ceph.dir.layout.pool_id"sv) ||
6564 (xattr_name == "ceph.file.layout.pool_id"sv)) {
6565 // fetch layout only for valid xattr_name
6566 const auto lxi = get_inherited_layout(cur);
6567 *css << (uint64_t)lxi.layout.pool_id;
6568 } else {
6569 r = -CEPHFS_ENODATA; // no such attribute
6570 }
6571 } else if (xattr_name.substr(0, 12) == "ceph.dir.pin"sv) {
6572 if (xattr_name == "ceph.dir.pin"sv) {
6573 *css << cur->get_projected_inode()->export_pin;
6574 } else if (xattr_name == "ceph.dir.pin.random"sv) {
6575 *css << cur->get_projected_inode()->export_ephemeral_random_pin;
6576 } else if (xattr_name == "ceph.dir.pin.distributed"sv) {
6577 *css << cur->get_projected_inode()->export_ephemeral_distributed_pin;
6578 } else {
6579 // otherwise respond as invalid request
6580 // since we only handle ceph vxattrs here
6581 r = -CEPHFS_ENODATA; // no such attribute
6582 }
6583 } else {
6584 // otherwise respond as invalid request
6585 // since we only handle ceph vxattrs here
6586 r = -CEPHFS_ENODATA; // no such attribute
6587 }
6588
6589 if (r == 0) {
6590 ENCODE_START(1, 1, bl);
6591 encode(css->strv(), bl);
6592 ENCODE_FINISH(bl);
6593 mdr->reply_extra_bl = bl;
6594 }
6595
6596 respond_to_request(mdr, r);
6597}
7c673cae
FG
6598
6599// =================================================================
6600// DIRECTORY and NAMESPACE OPS
6601
6602
6603// ------------------------------------------------
6604
6605// MKNOD
6606
6607class C_MDS_mknod_finish : public ServerLogContext {
6608 CDentry *dn;
6609 CInode *newi;
6610public:
6611 C_MDS_mknod_finish(Server *s, MDRequestRef& r, CDentry *d, CInode *ni) :
6612 ServerLogContext(s, r), dn(d), newi(ni) {}
6613 void finish(int r) override {
11fdf7f2 6614 ceph_assert(r == 0);
7c673cae
FG
6615
6616 // link the inode
6617 dn->pop_projected_linkage();
6618
6619 // be a bit hacky with the inode version, here.. we decrement it
6620 // just to keep mark_dirty() happen. (we didn't bother projecting
6621 // a new version of hte inode since it's just been created)
f67539c2 6622 newi->mark_dirty(mdr->ls);
28e407b8 6623 newi->mark_dirty_parent(mdr->ls, true);
7c673cae
FG
6624
6625 // mkdir?
f67539c2 6626 if (newi->is_dir()) {
7c673cae 6627 CDir *dir = newi->get_dirfrag(frag_t());
11fdf7f2 6628 ceph_assert(dir);
f67539c2 6629 dir->mark_dirty(mdr->ls);
7c673cae
FG
6630 dir->mark_new(mdr->ls);
6631 }
6632
6633 mdr->apply();
6634
6635 MDRequestRef null_ref;
6636 get_mds()->mdcache->send_dentry_link(dn, null_ref);
6637
f67539c2 6638 if (newi->is_file()) {
7c673cae 6639 get_mds()->locker->share_inode_max_size(newi);
f67539c2 6640 } else if (newi->is_dir()) {
f6b5b4d7 6641 // We do this now so that the linkages on the new directory are stable.
f67539c2 6642 newi->maybe_ephemeral_rand();
f6b5b4d7 6643 }
7c673cae
FG
6644
6645 // hit pop
11fdf7f2 6646 get_mds()->balancer->hit_inode(newi, META_POP_IWR);
7c673cae
FG
6647
6648 // reply
6649 server->respond_to_request(mdr, 0);
6650 }
6651};
6652
6653
6654void Server::handle_client_mknod(MDRequestRef& mdr)
6655{
9f95a23c 6656 const cref_t<MClientRequest> &req = mdr->client_request;
7c673cae 6657 client_t client = mdr->get_client();
9f95a23c
TL
6658
6659 unsigned mode = req->head.args.mknod.mode;
6660 if ((mode & S_IFMT) == 0)
6661 mode |= S_IFREG;
6662
6663 mdr->disable_lock_cache();
6664 CDentry *dn = rdlock_path_xlock_dentry(mdr, true, false, S_ISREG(mode));
6665 if (!dn)
7c673cae
FG
6666 return;
6667
9f95a23c
TL
6668 CDir *dir = dn->get_dir();
6669 CInode *diri = dir->get_inode();
7c673cae
FG
6670 if (!check_access(mdr, diri, MAY_WRITE))
6671 return;
20effc67
TL
6672 if (!check_fragment_space(mdr, dir))
6673 return;
6674 if (!check_dir_max_entries(mdr, dir))
7c673cae
FG
6675 return;
6676
f67539c2
TL
6677 ceph_assert(dn->get_projected_linkage()->is_null());
6678 if (req->get_alternate_name().size() > alternate_name_max) {
6679 dout(10) << " alternate_name longer than " << alternate_name_max << dendl;
6680 respond_to_request(mdr, -CEPHFS_ENAMETOOLONG);
6681 return;
6682 }
6683 dn->set_alternate_name(req->get_alternate_name());
6684
7c673cae
FG
6685 // set layout
6686 file_layout_t layout;
9f95a23c
TL
6687 if (mdr->dir_layout != file_layout_t())
6688 layout = mdr->dir_layout;
7c673cae
FG
6689 else
6690 layout = mdcache->default_file_layout;
6691
11fdf7f2
TL
6692 CInode *newi = prepare_new_inode(mdr, dn->get_dir(), inodeno_t(req->head.ino), mode, &layout);
6693 ceph_assert(newi);
7c673cae
FG
6694
6695 dn->push_projected_linkage(newi);
6696
f67539c2
TL
6697 auto _inode = newi->_get_inode();
6698 _inode->version = dn->pre_dirty();
6699 _inode->rdev = req->head.args.mknod.rdev;
6700 _inode->rstat.rfiles = 1;
6701 _inode->accounted_rstat = _inode->rstat;
7c673cae 6702 if (layout.pool_id != mdcache->default_file_layout.pool_id)
f67539c2
TL
6703 _inode->add_old_pool(mdcache->default_file_layout.pool_id);
6704 _inode->update_backtrace();
7c673cae 6705
11fdf7f2
TL
6706 snapid_t follows = mdcache->get_global_snaprealm()->get_newest_seq();
6707 SnapRealm *realm = dn->get_dir()->inode->find_snaprealm();
6708 ceph_assert(follows >= realm->get_newest_seq());
6709
7c673cae
FG
6710 // if the client created a _regular_ file via MKNOD, it's highly likely they'll
6711 // want to write to it (e.g., if they are reexporting NFS)
f67539c2 6712 if (S_ISREG(_inode->mode)) {
7c673cae
FG
6713 // issue a cap on the file
6714 int cmode = CEPH_FILE_MODE_RDWR;
9f95a23c 6715 Capability *cap = mds->locker->issue_new_caps(newi, cmode, mdr, realm);
7c673cae
FG
6716 if (cap) {
6717 cap->set_wanted(0);
6718
6719 // put locks in excl mode
6720 newi->filelock.set_state(LOCK_EXCL);
6721 newi->authlock.set_state(LOCK_EXCL);
6722 newi->xattrlock.set_state(LOCK_EXCL);
a8e16298
TL
6723
6724 dout(15) << " setting a client_range too, since this is a regular file" << dendl;
f67539c2
TL
6725 _inode->client_ranges[client].range.first = 0;
6726 _inode->client_ranges[client].range.last = _inode->layout.stripe_unit;
6727 _inode->client_ranges[client].follows = follows;
f91f0fd5 6728 newi->mark_clientwriteable();
a8e16298 6729 cap->mark_clientwriteable();
7c673cae
FG
6730 }
6731 }
6732
11fdf7f2 6733 ceph_assert(dn->first == follows + 1);
7c673cae
FG
6734 newi->first = dn->first;
6735
f67539c2 6736 dout(10) << "mknod mode " << _inode->mode << " rdev " << _inode->rdev << dendl;
7c673cae
FG
6737
6738 // prepare finisher
6739 mdr->ls = mdlog->get_current_segment();
6740 EUpdate *le = new EUpdate(mdlog, "mknod");
6741 mdlog->start_entry(le);
6742 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
6743 journal_allocated_inos(mdr, &le->metablob);
6744
6745 mdcache->predirty_journal_parents(mdr, &le->metablob, newi, dn->get_dir(),
6746 PREDIRTY_PRIMARY|PREDIRTY_DIR, 1);
6747 le->metablob.add_primary_dentry(dn, newi, true, true, true);
6748
6749 journal_and_reply(mdr, newi, dn, le, new C_MDS_mknod_finish(this, mdr, dn, newi));
92f5a8d4 6750 mds->balancer->maybe_fragment(dn->get_dir(), false);
7c673cae
FG
6751}
6752
6753
6754
6755// MKDIR
6756/* This function takes responsibility for the passed mdr*/
6757void Server::handle_client_mkdir(MDRequestRef& mdr)
6758{
9f95a23c 6759 const cref_t<MClientRequest> &req = mdr->client_request;
91327a77 6760
9f95a23c
TL
6761 mdr->disable_lock_cache();
6762 CDentry *dn = rdlock_path_xlock_dentry(mdr, true);
6763 if (!dn)
7c673cae 6764 return;
9f95a23c 6765
7c673cae
FG
6766 CDir *dir = dn->get_dir();
6767 CInode *diri = dir->get_inode();
7c673cae
FG
6768
6769 // mkdir check access
6770 if (!check_access(mdr, diri, MAY_WRITE))
6771 return;
6772
6773 if (!check_fragment_space(mdr, dir))
6774 return;
20effc67
TL
6775 if (!check_dir_max_entries(mdr, dir))
6776 return;
7c673cae 6777
f67539c2
TL
6778 ceph_assert(dn->get_projected_linkage()->is_null());
6779 if (req->get_alternate_name().size() > alternate_name_max) {
6780 dout(10) << " alternate_name longer than " << alternate_name_max << dendl;
6781 respond_to_request(mdr, -CEPHFS_ENAMETOOLONG);
6782 return;
6783 }
6784 dn->set_alternate_name(req->get_alternate_name());
6785
7c673cae 6786 // new inode
7c673cae
FG
6787 unsigned mode = req->head.args.mkdir.mode;
6788 mode &= ~S_IFMT;
6789 mode |= S_IFDIR;
9f95a23c 6790 CInode *newi = prepare_new_inode(mdr, dir, inodeno_t(req->head.ino), mode);
11fdf7f2 6791 ceph_assert(newi);
7c673cae
FG
6792
6793 // it's a directory.
6794 dn->push_projected_linkage(newi);
6795
f67539c2
TL
6796 auto _inode = newi->_get_inode();
6797 _inode->version = dn->pre_dirty();
6798 _inode->rstat.rsubdirs = 1;
6799 _inode->accounted_rstat = _inode->rstat;
6800 _inode->update_backtrace();
7c673cae 6801
11fdf7f2
TL
6802 snapid_t follows = mdcache->get_global_snaprealm()->get_newest_seq();
6803 SnapRealm *realm = dn->get_dir()->inode->find_snaprealm();
6804 ceph_assert(follows >= realm->get_newest_seq());
6805
7c673cae 6806 dout(12) << " follows " << follows << dendl;
11fdf7f2 6807 ceph_assert(dn->first == follows + 1);
7c673cae
FG
6808 newi->first = dn->first;
6809
6810 // ...and that new dir is empty.
6811 CDir *newdir = newi->get_or_open_dirfrag(mdcache, frag_t());
6812 newdir->state_set(CDir::STATE_CREATING);
6813 newdir->mark_complete();
f67539c2 6814 newdir->_get_fnode()->version = newdir->pre_dirty();
7c673cae
FG
6815
6816 // prepare finisher
6817 mdr->ls = mdlog->get_current_segment();
6818 EUpdate *le = new EUpdate(mdlog, "mkdir");
6819 mdlog->start_entry(le);
6820 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
6821 journal_allocated_inos(mdr, &le->metablob);
6822 mdcache->predirty_journal_parents(mdr, &le->metablob, newi, dn->get_dir(), PREDIRTY_PRIMARY|PREDIRTY_DIR, 1);
6823 le->metablob.add_primary_dentry(dn, newi, true, true);
6824 le->metablob.add_new_dir(newdir); // dirty AND complete AND new
6825
6826 // issue a cap on the directory
6827 int cmode = CEPH_FILE_MODE_RDWR;
9f95a23c 6828 Capability *cap = mds->locker->issue_new_caps(newi, cmode, mdr, realm);
7c673cae
FG
6829 if (cap) {
6830 cap->set_wanted(0);
6831
6832 // put locks in excl mode
6833 newi->filelock.set_state(LOCK_EXCL);
6834 newi->authlock.set_state(LOCK_EXCL);
6835 newi->xattrlock.set_state(LOCK_EXCL);
6836 }
6837
6838 // make sure this inode gets into the journal
6839 le->metablob.add_opened_ino(newi->ino());
7c673cae
FG
6840
6841 journal_and_reply(mdr, newi, dn, le, new C_MDS_mknod_finish(this, mdr, dn, newi));
81eedcae
TL
6842
6843 // We hit_dir (via hit_inode) in our finish callback, but by then we might
6844 // have overshot the split size (multiple mkdir in flight), so here is
6845 // an early chance to split the dir if this mkdir makes it oversized.
6846 mds->balancer->maybe_fragment(dir, false);
7c673cae
FG
6847}
6848
6849
6850// SYMLINK
6851
6852void Server::handle_client_symlink(MDRequestRef& mdr)
6853{
f67539c2
TL
6854 const auto& req = mdr->client_request;
6855
9f95a23c
TL
6856 mdr->disable_lock_cache();
6857 CDentry *dn = rdlock_path_xlock_dentry(mdr, true);
6858 if (!dn)
7c673cae 6859 return;
9f95a23c 6860
7c673cae
FG
6861 CDir *dir = dn->get_dir();
6862 CInode *diri = dir->get_inode();
7c673cae
FG
6863
6864 if (!check_access(mdr, diri, MAY_WRITE))
9f95a23c 6865 return;
7c673cae
FG
6866 if (!check_fragment_space(mdr, dir))
6867 return;
20effc67
TL
6868 if (!check_dir_max_entries(mdr, dir))
6869 return;
7c673cae 6870
f67539c2
TL
6871 ceph_assert(dn->get_projected_linkage()->is_null());
6872 if (req->get_alternate_name().size() > alternate_name_max) {
6873 dout(10) << " alternate_name longer than " << alternate_name_max << dendl;
6874 respond_to_request(mdr, -CEPHFS_ENAMETOOLONG);
6875 }
6876 dn->set_alternate_name(req->get_alternate_name());
9f95a23c 6877
7c673cae 6878 unsigned mode = S_IFLNK | 0777;
9f95a23c 6879 CInode *newi = prepare_new_inode(mdr, dir, inodeno_t(req->head.ino), mode);
11fdf7f2 6880 ceph_assert(newi);
7c673cae
FG
6881
6882 // it's a symlink
6883 dn->push_projected_linkage(newi);
6884
11fdf7f2 6885 newi->symlink = req->get_path2();
f67539c2
TL
6886 auto _inode = newi->_get_inode();
6887 _inode->version = dn->pre_dirty();
6888 _inode->size = newi->symlink.length();
6889 _inode->rstat.rbytes = _inode->size;
6890 _inode->rstat.rfiles = 1;
6891 _inode->accounted_rstat = _inode->rstat;
6892 _inode->update_backtrace();
7c673cae
FG
6893
6894 newi->first = dn->first;
6895
6896 // prepare finisher
6897 mdr->ls = mdlog->get_current_segment();
6898 EUpdate *le = new EUpdate(mdlog, "symlink");
6899 mdlog->start_entry(le);
6900 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
6901 journal_allocated_inos(mdr, &le->metablob);
6902 mdcache->predirty_journal_parents(mdr, &le->metablob, newi, dn->get_dir(), PREDIRTY_PRIMARY|PREDIRTY_DIR, 1);
6903 le->metablob.add_primary_dentry(dn, newi, true, true);
6904
6905 journal_and_reply(mdr, newi, dn, le, new C_MDS_mknod_finish(this, mdr, dn, newi));
92f5a8d4 6906 mds->balancer->maybe_fragment(dir, false);
7c673cae
FG
6907}
6908
6909
6910
6911
6912
6913// LINK
6914
6915void Server::handle_client_link(MDRequestRef& mdr)
6916{
9f95a23c 6917 const cref_t<MClientRequest> &req = mdr->client_request;
7c673cae
FG
6918
6919 dout(7) << "handle_client_link " << req->get_filepath()
6920 << " to " << req->get_filepath2()
6921 << dendl;
6922
9f95a23c 6923 mdr->disable_lock_cache();
7c673cae 6924
9f95a23c
TL
6925 CDentry *destdn;
6926 CInode *targeti;
6927
6928 if (req->get_filepath2().depth() == 0) {
6929 targeti = mdcache->get_inode(req->get_filepath2().get_ino());
6930 if (!targeti) {
f67539c2 6931 dout(10) << "CEPHFS_ESTALE on path2, attempting recovery" << dendl;
2a845540
TL
6932 inodeno_t ino = req->get_filepath2().get_ino();
6933 mdcache->find_ino_peers(ino, new C_MDS_TryFindInode(this, mdr, mdcache, ino));
9f95a23c
TL
6934 return;
6935 }
6936 mdr->pin(targeti);
6937
6938 if (!(mdr->locking_state & MutationImpl::SNAP2_LOCKED)) {
6939 CDentry *pdn = targeti->get_projected_parent_dn();
6940 if (!pdn) {
6941 dout(7) << "target has no parent dn, failing..." << dendl;
f67539c2 6942 respond_to_request(mdr, -CEPHFS_EINVAL);
9f95a23c
TL
6943 return;
6944 }
6945 if (!mds->locker->try_rdlock_snap_layout(pdn->get_dir()->get_inode(), mdr, 1))
6946 return;
6947 mdr->locking_state |= MutationImpl::SNAP2_LOCKED;
6948 }
6949
6950 destdn = rdlock_path_xlock_dentry(mdr, false);
6951 if (!destdn)
6952 return;
9f95a23c
TL
6953 } else {
6954 auto ret = rdlock_two_paths_xlock_destdn(mdr, false);
6955 destdn = ret.first;
6956 if (!destdn)
6957 return;
6958
6959 if (!destdn->get_projected_linkage()->is_null()) {
f67539c2 6960 respond_to_request(mdr, -CEPHFS_EEXIST);
9f95a23c
TL
6961 return;
6962 }
6963
6964 targeti = ret.second->get_projected_linkage()->get_inode();
6965 }
6966
f67539c2
TL
6967 ceph_assert(destdn->get_projected_linkage()->is_null());
6968 if (req->get_alternate_name().size() > alternate_name_max) {
6969 dout(10) << " alternate_name longer than " << alternate_name_max << dendl;
6970 respond_to_request(mdr, -CEPHFS_ENAMETOOLONG);
6971 return;
6972 }
6973 destdn->set_alternate_name(req->get_alternate_name());
6974
9f95a23c
TL
6975 if (targeti->is_dir()) {
6976 dout(7) << "target is a dir, failing..." << dendl;
f67539c2 6977 respond_to_request(mdr, -CEPHFS_EINVAL);
7c673cae
FG
6978 return;
6979 }
6980
9f95a23c
TL
6981 CDir *dir = destdn->get_dir();
6982 dout(7) << "handle_client_link link " << destdn->get_name() << " in " << *dir << dendl;
7c673cae 6983 dout(7) << "target is " << *targeti << dendl;
9f95a23c
TL
6984
6985 if (!(mdr->locking_state & MutationImpl::ALL_LOCKED)) {
6986 MutationImpl::LockOpVec lov;
6987 lov.add_xlock(&targeti->snaplock);
6988 lov.add_xlock(&targeti->linklock);
6989
6990 if (!mds->locker->acquire_locks(mdr, lov))
181888fb 6991 return;
7c673cae 6992
9f95a23c
TL
6993 mdr->locking_state |= MutationImpl::ALL_LOCKED;
6994 }
7c673cae 6995
9f95a23c
TL
6996 if (targeti->get_projected_inode()->nlink == 0) {
6997 dout(7) << "target has no link, failing..." << dendl;
f67539c2 6998 respond_to_request(mdr, -CEPHFS_ENOENT);
20effc67 6999 return;
9f95a23c 7000 }
7c673cae
FG
7001
7002 if ((!mdr->has_more() || mdr->more()->witnessed.empty())) {
7003 if (!check_access(mdr, targeti, MAY_WRITE))
7004 return;
7005
7006 if (!check_access(mdr, dir->get_inode(), MAY_WRITE))
7007 return;
7008
7009 if (!check_fragment_space(mdr, dir))
7010 return;
20effc67
TL
7011
7012 if (!check_dir_max_entries(mdr, dir))
7013 return;
7c673cae
FG
7014 }
7015
adb31ebb
TL
7016 CInode* target_pin = targeti->get_projected_parent_dir()->inode;
7017 SnapRealm *target_realm = target_pin->find_snaprealm();
7018 if (target_pin != dir->inode &&
7019 target_realm->get_subvolume_ino() !=
7020 dir->inode->find_snaprealm()->get_subvolume_ino()) {
7021 dout(7) << "target is in different subvolume, failing..." << dendl;
f67539c2 7022 respond_to_request(mdr, -CEPHFS_EXDEV);
adb31ebb
TL
7023 return;
7024 }
7025
7c673cae 7026 // go!
11fdf7f2 7027 ceph_assert(g_conf()->mds_kill_link_at != 1);
7c673cae
FG
7028
7029 // local or remote?
7030 if (targeti->is_auth())
adb31ebb 7031 _link_local(mdr, destdn, targeti, target_realm);
7c673cae 7032 else
9f95a23c 7033 _link_remote(mdr, true, destdn, targeti);
92f5a8d4 7034 mds->balancer->maybe_fragment(dir, false);
7c673cae
FG
7035}
7036
7037
7038class C_MDS_link_local_finish : public ServerLogContext {
7039 CDentry *dn;
7040 CInode *targeti;
7041 version_t dnpv;
7042 version_t tipv;
11fdf7f2 7043 bool adjust_realm;
7c673cae
FG
7044public:
7045 C_MDS_link_local_finish(Server *s, MDRequestRef& r, CDentry *d, CInode *ti,
11fdf7f2 7046 version_t dnpv_, version_t tipv_, bool ar) :
7c673cae 7047 ServerLogContext(s, r), dn(d), targeti(ti),
11fdf7f2 7048 dnpv(dnpv_), tipv(tipv_), adjust_realm(ar) { }
7c673cae 7049 void finish(int r) override {
11fdf7f2
TL
7050 ceph_assert(r == 0);
7051 server->_link_local_finish(mdr, dn, targeti, dnpv, tipv, adjust_realm);
7c673cae
FG
7052 }
7053};
7054
7055
adb31ebb 7056void Server::_link_local(MDRequestRef& mdr, CDentry *dn, CInode *targeti, SnapRealm *target_realm)
7c673cae
FG
7057{
7058 dout(10) << "_link_local " << *dn << " to " << *targeti << dendl;
7059
7060 mdr->ls = mdlog->get_current_segment();
7061
7062 // predirty NEW dentry
7063 version_t dnpv = dn->pre_dirty();
7064 version_t tipv = targeti->pre_dirty();
7065
7066 // project inode update
f67539c2
TL
7067 auto pi = targeti->project_inode(mdr);
7068 pi.inode->nlink++;
7069 pi.inode->ctime = mdr->get_op_stamp();
7070 if (mdr->get_op_stamp() > pi.inode->rstat.rctime)
7071 pi.inode->rstat.rctime = mdr->get_op_stamp();
7072 pi.inode->change_attr++;
7073 pi.inode->version = tipv;
7c673cae 7074
11fdf7f2 7075 bool adjust_realm = false;
adb31ebb 7076 if (!target_realm->get_subvolume_ino() && !targeti->is_projected_snaprealm_global()) {
11fdf7f2
TL
7077 sr_t *newsnap = targeti->project_snaprealm();
7078 targeti->mark_snaprealm_global(newsnap);
adb31ebb 7079 targeti->record_snaprealm_parent_dentry(newsnap, target_realm, targeti->get_projected_parent_dn(), true);
11fdf7f2
TL
7080 adjust_realm = true;
7081 }
7082
7c673cae
FG
7083 // log + wait
7084 EUpdate *le = new EUpdate(mdlog, "link_local");
7085 mdlog->start_entry(le);
7086 le->metablob.add_client_req(mdr->reqid, mdr->client_request->get_oldest_client_tid());
7087 mdcache->predirty_journal_parents(mdr, &le->metablob, targeti, dn->get_dir(), PREDIRTY_DIR, 1); // new dn
7088 mdcache->predirty_journal_parents(mdr, &le->metablob, targeti, 0, PREDIRTY_PRIMARY); // targeti
7089 le->metablob.add_remote_dentry(dn, true, targeti->ino(), targeti->d_type()); // new remote
7090 mdcache->journal_dirty_inode(mdr.get(), &le->metablob, targeti);
7091
7092 // do this after predirty_*, to avoid funky extra dnl arg
7093 dn->push_projected_linkage(targeti->ino(), targeti->d_type());
7094
11fdf7f2
TL
7095 journal_and_reply(mdr, targeti, dn, le,
7096 new C_MDS_link_local_finish(this, mdr, dn, targeti, dnpv, tipv, adjust_realm));
7c673cae
FG
7097}
7098
7099void Server::_link_local_finish(MDRequestRef& mdr, CDentry *dn, CInode *targeti,
11fdf7f2 7100 version_t dnpv, version_t tipv, bool adjust_realm)
7c673cae
FG
7101{
7102 dout(10) << "_link_local_finish " << *dn << " to " << *targeti << dendl;
7103
7104 // link and unlock the NEW dentry
31f18b77
FG
7105 CDentry::linkage_t *dnl = dn->pop_projected_linkage();
7106 if (!dnl->get_inode())
7107 dn->link_remote(dnl, targeti);
7c673cae
FG
7108 dn->mark_dirty(dnpv, mdr->ls);
7109
7110 // target inode
7c673cae
FG
7111 mdr->apply();
7112
7113 MDRequestRef null_ref;
7114 mdcache->send_dentry_link(dn, null_ref);
7115
11fdf7f2
TL
7116 if (adjust_realm) {
7117 int op = CEPH_SNAP_OP_SPLIT;
7118 mds->mdcache->send_snap_update(targeti, 0, op);
7119 mds->mdcache->do_realm_invalidate_and_update_notify(targeti, op);
7120 }
7121
7c673cae 7122 // bump target popularity
11fdf7f2
TL
7123 mds->balancer->hit_inode(targeti, META_POP_IWR);
7124 mds->balancer->hit_dir(dn->get_dir(), META_POP_IWR);
7c673cae
FG
7125
7126 // reply
7127 respond_to_request(mdr, 0);
7128}
7129
7130
7131// link / unlink remote
7132
7133class C_MDS_link_remote_finish : public ServerLogContext {
7134 bool inc;
7135 CDentry *dn;
7136 CInode *targeti;
7137 version_t dpv;
7138public:
7139 C_MDS_link_remote_finish(Server *s, MDRequestRef& r, bool i, CDentry *d, CInode *ti) :
7140 ServerLogContext(s, r), inc(i), dn(d), targeti(ti),
7141 dpv(d->get_projected_version()) {}
7142 void finish(int r) override {
11fdf7f2 7143 ceph_assert(r == 0);
7c673cae
FG
7144 server->_link_remote_finish(mdr, inc, dn, targeti, dpv);
7145 }
7146};
7147
7148void Server::_link_remote(MDRequestRef& mdr, bool inc, CDentry *dn, CInode *targeti)
7149{
7150 dout(10) << "_link_remote "
7151 << (inc ? "link ":"unlink ")
7152 << *dn << " to " << *targeti << dendl;
7153
7154 // 1. send LinkPrepare to dest (journal nlink++ prepare)
7155 mds_rank_t linkauth = targeti->authority().first;
7156 if (mdr->more()->witnessed.count(linkauth) == 0) {
7157 if (mds->is_cluster_degraded() &&
7158 !mds->mdsmap->is_clientreplay_or_active_or_stopping(linkauth)) {
7159 dout(10) << " targeti auth mds." << linkauth << " is not active" << dendl;
f67539c2 7160 if (mdr->more()->waiting_on_peer.empty())
7c673cae
FG
7161 mds->wait_for_active_peer(linkauth, new C_MDS_RetryRequest(mdcache, mdr));
7162 return;
7163 }
7164
7165 dout(10) << " targeti auth must prepare nlink++/--" << dendl;
7166 int op;
7167 if (inc)
f67539c2 7168 op = MMDSPeerRequest::OP_LINKPREP;
7c673cae 7169 else
f67539c2
TL
7170 op = MMDSPeerRequest::OP_UNLINKPREP;
7171 auto req = make_message<MMDSPeerRequest>(mdr->reqid, mdr->attempt, op);
7c673cae
FG
7172 targeti->set_object_info(req->get_object_info());
7173 req->op_stamp = mdr->get_op_stamp();
11fdf7f2
TL
7174 if (auto& desti_srnode = mdr->more()->desti_srnode)
7175 encode(*desti_srnode, req->desti_snapbl);
7c673cae
FG
7176 mds->send_message_mds(req, linkauth);
7177
f67539c2
TL
7178 ceph_assert(mdr->more()->waiting_on_peer.count(linkauth) == 0);
7179 mdr->more()->waiting_on_peer.insert(linkauth);
7c673cae
FG
7180 return;
7181 }
7182 dout(10) << " targeti auth has prepared nlink++/--" << dendl;
7183
11fdf7f2
TL
7184 ceph_assert(g_conf()->mds_kill_link_at != 2);
7185
7186 if (auto& desti_srnode = mdr->more()->desti_srnode) {
7187 delete desti_srnode;
7188 desti_srnode = NULL;
7189 }
7c673cae
FG
7190
7191 mdr->set_mds_stamp(ceph_clock_now());
7192
7193 // add to event
7194 mdr->ls = mdlog->get_current_segment();
7195 EUpdate *le = new EUpdate(mdlog, inc ? "link_remote":"unlink_remote");
7196 mdlog->start_entry(le);
7197 le->metablob.add_client_req(mdr->reqid, mdr->client_request->get_oldest_client_tid());
7198 if (!mdr->more()->witnessed.empty()) {
f67539c2 7199 dout(20) << " noting uncommitted_peers " << mdr->more()->witnessed << dendl;
7c673cae 7200 le->reqid = mdr->reqid;
f67539c2
TL
7201 le->had_peers = true;
7202 mdcache->add_uncommitted_leader(mdr->reqid, mdr->ls, mdr->more()->witnessed);
7c673cae
FG
7203 }
7204
7205 if (inc) {
7206 dn->pre_dirty();
7207 mdcache->predirty_journal_parents(mdr, &le->metablob, targeti, dn->get_dir(), PREDIRTY_DIR, 1);
7208 le->metablob.add_remote_dentry(dn, true, targeti->ino(), targeti->d_type()); // new remote
7209 dn->push_projected_linkage(targeti->ino(), targeti->d_type());
7210 } else {
7211 dn->pre_dirty();
7212 mdcache->predirty_journal_parents(mdr, &le->metablob, targeti, dn->get_dir(), PREDIRTY_DIR, -1);
7213 mdcache->journal_cow_dentry(mdr.get(), &le->metablob, dn);
7214 le->metablob.add_null_dentry(dn, true);
31f18b77 7215 dn->push_projected_linkage();
7c673cae
FG
7216 }
7217
9f95a23c
TL
7218 journal_and_reply(mdr, (inc ? targeti : nullptr), dn, le,
7219 new C_MDS_link_remote_finish(this, mdr, inc, dn, targeti));
7c673cae
FG
7220}
7221
7222void Server::_link_remote_finish(MDRequestRef& mdr, bool inc,
7223 CDentry *dn, CInode *targeti,
7224 version_t dpv)
7225{
7226 dout(10) << "_link_remote_finish "
7227 << (inc ? "link ":"unlink ")
7228 << *dn << " to " << *targeti << dendl;
7229
11fdf7f2 7230 ceph_assert(g_conf()->mds_kill_link_at != 3);
7c673cae
FG
7231
7232 if (!mdr->more()->witnessed.empty())
f67539c2 7233 mdcache->logged_leader_update(mdr->reqid);
7c673cae
FG
7234
7235 if (inc) {
7236 // link the new dentry
31f18b77
FG
7237 CDentry::linkage_t *dnl = dn->pop_projected_linkage();
7238 if (!dnl->get_inode())
7239 dn->link_remote(dnl, targeti);
7c673cae
FG
7240 dn->mark_dirty(dpv, mdr->ls);
7241 } else {
7242 // unlink main dentry
7243 dn->get_dir()->unlink_inode(dn);
31f18b77 7244 dn->pop_projected_linkage();
7c673cae
FG
7245 dn->mark_dirty(dn->get_projected_version(), mdr->ls); // dirty old dentry
7246 }
7247
7248 mdr->apply();
7249
7250 MDRequestRef null_ref;
7251 if (inc)
7252 mdcache->send_dentry_link(dn, null_ref);
7253 else
7254 mdcache->send_dentry_unlink(dn, NULL, null_ref);
7255
7256 // bump target popularity
11fdf7f2
TL
7257 mds->balancer->hit_inode(targeti, META_POP_IWR);
7258 mds->balancer->hit_dir(dn->get_dir(), META_POP_IWR);
7c673cae
FG
7259
7260 // reply
7261 respond_to_request(mdr, 0);
7262
7263 if (!inc)
7264 // removing a new dn?
7265 dn->get_dir()->try_remove_unlinked_dn(dn);
7266}
7267
7268
7269// remote linking/unlinking
7270
f67539c2 7271class C_MDS_PeerLinkPrep : public ServerLogContext {
7c673cae 7272 CInode *targeti;
11fdf7f2 7273 bool adjust_realm;
7c673cae 7274public:
f67539c2 7275 C_MDS_PeerLinkPrep(Server *s, MDRequestRef& r, CInode *t, bool ar) :
11fdf7f2 7276 ServerLogContext(s, r), targeti(t), adjust_realm(ar) { }
7c673cae 7277 void finish(int r) override {
11fdf7f2 7278 ceph_assert(r == 0);
f67539c2 7279 server->_logged_peer_link(mdr, targeti, adjust_realm);
7c673cae
FG
7280 }
7281};
7282
f67539c2 7283class C_MDS_PeerLinkCommit : public ServerContext {
7c673cae
FG
7284 MDRequestRef mdr;
7285 CInode *targeti;
7286public:
f67539c2 7287 C_MDS_PeerLinkCommit(Server *s, MDRequestRef& r, CInode *t) :
7c673cae
FG
7288 ServerContext(s), mdr(r), targeti(t) { }
7289 void finish(int r) override {
f67539c2 7290 server->_commit_peer_link(mdr, r, targeti);
7c673cae
FG
7291 }
7292};
7293
f67539c2 7294void Server::handle_peer_link_prep(MDRequestRef& mdr)
7c673cae 7295{
f67539c2
TL
7296 dout(10) << "handle_peer_link_prep " << *mdr
7297 << " on " << mdr->peer_request->get_object_info()
7c673cae
FG
7298 << dendl;
7299
11fdf7f2 7300 ceph_assert(g_conf()->mds_kill_link_at != 4);
7c673cae 7301
f67539c2 7302 CInode *targeti = mdcache->get_inode(mdr->peer_request->get_object_info().ino);
11fdf7f2 7303 ceph_assert(targeti);
7c673cae
FG
7304 dout(10) << "targeti " << *targeti << dendl;
7305 CDentry *dn = targeti->get_parent_dn();
7306 CDentry::linkage_t *dnl = dn->get_linkage();
11fdf7f2 7307 ceph_assert(dnl->is_primary());
7c673cae 7308
f67539c2 7309 mdr->set_op_stamp(mdr->peer_request->op_stamp);
7c673cae
FG
7310
7311 mdr->auth_pin(targeti);
7312
f67539c2 7313 //ceph_abort(); // test hack: make sure leader can handle a peer that fails to prepare...
11fdf7f2 7314 ceph_assert(g_conf()->mds_kill_link_at != 5);
7c673cae
FG
7315
7316 // journal it
7317 mdr->ls = mdlog->get_current_segment();
f67539c2
TL
7318 EPeerUpdate *le = new EPeerUpdate(mdlog, "peer_link_prep", mdr->reqid, mdr->peer_to_mds,
7319 EPeerUpdate::OP_PREPARE, EPeerUpdate::LINK);
7c673cae
FG
7320 mdlog->start_entry(le);
7321
f67539c2 7322 auto pi = dnl->get_inode()->project_inode(mdr);
7c673cae
FG
7323
7324 // update journaled target inode
7325 bool inc;
11fdf7f2
TL
7326 bool adjust_realm = false;
7327 bool realm_projected = false;
f67539c2 7328 if (mdr->peer_request->get_op() == MMDSPeerRequest::OP_LINKPREP) {
7c673cae 7329 inc = true;
f67539c2 7330 pi.inode->nlink++;
adb31ebb
TL
7331
7332 CDentry *target_pdn = targeti->get_projected_parent_dn();
7333 SnapRealm *target_realm = target_pdn->get_dir()->inode->find_snaprealm();
7334 if (!target_realm->get_subvolume_ino() && !targeti->is_projected_snaprealm_global()) {
11fdf7f2
TL
7335 sr_t *newsnap = targeti->project_snaprealm();
7336 targeti->mark_snaprealm_global(newsnap);
adb31ebb 7337 targeti->record_snaprealm_parent_dentry(newsnap, target_realm, target_pdn, true);
11fdf7f2
TL
7338 adjust_realm = true;
7339 realm_projected = true;
7340 }
7c673cae
FG
7341 } else {
7342 inc = false;
f67539c2 7343 pi.inode->nlink--;
11fdf7f2 7344 if (targeti->is_projected_snaprealm_global()) {
f67539c2
TL
7345 ceph_assert(mdr->peer_request->desti_snapbl.length());
7346 auto p = mdr->peer_request->desti_snapbl.cbegin();
11fdf7f2
TL
7347
7348 sr_t *newsnap = targeti->project_snaprealm();
7349 decode(*newsnap, p);
7350
f67539c2 7351 if (pi.inode->nlink == 0)
11fdf7f2
TL
7352 ceph_assert(!newsnap->is_parent_global());
7353
7354 realm_projected = true;
7355 } else {
f67539c2 7356 ceph_assert(mdr->peer_request->desti_snapbl.length() == 0);
11fdf7f2 7357 }
7c673cae
FG
7358 }
7359
7360 link_rollback rollback;
7361 rollback.reqid = mdr->reqid;
7362 rollback.ino = targeti->ino();
f67539c2
TL
7363 rollback.old_ctime = targeti->get_inode()->ctime; // we hold versionlock xlock; no concorrent projections
7364 const auto& pf = targeti->get_parent_dn()->get_dir()->get_projected_fnode();
7c673cae
FG
7365 rollback.old_dir_mtime = pf->fragstat.mtime;
7366 rollback.old_dir_rctime = pf->rstat.rctime;
7367 rollback.was_inc = inc;
11fdf7f2
TL
7368 if (realm_projected) {
7369 if (targeti->snaprealm) {
7370 encode(true, rollback.snapbl);
7371 targeti->encode_snap_blob(rollback.snapbl);
7372 } else {
7373 encode(false, rollback.snapbl);
7374 }
7375 }
7376 encode(rollback, le->rollback);
7c673cae
FG
7377 mdr->more()->rollback_bl = le->rollback;
7378
f67539c2
TL
7379 pi.inode->ctime = mdr->get_op_stamp();
7380 pi.inode->version = targeti->pre_dirty();
7c673cae 7381
f67539c2 7382 dout(10) << " projected inode " << pi.inode->ino << " v " << pi.inode->version << dendl;
7c673cae
FG
7383
7384 // commit case
7385 mdcache->predirty_journal_parents(mdr, &le->commit, dnl->get_inode(), 0, PREDIRTY_SHALLOW|PREDIRTY_PRIMARY);
7386 mdcache->journal_dirty_inode(mdr.get(), &le->commit, targeti);
f67539c2 7387 mdcache->add_uncommitted_peer(mdr->reqid, mdr->ls, mdr->peer_to_mds);
7c673cae
FG
7388
7389 // set up commit waiter
f67539c2 7390 mdr->more()->peer_commit = new C_MDS_PeerLinkCommit(this, mdr, targeti);
7c673cae 7391
f67539c2
TL
7392 mdr->more()->peer_update_journaled = true;
7393 submit_mdlog_entry(le, new C_MDS_PeerLinkPrep(this, mdr, targeti, adjust_realm),
7c673cae
FG
7394 mdr, __func__);
7395 mdlog->flush();
7396}
7397
f67539c2 7398void Server::_logged_peer_link(MDRequestRef& mdr, CInode *targeti, bool adjust_realm)
7c673cae 7399{
f67539c2 7400 dout(10) << "_logged_peer_link " << *mdr
7c673cae
FG
7401 << " " << *targeti << dendl;
7402
11fdf7f2 7403 ceph_assert(g_conf()->mds_kill_link_at != 6);
7c673cae
FG
7404
7405 // update the target
7c673cae
FG
7406 mdr->apply();
7407
7408 // hit pop
11fdf7f2 7409 mds->balancer->hit_inode(targeti, META_POP_IWR);
7c673cae
FG
7410
7411 // done.
f67539c2 7412 mdr->reset_peer_request();
7c673cae 7413
11fdf7f2
TL
7414 if (adjust_realm) {
7415 int op = CEPH_SNAP_OP_SPLIT;
7416 mds->mdcache->send_snap_update(targeti, 0, op);
7417 mds->mdcache->do_realm_invalidate_and_update_notify(targeti, op);
7418 }
7419
7c673cae
FG
7420 // ack
7421 if (!mdr->aborted) {
f67539c2
TL
7422 auto reply = make_message<MMDSPeerRequest>(mdr->reqid, mdr->attempt, MMDSPeerRequest::OP_LINKPREPACK);
7423 mds->send_message_mds(reply, mdr->peer_to_mds);
7c673cae
FG
7424 } else {
7425 dout(10) << " abort flag set, finishing" << dendl;
7426 mdcache->request_finish(mdr);
7427 }
7428}
7429
7430
f67539c2
TL
7431struct C_MDS_CommittedPeer : public ServerLogContext {
7432 C_MDS_CommittedPeer(Server *s, MDRequestRef& m) : ServerLogContext(s, m) {}
7c673cae 7433 void finish(int r) override {
f67539c2 7434 server->_committed_peer(mdr);
7c673cae
FG
7435 }
7436};
7437
f67539c2 7438void Server::_commit_peer_link(MDRequestRef& mdr, int r, CInode *targeti)
7c673cae 7439{
f67539c2 7440 dout(10) << "_commit_peer_link " << *mdr
7c673cae
FG
7441 << " r=" << r
7442 << " " << *targeti << dendl;
7443
11fdf7f2 7444 ceph_assert(g_conf()->mds_kill_link_at != 7);
7c673cae
FG
7445
7446 if (r == 0) {
7447 // drop our pins, etc.
7448 mdr->cleanup();
7449
7450 // write a commit to the journal
f67539c2
TL
7451 EPeerUpdate *le = new EPeerUpdate(mdlog, "peer_link_commit", mdr->reqid, mdr->peer_to_mds,
7452 EPeerUpdate::OP_COMMIT, EPeerUpdate::LINK);
7c673cae 7453 mdlog->start_entry(le);
f67539c2 7454 submit_mdlog_entry(le, new C_MDS_CommittedPeer(this, mdr), mdr, __func__);
7c673cae
FG
7455 mdlog->flush();
7456 } else {
f67539c2 7457 do_link_rollback(mdr->more()->rollback_bl, mdr->peer_to_mds, mdr);
7c673cae
FG
7458 }
7459}
7460
f67539c2 7461void Server::_committed_peer(MDRequestRef& mdr)
7c673cae 7462{
f67539c2 7463 dout(10) << "_committed_peer " << *mdr << dendl;
7c673cae 7464
11fdf7f2 7465 ceph_assert(g_conf()->mds_kill_link_at != 8);
7c673cae 7466
f67539c2
TL
7467 bool assert_exist = mdr->more()->peer_update_journaled;
7468 mdcache->finish_uncommitted_peer(mdr->reqid, assert_exist);
7469 auto req = make_message<MMDSPeerRequest>(mdr->reqid, mdr->attempt, MMDSPeerRequest::OP_COMMITTED);
7470 mds->send_message_mds(req, mdr->peer_to_mds);
7c673cae
FG
7471 mdcache->request_finish(mdr);
7472}
7473
7474struct C_MDS_LoggedLinkRollback : public ServerLogContext {
7475 MutationRef mut;
9f95a23c 7476 map<client_t,ref_t<MClientSnap>> splits;
11fdf7f2 7477 C_MDS_LoggedLinkRollback(Server *s, MutationRef& m, MDRequestRef& r,
9f95a23c 7478 map<client_t,ref_t<MClientSnap>>&& _splits) :
11fdf7f2
TL
7479 ServerLogContext(s, r), mut(m), splits(std::move(_splits)) {
7480 }
7c673cae 7481 void finish(int r) override {
11fdf7f2 7482 server->_link_rollback_finish(mut, mdr, splits);
7c673cae
FG
7483 }
7484};
7485
f67539c2 7486void Server::do_link_rollback(bufferlist &rbl, mds_rank_t leader, MDRequestRef& mdr)
7c673cae
FG
7487{
7488 link_rollback rollback;
11fdf7f2
TL
7489 auto p = rbl.cbegin();
7490 decode(rollback, p);
7c673cae
FG
7491
7492 dout(10) << "do_link_rollback on " << rollback.reqid
7493 << (rollback.was_inc ? " inc":" dec")
7494 << " ino " << rollback.ino
7495 << dendl;
7496
11fdf7f2 7497 ceph_assert(g_conf()->mds_kill_link_at != 9);
7c673cae 7498
f67539c2 7499 mdcache->add_rollback(rollback.reqid, leader); // need to finish this update before resolve finishes
11fdf7f2 7500 ceph_assert(mdr || mds->is_resolve());
7c673cae
FG
7501
7502 MutationRef mut(new MutationImpl(nullptr, utime_t(), rollback.reqid));
7503 mut->ls = mds->mdlog->get_current_segment();
7504
7505 CInode *in = mdcache->get_inode(rollback.ino);
11fdf7f2 7506 ceph_assert(in);
7c673cae 7507 dout(10) << " target is " << *in << dendl;
f67539c2 7508 ceph_assert(!in->is_projected()); // live peer request hold versionlock xlock.
7c673cae 7509
f67539c2
TL
7510 auto pi = in->project_inode(mut);
7511 pi.inode->version = in->pre_dirty();
7c673cae
FG
7512
7513 // parent dir rctime
7514 CDir *parent = in->get_projected_parent_dn()->get_dir();
f67539c2 7515 auto pf = parent->project_fnode(mut);
7c673cae 7516 pf->version = parent->pre_dirty();
f67539c2 7517 if (pf->fragstat.mtime == pi.inode->ctime) {
7c673cae 7518 pf->fragstat.mtime = rollback.old_dir_mtime;
f67539c2 7519 if (pf->rstat.rctime == pi.inode->ctime)
7c673cae
FG
7520 pf->rstat.rctime = rollback.old_dir_rctime;
7521 mut->add_updated_lock(&parent->get_inode()->filelock);
7522 mut->add_updated_lock(&parent->get_inode()->nestlock);
7523 }
7524
7525 // inode
f67539c2 7526 pi.inode->ctime = rollback.old_ctime;
7c673cae 7527 if (rollback.was_inc)
f67539c2 7528 pi.inode->nlink--;
7c673cae 7529 else
f67539c2 7530 pi.inode->nlink++;
7c673cae 7531
9f95a23c 7532 map<client_t,ref_t<MClientSnap>> splits;
11fdf7f2
TL
7533 if (rollback.snapbl.length() && in->snaprealm) {
7534 bool hadrealm;
7535 auto p = rollback.snapbl.cbegin();
7536 decode(hadrealm, p);
7537 if (hadrealm) {
7538 if (!mds->is_resolve()) {
7539 sr_t *new_srnode = new sr_t();
7540 decode(*new_srnode, p);
7541 in->project_snaprealm(new_srnode);
7542 } else {
7543 decode(in->snaprealm->srnode, p);
7544 }
7545 } else {
7546 SnapRealm *realm = parent->get_inode()->find_snaprealm();
7547 if (!mds->is_resolve())
7548 mdcache->prepare_realm_merge(in->snaprealm, realm, splits);
7549 in->project_snaprealm(NULL);
7550 }
7551 }
7552
7c673cae 7553 // journal it
f67539c2
TL
7554 EPeerUpdate *le = new EPeerUpdate(mdlog, "peer_link_rollback", rollback.reqid, leader,
7555 EPeerUpdate::OP_ROLLBACK, EPeerUpdate::LINK);
7c673cae
FG
7556 mdlog->start_entry(le);
7557 le->commit.add_dir_context(parent);
7558 le->commit.add_dir(parent, true);
7559 le->commit.add_primary_dentry(in->get_projected_parent_dn(), 0, true);
7560
11fdf7f2 7561 submit_mdlog_entry(le, new C_MDS_LoggedLinkRollback(this, mut, mdr, std::move(splits)),
7c673cae
FG
7562 mdr, __func__);
7563 mdlog->flush();
7564}
7565
11fdf7f2 7566void Server::_link_rollback_finish(MutationRef& mut, MDRequestRef& mdr,
9f95a23c 7567 map<client_t,ref_t<MClientSnap>>& splits)
7c673cae
FG
7568{
7569 dout(10) << "_link_rollback_finish" << dendl;
7570
11fdf7f2 7571 ceph_assert(g_conf()->mds_kill_link_at != 10);
7c673cae
FG
7572
7573 mut->apply();
11fdf7f2
TL
7574
7575 if (!mds->is_resolve())
7576 mdcache->send_snaps(splits);
7577
7c673cae
FG
7578 if (mdr)
7579 mdcache->request_finish(mdr);
7580
e306af50 7581 mdcache->finish_rollback(mut->reqid, mdr);
7c673cae
FG
7582
7583 mut->cleanup();
7584}
7585
7586
f67539c2 7587void Server::handle_peer_link_prep_ack(MDRequestRef& mdr, const cref_t<MMDSPeerRequest> &m)
7c673cae 7588{
f67539c2 7589 dout(10) << "handle_peer_link_prep_ack " << *mdr
7c673cae
FG
7590 << " " << *m << dendl;
7591 mds_rank_t from = mds_rank_t(m->get_source().num());
7592
11fdf7f2 7593 ceph_assert(g_conf()->mds_kill_link_at != 11);
7c673cae 7594
f67539c2
TL
7595 // note peer
7596 mdr->more()->peers.insert(from);
7c673cae
FG
7597
7598 // witnessed!
11fdf7f2 7599 ceph_assert(mdr->more()->witnessed.count(from) == 0);
7c673cae 7600 mdr->more()->witnessed.insert(from);
11fdf7f2 7601 ceph_assert(!m->is_not_journaled());
f67539c2 7602 mdr->more()->has_journaled_peers = true;
7c673cae
FG
7603
7604 // remove from waiting list
f67539c2
TL
7605 ceph_assert(mdr->more()->waiting_on_peer.count(from));
7606 mdr->more()->waiting_on_peer.erase(from);
7c673cae 7607
f67539c2 7608 ceph_assert(mdr->more()->waiting_on_peer.empty());
7c673cae 7609
9f95a23c
TL
7610 dispatch_client_request(mdr); // go again!
7611}
7c673cae 7612
9f95a23c
TL
7613
7614
7615
7616
7617// UNLINK
7618
7619void Server::handle_client_unlink(MDRequestRef& mdr)
7620{
7621 const cref_t<MClientRequest> &req = mdr->client_request;
7622 client_t client = mdr->get_client();
7623
7624 // rmdir or unlink?
7625 bool rmdir = (req->get_op() == CEPH_MDS_OP_RMDIR);
7626
7627 if (rmdir)
7628 mdr->disable_lock_cache();
7629 CDentry *dn = rdlock_path_xlock_dentry(mdr, false, true);
7630 if (!dn)
7631 return;
7c673cae
FG
7632
7633 CDentry::linkage_t *dnl = dn->get_linkage(client, mdr);
11fdf7f2 7634 ceph_assert(!dnl->is_null());
9f95a23c 7635 CInode *in = dnl->get_inode();
7c673cae
FG
7636
7637 if (rmdir) {
7638 dout(7) << "handle_client_rmdir on " << *dn << dendl;
7639 } else {
7640 dout(7) << "handle_client_unlink on " << *dn << dendl;
7641 }
7642 dout(7) << "dn links to " << *in << dendl;
7643
7644 // rmdir vs is_dir
7645 if (in->is_dir()) {
7646 if (rmdir) {
7647 // do empty directory checks
7648 if (_dir_is_nonempty_unlocked(mdr, in)) {
f67539c2 7649 respond_to_request(mdr, -CEPHFS_ENOTEMPTY);
7c673cae
FG
7650 return;
7651 }
7652 } else {
7653 dout(7) << "handle_client_unlink on dir " << *in << ", returning error" << dendl;
f67539c2 7654 respond_to_request(mdr, -CEPHFS_EISDIR);
7c673cae
FG
7655 return;
7656 }
7657 } else {
7658 if (rmdir) {
7659 // unlink
7660 dout(7) << "handle_client_rmdir on non-dir " << *in << ", returning error" << dendl;
f67539c2 7661 respond_to_request(mdr, -CEPHFS_ENOTDIR);
7c673cae
FG
7662 return;
7663 }
7664 }
7665
9f95a23c
TL
7666 CInode *diri = dn->get_dir()->get_inode();
7667 if ((!mdr->has_more() || mdr->more()->witnessed.empty())) {
7668 if (!check_access(mdr, diri, MAY_WRITE))
7669 return;
7670 }
7671
7c673cae
FG
7672 // -- create stray dentry? --
7673 CDentry *straydn = NULL;
7674 if (dnl->is_primary()) {
7675 straydn = prepare_stray_dentry(mdr, dnl->get_inode());
7676 if (!straydn)
7677 return;
7678 dout(10) << " straydn is " << *straydn << dendl;
7679 } else if (mdr->straydn) {
7680 mdr->unpin(mdr->straydn);
7681 mdr->straydn = NULL;
7682 }
7683
7684 // lock
9f95a23c
TL
7685 if (!(mdr->locking_state & MutationImpl::ALL_LOCKED)) {
7686 MutationImpl::LockOpVec lov;
11fdf7f2 7687
9f95a23c
TL
7688 lov.add_xlock(&in->linklock);
7689 lov.add_xlock(&in->snaplock);
7690 if (in->is_dir())
7691 lov.add_rdlock(&in->filelock); // to verify it's empty
7692
7693 if (straydn) {
7694 lov.add_wrlock(&straydn->get_dir()->inode->filelock);
7695 lov.add_wrlock(&straydn->get_dir()->inode->nestlock);
7696 lov.add_xlock(&straydn->lock);
7697 }
11fdf7f2 7698
9f95a23c
TL
7699 if (!mds->locker->acquire_locks(mdr, lov))
7700 return;
7c673cae 7701
9f95a23c
TL
7702 mdr->locking_state |= MutationImpl::ALL_LOCKED;
7703 }
7c673cae
FG
7704
7705 if (in->is_dir() &&
7706 _dir_is_nonempty(mdr, in)) {
f67539c2 7707 respond_to_request(mdr, -CEPHFS_ENOTEMPTY);
7c673cae
FG
7708 return;
7709 }
7710
11fdf7f2
TL
7711 if (straydn)
7712 straydn->first = mdcache->get_global_snaprealm()->get_newest_seq() + 1;
7713
7714 if (!mdr->more()->desti_srnode) {
7715 if (in->is_projected_snaprealm_global()) {
7716 sr_t *new_srnode = in->prepare_new_srnode(0);
adb31ebb 7717 in->record_snaprealm_parent_dentry(new_srnode, nullptr, dn, dnl->is_primary());
11fdf7f2
TL
7718 // dropping the last linkage or dropping the last remote linkage,
7719 // detch the inode from global snaprealm
7720 auto nlink = in->get_projected_inode()->nlink;
7721 if (nlink == 1 ||
7722 (nlink == 2 && !dnl->is_primary() &&
7723 !in->get_projected_parent_dir()->inode->is_stray()))
7724 in->clear_snaprealm_global(new_srnode);
7725 mdr->more()->desti_srnode = new_srnode;
7726 } else if (dnl->is_primary()) {
f67539c2 7727 // prepare snaprealm blob for peer request
11fdf7f2
TL
7728 SnapRealm *realm = in->find_snaprealm();
7729 snapid_t follows = realm->get_newest_seq();
7730 if (in->snaprealm || follows + 1 > in->get_oldest_snap()) {
7731 sr_t *new_srnode = in->prepare_new_srnode(follows);
7732 in->record_snaprealm_past_parent(new_srnode, straydn->get_dir()->inode->find_snaprealm());
7733 mdr->more()->desti_srnode = new_srnode;
7734 }
7735 }
7736 }
7737
7c673cae
FG
7738 // yay!
7739 if (in->is_dir() && in->has_subtree_root_dirfrag()) {
7740 // subtree root auths need to be witnesses
7741 set<mds_rank_t> witnesses;
7742 in->list_replicas(witnesses);
7743 dout(10) << " witnesses " << witnesses << ", have " << mdr->more()->witnessed << dendl;
7744
7745 for (set<mds_rank_t>::iterator p = witnesses.begin();
7746 p != witnesses.end();
7747 ++p) {
7748 if (mdr->more()->witnessed.count(*p)) {
7749 dout(10) << " already witnessed by mds." << *p << dendl;
f67539c2 7750 } else if (mdr->more()->waiting_on_peer.count(*p)) {
7c673cae
FG
7751 dout(10) << " already waiting on witness mds." << *p << dendl;
7752 } else {
9f95a23c 7753 if (!_rmdir_prepare_witness(mdr, *p, mdr->dn[0], straydn))
7c673cae
FG
7754 return;
7755 }
7756 }
f67539c2 7757 if (!mdr->more()->waiting_on_peer.empty())
7c673cae
FG
7758 return; // we're waiting for a witness.
7759 }
7760
9f95a23c
TL
7761 if (!rmdir && dnl->is_primary() && mdr->dn[0].size() == 1)
7762 mds->locker->create_lock_cache(mdr, diri);
7763
7c673cae
FG
7764 // ok!
7765 if (dnl->is_remote() && !dnl->get_inode()->is_auth())
7766 _link_remote(mdr, false, dn, dnl->get_inode());
7767 else
7768 _unlink_local(mdr, dn, straydn);
7769}
7770
7771class C_MDS_unlink_local_finish : public ServerLogContext {
7772 CDentry *dn;
7773 CDentry *straydn;
7774 version_t dnpv; // deleted dentry
7775public:
7776 C_MDS_unlink_local_finish(Server *s, MDRequestRef& r, CDentry *d, CDentry *sd) :
7777 ServerLogContext(s, r), dn(d), straydn(sd),
7778 dnpv(d->get_projected_version()) {}
7779 void finish(int r) override {
11fdf7f2 7780 ceph_assert(r == 0);
7c673cae
FG
7781 server->_unlink_local_finish(mdr, dn, straydn, dnpv);
7782 }
7783};
7784
7785void Server::_unlink_local(MDRequestRef& mdr, CDentry *dn, CDentry *straydn)
7786{
7787 dout(10) << "_unlink_local " << *dn << dendl;
7788
7789 CDentry::linkage_t *dnl = dn->get_projected_linkage();
7790 CInode *in = dnl->get_inode();
7791
7c673cae
FG
7792
7793 // ok, let's do it.
7794 mdr->ls = mdlog->get_current_segment();
7795
7796 // prepare log entry
7797 EUpdate *le = new EUpdate(mdlog, "unlink_local");
7798 mdlog->start_entry(le);
7799 le->metablob.add_client_req(mdr->reqid, mdr->client_request->get_oldest_client_tid());
7800 if (!mdr->more()->witnessed.empty()) {
f67539c2 7801 dout(20) << " noting uncommitted_peers " << mdr->more()->witnessed << dendl;
7c673cae 7802 le->reqid = mdr->reqid;
f67539c2
TL
7803 le->had_peers = true;
7804 mdcache->add_uncommitted_leader(mdr->reqid, mdr->ls, mdr->more()->witnessed);
7c673cae
FG
7805 }
7806
7807 if (straydn) {
11fdf7f2 7808 ceph_assert(dnl->is_primary());
7c673cae 7809 straydn->push_projected_linkage(in);
7c673cae
FG
7810 }
7811
7812 // the unlinked dentry
7813 dn->pre_dirty();
7814
f67539c2 7815 auto pi = in->project_inode(mdr);
94b18763
FG
7816 {
7817 std::string t;
7818 dn->make_path_string(t, true);
f67539c2
TL
7819 pi.inode->stray_prior_path = std::move(t);
7820 }
7821 pi.inode->version = in->pre_dirty();
7822 pi.inode->ctime = mdr->get_op_stamp();
7823 if (mdr->get_op_stamp() > pi.inode->rstat.rctime)
7824 pi.inode->rstat.rctime = mdr->get_op_stamp();
7825 pi.inode->change_attr++;
7826 pi.inode->nlink--;
7827 if (pi.inode->nlink == 0)
7c673cae
FG
7828 in->state_set(CInode::STATE_ORPHAN);
7829
11fdf7f2
TL
7830 if (mdr->more()->desti_srnode) {
7831 auto& desti_srnode = mdr->more()->desti_srnode;
7832 in->project_snaprealm(desti_srnode);
7833 desti_srnode = NULL;
7834 }
7835
7836 if (straydn) {
7837 // will manually pop projected inode
7838
7c673cae 7839 // primary link. add stray dentry.
7c673cae
FG
7840 mdcache->predirty_journal_parents(mdr, &le->metablob, in, dn->get_dir(), PREDIRTY_PRIMARY|PREDIRTY_DIR, -1);
7841 mdcache->predirty_journal_parents(mdr, &le->metablob, in, straydn->get_dir(), PREDIRTY_PRIMARY|PREDIRTY_DIR, 1);
7842
f67539c2 7843 pi.inode->update_backtrace();
7c673cae
FG
7844 le->metablob.add_primary_dentry(straydn, in, true, true);
7845 } else {
7846 // remote link. update remote inode.
7847 mdcache->predirty_journal_parents(mdr, &le->metablob, in, dn->get_dir(), PREDIRTY_DIR, -1);
7848 mdcache->predirty_journal_parents(mdr, &le->metablob, in, 0, PREDIRTY_PRIMARY);
7849 mdcache->journal_dirty_inode(mdr.get(), &le->metablob, in);
7850 }
7851
7852 mdcache->journal_cow_dentry(mdr.get(), &le->metablob, dn);
7853 le->metablob.add_null_dentry(dn, true);
7854
7855 if (in->is_dir()) {
7856 dout(10) << " noting renamed (unlinked) dir ino " << in->ino() << " in metablob" << dendl;
7857 le->metablob.renamed_dirino = in->ino();
7858 }
7859
7860 dn->push_projected_linkage();
7861
11fdf7f2
TL
7862 if (straydn) {
7863 ceph_assert(in->first <= straydn->first);
7864 in->first = straydn->first;
7865 }
7866
7c673cae 7867 if (in->is_dir()) {
11fdf7f2 7868 ceph_assert(straydn);
7c673cae
FG
7869 mdcache->project_subtree_rename(in, dn->get_dir(), straydn->get_dir());
7870 }
7871
7872 journal_and_reply(mdr, 0, dn, le, new C_MDS_unlink_local_finish(this, mdr, dn, straydn));
7873}
7874
7875void Server::_unlink_local_finish(MDRequestRef& mdr,
7876 CDentry *dn, CDentry *straydn,
7877 version_t dnpv)
7878{
7879 dout(10) << "_unlink_local_finish " << *dn << dendl;
7880
7881 if (!mdr->more()->witnessed.empty())
f67539c2 7882 mdcache->logged_leader_update(mdr->reqid);
7c673cae 7883
11fdf7f2
TL
7884 CInode *strayin = NULL;
7885 bool hadrealm = false;
7886 if (straydn) {
7887 // if there is newly created snaprealm, need to split old snaprealm's
7888 // inodes_with_caps. So pop snaprealm before linkage changes.
7889 strayin = dn->get_linkage()->get_inode();
7890 hadrealm = strayin->snaprealm ? true : false;
7891 strayin->early_pop_projected_snaprealm();
7892 }
7893
7c673cae
FG
7894 // unlink main dentry
7895 dn->get_dir()->unlink_inode(dn);
7896 dn->pop_projected_linkage();
f67539c2 7897 dn->mark_dirty(dnpv, mdr->ls);
7c673cae
FG
7898
7899 // relink as stray? (i.e. was primary link?)
7c673cae
FG
7900 if (straydn) {
7901 dout(20) << " straydn is " << *straydn << dendl;
11fdf7f2 7902 straydn->pop_projected_linkage();
7c673cae
FG
7903 mdcache->touch_dentry_bottom(straydn);
7904 }
7905
7c673cae 7906 mdr->apply();
7c673cae
FG
7907
7908 mdcache->send_dentry_unlink(dn, straydn, mdr);
7909
11fdf7f2
TL
7910 if (straydn) {
7911 // update subtree map?
7912 if (strayin->is_dir())
7913 mdcache->adjust_subtree_after_rename(strayin, dn->get_dir(), true);
7914
7915 if (strayin->snaprealm && !hadrealm)
7916 mdcache->do_realm_invalidate_and_update_notify(strayin, CEPH_SNAP_OP_SPLIT, false);
7917 }
7c673cae
FG
7918
7919 // bump pop
11fdf7f2 7920 mds->balancer->hit_dir(dn->get_dir(), META_POP_IWR);
7c673cae
FG
7921
7922 // reply
7923 respond_to_request(mdr, 0);
7924
7925 // removing a new dn?
7926 dn->get_dir()->try_remove_unlinked_dn(dn);
7927
7928 // clean up ?
7929 // respond_to_request() drops locks. So stray reintegration can race with us.
7930 if (straydn && !straydn->get_projected_linkage()->is_null()) {
7931 // Tip off the MDCache that this dentry is a stray that
7932 // might be elegible for purge.
7933 mdcache->notify_stray(straydn);
7934 }
7935}
7936
7937bool Server::_rmdir_prepare_witness(MDRequestRef& mdr, mds_rank_t who, vector<CDentry*>& trace, CDentry *straydn)
7938{
7939 if (mds->is_cluster_degraded() &&
7940 !mds->mdsmap->is_clientreplay_or_active_or_stopping(who)) {
7941 dout(10) << "_rmdir_prepare_witness mds." << who << " is not active" << dendl;
f67539c2 7942 if (mdr->more()->waiting_on_peer.empty())
7c673cae
FG
7943 mds->wait_for_active_peer(who, new C_MDS_RetryRequest(mdcache, mdr));
7944 return false;
7945 }
7946
7947 dout(10) << "_rmdir_prepare_witness mds." << who << dendl;
f67539c2 7948 auto req = make_message<MMDSPeerRequest>(mdr->reqid, mdr->attempt, MMDSPeerRequest::OP_RMDIRPREP);
7c673cae
FG
7949 req->srcdnpath = filepath(trace.front()->get_dir()->ino());
7950 for (auto dn : trace)
94b18763 7951 req->srcdnpath.push_dentry(dn->get_name());
9f95a23c 7952 mdcache->encode_replica_stray(straydn, who, req->straybl);
11fdf7f2
TL
7953 if (mdr->more()->desti_srnode)
7954 encode(*mdr->more()->desti_srnode, req->desti_snapbl);
7c673cae
FG
7955
7956 req->op_stamp = mdr->get_op_stamp();
7957 mds->send_message_mds(req, who);
7958
f67539c2
TL
7959 ceph_assert(mdr->more()->waiting_on_peer.count(who) == 0);
7960 mdr->more()->waiting_on_peer.insert(who);
7c673cae
FG
7961 return true;
7962}
7963
f67539c2 7964struct C_MDS_PeerRmdirPrep : public ServerLogContext {
7c673cae 7965 CDentry *dn, *straydn;
f67539c2 7966 C_MDS_PeerRmdirPrep(Server *s, MDRequestRef& r, CDentry *d, CDentry *st)
7c673cae
FG
7967 : ServerLogContext(s, r), dn(d), straydn(st) {}
7968 void finish(int r) override {
f67539c2 7969 server->_logged_peer_rmdir(mdr, dn, straydn);
7c673cae
FG
7970 }
7971};
7972
f67539c2 7973struct C_MDS_PeerRmdirCommit : public ServerContext {
7c673cae 7974 MDRequestRef mdr;
31f18b77 7975 CDentry *straydn;
f67539c2 7976 C_MDS_PeerRmdirCommit(Server *s, MDRequestRef& r, CDentry *sd)
31f18b77 7977 : ServerContext(s), mdr(r), straydn(sd) { }
7c673cae 7978 void finish(int r) override {
f67539c2 7979 server->_commit_peer_rmdir(mdr, r, straydn);
7c673cae
FG
7980 }
7981};
7982
f67539c2 7983void Server::handle_peer_rmdir_prep(MDRequestRef& mdr)
7c673cae 7984{
f67539c2
TL
7985 dout(10) << "handle_peer_rmdir_prep " << *mdr
7986 << " " << mdr->peer_request->srcdnpath
7987 << " to " << mdr->peer_request->destdnpath
7c673cae
FG
7988 << dendl;
7989
7990 vector<CDentry*> trace;
f67539c2 7991 filepath srcpath(mdr->peer_request->srcdnpath);
7c673cae
FG
7992 dout(10) << " src " << srcpath << dendl;
7993 CInode *in;
f67539c2 7994 CF_MDS_RetryRequestFactory cf(mdcache, mdr, false);
9f95a23c
TL
7995 int r = mdcache->path_traverse(mdr, cf, srcpath,
7996 MDS_TRAVERSE_DISCOVER | MDS_TRAVERSE_PATH_LOCKED,
7997 &trace, &in);
7c673cae 7998 if (r > 0) return;
f67539c2 7999 if (r == -CEPHFS_ESTALE) {
7c673cae 8000 mdcache->find_ino_peers(srcpath.get_ino(), new C_MDS_RetryRequest(mdcache, mdr),
f67539c2 8001 mdr->peer_to_mds, true);
7c673cae
FG
8002 return;
8003 }
11fdf7f2 8004 ceph_assert(r == 0);
91327a77 8005 CDentry *dn = trace.back();
7c673cae
FG
8006 dout(10) << " dn " << *dn << dendl;
8007 mdr->pin(dn);
8008
11fdf7f2 8009 ceph_assert(mdr->straydn);
7c673cae
FG
8010 CDentry *straydn = mdr->straydn;
8011 dout(10) << " straydn " << *straydn << dendl;
8012
f67539c2 8013 mdr->set_op_stamp(mdr->peer_request->op_stamp);
7c673cae
FG
8014
8015 rmdir_rollback rollback;
8016 rollback.reqid = mdr->reqid;
8017 rollback.src_dir = dn->get_dir()->dirfrag();
11fdf7f2 8018 rollback.src_dname = dn->get_name();
7c673cae 8019 rollback.dest_dir = straydn->get_dir()->dirfrag();
11fdf7f2 8020 rollback.dest_dname = straydn->get_name();
f67539c2 8021 if (mdr->peer_request->desti_snapbl.length()) {
11fdf7f2
TL
8022 if (in->snaprealm) {
8023 encode(true, rollback.snapbl);
8024 in->encode_snap_blob(rollback.snapbl);
8025 } else {
8026 encode(false, rollback.snapbl);
8027 }
8028 }
8029 encode(rollback, mdr->more()->rollback_bl);
8030 // FIXME: rollback snaprealm
7c673cae
FG
8031 dout(20) << " rollback is " << mdr->more()->rollback_bl.length() << " bytes" << dendl;
8032
8033 // set up commit waiter
f67539c2 8034 mdr->more()->peer_commit = new C_MDS_PeerRmdirCommit(this, mdr, straydn);
7c673cae 8035
11fdf7f2
TL
8036 straydn->push_projected_linkage(in);
8037 dn->push_projected_linkage();
7c673cae 8038
11fdf7f2
TL
8039 ceph_assert(straydn->first >= in->first);
8040 in->first = straydn->first;
7c673cae 8041
11fdf7f2
TL
8042 if (!in->has_subtree_root_dirfrag(mds->get_nodeid())) {
8043 dout(10) << " no auth subtree in " << *in << ", skipping journal" << dendl;
f67539c2 8044 _logged_peer_rmdir(mdr, dn, straydn);
7c673cae
FG
8045 return;
8046 }
8047
e306af50 8048 mdr->ls = mdlog->get_current_segment();
f67539c2
TL
8049 EPeerUpdate *le = new EPeerUpdate(mdlog, "peer_rmdir", mdr->reqid, mdr->peer_to_mds,
8050 EPeerUpdate::OP_PREPARE, EPeerUpdate::RMDIR);
7c673cae
FG
8051 mdlog->start_entry(le);
8052 le->rollback = mdr->more()->rollback_bl;
8053
8054 le->commit.add_dir_context(straydn->get_dir());
8055 le->commit.add_primary_dentry(straydn, in, true);
f67539c2 8056 // peer: no need to journal original dentry
7c673cae
FG
8057
8058 dout(10) << " noting renamed (unlinked) dir ino " << in->ino() << " in metablob" << dendl;
8059 le->commit.renamed_dirino = in->ino();
8060
8061 mdcache->project_subtree_rename(in, dn->get_dir(), straydn->get_dir());
f67539c2 8062 mdcache->add_uncommitted_peer(mdr->reqid, mdr->ls, mdr->peer_to_mds);
7c673cae 8063
f67539c2
TL
8064 mdr->more()->peer_update_journaled = true;
8065 submit_mdlog_entry(le, new C_MDS_PeerRmdirPrep(this, mdr, dn, straydn),
7c673cae
FG
8066 mdr, __func__);
8067 mdlog->flush();
8068}
8069
f67539c2 8070void Server::_logged_peer_rmdir(MDRequestRef& mdr, CDentry *dn, CDentry *straydn)
7c673cae 8071{
f67539c2 8072 dout(10) << "_logged_peer_rmdir " << *mdr << " on " << *dn << dendl;
11fdf7f2
TL
8073 CInode *in = dn->get_linkage()->get_inode();
8074
8075 bool new_realm;
f67539c2 8076 if (mdr->peer_request->desti_snapbl.length()) {
11fdf7f2 8077 new_realm = !in->snaprealm;
f67539c2 8078 in->decode_snap_blob(mdr->peer_request->desti_snapbl);
11fdf7f2 8079 ceph_assert(in->snaprealm);
11fdf7f2
TL
8080 } else {
8081 new_realm = false;
8082 }
7c673cae
FG
8083
8084 // update our cache now, so we are consistent with what is in the journal
8085 // when we journal a subtree map
7c673cae
FG
8086 dn->get_dir()->unlink_inode(dn);
8087 straydn->pop_projected_linkage();
8088 dn->pop_projected_linkage();
11fdf7f2 8089
f67539c2 8090 mdcache->adjust_subtree_after_rename(in, dn->get_dir(), mdr->more()->peer_update_journaled);
11fdf7f2
TL
8091
8092 if (new_realm)
8093 mdcache->do_realm_invalidate_and_update_notify(in, CEPH_SNAP_OP_SPLIT, false);
7c673cae
FG
8094
8095 // done.
f67539c2 8096 mdr->reset_peer_request();
7c673cae
FG
8097 mdr->straydn = 0;
8098
8099 if (!mdr->aborted) {
f67539c2
TL
8100 auto reply = make_message<MMDSPeerRequest>(mdr->reqid, mdr->attempt, MMDSPeerRequest::OP_RMDIRPREPACK);
8101 if (!mdr->more()->peer_update_journaled)
11fdf7f2 8102 reply->mark_not_journaled();
f67539c2 8103 mds->send_message_mds(reply, mdr->peer_to_mds);
7c673cae
FG
8104 } else {
8105 dout(10) << " abort flag set, finishing" << dendl;
8106 mdcache->request_finish(mdr);
8107 }
8108}
8109
f67539c2 8110void Server::handle_peer_rmdir_prep_ack(MDRequestRef& mdr, const cref_t<MMDSPeerRequest> &ack)
7c673cae 8111{
f67539c2 8112 dout(10) << "handle_peer_rmdir_prep_ack " << *mdr
7c673cae
FG
8113 << " " << *ack << dendl;
8114
8115 mds_rank_t from = mds_rank_t(ack->get_source().num());
8116
f67539c2 8117 mdr->more()->peers.insert(from);
7c673cae
FG
8118 mdr->more()->witnessed.insert(from);
8119 if (!ack->is_not_journaled())
f67539c2 8120 mdr->more()->has_journaled_peers = true;
7c673cae
FG
8121
8122 // remove from waiting list
f67539c2
TL
8123 ceph_assert(mdr->more()->waiting_on_peer.count(from));
8124 mdr->more()->waiting_on_peer.erase(from);
7c673cae 8125
f67539c2 8126 if (mdr->more()->waiting_on_peer.empty())
7c673cae
FG
8127 dispatch_client_request(mdr); // go again!
8128 else
f67539c2 8129 dout(10) << "still waiting on peers " << mdr->more()->waiting_on_peer << dendl;
7c673cae
FG
8130}
8131
f67539c2 8132void Server::_commit_peer_rmdir(MDRequestRef& mdr, int r, CDentry *straydn)
7c673cae 8133{
f67539c2 8134 dout(10) << "_commit_peer_rmdir " << *mdr << " r=" << r << dendl;
e306af50 8135
7c673cae 8136 if (r == 0) {
f67539c2 8137 if (mdr->more()->peer_update_journaled) {
31f18b77
FG
8138 CInode *strayin = straydn->get_projected_linkage()->get_inode();
8139 if (strayin && !strayin->snaprealm)
8140 mdcache->clear_dirty_bits_for_stray(strayin);
8141 }
8142
7c673cae
FG
8143 mdr->cleanup();
8144
f67539c2 8145 if (mdr->more()->peer_update_journaled) {
7c673cae 8146 // write a commit to the journal
f67539c2
TL
8147 EPeerUpdate *le = new EPeerUpdate(mdlog, "peer_rmdir_commit", mdr->reqid,
8148 mdr->peer_to_mds, EPeerUpdate::OP_COMMIT,
8149 EPeerUpdate::RMDIR);
7c673cae 8150 mdlog->start_entry(le);
f67539c2 8151 submit_mdlog_entry(le, new C_MDS_CommittedPeer(this, mdr), mdr, __func__);
7c673cae
FG
8152 mdlog->flush();
8153 } else {
f67539c2 8154 _committed_peer(mdr);
7c673cae
FG
8155 }
8156 } else {
8157 // abort
f67539c2 8158 do_rmdir_rollback(mdr->more()->rollback_bl, mdr->peer_to_mds, mdr);
7c673cae
FG
8159 }
8160}
8161
8162struct C_MDS_LoggedRmdirRollback : public ServerLogContext {
8163 metareqid_t reqid;
8164 CDentry *dn;
8165 CDentry *straydn;
8166 C_MDS_LoggedRmdirRollback(Server *s, MDRequestRef& m, metareqid_t mr, CDentry *d, CDentry *st)
8167 : ServerLogContext(s, m), reqid(mr), dn(d), straydn(st) {}
8168 void finish(int r) override {
8169 server->_rmdir_rollback_finish(mdr, reqid, dn, straydn);
8170 }
8171};
8172
f67539c2 8173void Server::do_rmdir_rollback(bufferlist &rbl, mds_rank_t leader, MDRequestRef& mdr)
7c673cae
FG
8174{
8175 // unlink the other rollback methods, the rmdir rollback is only
8176 // needed to record the subtree changes in the journal for inode
8177 // replicas who are auth for empty dirfrags. no actual changes to
8178 // the file system are taking place here, so there is no Mutation.
8179
8180 rmdir_rollback rollback;
11fdf7f2
TL
8181 auto p = rbl.cbegin();
8182 decode(rollback, p);
7c673cae
FG
8183
8184 dout(10) << "do_rmdir_rollback on " << rollback.reqid << dendl;
f67539c2 8185 mdcache->add_rollback(rollback.reqid, leader); // need to finish this update before resolve finishes
11fdf7f2 8186 ceph_assert(mdr || mds->is_resolve());
7c673cae
FG
8187
8188 CDir *dir = mdcache->get_dirfrag(rollback.src_dir);
8189 if (!dir)
8190 dir = mdcache->get_dirfrag(rollback.src_dir.ino, rollback.src_dname);
11fdf7f2 8191 ceph_assert(dir);
7c673cae 8192 CDentry *dn = dir->lookup(rollback.src_dname);
11fdf7f2 8193 ceph_assert(dn);
7c673cae 8194 dout(10) << " dn " << *dn << dendl;
11fdf7f2
TL
8195 CDir *straydir = mdcache->get_dirfrag(rollback.dest_dir);
8196 ceph_assert(straydir);
8197 CDentry *straydn = straydir->lookup(rollback.dest_dname);
8198 ceph_assert(straydn);
8199 dout(10) << " straydn " << *straydn << dendl;
7c673cae
FG
8200 CInode *in = straydn->get_linkage()->get_inode();
8201
11fdf7f2
TL
8202 dn->push_projected_linkage(in);
8203 straydn->push_projected_linkage();
7c673cae 8204
11fdf7f2
TL
8205 if (rollback.snapbl.length() && in->snaprealm) {
8206 bool hadrealm;
8207 auto p = rollback.snapbl.cbegin();
8208 decode(hadrealm, p);
8209 if (hadrealm) {
8210 decode(in->snaprealm->srnode, p);
8211 } else {
8212 in->snaprealm->merge_to(dir->get_inode()->find_snaprealm());
8213 }
8214 }
7c673cae 8215
f67539c2 8216 if (mdr && !mdr->more()->peer_update_journaled) {
11fdf7f2 8217 ceph_assert(!in->has_subtree_root_dirfrag(mds->get_nodeid()));
7c673cae 8218
11fdf7f2 8219 _rmdir_rollback_finish(mdr, rollback.reqid, dn, straydn);
7c673cae
FG
8220 return;
8221 }
8222
7c673cae 8223
f67539c2
TL
8224 EPeerUpdate *le = new EPeerUpdate(mdlog, "peer_rmdir_rollback", rollback.reqid, leader,
8225 EPeerUpdate::OP_ROLLBACK, EPeerUpdate::RMDIR);
7c673cae
FG
8226 mdlog->start_entry(le);
8227
8228 le->commit.add_dir_context(dn->get_dir());
8229 le->commit.add_primary_dentry(dn, in, true);
f67539c2 8230 // peer: no need to journal straydn
7c673cae
FG
8231
8232 dout(10) << " noting renamed (unlinked) dir ino " << in->ino() << " in metablob" << dendl;
8233 le->commit.renamed_dirino = in->ino();
8234
8235 mdcache->project_subtree_rename(in, straydn->get_dir(), dn->get_dir());
8236
8237 submit_mdlog_entry(le,
8238 new C_MDS_LoggedRmdirRollback(this, mdr,rollback.reqid,
8239 dn, straydn),
8240 mdr, __func__);
8241 mdlog->flush();
8242}
8243
8244void Server::_rmdir_rollback_finish(MDRequestRef& mdr, metareqid_t reqid, CDentry *dn, CDentry *straydn)
8245{
8246 dout(10) << "_rmdir_rollback_finish " << reqid << dendl;
8247
8248 straydn->get_dir()->unlink_inode(straydn);
8249 dn->pop_projected_linkage();
8250 straydn->pop_projected_linkage();
8251
8252 CInode *in = dn->get_linkage()->get_inode();
11fdf7f2 8253 mdcache->adjust_subtree_after_rename(in, straydn->get_dir(),
f67539c2 8254 !mdr || mdr->more()->peer_update_journaled);
11fdf7f2 8255
7c673cae
FG
8256 if (mds->is_resolve()) {
8257 CDir *root = mdcache->get_subtree_root(straydn->get_dir());
8258 mdcache->try_trim_non_auth_subtree(root);
8259 }
8260
8261 if (mdr)
8262 mdcache->request_finish(mdr);
8263
e306af50 8264 mdcache->finish_rollback(reqid, mdr);
7c673cae
FG
8265}
8266
8267
8268/** _dir_is_nonempty[_unlocked]
8269 *
8270 * check if a directory is non-empty (i.e. we can rmdir it).
8271 *
8272 * the unlocked varient this is a fastpath check. we can't really be
8273 * sure until we rdlock the filelock.
8274 */
8275bool Server::_dir_is_nonempty_unlocked(MDRequestRef& mdr, CInode *in)
8276{
8277 dout(10) << "dir_is_nonempty_unlocked " << *in << dendl;
11fdf7f2 8278 ceph_assert(in->is_auth());
7c673cae 8279
9f95a23c
TL
8280 if (in->filelock.is_cached())
8281 return false; // there can be pending async create/unlink. don't know.
7c673cae
FG
8282 if (in->snaprealm && in->snaprealm->srnode.snaps.size())
8283 return true; // in a snapshot!
8284
9f95a23c
TL
8285 auto&& ls = in->get_dirfrags();
8286 for (const auto& dir : ls) {
7c673cae
FG
8287 // is the frag obviously non-empty?
8288 if (dir->is_auth()) {
8289 if (dir->get_projected_fnode()->fragstat.size()) {
8290 dout(10) << "dir_is_nonempty_unlocked dirstat has "
8291 << dir->get_projected_fnode()->fragstat.size() << " items " << *dir << dendl;
8292 return true;
8293 }
8294 }
8295 }
8296
8297 return false;
8298}
8299
8300bool Server::_dir_is_nonempty(MDRequestRef& mdr, CInode *in)
8301{
8302 dout(10) << "dir_is_nonempty " << *in << dendl;
11fdf7f2
TL
8303 ceph_assert(in->is_auth());
8304 ceph_assert(in->filelock.can_read(mdr->get_client()));
7c673cae
FG
8305
8306 frag_info_t dirstat;
8307 version_t dirstat_version = in->get_projected_inode()->dirstat.version;
8308
9f95a23c
TL
8309 auto&& ls = in->get_dirfrags();
8310 for (const auto& dir : ls) {
f67539c2 8311 const auto& pf = dir->get_projected_fnode();
7c673cae
FG
8312 if (pf->fragstat.size()) {
8313 dout(10) << "dir_is_nonempty dirstat has "
8314 << pf->fragstat.size() << " items " << *dir << dendl;
8315 return true;
8316 }
8317
8318 if (pf->accounted_fragstat.version == dirstat_version)
8319 dirstat.add(pf->accounted_fragstat);
8320 else
8321 dirstat.add(pf->fragstat);
8322 }
8323
8324 return dirstat.size() != in->get_projected_inode()->dirstat.size();
8325}
8326
8327
8328// ======================================================
8329
8330
8331class C_MDS_rename_finish : public ServerLogContext {
8332 CDentry *srcdn;
8333 CDentry *destdn;
8334 CDentry *straydn;
8335public:
8336 C_MDS_rename_finish(Server *s, MDRequestRef& r,
8337 CDentry *sdn, CDentry *ddn, CDentry *stdn) :
8338 ServerLogContext(s, r),
8339 srcdn(sdn), destdn(ddn), straydn(stdn) { }
8340 void finish(int r) override {
11fdf7f2 8341 ceph_assert(r == 0);
7c673cae
FG
8342 server->_rename_finish(mdr, srcdn, destdn, straydn);
8343 }
8344};
8345
8346
8347/** handle_client_rename
8348 *
f67539c2 8349 * rename leader is the destdn auth. this is because cached inodes
7c673cae
FG
8350 * must remain connected. thus, any replica of srci, must also
8351 * replicate destdn, and possibly straydn, so that srci (and
8352 * destdn->inode) remain connected during the rename.
8353 *
f67539c2 8354 * to do this, we freeze srci, then leader (destdn auth) verifies that
7c673cae
FG
8355 * all other nodes have also replciated destdn and straydn. note that
8356 * destdn replicas need not also replicate srci. this only works when
f67539c2 8357 * destdn is leader.
7c673cae
FG
8358 *
8359 * This function takes responsibility for the passed mdr.
8360 */
8361void Server::handle_client_rename(MDRequestRef& mdr)
8362{
f67539c2 8363 const auto& req = mdr->client_request;
7c673cae
FG
8364 dout(7) << "handle_client_rename " << *req << dendl;
8365
8366 filepath destpath = req->get_filepath();
8367 filepath srcpath = req->get_filepath2();
91327a77 8368 if (srcpath.is_last_dot_or_dotdot() || destpath.is_last_dot_or_dotdot()) {
f67539c2
TL
8369 respond_to_request(mdr, -CEPHFS_EBUSY);
8370 return;
8371 }
8372
8373 if (req->get_alternate_name().size() > alternate_name_max) {
8374 dout(10) << " alternate_name longer than " << alternate_name_max << dendl;
8375 respond_to_request(mdr, -CEPHFS_ENAMETOOLONG);
91327a77
AA
8376 return;
8377 }
8378
9f95a23c
TL
8379 auto [destdn, srcdn] = rdlock_two_paths_xlock_destdn(mdr, true);
8380 if (!destdn)
8381 return;
7c673cae 8382
7c673cae 8383 dout(10) << " destdn " << *destdn << dendl;
7c673cae 8384 CDir *destdir = destdn->get_dir();
11fdf7f2 8385 ceph_assert(destdir->is_auth());
9f95a23c 8386 CDentry::linkage_t *destdnl = destdn->get_projected_linkage();
7c673cae 8387
7c673cae 8388 dout(10) << " srcdn " << *srcdn << dendl;
11fdf7f2 8389 CDir *srcdir = srcdn->get_dir();
7c673cae
FG
8390 CDentry::linkage_t *srcdnl = srcdn->get_projected_linkage();
8391 CInode *srci = srcdnl->get_inode();
8392 dout(10) << " srci " << *srci << dendl;
8393
9f95a23c
TL
8394 // -- some sanity checks --
8395 if (destdn == srcdn) {
8396 dout(7) << "rename src=dest, noop" << dendl;
8397 respond_to_request(mdr, 0);
8398 return;
8399 }
8400
8401 // dest a child of src?
8402 // e.g. mv /usr /usr/foo
8403 if (srci->is_dir() && srci->is_projected_ancestor_of(destdir->get_inode())) {
8404 dout(7) << "cannot rename item to be a child of itself" << dendl;
f67539c2 8405 respond_to_request(mdr, -CEPHFS_EINVAL);
9f95a23c
TL
8406 return;
8407 }
8408
8409 // is this a stray migration, reintegration or merge? (sanity checks!)
8410 if (mdr->reqid.name.is_mds() &&
8411 !(MDS_INO_IS_STRAY(srcpath.get_ino()) &&
8412 MDS_INO_IS_STRAY(destpath.get_ino())) &&
8413 !(destdnl->is_remote() &&
8414 destdnl->get_remote_ino() == srci->ino())) {
f67539c2 8415 respond_to_request(mdr, -CEPHFS_EINVAL); // actually, this won't reply, but whatev.
9f95a23c
TL
8416 return;
8417 }
8418
7c673cae
FG
8419 CInode *oldin = 0;
8420 if (!destdnl->is_null()) {
8421 //dout(10) << "dest dn exists " << *destdn << dendl;
8422 oldin = mdcache->get_dentry_inode(destdn, mdr, true);
8423 if (!oldin) return;
8424 dout(10) << " oldin " << *oldin << dendl;
7c673cae
FG
8425
8426 // non-empty dir? do trivial fast unlocked check, do another check later with read locks
8427 if (oldin->is_dir() && _dir_is_nonempty_unlocked(mdr, oldin)) {
f67539c2 8428 respond_to_request(mdr, -CEPHFS_ENOTEMPTY);
7c673cae
FG
8429 return;
8430 }
181888fb 8431
9f95a23c
TL
8432 // mv /some/thing /to/some/existing_other_thing
8433 if (oldin->is_dir() && !srci->is_dir()) {
f67539c2 8434 respond_to_request(mdr, -CEPHFS_EISDIR);
9f95a23c
TL
8435 return;
8436 }
8437 if (!oldin->is_dir() && srci->is_dir()) {
f67539c2 8438 respond_to_request(mdr, -CEPHFS_ENOTDIR);
9f95a23c
TL
8439 return;
8440 }
8441 if (srci == oldin && !srcdir->inode->is_stray()) {
8442 respond_to_request(mdr, 0); // no-op. POSIX makes no sense.
8443 return;
7c673cae 8444 }
f67539c2
TL
8445 if (destdn->get_alternate_name() != req->get_alternate_name()) {
8446 /* the dentry exists but the alternate_names do not match, fail... */
8447 respond_to_request(mdr, -CEPHFS_EINVAL);
8448 return;
8449 }
7c673cae
FG
8450 }
8451
9f95a23c
TL
8452 vector<CDentry*>& srctrace = mdr->dn[1];
8453 vector<CDentry*>& desttrace = mdr->dn[0];
7c673cae
FG
8454
8455 // src+dest traces _must_ share a common ancestor for locking to prevent orphans
8456 if (destpath.get_ino() != srcpath.get_ino() &&
8457 !(req->get_source().is_mds() &&
9f95a23c 8458 MDS_INO_IS_STRAY(srcpath.get_ino()))) { // <-- mds 'rename' out of stray dir is ok!
7c673cae
FG
8459 CInode *srcbase = srctrace[0]->get_dir()->get_inode();
8460 CInode *destbase = desttrace[0]->get_dir()->get_inode();
8461 // ok, extend srctrace toward root until it is an ancestor of desttrace.
8462 while (srcbase != destbase &&
8463 !srcbase->is_projected_ancestor_of(destbase)) {
8464 CDentry *pdn = srcbase->get_projected_parent_dn();
8465 srctrace.insert(srctrace.begin(), pdn);
8466 dout(10) << "rename prepending srctrace with " << *pdn << dendl;
8467 srcbase = pdn->get_dir()->get_inode();
8468 }
8469
8470 // then, extend destpath until it shares the same parent inode as srcpath.
8471 while (destbase != srcbase) {
8472 CDentry *pdn = destbase->get_projected_parent_dn();
8473 desttrace.insert(desttrace.begin(), pdn);
7c673cae
FG
8474 dout(10) << "rename prepending desttrace with " << *pdn << dendl;
8475 destbase = pdn->get_dir()->get_inode();
8476 }
8477 dout(10) << "rename src and dest traces now share common ancestor " << *destbase << dendl;
8478 }
8479
7c673cae 8480
11fdf7f2 8481 bool linkmerge = srcdnl->get_inode() == destdnl->get_inode();
7c673cae
FG
8482 if (linkmerge)
8483 dout(10) << " this is a link merge" << dendl;
8484
8485 // -- create stray dentry? --
8486 CDentry *straydn = NULL;
8487 if (destdnl->is_primary() && !linkmerge) {
8488 straydn = prepare_stray_dentry(mdr, destdnl->get_inode());
8489 if (!straydn)
8490 return;
8491 dout(10) << " straydn is " << *straydn << dendl;
8492 } else if (mdr->straydn) {
8493 mdr->unpin(mdr->straydn);
8494 mdr->straydn = NULL;
8495 }
8496
7c673cae
FG
8497
8498 // -- locks --
9f95a23c
TL
8499 if (!(mdr->locking_state & MutationImpl::ALL_LOCKED)) {
8500 MutationImpl::LockOpVec lov;
7c673cae 8501
9f95a23c
TL
8502 // we need to update srci's ctime. xlock its least contended lock to do that...
8503 lov.add_xlock(&srci->linklock);
8504 lov.add_xlock(&srci->snaplock);
7c673cae 8505
9f95a23c
TL
8506 if (oldin) {
8507 // xlock oldin (for nlink--)
8508 lov.add_xlock(&oldin->linklock);
8509 lov.add_xlock(&oldin->snaplock);
8510 if (oldin->is_dir()) {
8511 ceph_assert(srci->is_dir());
11fdf7f2 8512 lov.add_rdlock(&oldin->filelock); // to verify it's empty
7c673cae 8513
9f95a23c
TL
8514 // adjust locking order?
8515 int cmp = mdr->compare_paths();
8516 if (cmp < 0 || (cmp == 0 && oldin->ino() < srci->ino()))
8517 std::reverse(lov.begin(), lov.end());
8518 } else {
8519 ceph_assert(!srci->is_dir());
8520 // adjust locking order;
8521 if (srci->ino() > oldin->ino())
8522 std::reverse(lov.begin(), lov.end());
8523 }
8524 }
8525
8526 // straydn?
8527 if (straydn) {
8528 lov.add_wrlock(&straydn->get_dir()->inode->filelock);
8529 lov.add_wrlock(&straydn->get_dir()->inode->nestlock);
8530 lov.add_xlock(&straydn->lock);
8531 }
8532
8533 CInode *auth_pin_freeze = !srcdn->is_auth() && srcdnl->is_primary() ? srci : nullptr;
8534 if (!mds->locker->acquire_locks(mdr, lov, auth_pin_freeze))
8535 return;
8536
8537 mdr->locking_state |= MutationImpl::ALL_LOCKED;
8538 }
7c673cae 8539
11fdf7f2
TL
8540 if (linkmerge)
8541 ceph_assert(srcdir->inode->is_stray() && srcdnl->is_primary() && destdnl->is_remote());
8542
7c673cae 8543 if ((!mdr->has_more() || mdr->more()->witnessed.empty())) {
11fdf7f2 8544 if (!check_access(mdr, srcdir->get_inode(), MAY_WRITE))
7c673cae
FG
8545 return;
8546
8547 if (!check_access(mdr, destdn->get_dir()->get_inode(), MAY_WRITE))
8548 return;
8549
20effc67
TL
8550 if (!linkmerge && !check_fragment_space(mdr, destdn->get_dir()))
8551 return;
8552
8553 if (!linkmerge && !check_dir_max_entries(mdr, destdn->get_dir()))
7c673cae
FG
8554 return;
8555
8556 if (!check_access(mdr, srci, MAY_WRITE))
8557 return;
8558 }
8559
8560 // with read lock, really verify oldin is empty
8561 if (oldin &&
8562 oldin->is_dir() &&
8563 _dir_is_nonempty(mdr, oldin)) {
f67539c2 8564 respond_to_request(mdr, -CEPHFS_ENOTEMPTY);
7c673cae
FG
8565 return;
8566 }
8567
11fdf7f2 8568 /* project_snaprealm_past_parent() will do this job
7c673cae
FG
8569 *
8570 // moving between snaprealms?
8571 if (srcdnl->is_primary() && srci->is_multiversion() && !srci->snaprealm) {
8572 SnapRealm *srcrealm = srci->find_snaprealm();
8573 SnapRealm *destrealm = destdn->get_dir()->inode->find_snaprealm();
8574 if (srcrealm != destrealm &&
8575 (srcrealm->get_newest_seq() + 1 > srcdn->first ||
8576 destrealm->get_newest_seq() + 1 > srcdn->first)) {
8577 dout(10) << " renaming between snaprealms, creating snaprealm for " << *srci << dendl;
8578 mdcache->snaprealm_create(mdr, srci);
8579 return;
8580 }
8581 }
8582 */
8583
adb31ebb
TL
8584 SnapRealm *dest_realm = nullptr;
8585 SnapRealm *src_realm = nullptr;
8586 if (!linkmerge) {
8587 dest_realm = destdir->inode->find_snaprealm();
8588 if (srcdir->inode == destdir->inode)
8589 src_realm = dest_realm;
8590 else
8591 src_realm = srcdir->inode->find_snaprealm();
8592 if (src_realm != dest_realm &&
8593 src_realm->get_subvolume_ino() != dest_realm->get_subvolume_ino()) {
f67539c2 8594 respond_to_request(mdr, -CEPHFS_EXDEV);
adb31ebb
TL
8595 return;
8596 }
8597 }
8598
11fdf7f2 8599 ceph_assert(g_conf()->mds_kill_rename_at != 1);
7c673cae
FG
8600
8601 // -- open all srcdn inode frags, if any --
8602 // we need these open so that auth can properly delegate from inode to dirfrags
8603 // after the inode is _ours_.
8604 if (srcdnl->is_primary() &&
8605 !srcdn->is_auth() &&
8606 srci->is_dir()) {
8607 dout(10) << "srci is remote dir, setting stickydirs and opening all frags" << dendl;
8608 mdr->set_stickydirs(srci);
8609
11fdf7f2
TL
8610 frag_vec_t leaves;
8611 srci->dirfragtree.get_leaves(leaves);
8612 for (const auto& leaf : leaves) {
8613 CDir *dir = srci->get_dirfrag(leaf);
7c673cae 8614 if (!dir) {
11fdf7f2
TL
8615 dout(10) << " opening " << leaf << " under " << *srci << dendl;
8616 mdcache->open_remote_dirfrag(srci, leaf, new C_MDS_RetryRequest(mdcache, mdr));
7c673cae
FG
8617 return;
8618 }
8619 }
8620 }
8621
11fdf7f2
TL
8622 // -- prepare snaprealm ---
8623
8624 if (linkmerge) {
8625 if (!mdr->more()->srci_srnode &&
8626 srci->get_projected_inode()->nlink == 1 &&
8627 srci->is_projected_snaprealm_global()) {
8628 sr_t *new_srnode = srci->prepare_new_srnode(0);
adb31ebb 8629 srci->record_snaprealm_parent_dentry(new_srnode, nullptr, destdn, false);
11fdf7f2
TL
8630
8631 srci->clear_snaprealm_global(new_srnode);
8632 mdr->more()->srci_srnode = new_srnode;
8633 }
8634 } else {
8635 if (oldin && !mdr->more()->desti_srnode) {
8636 if (oldin->is_projected_snaprealm_global()) {
8637 sr_t *new_srnode = oldin->prepare_new_srnode(0);
adb31ebb 8638 oldin->record_snaprealm_parent_dentry(new_srnode, dest_realm, destdn, destdnl->is_primary());
11fdf7f2
TL
8639 // dropping the last linkage or dropping the last remote linkage,
8640 // detch the inode from global snaprealm
8641 auto nlink = oldin->get_projected_inode()->nlink;
8642 if (nlink == 1 ||
8643 (nlink == 2 && !destdnl->is_primary() &&
8644 !oldin->get_projected_parent_dir()->inode->is_stray()))
8645 oldin->clear_snaprealm_global(new_srnode);
8646 mdr->more()->desti_srnode = new_srnode;
8647 } else if (destdnl->is_primary()) {
11fdf7f2
TL
8648 snapid_t follows = dest_realm->get_newest_seq();
8649 if (oldin->snaprealm || follows + 1 > oldin->get_oldest_snap()) {
8650 sr_t *new_srnode = oldin->prepare_new_srnode(follows);
8651 oldin->record_snaprealm_past_parent(new_srnode, straydn->get_dir()->inode->find_snaprealm());
8652 mdr->more()->desti_srnode = new_srnode;
8653 }
8654 }
8655 }
8656 if (!mdr->more()->srci_srnode) {
11fdf7f2
TL
8657 if (srci->is_projected_snaprealm_global()) {
8658 sr_t *new_srnode = srci->prepare_new_srnode(0);
adb31ebb 8659 srci->record_snaprealm_parent_dentry(new_srnode, src_realm, srcdn, srcdnl->is_primary());
11fdf7f2
TL
8660 mdr->more()->srci_srnode = new_srnode;
8661 } else if (srcdnl->is_primary()) {
11fdf7f2
TL
8662 snapid_t follows = src_realm->get_newest_seq();
8663 if (src_realm != dest_realm &&
8664 (srci->snaprealm || follows + 1 > srci->get_oldest_snap())) {
8665 sr_t *new_srnode = srci->prepare_new_srnode(follows);
8666 srci->record_snaprealm_past_parent(new_srnode, dest_realm);
8667 mdr->more()->srci_srnode = new_srnode;
8668 }
8669 }
8670 }
8671 }
8672
7c673cae
FG
8673 // -- prepare witnesses --
8674
9f95a23c
TL
8675 /*
8676 * NOTE: we use _all_ replicas as witnesses.
8677 * this probably isn't totally necessary (esp for file renames),
8678 * but if/when we change that, we have to make sure rejoin is
8679 * sufficiently robust to handle strong rejoins from survivors
8680 * with totally wrong dentry->inode linkage.
8681 * (currently, it can ignore rename effects, because the resolve
8682 * stage will sort them out.)
8683 */
8684 set<mds_rank_t> witnesses = mdr->more()->extra_witnesses;
8685 if (srcdn->is_auth())
8686 srcdn->list_replicas(witnesses);
8687 else
8688 witnesses.insert(srcdn->authority().first);
8689 if (srcdnl->is_remote() && !srci->is_auth())
8690 witnesses.insert(srci->authority().first);
8691 destdn->list_replicas(witnesses);
8692 if (destdnl->is_remote() && !oldin->is_auth())
8693 witnesses.insert(oldin->authority().first);
8694 dout(10) << " witnesses " << witnesses << ", have " << mdr->more()->witnessed << dendl;
8695
8696 if (!witnesses.empty()) {
8697 // Replicas can't see projected dentry linkages and will get confused.
8698 // We have taken snaplocks on ancestor inodes. Later rename/rmdir requests
8699 // can't project these inodes' linkages.
8700 bool need_flush = false;
8701 for (auto& dn : srctrace) {
8702 if (dn->is_projected()) {
8703 need_flush = true;
8704 break;
8705 }
8706 }
8707 if (!need_flush) {
8708 CDentry *dn = destdn;
8709 do {
8710 if (dn->is_projected()) {
8711 need_flush = true;
8712 break;
8713 }
8714 CInode *diri = dn->get_dir()->get_inode();
8715 dn = diri->get_projected_parent_dn();
8716 } while (dn);
8717 }
8718 if (need_flush) {
8719 mdlog->wait_for_safe(
8720 new MDSInternalContextWrapper(mds,
8721 new C_MDS_RetryRequest(mdcache, mdr)));
8722 mdlog->flush();
8723 return;
8724 }
8725 }
8726
7c673cae
FG
8727 // do srcdn auth last
8728 mds_rank_t last = MDS_RANK_NONE;
8729 if (!srcdn->is_auth()) {
8730 last = srcdn->authority().first;
8731 mdr->more()->srcdn_auth_mds = last;
8732 // ask auth of srci to mark srci as ambiguous auth if more than two MDS
8733 // are involved in the rename operation.
8734 if (srcdnl->is_primary() && !mdr->more()->is_ambiguous_auth) {
8735 dout(10) << " preparing ambiguous auth for srci" << dendl;
11fdf7f2
TL
8736 ceph_assert(mdr->more()->is_remote_frozen_authpin);
8737 ceph_assert(mdr->more()->rename_inode == srci);
7c673cae
FG
8738 _rename_prepare_witness(mdr, last, witnesses, srctrace, desttrace, straydn);
8739 return;
8740 }
8741 }
8742
8743 for (set<mds_rank_t>::iterator p = witnesses.begin();
8744 p != witnesses.end();
8745 ++p) {
8746 if (*p == last) continue; // do it last!
8747 if (mdr->more()->witnessed.count(*p)) {
8748 dout(10) << " already witnessed by mds." << *p << dendl;
f67539c2 8749 } else if (mdr->more()->waiting_on_peer.count(*p)) {
7c673cae
FG
8750 dout(10) << " already waiting on witness mds." << *p << dendl;
8751 } else {
8752 if (!_rename_prepare_witness(mdr, *p, witnesses, srctrace, desttrace, straydn))
8753 return;
8754 }
8755 }
f67539c2 8756 if (!mdr->more()->waiting_on_peer.empty())
7c673cae
FG
8757 return; // we're waiting for a witness.
8758
8759 if (last != MDS_RANK_NONE && mdr->more()->witnessed.count(last) == 0) {
8760 dout(10) << " preparing last witness (srcdn auth)" << dendl;
f67539c2 8761 ceph_assert(mdr->more()->waiting_on_peer.count(last) == 0);
7c673cae
FG
8762 _rename_prepare_witness(mdr, last, witnesses, srctrace, desttrace, straydn);
8763 return;
8764 }
8765
f67539c2
TL
8766 // test hack: bail after peer does prepare, so we can verify it's _live_ rollback.
8767 if (!mdr->more()->peers.empty() && !srci->is_dir())
11fdf7f2 8768 ceph_assert(g_conf()->mds_kill_rename_at != 3);
f67539c2 8769 if (!mdr->more()->peers.empty() && srci->is_dir())
11fdf7f2 8770 ceph_assert(g_conf()->mds_kill_rename_at != 4);
7c673cae
FG
8771
8772 // -- declare now --
8773 mdr->set_mds_stamp(ceph_clock_now());
8774
8775 // -- prepare journal entry --
8776 mdr->ls = mdlog->get_current_segment();
8777 EUpdate *le = new EUpdate(mdlog, "rename");
8778 mdlog->start_entry(le);
f67539c2 8779 le->metablob.add_client_req(mdr->reqid, req->get_oldest_client_tid());
7c673cae 8780 if (!mdr->more()->witnessed.empty()) {
f67539c2 8781 dout(20) << " noting uncommitted_peers " << mdr->more()->witnessed << dendl;
7c673cae
FG
8782
8783 le->reqid = mdr->reqid;
f67539c2 8784 le->had_peers = true;
7c673cae 8785
f67539c2 8786 mdcache->add_uncommitted_leader(mdr->reqid, mdr->ls, mdr->more()->witnessed);
7c673cae
FG
8787 // no need to send frozen auth pin to recovring auth MDS of srci
8788 mdr->more()->is_remote_frozen_authpin = false;
8789 }
8790
f67539c2 8791 _rename_prepare(mdr, &le->metablob, &le->client_map, srcdn, destdn, req->get_alternate_name(), straydn);
7c673cae
FG
8792 if (le->client_map.length())
8793 le->cmapv = mds->sessionmap.get_projected();
8794
8795 // -- commit locally --
8796 C_MDS_rename_finish *fin = new C_MDS_rename_finish(this, mdr, srcdn, destdn, straydn);
8797
8798 journal_and_reply(mdr, srci, destdn, le, fin);
81eedcae 8799 mds->balancer->maybe_fragment(destdn->get_dir(), false);
7c673cae
FG
8800}
8801
8802
8803void Server::_rename_finish(MDRequestRef& mdr, CDentry *srcdn, CDentry *destdn, CDentry *straydn)
8804{
8805 dout(10) << "_rename_finish " << *mdr << dendl;
8806
8807 if (!mdr->more()->witnessed.empty())
f67539c2 8808 mdcache->logged_leader_update(mdr->reqid);
7c673cae
FG
8809
8810 // apply
8811 _rename_apply(mdr, srcdn, destdn, straydn);
8812
8813 mdcache->send_dentry_link(destdn, mdr);
8814
8815 CDentry::linkage_t *destdnl = destdn->get_linkage();
8816 CInode *in = destdnl->get_inode();
8817 bool need_eval = mdr->more()->cap_imports.count(in);
8818
f67539c2
TL
8819 // test hack: test peer commit
8820 if (!mdr->more()->peers.empty() && !in->is_dir())
11fdf7f2 8821 ceph_assert(g_conf()->mds_kill_rename_at != 5);
f67539c2 8822 if (!mdr->more()->peers.empty() && in->is_dir())
11fdf7f2 8823 ceph_assert(g_conf()->mds_kill_rename_at != 6);
7c673cae
FG
8824
8825 // bump popularity
11fdf7f2 8826 mds->balancer->hit_dir(srcdn->get_dir(), META_POP_IWR);
7c673cae 8827 if (destdnl->is_remote() && in->is_auth())
11fdf7f2 8828 mds->balancer->hit_inode(in, META_POP_IWR);
7c673cae
FG
8829
8830 // did we import srci? if so, explicitly ack that import that, before we unlock and reply.
8831
11fdf7f2 8832 ceph_assert(g_conf()->mds_kill_rename_at != 7);
7c673cae
FG
8833
8834 // reply
8835 respond_to_request(mdr, 0);
8836
8837 if (need_eval)
8838 mds->locker->eval(in, CEPH_CAP_LOCKS, true);
8839
8840 // clean up?
8841 // respond_to_request() drops locks. So stray reintegration can race with us.
8842 if (straydn && !straydn->get_projected_linkage()->is_null()) {
8843 mdcache->notify_stray(straydn);
8844 }
8845}
8846
8847
8848
8849// helpers
8850
8851bool Server::_rename_prepare_witness(MDRequestRef& mdr, mds_rank_t who, set<mds_rank_t> &witnesse,
8852 vector<CDentry*>& srctrace, vector<CDentry*>& dsttrace, CDentry *straydn)
8853{
f67539c2
TL
8854 const auto& client_req = mdr->client_request;
8855 ceph_assert(client_req);
8856
7c673cae
FG
8857 if (mds->is_cluster_degraded() &&
8858 !mds->mdsmap->is_clientreplay_or_active_or_stopping(who)) {
8859 dout(10) << "_rename_prepare_witness mds." << who << " is not active" << dendl;
f67539c2 8860 if (mdr->more()->waiting_on_peer.empty())
7c673cae
FG
8861 mds->wait_for_active_peer(who, new C_MDS_RetryRequest(mdcache, mdr));
8862 return false;
8863 }
8864
8865 dout(10) << "_rename_prepare_witness mds." << who << dendl;
f67539c2 8866 auto req = make_message<MMDSPeerRequest>(mdr->reqid, mdr->attempt, MMDSPeerRequest::OP_RENAMEPREP);
7c673cae
FG
8867
8868 req->srcdnpath = filepath(srctrace.front()->get_dir()->ino());
8869 for (auto dn : srctrace)
94b18763 8870 req->srcdnpath.push_dentry(dn->get_name());
7c673cae
FG
8871 req->destdnpath = filepath(dsttrace.front()->get_dir()->ino());
8872 for (auto dn : dsttrace)
94b18763 8873 req->destdnpath.push_dentry(dn->get_name());
f67539c2 8874 req->alternate_name = client_req->alternate_name;
7c673cae 8875 if (straydn)
9f95a23c 8876 mdcache->encode_replica_stray(straydn, who, req->straybl);
11fdf7f2
TL
8877
8878 if (mdr->more()->srci_srnode)
8879 encode(*mdr->more()->srci_srnode, req->srci_snapbl);
8880 if (mdr->more()->desti_srnode)
8881 encode(*mdr->more()->desti_srnode, req->desti_snapbl);
31f18b77
FG
8882
8883 req->srcdn_auth = mdr->more()->srcdn_auth_mds;
7c673cae
FG
8884
8885 // srcdn auth will verify our current witness list is sufficient
8886 req->witnesses = witnesse;
8887
8888 req->op_stamp = mdr->get_op_stamp();
8889 mds->send_message_mds(req, who);
8890
f67539c2
TL
8891 ceph_assert(mdr->more()->waiting_on_peer.count(who) == 0);
8892 mdr->more()->waiting_on_peer.insert(who);
7c673cae
FG
8893 return true;
8894}
8895
8896version_t Server::_rename_prepare_import(MDRequestRef& mdr, CDentry *srcdn, bufferlist *client_map_bl)
8897{
8898 version_t oldpv = mdr->more()->inode_import_v;
8899
8900 CDentry::linkage_t *srcdnl = srcdn->get_linkage();
8901
8902 /* import node */
11fdf7f2 8903 auto blp = mdr->more()->inode_import.cbegin();
7c673cae
FG
8904
8905 // imported caps
28e407b8 8906 map<client_t,entity_inst_t> client_map;
11fdf7f2 8907 map<client_t, client_metadata_t> client_metadata_map;
28e407b8 8908 decode(client_map, blp);
11fdf7f2
TL
8909 decode(client_metadata_map, blp);
8910 prepare_force_open_sessions(client_map, client_metadata_map,
8911 mdr->more()->imported_session_map);
28e407b8 8912 encode(client_map, *client_map_bl, mds->mdsmap->get_up_features());
11fdf7f2 8913 encode(client_metadata_map, *client_map_bl);
7c673cae
FG
8914
8915 list<ScatterLock*> updated_scatterlocks;
8916 mdcache->migrator->decode_import_inode(srcdn, blp, srcdn->authority().first, mdr->ls,
8917 mdr->more()->cap_imports, updated_scatterlocks);
8918
8919 // hack: force back to !auth and clean, temporarily
8920 srcdnl->get_inode()->state_clear(CInode::STATE_AUTH);
8921 srcdnl->get_inode()->mark_clean();
8922
8923 return oldpv;
8924}
8925
8926bool Server::_need_force_journal(CInode *diri, bool empty)
8927{
9f95a23c 8928 auto&& dirs = diri->get_dirfrags();
7c673cae
FG
8929
8930 bool force_journal = false;
8931 if (empty) {
11fdf7f2
TL
8932 for (const auto& dir : dirs) {
8933 if (dir->is_subtree_root() && dir->get_dir_auth().first == mds->get_nodeid()) {
8934 dout(10) << " frag " << dir->get_frag() << " is auth subtree dirfrag, will force journal" << dendl;
7c673cae
FG
8935 force_journal = true;
8936 break;
8937 } else
11fdf7f2 8938 dout(20) << " frag " << dir->get_frag() << " is not auth subtree dirfrag" << dendl;
7c673cae
FG
8939 }
8940 } else {
8941 // see if any children of our frags are auth subtrees.
11fdf7f2
TL
8942 std::vector<CDir*> subtrees;
8943 mdcache->get_subtrees(subtrees);
8944 dout(10) << " subtrees " << subtrees << " frags " << dirs << dendl;
8945 for (const auto& dir : dirs) {
8946 for (const auto& subtree : subtrees) {
8947 if (dir->contains(subtree)) {
8948 if (subtree->get_dir_auth().first == mds->get_nodeid()) {
8949 dout(10) << " frag " << dir->get_frag() << " contains (maybe) auth subtree, will force journal "
8950 << *subtree << dendl;
7c673cae
FG
8951 force_journal = true;
8952 break;
8953 } else
11fdf7f2 8954 dout(20) << " frag " << dir->get_frag() << " contains but isn't auth for " << *subtree << dendl;
7c673cae 8955 } else
11fdf7f2 8956 dout(20) << " frag " << dir->get_frag() << " does not contain " << *subtree << dendl;
7c673cae
FG
8957 }
8958 if (force_journal)
8959 break;
8960 }
8961 }
8962 return force_journal;
8963}
8964
8965void Server::_rename_prepare(MDRequestRef& mdr,
8966 EMetaBlob *metablob, bufferlist *client_map_bl,
f67539c2
TL
8967 CDentry *srcdn, CDentry *destdn, std::string_view alternate_name,
8968 CDentry *straydn)
7c673cae
FG
8969{
8970 dout(10) << "_rename_prepare " << *mdr << " " << *srcdn << " " << *destdn << dendl;
8971 if (straydn)
8972 dout(10) << " straydn " << *straydn << dendl;
8973
8974 CDentry::linkage_t *srcdnl = srcdn->get_projected_linkage();
8975 CDentry::linkage_t *destdnl = destdn->get_projected_linkage();
8976 CInode *srci = srcdnl->get_inode();
8977 CInode *oldin = destdnl->get_inode();
8978
8979 // primary+remote link merge?
11fdf7f2
TL
8980 bool linkmerge = (srci == oldin);
8981 if (linkmerge)
8982 ceph_assert(srcdnl->is_primary() && destdnl->is_remote());
7c673cae
FG
8983 bool silent = srcdn->get_dir()->inode->is_stray();
8984
8985 bool force_journal_dest = false;
8986 if (srci->is_dir() && !destdn->is_auth()) {
8987 if (srci->is_auth()) {
8988 // if we are auth for srci and exporting it, force journal because journal replay needs
8989 // the source inode to create auth subtrees.
8990 dout(10) << " we are exporting srci, will force journal destdn" << dendl;
8991 force_journal_dest = true;
8992 } else
8993 force_journal_dest = _need_force_journal(srci, false);
8994 }
8995
8996 bool force_journal_stray = false;
8997 if (oldin && oldin->is_dir() && straydn && !straydn->is_auth())
8998 force_journal_stray = _need_force_journal(oldin, true);
8999
9000 if (linkmerge)
9001 dout(10) << " merging remote and primary links to the same inode" << dendl;
9002 if (silent)
9003 dout(10) << " reintegrating stray; will avoid changing nlink or dir mtime" << dendl;
9004 if (force_journal_dest)
9005 dout(10) << " forcing journal destdn because we (will) have auth subtrees nested beneath it" << dendl;
9006 if (force_journal_stray)
9007 dout(10) << " forcing journal straydn because we (will) have auth subtrees nested beneath it" << dendl;
9008
9009 if (srci->is_dir() && (destdn->is_auth() || force_journal_dest)) {
9010 dout(10) << " noting renamed dir ino " << srci->ino() << " in metablob" << dendl;
9011 metablob->renamed_dirino = srci->ino();
9012 } else if (oldin && oldin->is_dir() && force_journal_stray) {
9013 dout(10) << " noting rename target dir " << oldin->ino() << " in metablob" << dendl;
9014 metablob->renamed_dirino = oldin->ino();
9015 }
9016
9017 // prepare
94b18763
FG
9018 CInode::mempool_inode *spi = 0; // renamed inode
9019 CInode::mempool_inode *tpi = 0; // target/overwritten inode
7c673cae
FG
9020
9021 // target inode
9022 if (!linkmerge) {
9023 if (destdnl->is_primary()) {
11fdf7f2 9024 ceph_assert(straydn); // moving to straydn.
7c673cae
FG
9025 // link--, and move.
9026 if (destdn->is_auth()) {
f67539c2
TL
9027 auto pi= oldin->project_inode(mdr); //project_snaprealm
9028 pi.inode->version = straydn->pre_dirty(pi.inode->version);
9029 pi.inode->update_backtrace();
9030 tpi = pi.inode.get();
7c673cae
FG
9031 }
9032 straydn->push_projected_linkage(oldin);
9033 } else if (destdnl->is_remote()) {
9034 // nlink-- targeti
9035 if (oldin->is_auth()) {
f67539c2
TL
9036 auto pi = oldin->project_inode(mdr);
9037 pi.inode->version = oldin->pre_dirty();
9038 tpi = pi.inode.get();
7c673cae
FG
9039 }
9040 }
9041 }
9042
9043 // dest
f67539c2
TL
9044 if (destdnl->is_null()) {
9045 /* handle_client_rename checks that alternate_name matches for existing destdn */
9046 destdn->set_alternate_name(alternate_name);
9047 }
7c673cae
FG
9048 if (srcdnl->is_remote()) {
9049 if (!linkmerge) {
9050 // destdn
9051 if (destdn->is_auth())
9052 mdr->more()->pvmap[destdn] = destdn->pre_dirty();
9053 destdn->push_projected_linkage(srcdnl->get_remote_ino(), srcdnl->get_remote_d_type());
9054 // srci
9055 if (srci->is_auth()) {
f67539c2
TL
9056 auto pi = srci->project_inode(mdr);
9057 pi.inode->version = srci->pre_dirty();
9058 spi = pi.inode.get();
7c673cae
FG
9059 }
9060 } else {
9061 dout(10) << " will merge remote onto primary link" << dendl;
9062 if (destdn->is_auth()) {
f67539c2
TL
9063 auto pi = oldin->project_inode(mdr);
9064 pi.inode->version = mdr->more()->pvmap[destdn] = destdn->pre_dirty(oldin->get_version());
9065 spi = pi.inode.get();
7c673cae
FG
9066 }
9067 }
9068 } else { // primary
9069 if (destdn->is_auth()) {
9070 version_t oldpv;
9071 if (srcdn->is_auth())
9072 oldpv = srci->get_projected_version();
9073 else {
9074 oldpv = _rename_prepare_import(mdr, srcdn, client_map_bl);
9075
9076 // note which dirfrags have child subtrees in the journal
9077 // event, so that we can open those (as bounds) during replay.
9078 if (srci->is_dir()) {
9f95a23c
TL
9079 auto&& ls = srci->get_dirfrags();
9080 for (const auto& dir : ls) {
7c673cae
FG
9081 if (!dir->is_auth())
9082 metablob->renamed_dir_frags.push_back(dir->get_frag());
9083 }
9084 dout(10) << " noting renamed dir open frags " << metablob->renamed_dir_frags << dendl;
9085 }
9086 }
f67539c2 9087 auto pi = srci->project_inode(mdr); // project snaprealm if srcdnl->is_primary
7c673cae 9088 // & srcdnl->snaprealm
f67539c2
TL
9089 pi.inode->version = mdr->more()->pvmap[destdn] = destdn->pre_dirty(oldpv);
9090 pi.inode->update_backtrace();
9091 spi = pi.inode.get();
7c673cae
FG
9092 }
9093 destdn->push_projected_linkage(srci);
9094 }
9095
9096 // src
9097 if (srcdn->is_auth())
9098 mdr->more()->pvmap[srcdn] = srcdn->pre_dirty();
9099 srcdn->push_projected_linkage(); // push null linkage
9100
9101 if (!silent) {
94b18763 9102 if (spi) {
91327a77
AA
9103 spi->ctime = mdr->get_op_stamp();
9104 if (mdr->get_op_stamp() > spi->rstat.rctime)
9105 spi->rstat.rctime = mdr->get_op_stamp();
94b18763 9106 spi->change_attr++;
7c673cae 9107 if (linkmerge)
94b18763 9108 spi->nlink--;
7c673cae
FG
9109 }
9110 if (tpi) {
91327a77
AA
9111 tpi->ctime = mdr->get_op_stamp();
9112 if (mdr->get_op_stamp() > tpi->rstat.rctime)
9113 tpi->rstat.rctime = mdr->get_op_stamp();
7c673cae 9114 tpi->change_attr++;
94b18763
FG
9115 {
9116 std::string t;
9117 destdn->make_path_string(t, true);
11fdf7f2 9118 tpi->stray_prior_path = std::move(t);
94b18763 9119 }
7c673cae
FG
9120 tpi->nlink--;
9121 if (tpi->nlink == 0)
9122 oldin->state_set(CInode::STATE_ORPHAN);
9123 }
9124 }
9125
9126 // prepare nesting, mtime updates
9127 int predirty_dir = silent ? 0:PREDIRTY_DIR;
9128
9129 // guarantee stray dir is processed first during journal replay. unlink the old inode,
9130 // then link the source inode to destdn
9131 if (destdnl->is_primary()) {
11fdf7f2 9132 ceph_assert(straydn);
7c673cae
FG
9133 if (straydn->is_auth()) {
9134 metablob->add_dir_context(straydn->get_dir());
9135 metablob->add_dir(straydn->get_dir(), true);
9136 }
9137 }
9138
f67539c2
TL
9139 if (!linkmerge && destdnl->is_remote() && oldin->is_auth()) {
9140 CDir *oldin_dir = oldin->get_projected_parent_dir();
9141 if (oldin_dir != srcdn->get_dir() && oldin_dir != destdn->get_dir())
9142 mdcache->predirty_journal_parents(mdr, metablob, oldin, oldin_dir, PREDIRTY_PRIMARY);
9143 }
9144
7c673cae
FG
9145 // sub off target
9146 if (destdn->is_auth() && !destdnl->is_null()) {
9147 mdcache->predirty_journal_parents(mdr, metablob, oldin, destdn->get_dir(),
9148 (destdnl->is_primary() ? PREDIRTY_PRIMARY:0)|predirty_dir, -1);
224ce89b 9149 if (destdnl->is_primary()) {
11fdf7f2 9150 ceph_assert(straydn);
7c673cae
FG
9151 mdcache->predirty_journal_parents(mdr, metablob, oldin, straydn->get_dir(),
9152 PREDIRTY_PRIMARY|PREDIRTY_DIR, 1);
224ce89b 9153 }
7c673cae 9154 }
f67539c2
TL
9155
9156 if (srcdnl->is_remote() && srci->is_auth()) {
9157 CDir *srci_dir = srci->get_projected_parent_dir();
9158 if (srci_dir != srcdn->get_dir() && srci_dir != destdn->get_dir())
9159 mdcache->predirty_journal_parents(mdr, metablob, srci, srci_dir, PREDIRTY_PRIMARY);
9160 }
7c673cae
FG
9161
9162 // move srcdn
9163 int predirty_primary = (srcdnl->is_primary() && srcdn->get_dir() != destdn->get_dir()) ? PREDIRTY_PRIMARY:0;
9164 int flags = predirty_dir | predirty_primary;
9165 if (srcdn->is_auth())
9166 mdcache->predirty_journal_parents(mdr, metablob, srci, srcdn->get_dir(), PREDIRTY_SHALLOW|flags, -1);
9167 if (destdn->is_auth())
9168 mdcache->predirty_journal_parents(mdr, metablob, srci, destdn->get_dir(), flags, 1);
9169
7c673cae
FG
9170 // add it all to the metablob
9171 // target inode
9172 if (!linkmerge) {
9173 if (destdnl->is_primary()) {
11fdf7f2 9174 ceph_assert(straydn);
7c673cae
FG
9175 if (destdn->is_auth()) {
9176 // project snaprealm, too
11fdf7f2
TL
9177 if (auto& desti_srnode = mdr->more()->desti_srnode) {
9178 oldin->project_snaprealm(desti_srnode);
9179 if (tpi->nlink == 0)
9180 ceph_assert(!desti_srnode->is_parent_global());
9181 desti_srnode = NULL;
9182 }
9183 straydn->first = mdcache->get_global_snaprealm()->get_newest_seq() + 1;
7c673cae
FG
9184 metablob->add_primary_dentry(straydn, oldin, true, true);
9185 } else if (force_journal_stray) {
9186 dout(10) << " forced journaling straydn " << *straydn << dendl;
9187 metablob->add_dir_context(straydn->get_dir());
9188 metablob->add_primary_dentry(straydn, oldin, true);
9189 }
9190 } else if (destdnl->is_remote()) {
9191 if (oldin->is_auth()) {
11fdf7f2 9192 sr_t *new_srnode = NULL;
f67539c2
TL
9193 if (mdr->peer_request) {
9194 if (mdr->peer_request->desti_snapbl.length() > 0) {
11fdf7f2 9195 new_srnode = new sr_t();
f67539c2 9196 auto p = mdr->peer_request->desti_snapbl.cbegin();
11fdf7f2
TL
9197 decode(*new_srnode, p);
9198 }
9199 } else if (auto& desti_srnode = mdr->more()->desti_srnode) {
9200 new_srnode = desti_srnode;
9201 desti_srnode = NULL;
9202 }
9203 if (new_srnode) {
9204 oldin->project_snaprealm(new_srnode);
9205 if (tpi->nlink == 0)
9206 ceph_assert(!new_srnode->is_parent_global());
9207 }
7c673cae 9208 // auth for targeti
f67539c2
TL
9209 CDentry *oldin_pdn = oldin->get_projected_parent_dn();
9210 mdcache->journal_cow_dentry(mdr.get(), metablob, oldin_pdn);
9211 metablob->add_primary_dentry(oldin_pdn, oldin, true);
7c673cae
FG
9212 }
9213 }
9214 }
9215
9216 // dest
9217 if (srcdnl->is_remote()) {
11fdf7f2
TL
9218 ceph_assert(!linkmerge);
9219 if (destdn->is_auth() && !destdnl->is_null())
9220 mdcache->journal_cow_dentry(mdr.get(), metablob, destdn, CEPH_NOSNAP, 0, destdnl);
9221 else
9222 destdn->first = mdcache->get_global_snaprealm()->get_newest_seq() + 1;
7c673cae 9223
11fdf7f2
TL
9224 if (destdn->is_auth())
9225 metablob->add_remote_dentry(destdn, true, srcdnl->get_remote_ino(), srcdnl->get_remote_d_type());
9226
9227 if (srci->is_auth() ) { // it's remote
f67539c2
TL
9228 if (mdr->peer_request) {
9229 if (mdr->peer_request->srci_snapbl.length() > 0) {
11fdf7f2 9230 sr_t *new_srnode = new sr_t();
f67539c2 9231 auto p = mdr->peer_request->srci_snapbl.cbegin();
11fdf7f2
TL
9232 decode(*new_srnode, p);
9233 srci->project_snaprealm(new_srnode);
9234 }
9235 } else if (auto& srci_srnode = mdr->more()->srci_srnode) {
9236 srci->project_snaprealm(srci_srnode);
9237 srci_srnode = NULL;
7c673cae 9238 }
7c673cae 9239
11fdf7f2 9240 CDentry *srci_pdn = srci->get_projected_parent_dn();
f67539c2 9241 mdcache->journal_cow_dentry(mdr.get(), metablob, srci_pdn);
11fdf7f2 9242 metablob->add_primary_dentry(srci_pdn, srci, true);
7c673cae
FG
9243 }
9244 } else if (srcdnl->is_primary()) {
9245 // project snap parent update?
11fdf7f2
TL
9246 if (destdn->is_auth()) {
9247 if (auto& srci_srnode = mdr->more()->srci_srnode) {
9248 srci->project_snaprealm(srci_srnode);
9249 srci_srnode = NULL;
9250 }
9251 }
7c673cae
FG
9252
9253 if (destdn->is_auth() && !destdnl->is_null())
9254 mdcache->journal_cow_dentry(mdr.get(), metablob, destdn, CEPH_NOSNAP, 0, destdnl);
11fdf7f2
TL
9255
9256 destdn->first = mdcache->get_global_snaprealm()->get_newest_seq() + 1;
7c673cae
FG
9257
9258 if (destdn->is_auth())
9259 metablob->add_primary_dentry(destdn, srci, true, true);
9260 else if (force_journal_dest) {
9261 dout(10) << " forced journaling destdn " << *destdn << dendl;
9262 metablob->add_dir_context(destdn->get_dir());
9263 metablob->add_primary_dentry(destdn, srci, true);
9264 if (srcdn->is_auth() && srci->is_dir()) {
9265 // journal new subtrees root dirfrags
9f95a23c
TL
9266 auto&& ls = srci->get_dirfrags();
9267 for (const auto& dir : ls) {
7c673cae
FG
9268 if (dir->is_auth())
9269 metablob->add_dir(dir, true);
9270 }
9271 }
9272 }
9273 }
9274
9275 // src
9276 if (srcdn->is_auth()) {
9277 dout(10) << " journaling srcdn " << *srcdn << dendl;
9278 mdcache->journal_cow_dentry(mdr.get(), metablob, srcdn, CEPH_NOSNAP, 0, srcdnl);
f67539c2 9279 // also journal the inode in case we need do peer rename rollback. It is Ok to add
7c673cae
FG
9280 // both primary and NULL dentries. Because during journal replay, null dentry is
9281 // processed after primary dentry.
9282 if (srcdnl->is_primary() && !srci->is_dir() && !destdn->is_auth())
9283 metablob->add_primary_dentry(srcdn, srci, true);
9284 metablob->add_null_dentry(srcdn, true);
9285 } else
9286 dout(10) << " NOT journaling srcdn " << *srcdn << dendl;
9287
9288 // make renamed inode first track the dn
11fdf7f2
TL
9289 if (srcdnl->is_primary() && destdn->is_auth()) {
9290 ceph_assert(srci->first <= destdn->first);
9291 srci->first = destdn->first;
9292 }
9293 // make stray inode first track the straydn
9294 if (straydn && straydn->is_auth()) {
9295 ceph_assert(oldin->first <= straydn->first);
9296 oldin->first = straydn->first;
9297 }
7c673cae 9298
224ce89b 9299 if (oldin && oldin->is_dir()) {
11fdf7f2 9300 ceph_assert(straydn);
7c673cae 9301 mdcache->project_subtree_rename(oldin, destdn->get_dir(), straydn->get_dir());
224ce89b 9302 }
7c673cae
FG
9303 if (srci->is_dir())
9304 mdcache->project_subtree_rename(srci, srcdn->get_dir(), destdn->get_dir());
9305
9306}
9307
9308
9309void Server::_rename_apply(MDRequestRef& mdr, CDentry *srcdn, CDentry *destdn, CDentry *straydn)
9310{
9311 dout(10) << "_rename_apply " << *mdr << " " << *srcdn << " " << *destdn << dendl;
9312 dout(10) << " pvs " << mdr->more()->pvmap << dendl;
9313
9314 CDentry::linkage_t *srcdnl = srcdn->get_linkage();
9315 CDentry::linkage_t *destdnl = destdn->get_linkage();
9316
9317 CInode *oldin = destdnl->get_inode();
7c673cae
FG
9318
9319 // primary+remote link merge?
11fdf7f2
TL
9320 bool linkmerge = (srcdnl->get_inode() == oldin);
9321 if (linkmerge)
9322 ceph_assert(srcdnl->is_primary() || destdnl->is_remote());
9323
9324 bool new_in_snaprealm = false;
9325 bool new_oldin_snaprealm = false;
7c673cae
FG
9326
9327 // target inode
9328 if (!linkmerge) {
9329 if (destdnl->is_primary()) {
11fdf7f2 9330 ceph_assert(straydn);
7c673cae 9331 dout(10) << "straydn is " << *straydn << dendl;
11fdf7f2
TL
9332
9333 // if there is newly created snaprealm, need to split old snaprealm's
9334 // inodes_with_caps. So pop snaprealm before linkage changes.
9335 if (destdn->is_auth()) {
9336 bool hadrealm = (oldin->snaprealm ? true : false);
9337 oldin->early_pop_projected_snaprealm();
9338 new_oldin_snaprealm = (oldin->snaprealm && !hadrealm);
9339 } else {
f67539c2
TL
9340 ceph_assert(mdr->peer_request);
9341 if (mdr->peer_request->desti_snapbl.length()) {
11fdf7f2 9342 new_oldin_snaprealm = !oldin->snaprealm;
f67539c2 9343 oldin->decode_snap_blob(mdr->peer_request->desti_snapbl);
11fdf7f2 9344 ceph_assert(oldin->snaprealm);
11fdf7f2
TL
9345 }
9346 }
9347
31f18b77 9348 destdn->get_dir()->unlink_inode(destdn, false);
7c673cae
FG
9349
9350 straydn->pop_projected_linkage();
f67539c2 9351 if (mdr->is_peer() && !mdr->more()->peer_update_journaled)
11fdf7f2 9352 ceph_assert(!straydn->is_projected()); // no other projected
7c673cae
FG
9353
9354 // nlink-- targeti
11fdf7f2 9355 if (destdn->is_auth())
f67539c2 9356 oldin->pop_and_dirty_projected_inode(mdr->ls, mdr);
11fdf7f2
TL
9357
9358 mdcache->touch_dentry_bottom(straydn); // drop dn as quickly as possible.
7c673cae 9359 } else if (destdnl->is_remote()) {
31f18b77 9360 destdn->get_dir()->unlink_inode(destdn, false);
11fdf7f2 9361 if (oldin->is_auth()) {
f67539c2
TL
9362 oldin->pop_and_dirty_projected_inode(mdr->ls, mdr);
9363 } else if (mdr->peer_request) {
9364 if (mdr->peer_request->desti_snapbl.length() > 0) {
11fdf7f2 9365 ceph_assert(oldin->snaprealm);
f67539c2 9366 oldin->decode_snap_blob(mdr->peer_request->desti_snapbl);
11fdf7f2
TL
9367 }
9368 } else if (auto& desti_srnode = mdr->more()->desti_srnode) {
9369 delete desti_srnode;
9370 desti_srnode = NULL;
9371 }
7c673cae
FG
9372 }
9373 }
9374
9375 // unlink src before we relink it at dest
9376 CInode *in = srcdnl->get_inode();
11fdf7f2 9377 ceph_assert(in);
7c673cae
FG
9378
9379 bool srcdn_was_remote = srcdnl->is_remote();
11fdf7f2
TL
9380 if (!srcdn_was_remote) {
9381 // if there is newly created snaprealm, need to split old snaprealm's
9382 // inodes_with_caps. So pop snaprealm before linkage changes.
9383 if (destdn->is_auth()) {
9384 bool hadrealm = (in->snaprealm ? true : false);
9385 in->early_pop_projected_snaprealm();
9386 new_in_snaprealm = (in->snaprealm && !hadrealm);
9387 } else {
f67539c2
TL
9388 ceph_assert(mdr->peer_request);
9389 if (mdr->peer_request->srci_snapbl.length()) {
11fdf7f2 9390 new_in_snaprealm = !in->snaprealm;
f67539c2 9391 in->decode_snap_blob(mdr->peer_request->srci_snapbl);
11fdf7f2 9392 ceph_assert(in->snaprealm);
11fdf7f2
TL
9393 }
9394 }
9395 }
9396
7c673cae
FG
9397 srcdn->get_dir()->unlink_inode(srcdn);
9398
9399 // dest
9400 if (srcdn_was_remote) {
9401 if (!linkmerge) {
9402 // destdn
9403 destdnl = destdn->pop_projected_linkage();
f67539c2 9404 if (mdr->is_peer() && !mdr->more()->peer_update_journaled)
11fdf7f2 9405 ceph_assert(!destdn->is_projected()); // no other projected
7c673cae
FG
9406
9407 destdn->link_remote(destdnl, in);
9408 if (destdn->is_auth())
9409 destdn->mark_dirty(mdr->more()->pvmap[destdn], mdr->ls);
9410 // in
11fdf7f2 9411 if (in->is_auth()) {
f67539c2
TL
9412 in->pop_and_dirty_projected_inode(mdr->ls, mdr);
9413 } else if (mdr->peer_request) {
9414 if (mdr->peer_request->srci_snapbl.length() > 0) {
11fdf7f2 9415 ceph_assert(in->snaprealm);
f67539c2 9416 in->decode_snap_blob(mdr->peer_request->srci_snapbl);
11fdf7f2
TL
9417 }
9418 } else if (auto& srci_srnode = mdr->more()->srci_srnode) {
9419 delete srci_srnode;
9420 srci_srnode = NULL;
9421 }
7c673cae
FG
9422 } else {
9423 dout(10) << "merging remote onto primary link" << dendl;
f67539c2 9424 oldin->pop_and_dirty_projected_inode(mdr->ls, mdr);
7c673cae
FG
9425 }
9426 } else { // primary
9427 if (linkmerge) {
9428 dout(10) << "merging primary onto remote link" << dendl;
31f18b77 9429 destdn->get_dir()->unlink_inode(destdn, false);
7c673cae
FG
9430 }
9431 destdnl = destdn->pop_projected_linkage();
f67539c2 9432 if (mdr->is_peer() && !mdr->more()->peer_update_journaled)
11fdf7f2 9433 ceph_assert(!destdn->is_projected()); // no other projected
7c673cae
FG
9434
9435 // srcdn inode import?
9436 if (!srcdn->is_auth() && destdn->is_auth()) {
11fdf7f2 9437 ceph_assert(mdr->more()->inode_import.length() > 0);
7c673cae
FG
9438
9439 map<client_t,Capability::Import> imported_caps;
9440
9441 // finish cap imports
28e407b8 9442 finish_force_open_sessions(mdr->more()->imported_session_map);
7c673cae
FG
9443 if (mdr->more()->cap_imports.count(destdnl->get_inode())) {
9444 mdcache->migrator->finish_import_inode_caps(destdnl->get_inode(),
28e407b8
AA
9445 mdr->more()->srcdn_auth_mds, true,
9446 mdr->more()->imported_session_map,
9447 mdr->more()->cap_imports[destdnl->get_inode()],
9448 imported_caps);
7c673cae
FG
9449 }
9450
9451 mdr->more()->inode_import.clear();
11fdf7f2 9452 encode(imported_caps, mdr->more()->inode_import);
7c673cae
FG
9453
9454 /* hack: add an auth pin for each xlock we hold. These were
9455 * remote xlocks previously but now they're local and
9456 * we're going to try and unpin when we xlock_finish. */
11fdf7f2
TL
9457
9458 for (auto i = mdr->locks.lower_bound(&destdnl->get_inode()->versionlock);
9459 i != mdr->locks.end();
9460 ++i) {
9461 SimpleLock *lock = i->lock;
9462 if (lock->get_parent() != destdnl->get_inode())
9463 break;
9464 if (i->is_xlock() && !lock->is_locallock())
9465 mds->locker->xlock_import(lock);
9466 }
7c673cae
FG
9467
9468 // hack: fix auth bit
9469 in->state_set(CInode::STATE_AUTH);
7c673cae
FG
9470
9471 mdr->clear_ambiguous_auth();
9472 }
9473
11fdf7f2 9474 if (destdn->is_auth())
f67539c2 9475 in->pop_and_dirty_projected_inode(mdr->ls, mdr);
7c673cae
FG
9476 }
9477
9478 // src
9479 if (srcdn->is_auth())
9480 srcdn->mark_dirty(mdr->more()->pvmap[srcdn], mdr->ls);
9481 srcdn->pop_projected_linkage();
f67539c2 9482 if (mdr->is_peer() && !mdr->more()->peer_update_journaled)
11fdf7f2 9483 ceph_assert(!srcdn->is_projected()); // no other projected
7c673cae
FG
9484
9485 // apply remaining projected inodes (nested)
9486 mdr->apply();
9487
9488 // update subtree map?
11fdf7f2 9489 if (destdnl->is_primary() && in->is_dir())
224ce89b 9490 mdcache->adjust_subtree_after_rename(in, srcdn->get_dir(), true);
7c673cae
FG
9491
9492 if (straydn && oldin->is_dir())
9493 mdcache->adjust_subtree_after_rename(oldin, destdn->get_dir(), true);
9494
11fdf7f2
TL
9495 if (new_oldin_snaprealm)
9496 mdcache->do_realm_invalidate_and_update_notify(oldin, CEPH_SNAP_OP_SPLIT, false);
9497 if (new_in_snaprealm)
9498 mdcache->do_realm_invalidate_and_update_notify(in, CEPH_SNAP_OP_SPLIT, true);
9499
7c673cae
FG
9500 // removing a new dn?
9501 if (srcdn->is_auth())
9502 srcdn->get_dir()->try_remove_unlinked_dn(srcdn);
9503}
9504
9505
9506
9507// ------------
f67539c2 9508// PEER
7c673cae 9509
f67539c2 9510class C_MDS_PeerRenamePrep : public ServerLogContext {
7c673cae
FG
9511 CDentry *srcdn, *destdn, *straydn;
9512public:
f67539c2 9513 C_MDS_PeerRenamePrep(Server *s, MDRequestRef& m, CDentry *sr, CDentry *de, CDentry *st) :
7c673cae
FG
9514 ServerLogContext(s, m), srcdn(sr), destdn(de), straydn(st) {}
9515 void finish(int r) override {
f67539c2 9516 server->_logged_peer_rename(mdr, srcdn, destdn, straydn);
7c673cae
FG
9517 }
9518};
9519
f67539c2 9520class C_MDS_PeerRenameCommit : public ServerContext {
7c673cae
FG
9521 MDRequestRef mdr;
9522 CDentry *srcdn, *destdn, *straydn;
9523public:
f67539c2 9524 C_MDS_PeerRenameCommit(Server *s, MDRequestRef& m, CDentry *sr, CDentry *de, CDentry *st) :
7c673cae
FG
9525 ServerContext(s), mdr(m), srcdn(sr), destdn(de), straydn(st) {}
9526 void finish(int r) override {
f67539c2 9527 server->_commit_peer_rename(mdr, r, srcdn, destdn, straydn);
7c673cae
FG
9528 }
9529};
9530
f67539c2 9531class C_MDS_PeerRenameSessionsFlushed : public ServerContext {
7c673cae
FG
9532 MDRequestRef mdr;
9533public:
f67539c2 9534 C_MDS_PeerRenameSessionsFlushed(Server *s, MDRequestRef& r) :
7c673cae
FG
9535 ServerContext(s), mdr(r) {}
9536 void finish(int r) override {
f67539c2 9537 server->_peer_rename_sessions_flushed(mdr);
7c673cae
FG
9538 }
9539};
9540
f67539c2 9541void Server::handle_peer_rename_prep(MDRequestRef& mdr)
7c673cae 9542{
f67539c2
TL
9543 dout(10) << "handle_peer_rename_prep " << *mdr
9544 << " " << mdr->peer_request->srcdnpath
9545 << " to " << mdr->peer_request->destdnpath
7c673cae 9546 << dendl;
31f18b77 9547
f67539c2
TL
9548 if (mdr->peer_request->is_interrupted()) {
9549 dout(10) << " peer request interrupted, sending noop reply" << dendl;
9550 auto reply = make_message<MMDSPeerRequest>(mdr->reqid, mdr->attempt, MMDSPeerRequest::OP_RENAMEPREPACK);
31f18b77 9551 reply->mark_interrupted();
f67539c2
TL
9552 mds->send_message_mds(reply, mdr->peer_to_mds);
9553 mdr->reset_peer_request();
31f18b77
FG
9554 return;
9555 }
9556
7c673cae 9557 // discover destdn
f67539c2 9558 filepath destpath(mdr->peer_request->destdnpath);
7c673cae
FG
9559 dout(10) << " dest " << destpath << dendl;
9560 vector<CDentry*> trace;
f67539c2 9561 CF_MDS_RetryRequestFactory cf(mdcache, mdr, false);
9f95a23c
TL
9562 int r = mdcache->path_traverse(mdr, cf, destpath,
9563 MDS_TRAVERSE_DISCOVER | MDS_TRAVERSE_PATH_LOCKED | MDS_TRAVERSE_WANT_DENTRY,
9564 &trace);
7c673cae 9565 if (r > 0) return;
f67539c2 9566 if (r == -CEPHFS_ESTALE) {
7c673cae 9567 mdcache->find_ino_peers(destpath.get_ino(), new C_MDS_RetryRequest(mdcache, mdr),
f67539c2 9568 mdr->peer_to_mds, true);
7c673cae
FG
9569 return;
9570 }
11fdf7f2 9571 ceph_assert(r == 0); // we shouldn't get an error here!
7c673cae 9572
91327a77 9573 CDentry *destdn = trace.back();
7c673cae
FG
9574 CDentry::linkage_t *destdnl = destdn->get_projected_linkage();
9575 dout(10) << " destdn " << *destdn << dendl;
9576 mdr->pin(destdn);
9577
9578 // discover srcdn
f67539c2 9579 filepath srcpath(mdr->peer_request->srcdnpath);
7c673cae
FG
9580 dout(10) << " src " << srcpath << dendl;
9581 CInode *srci = nullptr;
9f95a23c
TL
9582 r = mdcache->path_traverse(mdr, cf, srcpath,
9583 MDS_TRAVERSE_DISCOVER | MDS_TRAVERSE_PATH_LOCKED,
9584 &trace, &srci);
7c673cae 9585 if (r > 0) return;
11fdf7f2 9586 ceph_assert(r == 0);
7c673cae 9587
91327a77 9588 CDentry *srcdn = trace.back();
7c673cae
FG
9589 CDentry::linkage_t *srcdnl = srcdn->get_projected_linkage();
9590 dout(10) << " srcdn " << *srcdn << dendl;
9591 mdr->pin(srcdn);
9592 mdr->pin(srci);
9593
9594 // stray?
11fdf7f2
TL
9595 bool linkmerge = srcdnl->get_inode() == destdnl->get_inode();
9596 if (linkmerge)
9597 ceph_assert(srcdnl->is_primary() && destdnl->is_remote());
7c673cae
FG
9598 CDentry *straydn = mdr->straydn;
9599 if (destdnl->is_primary() && !linkmerge)
11fdf7f2 9600 ceph_assert(straydn);
7c673cae 9601
f67539c2 9602 mdr->set_op_stamp(mdr->peer_request->op_stamp);
7c673cae
FG
9603 mdr->more()->srcdn_auth_mds = srcdn->authority().first;
9604
9605 // set up commit waiter (early, to clean up any freezing etc we do)
f67539c2
TL
9606 if (!mdr->more()->peer_commit)
9607 mdr->more()->peer_commit = new C_MDS_PeerRenameCommit(this, mdr, srcdn, destdn, straydn);
7c673cae
FG
9608
9609 // am i srcdn auth?
9610 if (srcdn->is_auth()) {
9611 set<mds_rank_t> srcdnrep;
9612 srcdn->list_replicas(srcdnrep);
9613
9614 bool reply_witness = false;
9615 if (srcdnl->is_primary() && !srcdnl->get_inode()->state_test(CInode::STATE_AMBIGUOUSAUTH)) {
9616 // freeze?
9617 // we need this to
9618 // - avoid conflicting lock state changes
9619 // - avoid concurrent updates to the inode
9620 // (this could also be accomplished with the versionlock)
11fdf7f2 9621 int allowance = 3; // 1 for the mdr auth_pin, 1 for the link lock, 1 for the snap lock
7c673cae
FG
9622 dout(10) << " freezing srci " << *srcdnl->get_inode() << " with allowance " << allowance << dendl;
9623 bool frozen_inode = srcdnl->get_inode()->freeze_inode(allowance);
9624
9625 // unfreeze auth pin after freezing the inode to avoid queueing waiters
9626 if (srcdnl->get_inode()->is_frozen_auth_pin())
9627 mdr->unfreeze_auth_pin();
9628
9629 if (!frozen_inode) {
9630 srcdnl->get_inode()->add_waiter(CInode::WAIT_FROZEN, new C_MDS_RetryRequest(mdcache, mdr));
9631 return;
9632 }
9633
9634 /*
9635 * set ambiguous auth for srci
9636 * NOTE: we don't worry about ambiguous cache expire as we do
f67539c2 9637 * with subtree migrations because all peers will pin
7c673cae
FG
9638 * srcdn->get_inode() for duration of this rename.
9639 */
9640 mdr->set_ambiguous_auth(srcdnl->get_inode());
9641
9642 // just mark the source inode as ambiguous auth if more than two MDS are involved.
f67539c2
TL
9643 // the leader will send another OP_RENAMEPREP peer request later.
9644 if (mdr->peer_request->witnesses.size() > 1) {
7c673cae
FG
9645 dout(10) << " set srci ambiguous auth; providing srcdn replica list" << dendl;
9646 reply_witness = true;
9647 }
9648
9649 // make sure bystanders have received all lock related messages
9650 for (set<mds_rank_t>::iterator p = srcdnrep.begin(); p != srcdnrep.end(); ++p) {
f67539c2 9651 if (*p == mdr->peer_to_mds ||
7c673cae
FG
9652 (mds->is_cluster_degraded() &&
9653 !mds->mdsmap->is_clientreplay_or_active_or_stopping(*p)))
9654 continue;
f67539c2 9655 auto notify = make_message<MMDSPeerRequest>(mdr->reqid, mdr->attempt, MMDSPeerRequest::OP_RENAMENOTIFY);
7c673cae 9656 mds->send_message_mds(notify, *p);
f67539c2 9657 mdr->more()->waiting_on_peer.insert(*p);
7c673cae
FG
9658 }
9659
9660 // make sure clients have received all cap related messages
9661 set<client_t> export_client_set;
9662 mdcache->migrator->get_export_client_set(srcdnl->get_inode(), export_client_set);
9663
9664 MDSGatherBuilder gather(g_ceph_context);
9665 flush_client_sessions(export_client_set, gather);
9666 if (gather.has_subs()) {
f67539c2
TL
9667 mdr->more()->waiting_on_peer.insert(MDS_RANK_NONE);
9668 gather.set_finisher(new C_MDS_PeerRenameSessionsFlushed(this, mdr));
7c673cae
FG
9669 gather.activate();
9670 }
9671 }
9672
9673 // is witness list sufficient?
9674 for (set<mds_rank_t>::iterator p = srcdnrep.begin(); p != srcdnrep.end(); ++p) {
f67539c2
TL
9675 if (*p == mdr->peer_to_mds ||
9676 mdr->peer_request->witnesses.count(*p)) continue;
7c673cae
FG
9677 dout(10) << " witness list insufficient; providing srcdn replica list" << dendl;
9678 reply_witness = true;
9679 break;
9680 }
9681
9682 if (reply_witness) {
11fdf7f2 9683 ceph_assert(!srcdnrep.empty());
f67539c2 9684 auto reply = make_message<MMDSPeerRequest>(mdr->reqid, mdr->attempt, MMDSPeerRequest::OP_RENAMEPREPACK);
7c673cae 9685 reply->witnesses.swap(srcdnrep);
f67539c2
TL
9686 mds->send_message_mds(reply, mdr->peer_to_mds);
9687 mdr->reset_peer_request();
7c673cae
FG
9688 return;
9689 }
9690 dout(10) << " witness list sufficient: includes all srcdn replicas" << dendl;
f67539c2 9691 if (!mdr->more()->waiting_on_peer.empty()) {
7c673cae 9692 dout(10) << " still waiting for rename notify acks from "
f67539c2 9693 << mdr->more()->waiting_on_peer << dendl;
7c673cae
FG
9694 return;
9695 }
9696 } else if (srcdnl->is_primary() && srcdn->authority() != destdn->authority()) {
9697 // set ambiguous auth for srci on witnesses
9698 mdr->set_ambiguous_auth(srcdnl->get_inode());
9699 }
9700
9701 // encode everything we'd need to roll this back... basically, just the original state.
9702 rename_rollback rollback;
9703
9704 rollback.reqid = mdr->reqid;
9705
9706 rollback.orig_src.dirfrag = srcdn->get_dir()->dirfrag();
9707 rollback.orig_src.dirfrag_old_mtime = srcdn->get_dir()->get_projected_fnode()->fragstat.mtime;
9708 rollback.orig_src.dirfrag_old_rctime = srcdn->get_dir()->get_projected_fnode()->rstat.rctime;
11fdf7f2 9709 rollback.orig_src.dname = srcdn->get_name();
7c673cae
FG
9710 if (srcdnl->is_primary())
9711 rollback.orig_src.ino = srcdnl->get_inode()->ino();
9712 else {
11fdf7f2 9713 ceph_assert(srcdnl->is_remote());
7c673cae
FG
9714 rollback.orig_src.remote_ino = srcdnl->get_remote_ino();
9715 rollback.orig_src.remote_d_type = srcdnl->get_remote_d_type();
9716 }
9717
9718 rollback.orig_dest.dirfrag = destdn->get_dir()->dirfrag();
9719 rollback.orig_dest.dirfrag_old_mtime = destdn->get_dir()->get_projected_fnode()->fragstat.mtime;
9720 rollback.orig_dest.dirfrag_old_rctime = destdn->get_dir()->get_projected_fnode()->rstat.rctime;
11fdf7f2 9721 rollback.orig_dest.dname = destdn->get_name();
7c673cae
FG
9722 if (destdnl->is_primary())
9723 rollback.orig_dest.ino = destdnl->get_inode()->ino();
9724 else if (destdnl->is_remote()) {
9725 rollback.orig_dest.remote_ino = destdnl->get_remote_ino();
9726 rollback.orig_dest.remote_d_type = destdnl->get_remote_d_type();
9727 }
9728
9729 if (straydn) {
9730 rollback.stray.dirfrag = straydn->get_dir()->dirfrag();
9731 rollback.stray.dirfrag_old_mtime = straydn->get_dir()->get_projected_fnode()->fragstat.mtime;
9732 rollback.stray.dirfrag_old_rctime = straydn->get_dir()->get_projected_fnode()->rstat.rctime;
11fdf7f2
TL
9733 rollback.stray.dname = straydn->get_name();
9734 }
f67539c2 9735 if (mdr->peer_request->desti_snapbl.length()) {
11fdf7f2
TL
9736 CInode *oldin = destdnl->get_inode();
9737 if (oldin->snaprealm) {
9738 encode(true, rollback.desti_snapbl);
9739 oldin->encode_snap_blob(rollback.desti_snapbl);
9740 } else {
9741 encode(false, rollback.desti_snapbl);
9742 }
9743 }
f67539c2 9744 if (mdr->peer_request->srci_snapbl.length()) {
11fdf7f2
TL
9745 if (srci->snaprealm) {
9746 encode(true, rollback.srci_snapbl);
9747 srci->encode_snap_blob(rollback.srci_snapbl);
9748 } else {
9749 encode(false, rollback.srci_snapbl);
9750 }
7c673cae 9751 }
11fdf7f2
TL
9752 encode(rollback, mdr->more()->rollback_bl);
9753 // FIXME: rollback snaprealm
7c673cae
FG
9754 dout(20) << " rollback is " << mdr->more()->rollback_bl.length() << " bytes" << dendl;
9755
9756 // journal.
9757 mdr->ls = mdlog->get_current_segment();
f67539c2
TL
9758 EPeerUpdate *le = new EPeerUpdate(mdlog, "peer_rename_prep", mdr->reqid, mdr->peer_to_mds,
9759 EPeerUpdate::OP_PREPARE, EPeerUpdate::RENAME);
7c673cae
FG
9760 mdlog->start_entry(le);
9761 le->rollback = mdr->more()->rollback_bl;
9762
f67539c2
TL
9763 bufferlist blah; // inode import data... obviously not used if we're the peer
9764 _rename_prepare(mdr, &le->commit, &blah, srcdn, destdn, mdr->peer_request->alternate_name, straydn);
7c673cae
FG
9765
9766 if (le->commit.empty()) {
9767 dout(10) << " empty metablob, skipping journal" << dendl;
9768 mdlog->cancel_entry(le);
9769 mdr->ls = NULL;
f67539c2 9770 _logged_peer_rename(mdr, srcdn, destdn, straydn);
7c673cae 9771 } else {
f67539c2
TL
9772 mdcache->add_uncommitted_peer(mdr->reqid, mdr->ls, mdr->peer_to_mds);
9773 mdr->more()->peer_update_journaled = true;
9774 submit_mdlog_entry(le, new C_MDS_PeerRenamePrep(this, mdr, srcdn, destdn, straydn),
7c673cae
FG
9775 mdr, __func__);
9776 mdlog->flush();
9777 }
9778}
9779
f67539c2 9780void Server::_logged_peer_rename(MDRequestRef& mdr,
7c673cae
FG
9781 CDentry *srcdn, CDentry *destdn, CDentry *straydn)
9782{
f67539c2 9783 dout(10) << "_logged_peer_rename " << *mdr << dendl;
7c673cae
FG
9784
9785 // prepare ack
f67539c2 9786 ref_t<MMDSPeerRequest> reply;
7c673cae 9787 if (!mdr->aborted) {
f67539c2
TL
9788 reply = make_message<MMDSPeerRequest>(mdr->reqid, mdr->attempt, MMDSPeerRequest::OP_RENAMEPREPACK);
9789 if (!mdr->more()->peer_update_journaled)
7c673cae
FG
9790 reply->mark_not_journaled();
9791 }
9792
9793 CDentry::linkage_t *srcdnl = srcdn->get_linkage();
7c673cae
FG
9794 //CDentry::linkage_t *straydnl = straydn ? straydn->get_linkage() : 0;
9795
9796 // export srci?
9797 if (srcdn->is_auth() && srcdnl->is_primary()) {
9798 // set export bounds for CInode::encode_export()
11fdf7f2 9799 if (reply) {
9f95a23c 9800 std::vector<CDir*> bounds;
11fdf7f2
TL
9801 if (srcdnl->get_inode()->is_dir()) {
9802 srcdnl->get_inode()->get_dirfrags(bounds);
9f95a23c
TL
9803 for (const auto& bound : bounds) {
9804 bound->state_set(CDir::STATE_EXPORTBOUND);
9805 }
11fdf7f2 9806 }
7c673cae 9807
11fdf7f2
TL
9808 map<client_t,entity_inst_t> exported_client_map;
9809 map<client_t, client_metadata_t> exported_client_metadata_map;
9810 bufferlist inodebl;
9811 mdcache->migrator->encode_export_inode(srcdnl->get_inode(), inodebl,
9812 exported_client_map,
9813 exported_client_metadata_map);
7c673cae 9814
9f95a23c
TL
9815 for (const auto& bound : bounds) {
9816 bound->state_clear(CDir::STATE_EXPORTBOUND);
9817 }
7c673cae 9818
11fdf7f2
TL
9819 encode(exported_client_map, reply->inode_export, mds->mdsmap->get_up_features());
9820 encode(exported_client_metadata_map, reply->inode_export);
7c673cae 9821 reply->inode_export.claim_append(inodebl);
f67539c2 9822 reply->inode_export_v = srcdnl->get_inode()->get_version();
7c673cae
FG
9823 }
9824
9825 // remove mdr auth pin
9826 mdr->auth_unpin(srcdnl->get_inode());
9827 mdr->more()->is_inode_exporter = true;
9828
9829 if (srcdnl->get_inode()->is_dirty())
9830 srcdnl->get_inode()->mark_clean();
9831
9832 dout(10) << " exported srci " << *srcdnl->get_inode() << dendl;
9833 }
9834
9835 // apply
9836 _rename_apply(mdr, srcdn, destdn, straydn);
11fdf7f2
TL
9837
9838 CDentry::linkage_t *destdnl = destdn->get_linkage();
7c673cae
FG
9839
9840 // bump popularity
11fdf7f2 9841 mds->balancer->hit_dir(srcdn->get_dir(), META_POP_IWR);
7c673cae 9842 if (destdnl->get_inode() && destdnl->get_inode()->is_auth())
11fdf7f2 9843 mds->balancer->hit_inode(destdnl->get_inode(), META_POP_IWR);
7c673cae
FG
9844
9845 // done.
f67539c2 9846 mdr->reset_peer_request();
7c673cae
FG
9847 mdr->straydn = 0;
9848
9849 if (reply) {
f67539c2 9850 mds->send_message_mds(reply, mdr->peer_to_mds);
7c673cae 9851 } else {
11fdf7f2 9852 ceph_assert(mdr->aborted);
7c673cae
FG
9853 dout(10) << " abort flag set, finishing" << dendl;
9854 mdcache->request_finish(mdr);
9855 }
9856}
9857
f67539c2 9858void Server::_commit_peer_rename(MDRequestRef& mdr, int r,
7c673cae
FG
9859 CDentry *srcdn, CDentry *destdn, CDentry *straydn)
9860{
f67539c2 9861 dout(10) << "_commit_peer_rename " << *mdr << " r=" << r << dendl;
7c673cae 9862
f64942e4
AA
9863 CInode *in = destdn->get_linkage()->get_inode();
9864
9865 inodeno_t migrated_stray;
9866 if (srcdn->is_auth() && srcdn->get_dir()->inode->is_stray())
9867 migrated_stray = in->ino();
7c673cae 9868
11fdf7f2 9869 MDSContext::vec finished;
7c673cae
FG
9870 if (r == 0) {
9871 // unfreeze+singleauth inode
9872 // hmm, do i really need to delay this?
9873 if (mdr->more()->is_inode_exporter) {
7c673cae
FG
9874 // drop our pins
9875 // we exported, clear out any xlocks that we moved to another MDS
7c673cae 9876
11fdf7f2
TL
9877 for (auto i = mdr->locks.lower_bound(&in->versionlock);
9878 i != mdr->locks.end(); ) {
9879 SimpleLock *lock = i->lock;
9880 if (lock->get_parent() != in)
9881 break;
7c673cae 9882 // we only care about xlocks on the exported inode
11fdf7f2
TL
9883 if (i->is_xlock() && !lock->is_locallock())
9884 mds->locker->xlock_export(i++, mdr.get());
9885 else
9886 ++i;
7c673cae
FG
9887 }
9888
9889 map<client_t,Capability::Import> peer_imported;
11fdf7f2
TL
9890 auto bp = mdr->more()->inode_import.cbegin();
9891 decode(peer_imported, bp);
7c673cae 9892
f64942e4 9893 dout(10) << " finishing inode export on " << *in << dendl;
f67539c2 9894 mdcache->migrator->finish_export_inode(in, mdr->peer_to_mds, peer_imported, finished);
7c673cae
FG
9895 mds->queue_waiters(finished); // this includes SINGLEAUTH waiters.
9896
9897 // unfreeze
11fdf7f2 9898 ceph_assert(in->is_frozen_inode());
f64942e4 9899 in->unfreeze_inode(finished);
7c673cae
FG
9900 }
9901
9902 // singleauth
9903 if (mdr->more()->is_ambiguous_auth) {
9904 mdr->more()->rename_inode->clear_ambiguous_auth(finished);
9905 mdr->more()->is_ambiguous_auth = false;
9906 }
9907
f67539c2 9908 if (straydn && mdr->more()->peer_update_journaled) {
31f18b77
FG
9909 CInode *strayin = straydn->get_projected_linkage()->get_inode();
9910 if (strayin && !strayin->snaprealm)
9911 mdcache->clear_dirty_bits_for_stray(strayin);
9912 }
7c673cae
FG
9913
9914 mds->queue_waiters(finished);
9915 mdr->cleanup();
9916
f67539c2 9917 if (mdr->more()->peer_update_journaled) {
7c673cae 9918 // write a commit to the journal
f67539c2
TL
9919 EPeerUpdate *le = new EPeerUpdate(mdlog, "peer_rename_commit", mdr->reqid,
9920 mdr->peer_to_mds, EPeerUpdate::OP_COMMIT,
9921 EPeerUpdate::RENAME);
7c673cae 9922 mdlog->start_entry(le);
f67539c2 9923 submit_mdlog_entry(le, new C_MDS_CommittedPeer(this, mdr), mdr, __func__);
7c673cae
FG
9924 mdlog->flush();
9925 } else {
f67539c2 9926 _committed_peer(mdr);
7c673cae
FG
9927 }
9928 } else {
9929
9930 // abort
9931 // rollback_bl may be empty if we froze the inode but had to provide an expanded
f67539c2 9932 // witness list from the leader, and they failed before we tried prep again.
7c673cae
FG
9933 if (mdr->more()->rollback_bl.length()) {
9934 if (mdr->more()->is_inode_exporter) {
f64942e4
AA
9935 dout(10) << " reversing inode export of " << *in << dendl;
9936 in->abort_export();
7c673cae 9937 }
f67539c2
TL
9938 if (mdcache->is_ambiguous_peer_update(mdr->reqid, mdr->peer_to_mds)) {
9939 mdcache->remove_ambiguous_peer_update(mdr->reqid, mdr->peer_to_mds);
9940 // rollback but preserve the peer request
9941 do_rename_rollback(mdr->more()->rollback_bl, mdr->peer_to_mds, mdr, false);
7c673cae
FG
9942 mdr->more()->rollback_bl.clear();
9943 } else
f67539c2 9944 do_rename_rollback(mdr->more()->rollback_bl, mdr->peer_to_mds, mdr, true);
7c673cae 9945 } else {
f67539c2 9946 dout(10) << " rollback_bl empty, not rollback back rename (leader failed after getting extra witnesses?)" << dendl;
7c673cae
FG
9947 // singleauth
9948 if (mdr->more()->is_ambiguous_auth) {
9949 if (srcdn->is_auth())
9950 mdr->more()->rename_inode->unfreeze_inode(finished);
9951
9952 mdr->more()->rename_inode->clear_ambiguous_auth(finished);
9953 mdr->more()->is_ambiguous_auth = false;
9954 }
9955 mds->queue_waiters(finished);
9956 mdcache->request_finish(mdr);
9957 }
9958 }
f64942e4
AA
9959
9960 if (migrated_stray && mds->is_stopping())
9961 mdcache->shutdown_export_stray_finish(migrated_stray);
7c673cae
FG
9962}
9963
f67539c2
TL
9964static void _rollback_repair_dir(MutationRef& mut, CDir *dir,
9965 rename_rollback::drec &r, utime_t ctime,
9966 bool isdir, const nest_info_t &rstat)
7c673cae 9967{
f67539c2 9968 auto pf = dir->project_fnode(mut);
7c673cae
FG
9969 pf->version = dir->pre_dirty();
9970
9971 if (isdir) {
f67539c2 9972 pf->fragstat.nsubdirs += 1;
7c673cae 9973 } else {
f67539c2 9974 pf->fragstat.nfiles += 1;
7c673cae
FG
9975 }
9976 if (r.ino) {
f67539c2
TL
9977 pf->rstat.rbytes += rstat.rbytes;
9978 pf->rstat.rfiles += rstat.rfiles;
9979 pf->rstat.rsubdirs += rstat.rsubdirs;
9980 pf->rstat.rsnaps += rstat.rsnaps;
7c673cae
FG
9981 }
9982 if (pf->fragstat.mtime == ctime) {
9983 pf->fragstat.mtime = r.dirfrag_old_mtime;
9984 if (pf->rstat.rctime == ctime)
9985 pf->rstat.rctime = r.dirfrag_old_rctime;
9986 }
9987 mut->add_updated_lock(&dir->get_inode()->filelock);
9988 mut->add_updated_lock(&dir->get_inode()->nestlock);
9989}
9990
9991struct C_MDS_LoggedRenameRollback : public ServerLogContext {
9992 MutationRef mut;
9993 CDentry *srcdn;
9994 version_t srcdnpv;
9995 CDentry *destdn;
9996 CDentry *straydn;
9f95a23c 9997 map<client_t,ref_t<MClientSnap>> splits[2];
7c673cae
FG
9998 bool finish_mdr;
9999 C_MDS_LoggedRenameRollback(Server *s, MutationRef& m, MDRequestRef& r,
11fdf7f2 10000 CDentry *sd, version_t pv, CDentry *dd, CDentry *st,
9f95a23c 10001 map<client_t,ref_t<MClientSnap>> _splits[2], bool f) :
7c673cae 10002 ServerLogContext(s, r), mut(m), srcdn(sd), srcdnpv(pv), destdn(dd),
11fdf7f2
TL
10003 straydn(st), finish_mdr(f) {
10004 splits[0].swap(_splits[0]);
10005 splits[1].swap(_splits[1]);
10006 }
7c673cae
FG
10007 void finish(int r) override {
10008 server->_rename_rollback_finish(mut, mdr, srcdn, srcdnpv,
11fdf7f2 10009 destdn, straydn, splits, finish_mdr);
7c673cae
FG
10010 }
10011};
10012
f67539c2 10013void Server::do_rename_rollback(bufferlist &rbl, mds_rank_t leader, MDRequestRef& mdr,
7c673cae
FG
10014 bool finish_mdr)
10015{
10016 rename_rollback rollback;
11fdf7f2
TL
10017 auto p = rbl.cbegin();
10018 decode(rollback, p);
7c673cae
FG
10019
10020 dout(10) << "do_rename_rollback on " << rollback.reqid << dendl;
10021 // need to finish this update before sending resolve to claim the subtree
f67539c2 10022 mdcache->add_rollback(rollback.reqid, leader);
7c673cae
FG
10023
10024 MutationRef mut(new MutationImpl(nullptr, utime_t(), rollback.reqid));
10025 mut->ls = mds->mdlog->get_current_segment();
10026
10027 CDentry *srcdn = NULL;
10028 CDir *srcdir = mdcache->get_dirfrag(rollback.orig_src.dirfrag);
10029 if (!srcdir)
10030 srcdir = mdcache->get_dirfrag(rollback.orig_src.dirfrag.ino, rollback.orig_src.dname);
10031 if (srcdir) {
10032 dout(10) << " srcdir " << *srcdir << dendl;
10033 srcdn = srcdir->lookup(rollback.orig_src.dname);
10034 if (srcdn) {
10035 dout(10) << " srcdn " << *srcdn << dendl;
11fdf7f2 10036 ceph_assert(srcdn->get_linkage()->is_null());
7c673cae
FG
10037 } else
10038 dout(10) << " srcdn not found" << dendl;
10039 } else
10040 dout(10) << " srcdir not found" << dendl;
10041
10042 CDentry *destdn = NULL;
10043 CDir *destdir = mdcache->get_dirfrag(rollback.orig_dest.dirfrag);
10044 if (!destdir)
10045 destdir = mdcache->get_dirfrag(rollback.orig_dest.dirfrag.ino, rollback.orig_dest.dname);
10046 if (destdir) {
10047 dout(10) << " destdir " << *destdir << dendl;
10048 destdn = destdir->lookup(rollback.orig_dest.dname);
10049 if (destdn)
10050 dout(10) << " destdn " << *destdn << dendl;
10051 else
10052 dout(10) << " destdn not found" << dendl;
10053 } else
10054 dout(10) << " destdir not found" << dendl;
10055
10056 CInode *in = NULL;
10057 if (rollback.orig_src.ino) {
10058 in = mdcache->get_inode(rollback.orig_src.ino);
10059 if (in && in->is_dir())
11fdf7f2 10060 ceph_assert(srcdn && destdn);
7c673cae
FG
10061 } else
10062 in = mdcache->get_inode(rollback.orig_src.remote_ino);
10063
10064 CDir *straydir = NULL;
10065 CDentry *straydn = NULL;
10066 if (rollback.stray.dirfrag.ino) {
10067 straydir = mdcache->get_dirfrag(rollback.stray.dirfrag);
10068 if (straydir) {
10069 dout(10) << "straydir " << *straydir << dendl;
10070 straydn = straydir->lookup(rollback.stray.dname);
10071 if (straydn) {
10072 dout(10) << " straydn " << *straydn << dendl;
11fdf7f2 10073 ceph_assert(straydn->get_linkage()->is_primary());
7c673cae
FG
10074 } else
10075 dout(10) << " straydn not found" << dendl;
10076 } else
10077 dout(10) << "straydir not found" << dendl;
10078 }
10079
10080 CInode *target = NULL;
10081 if (rollback.orig_dest.ino) {
10082 target = mdcache->get_inode(rollback.orig_dest.ino);
10083 if (target)
11fdf7f2 10084 ceph_assert(destdn && straydn);
7c673cae
FG
10085 } else if (rollback.orig_dest.remote_ino)
10086 target = mdcache->get_inode(rollback.orig_dest.remote_ino);
10087
10088 // can't use is_auth() in the resolve stage
10089 mds_rank_t whoami = mds->get_nodeid();
f67539c2 10090 // peer
11fdf7f2
TL
10091 ceph_assert(!destdn || destdn->authority().first != whoami);
10092 ceph_assert(!straydn || straydn->authority().first != whoami);
7c673cae
FG
10093
10094 bool force_journal_src = false;
10095 bool force_journal_dest = false;
10096 if (in && in->is_dir() && srcdn->authority().first != whoami)
10097 force_journal_src = _need_force_journal(in, false);
10098 if (in && target && target->is_dir())
10099 force_journal_dest = _need_force_journal(in, true);
10100
10101 version_t srcdnpv = 0;
10102 // repair src
10103 if (srcdn) {
10104 if (srcdn->authority().first == whoami)
10105 srcdnpv = srcdn->pre_dirty();
10106 if (rollback.orig_src.ino) {
11fdf7f2 10107 ceph_assert(in);
7c673cae
FG
10108 srcdn->push_projected_linkage(in);
10109 } else
10110 srcdn->push_projected_linkage(rollback.orig_src.remote_ino,
10111 rollback.orig_src.remote_d_type);
10112 }
10113
9f95a23c 10114 map<client_t,ref_t<MClientSnap>> splits[2];
11fdf7f2 10115
f67539c2 10116 const CInode::mempool_inode *pip = nullptr;
7c673cae 10117 if (in) {
11fdf7f2 10118 bool projected;
f67539c2
TL
10119 CDir *pdir = in->get_projected_parent_dir();
10120 if (pdir->authority().first == whoami) {
10121 auto pi = in->project_inode(mut);
10122 pi.inode->version = in->pre_dirty();
10123 if (pdir != srcdir) {
10124 auto pf = pdir->project_fnode(mut);
10125 pf->version = pdir->pre_dirty();
10126 }
10127 if (pi.inode->ctime == rollback.ctime)
10128 pi.inode->ctime = rollback.orig_src.old_ctime;
11fdf7f2
TL
10129 projected = true;
10130 } else {
f67539c2
TL
10131 if (in->get_inode()->ctime == rollback.ctime) {
10132 auto _inode = CInode::allocate_inode(*in->get_inode());
10133 _inode->ctime = rollback.orig_src.old_ctime;
10134 in->reset_inode(_inode);
10135 }
11fdf7f2
TL
10136 projected = false;
10137 }
f67539c2 10138 pip = in->get_projected_inode().get();
11fdf7f2
TL
10139
10140 if (rollback.srci_snapbl.length() && in->snaprealm) {
10141 bool hadrealm;
10142 auto p = rollback.srci_snapbl.cbegin();
10143 decode(hadrealm, p);
10144 if (hadrealm) {
10145 if (projected && !mds->is_resolve()) {
10146 sr_t *new_srnode = new sr_t();
10147 decode(*new_srnode, p);
10148 in->project_snaprealm(new_srnode);
10149 } else
10150 decode(in->snaprealm->srnode, p);
10151 } else {
10152 SnapRealm *realm;
10153 if (rollback.orig_src.ino) {
10154 ceph_assert(srcdir);
10155 realm = srcdir->get_inode()->find_snaprealm();
10156 } else {
10157 realm = in->snaprealm->parent;
10158 }
10159 if (!mds->is_resolve())
10160 mdcache->prepare_realm_merge(in->snaprealm, realm, splits[0]);
10161 if (projected)
10162 in->project_snaprealm(NULL);
10163 else
10164 in->snaprealm->merge_to(realm);
10165 }
10166 }
7c673cae
FG
10167 }
10168
7c673cae
FG
10169 // repair dest
10170 if (destdn) {
10171 if (rollback.orig_dest.ino && target) {
10172 destdn->push_projected_linkage(target);
10173 } else if (rollback.orig_dest.remote_ino) {
10174 destdn->push_projected_linkage(rollback.orig_dest.remote_ino,
10175 rollback.orig_dest.remote_d_type);
10176 } else {
10177 // the dentry will be trimmed soon, it's ok to have wrong linkage
10178 if (rollback.orig_dest.ino)
11fdf7f2 10179 ceph_assert(mds->is_resolve());
7c673cae
FG
10180 destdn->push_projected_linkage();
10181 }
10182 }
10183
10184 if (straydn)
10185 straydn->push_projected_linkage();
10186
10187 if (target) {
11fdf7f2 10188 bool projected;
f67539c2
TL
10189 CInode::inode_ptr ti;
10190 CDir *pdir = target->get_projected_parent_dir();
10191 if (pdir->authority().first == whoami) {
10192 auto pi = target->project_inode(mut);
10193 pi.inode->version = target->pre_dirty();
10194 if (pdir != srcdir) {
10195 auto pf = pdir->project_fnode(mut);
10196 pf->version = pdir->pre_dirty();
10197 }
10198 ti = pi.inode;
11fdf7f2
TL
10199 projected = true;
10200 } else {
f67539c2 10201 ti = CInode::allocate_inode(*target->get_inode());
11fdf7f2
TL
10202 projected = false;
10203 }
f67539c2 10204
7c673cae 10205 if (ti->ctime == rollback.ctime)
91327a77 10206 ti->ctime = rollback.orig_dest.old_ctime;
7c673cae
FG
10207 if (MDS_INO_IS_STRAY(rollback.orig_src.dirfrag.ino)) {
10208 if (MDS_INO_IS_STRAY(rollback.orig_dest.dirfrag.ino))
11fdf7f2 10209 ceph_assert(!rollback.orig_dest.ino && !rollback.orig_dest.remote_ino);
7c673cae 10210 else
11fdf7f2 10211 ceph_assert(rollback.orig_dest.remote_ino &&
7c673cae
FG
10212 rollback.orig_dest.remote_ino == rollback.orig_src.ino);
10213 } else
10214 ti->nlink++;
11fdf7f2 10215
f67539c2
TL
10216 if (!projected)
10217 target->reset_inode(ti);
10218
11fdf7f2
TL
10219 if (rollback.desti_snapbl.length() && target->snaprealm) {
10220 bool hadrealm;
10221 auto p = rollback.desti_snapbl.cbegin();
10222 decode(hadrealm, p);
10223 if (hadrealm) {
10224 if (projected && !mds->is_resolve()) {
10225 sr_t *new_srnode = new sr_t();
10226 decode(*new_srnode, p);
10227 target->project_snaprealm(new_srnode);
10228 } else
10229 decode(target->snaprealm->srnode, p);
10230 } else {
10231 SnapRealm *realm;
10232 if (rollback.orig_dest.ino) {
10233 ceph_assert(destdir);
10234 realm = destdir->get_inode()->find_snaprealm();
10235 } else {
10236 realm = target->snaprealm->parent;
10237 }
10238 if (!mds->is_resolve())
10239 mdcache->prepare_realm_merge(target->snaprealm, realm, splits[1]);
10240 if (projected)
10241 target->project_snaprealm(NULL);
10242 else
10243 target->snaprealm->merge_to(realm);
10244 }
10245 }
7c673cae
FG
10246 }
10247
f67539c2
TL
10248 if (srcdn && srcdn->authority().first == whoami) {
10249 nest_info_t blah;
10250 _rollback_repair_dir(mut, srcdir, rollback.orig_src, rollback.ctime,
10251 in && in->is_dir(), pip ? pip->accounted_rstat : blah);
10252 }
10253
7c673cae
FG
10254 if (srcdn)
10255 dout(0) << " srcdn back to " << *srcdn << dendl;
10256 if (in)
10257 dout(0) << " srci back to " << *in << dendl;
10258 if (destdn)
10259 dout(0) << " destdn back to " << *destdn << dendl;
10260 if (target)
10261 dout(0) << " desti back to " << *target << dendl;
10262
10263 // journal it
f67539c2
TL
10264 EPeerUpdate *le = new EPeerUpdate(mdlog, "peer_rename_rollback", rollback.reqid, leader,
10265 EPeerUpdate::OP_ROLLBACK, EPeerUpdate::RENAME);
7c673cae
FG
10266 mdlog->start_entry(le);
10267
10268 if (srcdn && (srcdn->authority().first == whoami || force_journal_src)) {
10269 le->commit.add_dir_context(srcdir);
10270 if (rollback.orig_src.ino)
10271 le->commit.add_primary_dentry(srcdn, 0, true);
10272 else
10273 le->commit.add_remote_dentry(srcdn, true);
10274 }
10275
10276 if (!rollback.orig_src.ino && // remote linkage
10277 in && in->authority().first == whoami) {
10278 le->commit.add_dir_context(in->get_projected_parent_dir());
10279 le->commit.add_primary_dentry(in->get_projected_parent_dn(), in, true);
10280 }
10281
10282 if (force_journal_dest) {
11fdf7f2 10283 ceph_assert(rollback.orig_dest.ino);
7c673cae
FG
10284 le->commit.add_dir_context(destdir);
10285 le->commit.add_primary_dentry(destdn, 0, true);
10286 }
10287
f67539c2 10288 // peer: no need to journal straydn
7c673cae
FG
10289
10290 if (target && target != in && target->authority().first == whoami) {
11fdf7f2 10291 ceph_assert(rollback.orig_dest.remote_ino);
7c673cae
FG
10292 le->commit.add_dir_context(target->get_projected_parent_dir());
10293 le->commit.add_primary_dentry(target->get_projected_parent_dn(), target, true);
10294 }
10295
10296 if (in && in->is_dir() && (srcdn->authority().first == whoami || force_journal_src)) {
10297 dout(10) << " noting renamed dir ino " << in->ino() << " in metablob" << dendl;
10298 le->commit.renamed_dirino = in->ino();
10299 if (srcdn->authority().first == whoami) {
9f95a23c
TL
10300 auto&& ls = in->get_dirfrags();
10301 for (const auto& dir : ls) {
7c673cae
FG
10302 if (!dir->is_auth())
10303 le->commit.renamed_dir_frags.push_back(dir->get_frag());
10304 }
10305 dout(10) << " noting renamed dir open frags " << le->commit.renamed_dir_frags << dendl;
10306 }
10307 } else if (force_journal_dest) {
10308 dout(10) << " noting rename target ino " << target->ino() << " in metablob" << dendl;
10309 le->commit.renamed_dirino = target->ino();
10310 }
10311
10312 if (target && target->is_dir()) {
11fdf7f2 10313 ceph_assert(destdn);
7c673cae
FG
10314 mdcache->project_subtree_rename(target, straydir, destdir);
10315 }
10316
10317 if (in && in->is_dir()) {
11fdf7f2 10318 ceph_assert(srcdn);
7c673cae
FG
10319 mdcache->project_subtree_rename(in, destdir, srcdir);
10320 }
10321
f67539c2 10322 if (mdr && !mdr->more()->peer_update_journaled) {
11fdf7f2 10323 ceph_assert(le->commit.empty());
7c673cae
FG
10324 mdlog->cancel_entry(le);
10325 mut->ls = NULL;
11fdf7f2 10326 _rename_rollback_finish(mut, mdr, srcdn, srcdnpv, destdn, straydn, splits, finish_mdr);
7c673cae 10327 } else {
11fdf7f2 10328 ceph_assert(!le->commit.empty());
7c673cae 10329 if (mdr)
f67539c2 10330 mdr->more()->peer_update_journaled = false;
11fdf7f2
TL
10331 MDSLogContextBase *fin = new C_MDS_LoggedRenameRollback(this, mut, mdr,
10332 srcdn, srcdnpv, destdn, straydn,
10333 splits, finish_mdr);
7c673cae
FG
10334 submit_mdlog_entry(le, fin, mdr, __func__);
10335 mdlog->flush();
10336 }
10337}
10338
10339void Server::_rename_rollback_finish(MutationRef& mut, MDRequestRef& mdr, CDentry *srcdn,
11fdf7f2 10340 version_t srcdnpv, CDentry *destdn, CDentry *straydn,
9f95a23c 10341 map<client_t,ref_t<MClientSnap>> splits[2], bool finish_mdr)
7c673cae
FG
10342{
10343 dout(10) << "_rename_rollback_finish " << mut->reqid << dendl;
10344
10345 if (straydn) {
10346 straydn->get_dir()->unlink_inode(straydn);
10347 straydn->pop_projected_linkage();
10348 }
10349 if (destdn) {
10350 destdn->get_dir()->unlink_inode(destdn);
10351 destdn->pop_projected_linkage();
10352 }
10353 if (srcdn) {
10354 srcdn->pop_projected_linkage();
11fdf7f2 10355 if (srcdn->authority().first == mds->get_nodeid()) {
7c673cae 10356 srcdn->mark_dirty(srcdnpv, mut->ls);
11fdf7f2
TL
10357 if (srcdn->get_linkage()->is_primary())
10358 srcdn->get_linkage()->get_inode()->state_set(CInode::STATE_AUTH);
10359 }
7c673cae
FG
10360 }
10361
10362 mut->apply();
10363
10364 if (srcdn && srcdn->get_linkage()->is_primary()) {
10365 CInode *in = srcdn->get_linkage()->get_inode();
7c673cae 10366 if (in && in->is_dir()) {
11fdf7f2 10367 ceph_assert(destdn);
7c673cae
FG
10368 mdcache->adjust_subtree_after_rename(in, destdn->get_dir(), true);
10369 }
10370 }
10371
10372 if (destdn) {
10373 CInode *oldin = destdn->get_linkage()->get_inode();
10374 // update subtree map?
10375 if (oldin && oldin->is_dir()) {
11fdf7f2 10376 ceph_assert(straydn);
7c673cae
FG
10377 mdcache->adjust_subtree_after_rename(oldin, straydn->get_dir(), true);
10378 }
10379 }
10380
10381 if (mds->is_resolve()) {
10382 CDir *root = NULL;
10383 if (straydn)
10384 root = mdcache->get_subtree_root(straydn->get_dir());
10385 else if (destdn)
10386 root = mdcache->get_subtree_root(destdn->get_dir());
10387 if (root)
10388 mdcache->try_trim_non_auth_subtree(root);
11fdf7f2
TL
10389 } else {
10390 mdcache->send_snaps(splits[1]);
10391 mdcache->send_snaps(splits[0]);
7c673cae
FG
10392 }
10393
10394 if (mdr) {
11fdf7f2 10395 MDSContext::vec finished;
7c673cae
FG
10396 if (mdr->more()->is_ambiguous_auth) {
10397 if (srcdn->is_auth())
10398 mdr->more()->rename_inode->unfreeze_inode(finished);
10399
10400 mdr->more()->rename_inode->clear_ambiguous_auth(finished);
10401 mdr->more()->is_ambiguous_auth = false;
10402 }
10403 mds->queue_waiters(finished);
10404 if (finish_mdr || mdr->aborted)
10405 mdcache->request_finish(mdr);
10406 else
f67539c2 10407 mdr->more()->peer_rolling_back = false;
7c673cae
FG
10408 }
10409
e306af50 10410 mdcache->finish_rollback(mut->reqid, mdr);
7c673cae
FG
10411
10412 mut->cleanup();
10413}
10414
f67539c2 10415void Server::handle_peer_rename_prep_ack(MDRequestRef& mdr, const cref_t<MMDSPeerRequest> &ack)
7c673cae 10416{
f67539c2 10417 dout(10) << "handle_peer_rename_prep_ack " << *mdr
7c673cae
FG
10418 << " witnessed by " << ack->get_source()
10419 << " " << *ack << dendl;
10420 mds_rank_t from = mds_rank_t(ack->get_source().num());
10421
f67539c2
TL
10422 // note peer
10423 mdr->more()->peers.insert(from);
7c673cae
FG
10424 if (mdr->more()->srcdn_auth_mds == from &&
10425 mdr->more()->is_remote_frozen_authpin &&
10426 !mdr->more()->is_ambiguous_auth) {
10427 mdr->set_ambiguous_auth(mdr->more()->rename_inode);
10428 }
10429
10430 // witnessed? or add extra witnesses?
11fdf7f2 10431 ceph_assert(mdr->more()->witnessed.count(from) == 0);
31f18b77 10432 if (ack->is_interrupted()) {
f67539c2 10433 dout(10) << " peer request interrupted, noop" << dendl;
31f18b77 10434 } else if (ack->witnesses.empty()) {
7c673cae
FG
10435 mdr->more()->witnessed.insert(from);
10436 if (!ack->is_not_journaled())
f67539c2 10437 mdr->more()->has_journaled_peers = true;
7c673cae
FG
10438 } else {
10439 dout(10) << " extra witnesses (srcdn replicas) are " << ack->witnesses << dendl;
11fdf7f2 10440 mdr->more()->extra_witnesses = ack->witnesses;
7c673cae
FG
10441 mdr->more()->extra_witnesses.erase(mds->get_nodeid()); // not me!
10442 }
10443
10444 // srci import?
10445 if (ack->inode_export.length()) {
10446 dout(10) << " got srci import" << dendl;
11fdf7f2 10447 mdr->more()->inode_import.share(ack->inode_export);
7c673cae
FG
10448 mdr->more()->inode_import_v = ack->inode_export_v;
10449 }
10450
10451 // remove from waiting list
f67539c2
TL
10452 ceph_assert(mdr->more()->waiting_on_peer.count(from));
10453 mdr->more()->waiting_on_peer.erase(from);
7c673cae 10454
f67539c2 10455 if (mdr->more()->waiting_on_peer.empty())
7c673cae
FG
10456 dispatch_client_request(mdr); // go again!
10457 else
f67539c2 10458 dout(10) << "still waiting on peers " << mdr->more()->waiting_on_peer << dendl;
7c673cae
FG
10459}
10460
f67539c2 10461void Server::handle_peer_rename_notify_ack(MDRequestRef& mdr, const cref_t<MMDSPeerRequest> &ack)
7c673cae 10462{
f67539c2 10463 dout(10) << "handle_peer_rename_notify_ack " << *mdr << " from mds."
7c673cae 10464 << ack->get_source() << dendl;
f67539c2 10465 ceph_assert(mdr->is_peer());
7c673cae
FG
10466 mds_rank_t from = mds_rank_t(ack->get_source().num());
10467
f67539c2
TL
10468 if (mdr->more()->waiting_on_peer.count(from)) {
10469 mdr->more()->waiting_on_peer.erase(from);
7c673cae 10470
f67539c2
TL
10471 if (mdr->more()->waiting_on_peer.empty()) {
10472 if (mdr->peer_request)
10473 dispatch_peer_request(mdr);
7c673cae
FG
10474 } else
10475 dout(10) << " still waiting for rename notify acks from "
f67539c2 10476 << mdr->more()->waiting_on_peer << dendl;
7c673cae
FG
10477 }
10478}
10479
f67539c2 10480void Server::_peer_rename_sessions_flushed(MDRequestRef& mdr)
7c673cae 10481{
f67539c2 10482 dout(10) << "_peer_rename_sessions_flushed " << *mdr << dendl;
7c673cae 10483
f67539c2
TL
10484 if (mdr->more()->waiting_on_peer.count(MDS_RANK_NONE)) {
10485 mdr->more()->waiting_on_peer.erase(MDS_RANK_NONE);
7c673cae 10486
f67539c2
TL
10487 if (mdr->more()->waiting_on_peer.empty()) {
10488 if (mdr->peer_request)
10489 dispatch_peer_request(mdr);
7c673cae
FG
10490 } else
10491 dout(10) << " still waiting for rename notify acks from "
f67539c2 10492 << mdr->more()->waiting_on_peer << dendl;
7c673cae
FG
10493 }
10494}
10495
10496// snaps
10497/* This function takes responsibility for the passed mdr*/
10498void Server::handle_client_lssnap(MDRequestRef& mdr)
10499{
9f95a23c 10500 const cref_t<MClientRequest> &req = mdr->client_request;
7c673cae
FG
10501
10502 // traverse to path
9f95a23c
TL
10503 CInode *diri = try_get_auth_inode(mdr, req->get_filepath().get_ino());
10504 if (!diri)
7c673cae 10505 return;
9f95a23c 10506
7c673cae 10507 if (!diri->is_dir()) {
f67539c2 10508 respond_to_request(mdr, -CEPHFS_ENOTDIR);
7c673cae
FG
10509 return;
10510 }
10511 dout(10) << "lssnap on " << *diri << dendl;
10512
10513 // lock snap
9f95a23c 10514 if (!mds->locker->try_rdlock_snap_layout(diri, mdr))
7c673cae
FG
10515 return;
10516
10517 if (!check_access(mdr, diri, MAY_READ))
10518 return;
10519
10520 SnapRealm *realm = diri->find_snaprealm();
11fdf7f2 10521 map<snapid_t,const SnapInfo*> infomap;
7c673cae
FG
10522 realm->get_snap_info(infomap, diri->get_oldest_snap());
10523
10524 unsigned max_entries = req->head.args.readdir.max_entries;
10525 if (!max_entries)
10526 max_entries = infomap.size();
10527 int max_bytes = req->head.args.readdir.max_bytes;
10528 if (!max_bytes)
10529 // make sure at least one item can be encoded
11fdf7f2 10530 max_bytes = (512 << 10) + g_conf()->mds_max_xattr_pairs_size;
7c673cae
FG
10531
10532 __u64 last_snapid = 0;
10533 string offset_str = req->get_path2();
10534 if (!offset_str.empty())
10535 last_snapid = realm->resolve_snapname(offset_str, diri->ino());
10536
11fdf7f2 10537 //Empty DirStat
7c673cae 10538 bufferlist dirbl;
11fdf7f2
TL
10539 static DirStat empty;
10540 CDir::encode_dirstat(dirbl, mdr->session->info, empty);
7c673cae
FG
10541
10542 max_bytes -= dirbl.length() - sizeof(__u32) + sizeof(__u8) * 2;
10543
10544 __u32 num = 0;
10545 bufferlist dnbl;
11fdf7f2 10546 auto p = infomap.upper_bound(last_snapid);
7c673cae
FG
10547 for (; p != infomap.end() && num < max_entries; ++p) {
10548 dout(10) << p->first << " -> " << *p->second << dendl;
10549
10550 // actual
10551 string snap_name;
10552 if (p->second->ino == diri->ino())
11fdf7f2 10553 snap_name = p->second->name;
7c673cae 10554 else
11fdf7f2 10555 snap_name = p->second->get_long_name();
7c673cae
FG
10556
10557 unsigned start_len = dnbl.length();
10558 if (int(start_len + snap_name.length() + sizeof(__u32) + sizeof(LeaseStat)) > max_bytes)
10559 break;
10560
11fdf7f2
TL
10561 encode(snap_name, dnbl);
10562 //infinite lease
9f95a23c 10563 LeaseStat e(CEPH_LEASE_VALID, -1, 0);
11fdf7f2
TL
10564 mds->locker->encode_lease(dnbl, mdr->session->info, e);
10565 dout(20) << "encode_infinite_lease" << dendl;
7c673cae
FG
10566
10567 int r = diri->encode_inodestat(dnbl, mdr->session, realm, p->first, max_bytes - (int)dnbl.length());
10568 if (r < 0) {
10569 bufferlist keep;
10570 keep.substr_of(dnbl, 0, start_len);
10571 dnbl.swap(keep);
10572 break;
10573 }
10574 ++num;
10575 }
10576
11fdf7f2 10577 encode(num, dirbl);
7c673cae
FG
10578 __u16 flags = 0;
10579 if (p == infomap.end()) {
10580 flags = CEPH_READDIR_FRAG_END;
10581 if (last_snapid == 0)
10582 flags |= CEPH_READDIR_FRAG_COMPLETE;
10583 }
11fdf7f2 10584 encode(flags, dirbl);
7c673cae
FG
10585 dirbl.claim_append(dnbl);
10586
10587 mdr->reply_extra_bl = dirbl;
10588 mdr->tracei = diri;
10589 respond_to_request(mdr, 0);
10590}
10591
10592
10593// MKSNAP
10594
10595struct C_MDS_mksnap_finish : public ServerLogContext {
10596 CInode *diri;
10597 SnapInfo info;
10598 C_MDS_mksnap_finish(Server *s, MDRequestRef& r, CInode *di, SnapInfo &i) :
10599 ServerLogContext(s, r), diri(di), info(i) {}
10600 void finish(int r) override {
10601 server->_mksnap_finish(mdr, diri, info);
10602 }
10603};
10604
10605/* This function takes responsibility for the passed mdr*/
10606void Server::handle_client_mksnap(MDRequestRef& mdr)
10607{
9f95a23c 10608 const cref_t<MClientRequest> &req = mdr->client_request;
11fdf7f2
TL
10609 // make sure we have as new a map as the client
10610 if (req->get_mdsmap_epoch() > mds->mdsmap->get_epoch()) {
10611 mds->wait_for_mdsmap(req->get_mdsmap_epoch(), new C_MDS_RetryRequest(mdcache, mdr));
10612 return;
10613 }
7c673cae
FG
10614 if (!mds->mdsmap->allows_snaps()) {
10615 // you can't make snapshots until you set an option right now
522d829b 10616 dout(5) << "new snapshots are disabled for this fs" << dendl;
f67539c2 10617 respond_to_request(mdr, -CEPHFS_EPERM);
7c673cae
FG
10618 return;
10619 }
10620
9f95a23c
TL
10621 CInode *diri = try_get_auth_inode(mdr, req->get_filepath().get_ino());
10622 if (!diri)
7c673cae 10623 return;
7c673cae
FG
10624
10625 // dir only
10626 if (!diri->is_dir()) {
f67539c2 10627 respond_to_request(mdr, -CEPHFS_ENOTDIR);
7c673cae
FG
10628 return;
10629 }
10630 if (diri->is_system() && !diri->is_root()) {
10631 // no snaps in system dirs (root is ok)
522d829b 10632 dout(5) << "is an internal system dir" << dendl;
f67539c2 10633 respond_to_request(mdr, -CEPHFS_EPERM);
7c673cae
FG
10634 return;
10635 }
10636
11fdf7f2 10637 std::string_view snapname = req->get_filepath().last_dentry();
7c673cae 10638
11fdf7f2 10639 if (mdr->client_request->get_caller_uid() < g_conf()->mds_snap_min_uid || mdr->client_request->get_caller_uid() > g_conf()->mds_snap_max_uid) {
7c673cae 10640 dout(20) << "mksnap " << snapname << " on " << *diri << " denied to uid " << mdr->client_request->get_caller_uid() << dendl;
f67539c2 10641 respond_to_request(mdr, -CEPHFS_EPERM);
7c673cae
FG
10642 return;
10643 }
10644
10645 dout(10) << "mksnap " << snapname << " on " << *diri << dendl;
10646
10647 // lock snap
9f95a23c
TL
10648 if (!(mdr->locking_state & MutationImpl::ALL_LOCKED)) {
10649 MutationImpl::LockOpVec lov;
10650 lov.add_xlock(&diri->snaplock);
10651 if (!mds->locker->acquire_locks(mdr, lov))
10652 return;
7c673cae 10653
9f95a23c
TL
10654 if (CDentry *pdn = diri->get_projected_parent_dn(); pdn) {
10655 if (!mds->locker->try_rdlock_snap_layout(pdn->get_dir()->get_inode(), mdr))
10656 return;
10657 }
10658 mdr->locking_state |= MutationImpl::ALL_LOCKED;
10659 }
7c673cae 10660
9f95a23c 10661 if (!check_access(mdr, diri, MAY_WRITE|MAY_SNAPSHOT))
7c673cae
FG
10662 return;
10663
adb31ebb
TL
10664 if (inodeno_t subvol_ino = diri->find_snaprealm()->get_subvolume_ino();
10665 (subvol_ino && subvol_ino != diri->ino())) {
522d829b 10666 dout(5) << "is a descendent of a subvolume dir" << dendl;
f67539c2 10667 respond_to_request(mdr, -CEPHFS_EPERM);
adb31ebb
TL
10668 return;
10669 }
10670
9f95a23c
TL
10671 // check if we can create any more snapshots
10672 // we don't allow any more if we are already at or beyond the limit
10673 if (diri->snaprealm &&
10674 diri->snaprealm->get_snaps().size() >= max_snaps_per_dir) {
f67539c2 10675 respond_to_request(mdr, -CEPHFS_EMLINK);
7c673cae 10676 return;
9f95a23c 10677 }
7c673cae
FG
10678
10679 // make sure name is unique
10680 if (diri->snaprealm &&
10681 diri->snaprealm->exists(snapname)) {
f67539c2 10682 respond_to_request(mdr, -CEPHFS_EEXIST);
7c673cae
FG
10683 return;
10684 }
10685 if (snapname.length() == 0 ||
10686 snapname[0] == '_') {
f67539c2 10687 respond_to_request(mdr, -CEPHFS_EINVAL);
7c673cae
FG
10688 return;
10689 }
10690
10691 // allocate a snapid
10692 if (!mdr->more()->stid) {
10693 // prepare an stid
10694 mds->snapclient->prepare_create(diri->ino(), snapname,
10695 mdr->get_mds_stamp(),
10696 &mdr->more()->stid, &mdr->more()->snapidbl,
10697 new C_MDS_RetryRequest(mdcache, mdr));
10698 return;
10699 }
10700
10701 version_t stid = mdr->more()->stid;
10702 snapid_t snapid;
11fdf7f2
TL
10703 auto p = mdr->more()->snapidbl.cbegin();
10704 decode(snapid, p);
7c673cae
FG
10705 dout(10) << " stid " << stid << " snapid " << snapid << dendl;
10706
11fdf7f2
TL
10707 ceph_assert(mds->snapclient->get_cached_version() >= stid);
10708
f67539c2
TL
10709 SnapPayload payload;
10710 if (req->get_data().length()) {
10711 try {
10712 auto iter = req->get_data().cbegin();
10713 decode(payload, iter);
10714 } catch (const ceph::buffer::error &e) {
10715 // backward compat -- client sends xattr bufferlist. however,
10716 // that is not used anywhere -- so (log and) ignore.
10717 dout(20) << ": no metadata in payload (old client?)" << dendl;
10718 }
10719 }
10720
7c673cae
FG
10721 // journal
10722 SnapInfo info;
10723 info.ino = diri->ino();
10724 info.snapid = snapid;
11fdf7f2 10725 info.name = snapname;
7c673cae 10726 info.stamp = mdr->get_op_stamp();
f67539c2 10727 info.metadata = payload.metadata;
7c673cae 10728
f67539c2
TL
10729 auto pi = diri->project_inode(mdr, false, true);
10730 pi.inode->ctime = info.stamp;
10731 if (info.stamp > pi.inode->rstat.rctime)
10732 pi.inode->rstat.rctime = info.stamp;
10733 pi.inode->rstat.rsnaps++;
10734 pi.inode->version = diri->pre_dirty();
7c673cae
FG
10735
10736 // project the snaprealm
94b18763
FG
10737 auto &newsnap = *pi.snapnode;
10738 newsnap.created = snapid;
10739 auto em = newsnap.snaps.emplace(std::piecewise_construct, std::forward_as_tuple(snapid), std::forward_as_tuple(info));
10740 if (!em.second)
10741 em.first->second = info;
10742 newsnap.seq = snapid;
10743 newsnap.last_created = snapid;
7c673cae
FG
10744
10745 // journal the inode changes
10746 mdr->ls = mdlog->get_current_segment();
10747 EUpdate *le = new EUpdate(mdlog, "mksnap");
10748 mdlog->start_entry(le);
10749
10750 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
10751 le->metablob.add_table_transaction(TABLE_SNAP, stid);
10752 mdcache->predirty_journal_parents(mdr, &le->metablob, diri, 0, PREDIRTY_PRIMARY, false);
10753 mdcache->journal_dirty_inode(mdr.get(), &le->metablob, diri);
10754
10755 // journal the snaprealm changes
10756 submit_mdlog_entry(le, new C_MDS_mksnap_finish(this, mdr, diri, info),
10757 mdr, __func__);
10758 mdlog->flush();
10759}
10760
10761void Server::_mksnap_finish(MDRequestRef& mdr, CInode *diri, SnapInfo &info)
10762{
10763 dout(10) << "_mksnap_finish " << *mdr << " " << info << dendl;
10764
10765 int op = (diri->snaprealm? CEPH_SNAP_OP_CREATE : CEPH_SNAP_OP_SPLIT);
10766
7c673cae
FG
10767 mdr->apply();
10768
10769 mds->snapclient->commit(mdr->more()->stid, mdr->ls);
10770
10771 // create snap
10772 dout(10) << "snaprealm now " << *diri->snaprealm << dendl;
10773
11fdf7f2
TL
10774 // notify other mds
10775 mdcache->send_snap_update(diri, mdr->more()->stid, op);
10776
7c673cae
FG
10777 mdcache->do_realm_invalidate_and_update_notify(diri, op);
10778
10779 // yay
10780 mdr->in[0] = diri;
10781 mdr->snapid = info.snapid;
10782 mdr->tracei = diri;
10783 respond_to_request(mdr, 0);
10784}
10785
10786
10787// RMSNAP
10788
10789struct C_MDS_rmsnap_finish : public ServerLogContext {
10790 CInode *diri;
10791 snapid_t snapid;
10792 C_MDS_rmsnap_finish(Server *s, MDRequestRef& r, CInode *di, snapid_t sn) :
10793 ServerLogContext(s, r), diri(di), snapid(sn) {}
10794 void finish(int r) override {
10795 server->_rmsnap_finish(mdr, diri, snapid);
10796 }
10797};
10798
10799/* This function takes responsibility for the passed mdr*/
10800void Server::handle_client_rmsnap(MDRequestRef& mdr)
10801{
9f95a23c 10802 const cref_t<MClientRequest> &req = mdr->client_request;
7c673cae 10803
9f95a23c
TL
10804 CInode *diri = try_get_auth_inode(mdr, req->get_filepath().get_ino());
10805 if (!diri)
7c673cae 10806 return;
9f95a23c 10807
7c673cae 10808 if (!diri->is_dir()) {
f67539c2 10809 respond_to_request(mdr, -CEPHFS_ENOTDIR);
7c673cae
FG
10810 return;
10811 }
10812
11fdf7f2 10813 std::string_view snapname = req->get_filepath().last_dentry();
7c673cae 10814
11fdf7f2 10815 if (mdr->client_request->get_caller_uid() < g_conf()->mds_snap_min_uid || mdr->client_request->get_caller_uid() > g_conf()->mds_snap_max_uid) {
7c673cae 10816 dout(20) << "rmsnap " << snapname << " on " << *diri << " denied to uid " << mdr->client_request->get_caller_uid() << dendl;
f67539c2 10817 respond_to_request(mdr, -CEPHFS_EPERM);
7c673cae
FG
10818 return;
10819 }
10820
10821 dout(10) << "rmsnap " << snapname << " on " << *diri << dendl;
10822
10823 // does snap exist?
10824 if (snapname.length() == 0 || snapname[0] == '_') {
f67539c2 10825 respond_to_request(mdr, -CEPHFS_EINVAL); // can't prune a parent snap, currently.
7c673cae
FG
10826 return;
10827 }
10828 if (!diri->snaprealm || !diri->snaprealm->exists(snapname)) {
f67539c2 10829 respond_to_request(mdr, -CEPHFS_ENOENT);
7c673cae
FG
10830 return;
10831 }
10832 snapid_t snapid = diri->snaprealm->resolve_snapname(snapname, diri->ino());
10833 dout(10) << " snapname " << snapname << " is " << snapid << dendl;
10834
9f95a23c
TL
10835 if (!(mdr->locking_state & MutationImpl::ALL_LOCKED)) {
10836 MutationImpl::LockOpVec lov;
10837 lov.add_xlock(&diri->snaplock);
10838 if (!mds->locker->acquire_locks(mdr, lov))
10839 return;
10840 if (CDentry *pdn = diri->get_projected_parent_dn(); pdn) {
10841 if (!mds->locker->try_rdlock_snap_layout(pdn->get_dir()->get_inode(), mdr))
10842 return;
10843 }
10844 mdr->locking_state |= MutationImpl::ALL_LOCKED;
10845 }
7c673cae 10846
11fdf7f2 10847 if (!check_access(mdr, diri, MAY_WRITE|MAY_SNAPSHOT))
7c673cae
FG
10848 return;
10849
10850 // prepare
10851 if (!mdr->more()->stid) {
10852 mds->snapclient->prepare_destroy(diri->ino(), snapid,
10853 &mdr->more()->stid, &mdr->more()->snapidbl,
10854 new C_MDS_RetryRequest(mdcache, mdr));
10855 return;
10856 }
10857 version_t stid = mdr->more()->stid;
11fdf7f2 10858 auto p = mdr->more()->snapidbl.cbegin();
7c673cae 10859 snapid_t seq;
11fdf7f2 10860 decode(seq, p);
7c673cae
FG
10861 dout(10) << " stid is " << stid << ", seq is " << seq << dendl;
10862
11fdf7f2
TL
10863 ceph_assert(mds->snapclient->get_cached_version() >= stid);
10864
7c673cae 10865 // journal
f67539c2
TL
10866 auto pi = diri->project_inode(mdr, false, true);
10867 pi.inode->version = diri->pre_dirty();
10868 pi.inode->ctime = mdr->get_op_stamp();
10869 if (mdr->get_op_stamp() > pi.inode->rstat.rctime)
10870 pi.inode->rstat.rctime = mdr->get_op_stamp();
10871 pi.inode->rstat.rsnaps--;
7c673cae
FG
10872
10873 mdr->ls = mdlog->get_current_segment();
10874 EUpdate *le = new EUpdate(mdlog, "rmsnap");
10875 mdlog->start_entry(le);
10876
10877 // project the snaprealm
94b18763
FG
10878 auto &newnode = *pi.snapnode;
10879 newnode.snaps.erase(snapid);
10880 newnode.seq = seq;
10881 newnode.last_destroyed = seq;
7c673cae
FG
10882
10883 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
10884 le->metablob.add_table_transaction(TABLE_SNAP, stid);
10885 mdcache->predirty_journal_parents(mdr, &le->metablob, diri, 0, PREDIRTY_PRIMARY, false);
10886 mdcache->journal_dirty_inode(mdr.get(), &le->metablob, diri);
10887
10888 submit_mdlog_entry(le, new C_MDS_rmsnap_finish(this, mdr, diri, snapid),
10889 mdr, __func__);
10890 mdlog->flush();
10891}
10892
10893void Server::_rmsnap_finish(MDRequestRef& mdr, CInode *diri, snapid_t snapid)
10894{
10895 dout(10) << "_rmsnap_finish " << *mdr << " " << snapid << dendl;
10896 snapid_t stid = mdr->more()->stid;
11fdf7f2 10897 auto p = mdr->more()->snapidbl.cbegin();
7c673cae 10898 snapid_t seq;
11fdf7f2 10899 decode(seq, p);
7c673cae 10900
7c673cae
FG
10901 mdr->apply();
10902
10903 mds->snapclient->commit(stid, mdr->ls);
10904
10905 dout(10) << "snaprealm now " << *diri->snaprealm << dendl;
10906
11fdf7f2
TL
10907 // notify other mds
10908 mdcache->send_snap_update(diri, mdr->more()->stid, CEPH_SNAP_OP_DESTROY);
10909
7c673cae
FG
10910 mdcache->do_realm_invalidate_and_update_notify(diri, CEPH_SNAP_OP_DESTROY);
10911
10912 // yay
10913 mdr->in[0] = diri;
10914 respond_to_request(mdr, 0);
10915
10916 // purge snapshot data
f67539c2 10917 diri->purge_stale_snap_data(diri->snaprealm->get_snaps());
7c673cae
FG
10918}
10919
10920struct C_MDS_renamesnap_finish : public ServerLogContext {
10921 CInode *diri;
10922 snapid_t snapid;
10923 C_MDS_renamesnap_finish(Server *s, MDRequestRef& r, CInode *di, snapid_t sn) :
10924 ServerLogContext(s, r), diri(di), snapid(sn) {}
10925 void finish(int r) override {
10926 server->_renamesnap_finish(mdr, diri, snapid);
10927 }
10928};
10929
10930/* This function takes responsibility for the passed mdr*/
10931void Server::handle_client_renamesnap(MDRequestRef& mdr)
10932{
9f95a23c 10933 const cref_t<MClientRequest> &req = mdr->client_request;
7c673cae 10934 if (req->get_filepath().get_ino() != req->get_filepath2().get_ino()) {
f67539c2 10935 respond_to_request(mdr, -CEPHFS_EINVAL);
7c673cae
FG
10936 return;
10937 }
10938
9f95a23c
TL
10939 CInode *diri = try_get_auth_inode(mdr, req->get_filepath().get_ino());
10940 if (!diri)
7c673cae 10941 return;
7c673cae
FG
10942
10943 if (!diri->is_dir()) { // dir only
f67539c2 10944 respond_to_request(mdr, -CEPHFS_ENOTDIR);
7c673cae
FG
10945 return;
10946 }
10947
11fdf7f2
TL
10948 if (mdr->client_request->get_caller_uid() < g_conf()->mds_snap_min_uid ||
10949 mdr->client_request->get_caller_uid() > g_conf()->mds_snap_max_uid) {
f67539c2 10950 respond_to_request(mdr, -CEPHFS_EPERM);
7c673cae
FG
10951 return;
10952 }
10953
11fdf7f2
TL
10954 std::string_view dstname = req->get_filepath().last_dentry();
10955 std::string_view srcname = req->get_filepath2().last_dentry();
7c673cae
FG
10956 dout(10) << "renamesnap " << srcname << "->" << dstname << " on " << *diri << dendl;
10957
10958 if (srcname.length() == 0 || srcname[0] == '_') {
f67539c2 10959 respond_to_request(mdr, -CEPHFS_EINVAL); // can't rename a parent snap.
7c673cae
FG
10960 return;
10961 }
10962 if (!diri->snaprealm || !diri->snaprealm->exists(srcname)) {
f67539c2 10963 respond_to_request(mdr, -CEPHFS_ENOENT);
7c673cae
FG
10964 return;
10965 }
10966 if (dstname.length() == 0 || dstname[0] == '_') {
f67539c2 10967 respond_to_request(mdr, -CEPHFS_EINVAL);
7c673cae
FG
10968 return;
10969 }
10970 if (diri->snaprealm->exists(dstname)) {
f67539c2 10971 respond_to_request(mdr, -CEPHFS_EEXIST);
7c673cae
FG
10972 return;
10973 }
10974
10975 snapid_t snapid = diri->snaprealm->resolve_snapname(srcname, diri->ino());
10976 dout(10) << " snapname " << srcname << " is " << snapid << dendl;
10977
10978 // lock snap
9f95a23c
TL
10979 if (!(mdr->locking_state & MutationImpl::ALL_LOCKED)) {
10980 MutationImpl::LockOpVec lov;
10981 lov.add_xlock(&diri->snaplock);
10982 if (!mds->locker->acquire_locks(mdr, lov))
10983 return;
10984 if (CDentry *pdn = diri->get_projected_parent_dn(); pdn) {
10985 if (!mds->locker->try_rdlock_snap_layout(pdn->get_dir()->get_inode(), mdr))
10986 return;
10987 }
10988 mdr->locking_state |= MutationImpl::ALL_LOCKED;
10989 }
7c673cae 10990
11fdf7f2 10991 if (!check_access(mdr, diri, MAY_WRITE|MAY_SNAPSHOT))
7c673cae
FG
10992 return;
10993
10994 // prepare
10995 if (!mdr->more()->stid) {
10996 mds->snapclient->prepare_update(diri->ino(), snapid, dstname, utime_t(),
11fdf7f2 10997 &mdr->more()->stid,
7c673cae
FG
10998 new C_MDS_RetryRequest(mdcache, mdr));
10999 return;
11000 }
11001
11002 version_t stid = mdr->more()->stid;
11fdf7f2
TL
11003 dout(10) << " stid is " << stid << dendl;
11004
11005 ceph_assert(mds->snapclient->get_cached_version() >= stid);
7c673cae
FG
11006
11007 // journal
f67539c2
TL
11008 auto pi = diri->project_inode(mdr, false, true);
11009 pi.inode->ctime = mdr->get_op_stamp();
11010 if (mdr->get_op_stamp() > pi.inode->rstat.rctime)
11011 pi.inode->rstat.rctime = mdr->get_op_stamp();
11012 pi.inode->version = diri->pre_dirty();
7c673cae
FG
11013
11014 // project the snaprealm
94b18763
FG
11015 auto &newsnap = *pi.snapnode;
11016 auto it = newsnap.snaps.find(snapid);
11fdf7f2
TL
11017 ceph_assert(it != newsnap.snaps.end());
11018 it->second.name = dstname;
7c673cae
FG
11019
11020 // journal the inode changes
11021 mdr->ls = mdlog->get_current_segment();
11022 EUpdate *le = new EUpdate(mdlog, "renamesnap");
11023 mdlog->start_entry(le);
11024
11025 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
11026 le->metablob.add_table_transaction(TABLE_SNAP, stid);
11027 mdcache->predirty_journal_parents(mdr, &le->metablob, diri, 0, PREDIRTY_PRIMARY, false);
11028 mdcache->journal_dirty_inode(mdr.get(), &le->metablob, diri);
11029
11030 // journal the snaprealm changes
11031 submit_mdlog_entry(le, new C_MDS_renamesnap_finish(this, mdr, diri, snapid),
11032 mdr, __func__);
11033 mdlog->flush();
11034}
11035
11036void Server::_renamesnap_finish(MDRequestRef& mdr, CInode *diri, snapid_t snapid)
11037{
11038 dout(10) << "_renamesnap_finish " << *mdr << " " << snapid << dendl;
11039
7c673cae
FG
11040 mdr->apply();
11041
11042 mds->snapclient->commit(mdr->more()->stid, mdr->ls);
11043
11044 dout(10) << "snaprealm now " << *diri->snaprealm << dendl;
11045
11fdf7f2
TL
11046 // notify other mds
11047 mdcache->send_snap_update(diri, mdr->more()->stid, CEPH_SNAP_OP_UPDATE);
11048
11049 mdcache->do_realm_invalidate_and_update_notify(diri, CEPH_SNAP_OP_UPDATE);
7c673cae
FG
11050
11051 // yay
11052 mdr->in[0] = diri;
11053 mdr->tracei = diri;
11054 mdr->snapid = snapid;
11055 respond_to_request(mdr, 0);
11056}
11057
11058/**
11059 * Return true if server is in state RECONNECT and this
11060 * client has not yet reconnected.
11061 */
11062bool Server::waiting_for_reconnect(client_t c) const
11063{
11064 return client_reconnect_gather.count(c) > 0;
11065}
11066
11067void Server::dump_reconnect_status(Formatter *f) const
11068{
11069 f->open_object_section("reconnect_status");
11070 f->dump_stream("client_reconnect_gather") << client_reconnect_gather;
11071 f->close_section();
11072}