]> git.proxmox.com Git - ceph.git/blame - ceph/src/mds/Server.cc
import 15.2.9
[ceph.git] / ceph / src / mds / Server.cc
CommitLineData
7c673cae
FG
1// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2// vim: ts=8 sw=2 smarttab
3/*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 *
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
12 *
13 */
14
15#include <boost/lexical_cast.hpp>
11fdf7f2 16#include "include/ceph_assert.h" // lexical_cast includes system assert.h
7c673cae
FG
17
18#include <boost/config/warning_disable.hpp>
19#include <boost/fusion/include/std_pair.hpp>
a8e16298 20#include <boost/range/adaptor/reversed.hpp>
7c673cae
FG
21
22#include "MDSRank.h"
23#include "Server.h"
24#include "Locker.h"
25#include "MDCache.h"
26#include "MDLog.h"
27#include "Migrator.h"
28#include "MDBalancer.h"
29#include "InoTable.h"
30#include "SnapClient.h"
31#include "Mutation.h"
11fdf7f2 32#include "cephfs_features.h"
7c673cae
FG
33
34#include "msg/Messenger.h"
35
36#include "osdc/Objecter.h"
37
7c673cae
FG
38#include "events/EUpdate.h"
39#include "events/ESlaveUpdate.h"
40#include "events/ESession.h"
41#include "events/EOpen.h"
42#include "events/ECommitted.h"
9f95a23c 43#include "events/EPurged.h"
7c673cae 44
11fdf7f2 45#include "include/stringify.h"
7c673cae
FG
46#include "include/filepath.h"
47#include "common/errno.h"
48#include "common/Timer.h"
49#include "common/perf_counters.h"
50#include "include/compat.h"
51#include "osd/OSDMap.h"
52
53#include <errno.h>
a8e16298 54#include <math.h>
7c673cae
FG
55
56#include <list>
57#include <iostream>
11fdf7f2 58#include <string_view>
7c673cae
FG
59
60#include "common/config.h"
61
62#define dout_context g_ceph_context
63#define dout_subsys ceph_subsys_mds
64#undef dout_prefix
65#define dout_prefix *_dout << "mds." << mds->get_nodeid() << ".server "
66
11fdf7f2 67class ServerContext : public MDSContext {
7c673cae
FG
68 protected:
69 Server *server;
70 MDSRank *get_mds() override
71 {
72 return server->mds;
73 }
74
75 public:
76 explicit ServerContext(Server *s) : server(s) {
11fdf7f2 77 ceph_assert(server != NULL);
7c673cae
FG
78 }
79};
80
9f95a23c
TL
81class Batch_Getattr_Lookup : public BatchOp {
82protected:
83 Server* server;
84 ceph::ref_t<MDRequestImpl> mdr;
f91f0fd5 85 std::vector<ceph::ref_t<MDRequestImpl>> batch_reqs;
9f95a23c
TL
86 int res = 0;
87public:
f91f0fd5
TL
88 Batch_Getattr_Lookup(Server* s, const ceph::ref_t<MDRequestImpl>& r)
89 : server(s), mdr(r) {
90 if (mdr->client_request->get_op() == CEPH_MDS_OP_LOOKUP)
91 mdr->batch_op_map = &mdr->dn[0].back()->batch_ops;
92 else
93 mdr->batch_op_map = &mdr->in[0]->batch_ops;
94 }
95 void add_request(const ceph::ref_t<MDRequestImpl>& r) override {
96 batch_reqs.push_back(r);
9f95a23c 97 }
f91f0fd5
TL
98 ceph::ref_t<MDRequestImpl> find_new_head() override {
99 while (!batch_reqs.empty()) {
100 auto r = std::move(batch_reqs.back());
101 batch_reqs.pop_back();
102 if (r->killed)
103 continue;
104
105 r->batch_op_map = mdr->batch_op_map;
106 mdr->batch_op_map = nullptr;
107 mdr = r;
108 return mdr;
109 }
110 return nullptr;
9f95a23c
TL
111 }
112 void _forward(mds_rank_t t) override {
f91f0fd5 113 MDCache* mdcache = server->mdcache;
9f95a23c
TL
114 mdcache->mds->forward_message_mds(mdr->release_client_request(), t);
115 mdr->set_mds_stamp(ceph_clock_now());
f91f0fd5 116 for (auto& m : batch_reqs) {
9f95a23c
TL
117 if (!m->killed)
118 mdcache->request_forward(m, t);
119 }
f91f0fd5 120 batch_reqs.clear();
9f95a23c
TL
121 }
122 void _respond(int r) override {
123 mdr->set_mds_stamp(ceph_clock_now());
f91f0fd5 124 for (auto& m : batch_reqs) {
9f95a23c
TL
125 if (!m->killed) {
126 m->tracei = mdr->tracei;
127 m->tracedn = mdr->tracedn;
128 server->respond_to_request(m, r);
129 }
130 }
f91f0fd5 131 batch_reqs.clear();
9f95a23c
TL
132 server->reply_client_request(mdr, make_message<MClientReply>(*mdr->client_request, r));
133 }
134 void print(std::ostream& o) {
135 o << "[batch front=" << *mdr << "]";
136 }
137};
138
7c673cae
FG
139class ServerLogContext : public MDSLogContextBase {
140protected:
141 Server *server;
142 MDSRank *get_mds() override
143 {
144 return server->mds;
145 }
146
147 MDRequestRef mdr;
148 void pre_finish(int r) override {
149 if (mdr)
150 mdr->mark_event("journal_committed: ");
151 }
152public:
153 explicit ServerLogContext(Server *s) : server(s) {
11fdf7f2 154 ceph_assert(server != NULL);
7c673cae
FG
155 }
156 explicit ServerLogContext(Server *s, MDRequestRef& r) : server(s), mdr(r) {
11fdf7f2 157 ceph_assert(server != NULL);
7c673cae
FG
158 }
159};
160
161void Server::create_logger()
162{
163 PerfCountersBuilder plb(g_ceph_context, "mds_server", l_mdss_first, l_mdss_last);
91327a77
AA
164
165 plb.add_u64_counter(l_mdss_handle_client_request, "handle_client_request",
166 "Client requests", "hcr", PerfCountersBuilder::PRIO_INTERESTING);
7c673cae 167 plb.add_u64_counter(l_mdss_handle_slave_request, "handle_slave_request",
91327a77
AA
168 "Slave requests", "hsr", PerfCountersBuilder::PRIO_INTERESTING);
169 plb.add_u64_counter(l_mdss_handle_client_session,
170 "handle_client_session", "Client session messages", "hcs",
171 PerfCountersBuilder::PRIO_INTERESTING);
11fdf7f2
TL
172 plb.add_u64_counter(l_mdss_cap_revoke_eviction, "cap_revoke_eviction",
173 "Cap Revoke Client Eviction", "cre", PerfCountersBuilder::PRIO_INTERESTING);
adb31ebb
TL
174 plb.add_u64_counter(l_mdss_cap_acquisition_throttle,
175 "cap_acquisition_throttle", "Cap acquisition throttle counter", "cat",
176 PerfCountersBuilder::PRIO_INTERESTING);
91327a77
AA
177
178 // fop latencies are useful
179 plb.set_prio_default(PerfCountersBuilder::PRIO_USEFUL);
180 plb.add_time_avg(l_mdss_req_lookuphash_latency, "req_lookuphash_latency",
181 "Request type lookup hash of inode latency");
182 plb.add_time_avg(l_mdss_req_lookupino_latency, "req_lookupino_latency",
183 "Request type lookup inode latency");
184 plb.add_time_avg(l_mdss_req_lookupparent_latency, "req_lookupparent_latency",
185 "Request type lookup parent latency");
186 plb.add_time_avg(l_mdss_req_lookupname_latency, "req_lookupname_latency",
187 "Request type lookup name latency");
188 plb.add_time_avg(l_mdss_req_lookup_latency, "req_lookup_latency",
189 "Request type lookup latency");
190 plb.add_time_avg(l_mdss_req_lookupsnap_latency, "req_lookupsnap_latency",
191 "Request type lookup snapshot latency");
192 plb.add_time_avg(l_mdss_req_getattr_latency, "req_getattr_latency",
193 "Request type get attribute latency");
194 plb.add_time_avg(l_mdss_req_setattr_latency, "req_setattr_latency",
195 "Request type set attribute latency");
196 plb.add_time_avg(l_mdss_req_setlayout_latency, "req_setlayout_latency",
197 "Request type set file layout latency");
198 plb.add_time_avg(l_mdss_req_setdirlayout_latency, "req_setdirlayout_latency",
199 "Request type set directory layout latency");
200 plb.add_time_avg(l_mdss_req_setxattr_latency, "req_setxattr_latency",
201 "Request type set extended attribute latency");
202 plb.add_time_avg(l_mdss_req_rmxattr_latency, "req_rmxattr_latency",
203 "Request type remove extended attribute latency");
204 plb.add_time_avg(l_mdss_req_readdir_latency, "req_readdir_latency",
205 "Request type read directory latency");
206 plb.add_time_avg(l_mdss_req_setfilelock_latency, "req_setfilelock_latency",
207 "Request type set file lock latency");
208 plb.add_time_avg(l_mdss_req_getfilelock_latency, "req_getfilelock_latency",
209 "Request type get file lock latency");
210 plb.add_time_avg(l_mdss_req_create_latency, "req_create_latency",
211 "Request type create latency");
212 plb.add_time_avg(l_mdss_req_open_latency, "req_open_latency",
213 "Request type open latency");
214 plb.add_time_avg(l_mdss_req_mknod_latency, "req_mknod_latency",
215 "Request type make node latency");
216 plb.add_time_avg(l_mdss_req_link_latency, "req_link_latency",
217 "Request type link latency");
218 plb.add_time_avg(l_mdss_req_unlink_latency, "req_unlink_latency",
219 "Request type unlink latency");
220 plb.add_time_avg(l_mdss_req_rmdir_latency, "req_rmdir_latency",
221 "Request type remove directory latency");
222 plb.add_time_avg(l_mdss_req_rename_latency, "req_rename_latency",
223 "Request type rename latency");
224 plb.add_time_avg(l_mdss_req_mkdir_latency, "req_mkdir_latency",
225 "Request type make directory latency");
226 plb.add_time_avg(l_mdss_req_symlink_latency, "req_symlink_latency",
227 "Request type symbolic link latency");
228 plb.add_time_avg(l_mdss_req_lssnap_latency, "req_lssnap_latency",
229 "Request type list snapshot latency");
230 plb.add_time_avg(l_mdss_req_mksnap_latency, "req_mksnap_latency",
231 "Request type make snapshot latency");
232 plb.add_time_avg(l_mdss_req_rmsnap_latency, "req_rmsnap_latency",
233 "Request type remove snapshot latency");
234 plb.add_time_avg(l_mdss_req_renamesnap_latency, "req_renamesnap_latency",
235 "Request type rename snapshot latency");
236
91327a77
AA
237 plb.set_prio_default(PerfCountersBuilder::PRIO_DEBUGONLY);
238 plb.add_u64_counter(l_mdss_dispatch_client_request, "dispatch_client_request",
239 "Client requests dispatched");
240 plb.add_u64_counter(l_mdss_dispatch_slave_request, "dispatch_server_request",
241 "Server requests dispatched");
242
7c673cae
FG
243 logger = plb.create_perf_counters();
244 g_ceph_context->get_perfcounters_collection()->add(logger);
245}
246
247Server::Server(MDSRank *m) :
248 mds(m),
249 mdcache(mds->mdcache), mdlog(mds->mdlog),
11fdf7f2 250 recall_throttle(g_conf().get_val<double>("mds_recall_max_decay_rate"))
7c673cae 251{
f91f0fd5 252 forward_all_requests_to_auth = g_conf().get_val<bool>("mds_forward_all_requests_to_auth");
92f5a8d4 253 replay_unsafe_with_closed_session = g_conf().get_val<bool>("mds_replay_unsafe_with_closed_session");
81eedcae 254 cap_revoke_eviction_timeout = g_conf().get_val<double>("mds_cap_revoke_eviction_timeout");
9f95a23c
TL
255 max_snaps_per_dir = g_conf().get_val<uint64_t>("mds_max_snaps_per_dir");
256 delegate_inos_pct = g_conf().get_val<uint64_t>("mds_client_delegate_inos_pct");
adb31ebb
TL
257 max_caps_per_client = g_conf().get_val<uint64_t>("mds_max_caps_per_client");
258 cap_acquisition_throttle = g_conf().get_val<uint64_t>("mds_session_cap_acquisition_throttle");
259 max_caps_throttle_ratio = g_conf().get_val<double>("mds_session_max_caps_throttle_ratio");
260 caps_throttle_retry_request_timeout = g_conf().get_val<double>("mds_cap_acquisition_throttle_retry_request_timeout");
11fdf7f2 261 supported_features = feature_bitset_t(CEPHFS_FEATURES_MDS_SUPPORTED);
7c673cae
FG
262}
263
9f95a23c 264void Server::dispatch(const cref_t<Message> &m)
7c673cae
FG
265{
266 switch (m->get_type()) {
267 case CEPH_MSG_CLIENT_RECONNECT:
9f95a23c 268 handle_client_reconnect(ref_cast<MClientReconnect>(m));
7c673cae
FG
269 return;
270 }
271
92f5a8d4
TL
272/*
273 *In reconnect phase, client sent unsafe requests to mds before reconnect msg. Seting sessionclosed_isok will handle scenario like this:
274
2751. In reconnect phase, client sent unsafe requests to mds.
2762. It reached reconnect timeout. All sessions without sending reconnect msg in time, some of which may had sent unsafe requests, are marked as closed.
277(Another situation is #31668, which will deny all client reconnect msg to speed up reboot).
2783.So these unsafe request from session without sending reconnect msg in time or being denied could be handled in clientreplay phase.
279
280*/
281 bool sessionclosed_isok = replay_unsafe_with_closed_session;
7c673cae 282 // active?
94b18763
FG
283 // handle_slave_request()/handle_client_session() will wait if necessary
284 if (m->get_type() == CEPH_MSG_CLIENT_REQUEST && !mds->is_active()) {
9f95a23c 285 const auto &req = ref_cast<MClientRequest>(m);
94b18763
FG
286 if (mds->is_reconnect() || mds->get_want_state() == CEPH_MDS_STATE_RECONNECT) {
287 Session *session = mds->get_session(req);
92f5a8d4 288 if (!session || (!session->is_open() && !sessionclosed_isok)) {
7c673cae 289 dout(5) << "session is closed, dropping " << req->get_reqid() << dendl;
7c673cae
FG
290 return;
291 }
292 bool queue_replay = false;
9f95a23c 293 if (req->is_replay() || req->is_async()) {
7c673cae
FG
294 dout(3) << "queuing replayed op" << dendl;
295 queue_replay = true;
11fdf7f2
TL
296 if (req->head.ino &&
297 !session->have_completed_request(req->get_reqid().tid, nullptr)) {
298 mdcache->add_replay_ino_alloc(inodeno_t(req->head.ino));
299 }
7c673cae
FG
300 } else if (req->get_retry_attempt()) {
301 // process completed request in clientreplay stage. The completed request
302 // might have created new file/directorie. This guarantees MDS sends a reply
303 // to client before other request modifies the new file/directorie.
304 if (session->have_completed_request(req->get_reqid().tid, NULL)) {
305 dout(3) << "queuing completed op" << dendl;
306 queue_replay = true;
307 }
308 // this request was created before the cap reconnect message, drop any embedded
309 // cap releases.
310 req->releases.clear();
311 }
312 if (queue_replay) {
313 req->mark_queued_for_replay();
314 mds->enqueue_replay(new C_MDS_RetryMessage(mds, m));
315 return;
316 }
317 }
318
319 bool wait_for_active = true;
94b18763 320 if (mds->is_stopping()) {
28e407b8 321 wait_for_active = false;
7c673cae 322 } else if (mds->is_clientreplay()) {
94b18763 323 if (req->is_queued_for_replay()) {
7c673cae 324 wait_for_active = false;
7c673cae
FG
325 }
326 }
327 if (wait_for_active) {
328 dout(3) << "not active yet, waiting" << dendl;
329 mds->wait_for_active(new C_MDS_RetryMessage(mds, m));
330 return;
331 }
332 }
333
334 switch (m->get_type()) {
335 case CEPH_MSG_CLIENT_SESSION:
9f95a23c 336 handle_client_session(ref_cast<MClientSession>(m));
7c673cae
FG
337 return;
338 case CEPH_MSG_CLIENT_REQUEST:
9f95a23c 339 handle_client_request(ref_cast<MClientRequest>(m));
11fdf7f2
TL
340 return;
341 case CEPH_MSG_CLIENT_RECLAIM:
9f95a23c 342 handle_client_reclaim(ref_cast<MClientReclaim>(m));
7c673cae
FG
343 return;
344 case MSG_MDS_SLAVE_REQUEST:
9f95a23c 345 handle_slave_request(ref_cast<MMDSSlaveRequest>(m));
7c673cae
FG
346 return;
347 default:
348 derr << "server unknown message " << m->get_type() << dendl;
11fdf7f2 349 ceph_abort_msg("server unknown message");
7c673cae
FG
350 }
351}
352
353
354
355// ----------------------------------------------------------
356// SESSION management
357
358class C_MDS_session_finish : public ServerLogContext {
359 Session *session;
360 uint64_t state_seq;
361 bool open;
362 version_t cmapv;
363 interval_set<inodeno_t> inos;
364 version_t inotablev;
9f95a23c
TL
365 interval_set<inodeno_t> purge_inos;
366 LogSegment *ls = nullptr;
7c673cae
FG
367 Context *fin;
368public:
369 C_MDS_session_finish(Server *srv, Session *se, uint64_t sseq, bool s, version_t mv, Context *fin_ = NULL) :
370 ServerLogContext(srv), session(se), state_seq(sseq), open(s), cmapv(mv), inotablev(0), fin(fin_) { }
9f95a23c
TL
371 C_MDS_session_finish(Server *srv, Session *se, uint64_t sseq, bool s, version_t mv, interval_set<inodeno_t> i, version_t iv, Context *fin_ = NULL) :
372 ServerLogContext(srv), session(se), state_seq(sseq), open(s), cmapv(mv), inos(std::move(i)), inotablev(iv), fin(fin_) { }
373 C_MDS_session_finish(Server *srv, Session *se, uint64_t sseq, bool s, version_t mv, interval_set<inodeno_t> i, version_t iv,
374 interval_set<inodeno_t> _purge_inos, LogSegment *_ls, Context *fin_ = NULL) :
375 ServerLogContext(srv), session(se), state_seq(sseq), open(s), cmapv(mv), inos(std::move(i)), inotablev(iv), purge_inos(std::move(_purge_inos)), ls(_ls), fin(fin_){}
7c673cae 376 void finish(int r) override {
11fdf7f2 377 ceph_assert(r == 0);
9f95a23c 378 server->_session_logged(session, state_seq, open, cmapv, inos, inotablev, purge_inos, ls);
7c673cae
FG
379 if (fin) {
380 fin->complete(r);
381 }
382 }
383};
384
11fdf7f2
TL
385Session* Server::find_session_by_uuid(std::string_view uuid)
386{
387 Session* session = nullptr;
388 for (auto& it : mds->sessionmap.get_sessions()) {
389 auto& metadata = it.second->info.client_metadata;
390
391 auto p = metadata.find("uuid");
392 if (p == metadata.end() || p->second != uuid)
393 continue;
394
395 if (!session) {
396 session = it.second;
397 } else if (!session->reclaiming_from) {
398 assert(it.second->reclaiming_from == session);
399 session = it.second;
400 } else {
401 assert(session->reclaiming_from == it.second);
402 }
403 }
404 return session;
405}
406
9f95a23c 407void Server::reclaim_session(Session *session, const cref_t<MClientReclaim> &m)
11fdf7f2
TL
408{
409 if (!session->is_open() && !session->is_stale()) {
410 dout(10) << "session not open, dropping this req" << dendl;
411 return;
412 }
413
9f95a23c 414 auto reply = make_message<MClientReclaimReply>(0);
11fdf7f2
TL
415 if (m->get_uuid().empty()) {
416 dout(10) << __func__ << " invalid message (no uuid)" << dendl;
417 reply->set_result(-EINVAL);
418 mds->send_message_client(reply, session);
419 return;
420 }
421
422 unsigned flags = m->get_flags();
423 if (flags != CEPH_RECLAIM_RESET) { // currently only support reset
424 dout(10) << __func__ << " unsupported flags" << dendl;
425 reply->set_result(-EOPNOTSUPP);
426 mds->send_message_client(reply, session);
427 return;
428 }
429
430 Session* target = find_session_by_uuid(m->get_uuid());
431 if (target) {
432 if (session->info.auth_name != target->info.auth_name) {
433 dout(10) << __func__ << " session auth_name " << session->info.auth_name
434 << " != target auth_name " << target->info.auth_name << dendl;
435 reply->set_result(-EPERM);
436 mds->send_message_client(reply, session);
437 }
438
439 assert(!target->reclaiming_from);
440 assert(!session->reclaiming_from);
441 session->reclaiming_from = target;
442 reply->set_addrs(entity_addrvec_t(target->info.inst.addr));
443 }
444
445 if (flags & CEPH_RECLAIM_RESET) {
446 finish_reclaim_session(session, reply);
447 return;
448 }
449
450 ceph_abort();
451}
452
9f95a23c 453void Server::finish_reclaim_session(Session *session, const ref_t<MClientReclaimReply> &reply)
11fdf7f2
TL
454{
455 Session *target = session->reclaiming_from;
456 if (target) {
457 session->reclaiming_from = nullptr;
458
459 Context *send_reply;
460 if (reply) {
461 int64_t session_id = session->get_client().v;
9f95a23c
TL
462 send_reply = new LambdaContext([this, session_id, reply](int r) {
463 assert(ceph_mutex_is_locked_by_me(mds->mds_lock));
11fdf7f2
TL
464 Session *session = mds->sessionmap.get_session(entity_name_t::CLIENT(session_id));
465 if (!session) {
466 return;
467 }
468 auto epoch = mds->objecter->with_osdmap([](const OSDMap &map){ return map.get_epoch(); });
469 reply->set_epoch(epoch);
470 mds->send_message_client(reply, session);
471 });
472 } else {
473 send_reply = nullptr;
474 }
475
476 bool blacklisted = mds->objecter->with_osdmap([target](const OSDMap &map) {
477 return map.is_blacklisted(target->info.inst.addr);
478 });
479
480 if (blacklisted || !g_conf()->mds_session_blacklist_on_evict) {
481 kill_session(target, send_reply);
482 } else {
483 std::stringstream ss;
484 mds->evict_client(target->get_client().v, false, true, ss, send_reply);
485 }
486 } else if (reply) {
487 mds->send_message_client(reply, session);
488 }
489}
490
9f95a23c 491void Server::handle_client_reclaim(const cref_t<MClientReclaim> &m)
11fdf7f2
TL
492{
493 Session *session = mds->get_session(m);
494 dout(3) << __func__ << " " << *m << " from " << m->get_source() << dendl;
495 assert(m->get_source().is_client()); // should _not_ come from an mds!
496
497 if (!session) {
498 dout(0) << " ignoring sessionless msg " << *m << dendl;
499 return;
500 }
501
502 if (mds->get_state() < MDSMap::STATE_CLIENTREPLAY) {
503 mds->wait_for_replay(new C_MDS_RetryMessage(mds, m));
504 return;
505 }
506
507 if (m->get_flags() & MClientReclaim::FLAG_FINISH) {
508 finish_reclaim_session(session);
509 } else {
510 reclaim_session(session, m);
511 }
512}
513
9f95a23c 514void Server::handle_client_session(const cref_t<MClientSession> &m)
7c673cae
FG
515{
516 version_t pv;
94b18763 517 Session *session = mds->get_session(m);
7c673cae
FG
518
519 dout(3) << "handle_client_session " << *m << " from " << m->get_source() << dendl;
11fdf7f2 520 ceph_assert(m->get_source().is_client()); // should _not_ come from an mds!
7c673cae
FG
521
522 if (!session) {
523 dout(0) << " ignoring sessionless msg " << *m << dendl;
9f95a23c 524 auto reply = make_message<MClientSession>(CEPH_SESSION_REJECT);
92f5a8d4
TL
525 reply->metadata["error_string"] = "sessionless";
526 mds->send_message(reply, m->get_connection());
7c673cae
FG
527 return;
528 }
529
94b18763
FG
530 if (m->get_op() == CEPH_SESSION_REQUEST_RENEWCAPS) {
531 // always handle renewcaps (state >= MDSMap::STATE_RECONNECT)
532 } else if (m->get_op() == CEPH_SESSION_REQUEST_CLOSE) {
533 // close requests need to be handled when mds is active
534 if (mds->get_state() < MDSMap::STATE_ACTIVE) {
535 mds->wait_for_active(new C_MDS_RetryMessage(mds, m));
536 return;
537 }
538 } else {
539 if (mds->get_state() < MDSMap::STATE_CLIENTREPLAY) {
540 mds->wait_for_replay(new C_MDS_RetryMessage(mds, m));
541 return;
542 }
543 }
544
7c673cae
FG
545 if (logger)
546 logger->inc(l_mdss_handle_client_session);
547
548 uint64_t sseq = 0;
549 switch (m->get_op()) {
550 case CEPH_SESSION_REQUEST_OPEN:
551 if (session->is_opening() ||
552 session->is_open() ||
553 session->is_stale() ||
28e407b8
AA
554 session->is_killing() ||
555 terminating_sessions) {
7c673cae 556 dout(10) << "currently open|opening|stale|killing, dropping this req" << dendl;
7c673cae
FG
557 return;
558 }
11fdf7f2 559 ceph_assert(session->is_closed() || session->is_closing());
7c673cae 560
b32b8144
FG
561 if (mds->is_stopping()) {
562 dout(10) << "mds is stopping, dropping open req" << dendl;
b32b8144
FG
563 return;
564 }
565
a8e16298
TL
566 {
567 auto& addr = session->info.inst.addr;
9f95a23c 568 session->set_client_metadata(client_metadata_t(m->metadata, m->supported_features, m->metric_spec));
a8e16298
TL
569 auto& client_metadata = session->info.client_metadata;
570
11fdf7f2 571 auto log_session_status = [this, m, session](std::string_view status, std::string_view err) {
a8e16298
TL
572 auto now = ceph_clock_now();
573 auto throttle_elapsed = m->get_recv_complete_stamp() - m->get_throttle_stamp();
574 auto elapsed = now - m->get_recv_stamp();
11fdf7f2
TL
575 CachedStackStringStream css;
576 *css << "New client session:"
a8e16298
TL
577 << " addr=\"" << session->info.inst.addr << "\""
578 << ",elapsed=" << elapsed
579 << ",throttled=" << throttle_elapsed
580 << ",status=\"" << status << "\"";
581 if (!err.empty()) {
11fdf7f2 582 *css << ",error=\"" << err << "\"";
a8e16298
TL
583 }
584 const auto& metadata = session->info.client_metadata;
11fdf7f2
TL
585 if (auto it = metadata.find("root"); it != metadata.end()) {
586 *css << ",root=\"" << it->second << "\"";
a8e16298 587 }
11fdf7f2
TL
588 dout(2) << css->strv() << dendl;
589 };
590
591 auto send_reject_message = [this, &session, &log_session_status](std::string_view err_str) {
9f95a23c 592 auto m = make_message<MClientSession>(CEPH_SESSION_REJECT);
11fdf7f2
TL
593 if (session->info.has_feature(CEPHFS_FEATURE_MIMIC))
594 m->metadata["error_string"] = err_str;
595 mds->send_message_client(m, session);
596 log_session_status("REJECTED", err_str);
a8e16298 597 };
7c673cae 598
11fdf7f2
TL
599 bool blacklisted = mds->objecter->with_osdmap(
600 [&addr](const OSDMap &osd_map) -> bool {
601 return osd_map.is_blacklisted(addr);
602 });
603
a8e16298 604 if (blacklisted) {
11fdf7f2
TL
605 dout(10) << "rejecting blacklisted client " << addr << dendl;
606 send_reject_message("blacklisted");
607 session->clear();
608 break;
7c673cae 609 }
7c673cae 610
11fdf7f2
TL
611 if (client_metadata.features.empty())
612 infer_supported_features(session, client_metadata);
613
614 dout(20) << __func__ << " CEPH_SESSION_REQUEST_OPEN metadata entries:" << dendl;
9f95a23c
TL
615 dout(20) << " features: '" << client_metadata.features << "'" << dendl;
616 dout(20) << " metric specification: [" << client_metadata.metric_spec << "]" << dendl;
11fdf7f2
TL
617 for (const auto& p : client_metadata) {
618 dout(20) << " " << p.first << ": " << p.second << dendl;
619 }
620
621 feature_bitset_t missing_features = required_client_features;
622 missing_features -= client_metadata.features;
623 if (!missing_features.empty()) {
624 stringstream ss;
625 ss << "missing required features '" << missing_features << "'";
626 send_reject_message(ss.str());
92f5a8d4
TL
627 mds->clog->warn() << "client session (" << session->info.inst
628 << ") lacks required features " << missing_features
629 << "; client supports " << client_metadata.features;
11fdf7f2
TL
630 session->clear();
631 break;
a8e16298 632 }
7c673cae 633
a8e16298
TL
634 // Special case for the 'root' metadata path; validate that the claimed
635 // root is actually within the caps of the session
11fdf7f2
TL
636 if (auto it = client_metadata.find("root"); it != client_metadata.end()) {
637 auto claimed_root = it->second;
638 stringstream ss;
639 bool denied = false;
640 // claimed_root has a leading "/" which we strip before passing
641 // into caps check
642 if (claimed_root.empty() || claimed_root[0] != '/') {
643 denied = true;
644 ss << "invalue root '" << claimed_root << "'";
645 } else if (!session->auth_caps.path_capable(claimed_root.substr(1))) {
646 denied = true;
647 ss << "non-allowable root '" << claimed_root << "'";
648 }
649
650 if (denied) {
651 // Tell the client we're rejecting their open
652 send_reject_message(ss.str());
653 mds->clog->warn() << "client session with " << ss.str()
654 << " denied (" << session->info.inst << ")";
655 session->clear();
656 break;
657 }
658 }
659
660 if (auto it = client_metadata.find("uuid"); it != client_metadata.end()) {
661 if (find_session_by_uuid(it->second)) {
662 send_reject_message("duplicated session uuid");
663 mds->clog->warn() << "client session with duplicated session uuid '"
664 << it->second << "' denied (" << session->info.inst << ")";
665 session->clear();
666 break;
667 }
a8e16298
TL
668 }
669
670 if (session->is_closed())
11fdf7f2 671 mds->sessionmap.add_session(session);
a8e16298
TL
672
673 pv = mds->sessionmap.mark_projected(session);
674 sseq = mds->sessionmap.set_state(session, Session::STATE_OPENING);
675 mds->sessionmap.touch_session(session);
9f95a23c 676 auto fin = new LambdaContext([log_session_status = std::move(log_session_status)](int r){
11fdf7f2 677 ceph_assert(r == 0);
a8e16298
TL
678 log_session_status("ACCEPTED", "");
679 });
680 mdlog->start_submit_entry(new ESession(m->get_source_inst(), true, pv, client_metadata),
681 new C_MDS_session_finish(this, session, sseq, true, pv, fin));
11fdf7f2 682 mdlog->flush();
a8e16298 683 }
7c673cae
FG
684 break;
685
686 case CEPH_SESSION_REQUEST_RENEWCAPS:
11fdf7f2 687 if (session->is_open() || session->is_stale()) {
7c673cae
FG
688 mds->sessionmap.touch_session(session);
689 if (session->is_stale()) {
690 mds->sessionmap.set_state(session, Session::STATE_OPEN);
691 mds->locker->resume_stale_caps(session);
692 mds->sessionmap.touch_session(session);
693 }
9f95a23c 694 auto reply = make_message<MClientSession>(CEPH_SESSION_RENEWCAPS, m->get_seq());
11fdf7f2 695 mds->send_message_client(reply, session);
7c673cae
FG
696 } else {
697 dout(10) << "ignoring renewcaps on non open|stale session (" << session->get_state_name() << ")" << dendl;
698 }
699 break;
700
701 case CEPH_SESSION_REQUEST_CLOSE:
702 {
703 if (session->is_closed() ||
704 session->is_closing() ||
705 session->is_killing()) {
706 dout(10) << "already closed|closing|killing, dropping this req" << dendl;
7c673cae
FG
707 return;
708 }
709 if (session->is_importing()) {
710 dout(10) << "ignoring close req on importing session" << dendl;
7c673cae
FG
711 return;
712 }
11fdf7f2 713 ceph_assert(session->is_open() ||
7c673cae
FG
714 session->is_stale() ||
715 session->is_opening());
716 if (m->get_seq() < session->get_push_seq()) {
717 dout(10) << "old push seq " << m->get_seq() << " < " << session->get_push_seq()
718 << ", dropping" << dendl;
7c673cae
FG
719 return;
720 }
721 // We are getting a seq that is higher than expected.
722 // Handle the same as any other seqn error.
723 //
724 if (m->get_seq() != session->get_push_seq()) {
725 dout(0) << "old push seq " << m->get_seq() << " != " << session->get_push_seq()
726 << ", BUGGY!" << dendl;
727 mds->clog->warn() << "incorrect push seq " << m->get_seq() << " != "
728 << session->get_push_seq() << ", dropping" << " from client : " << session->get_human_name();
7c673cae
FG
729 return;
730 }
731 journal_close_session(session, Session::STATE_CLOSING, NULL);
732 }
733 break;
734
735 case CEPH_SESSION_FLUSHMSG_ACK:
736 finish_flush_session(session, m->get_seq());
737 break;
738
31f18b77 739 case CEPH_SESSION_REQUEST_FLUSH_MDLOG:
b32b8144
FG
740 if (mds->is_active())
741 mdlog->flush();
31f18b77
FG
742 break;
743
7c673cae
FG
744 default:
745 ceph_abort();
746 }
7c673cae
FG
747}
748
f91f0fd5 749void Server::flush_session(Session *session, MDSGatherBuilder& gather) {
f64942e4 750 if (!session->is_open() ||
11fdf7f2
TL
751 !session->get_connection() ||
752 !session->get_connection()->has_feature(CEPH_FEATURE_EXPORT_PEER)) {
f64942e4
AA
753 return;
754 }
755
f91f0fd5 756 version_t seq = session->wait_for_flush(gather.new_sub());
11fdf7f2 757 mds->send_message_client(
9f95a23c 758 make_message<MClientSession>(CEPH_SESSION_FLUSHMSG, seq), session);
f64942e4
AA
759}
760
7c673cae
FG
761void Server::flush_client_sessions(set<client_t>& client_set, MDSGatherBuilder& gather)
762{
f91f0fd5
TL
763 for (const auto& client : client_set) {
764 Session *session = mds->sessionmap.get_session(entity_name_t::CLIENT(client.v));
11fdf7f2 765 ceph_assert(session);
f91f0fd5 766 flush_session(session, gather);
7c673cae
FG
767 }
768}
769
770void Server::finish_flush_session(Session *session, version_t seq)
771{
11fdf7f2 772 MDSContext::vec finished;
7c673cae
FG
773 session->finish_flush(seq, finished);
774 mds->queue_waiters(finished);
775}
776
777void Server::_session_logged(Session *session, uint64_t state_seq, bool open, version_t pv,
9f95a23c
TL
778 const interval_set<inodeno_t>& inos, version_t piv,
779 const interval_set<inodeno_t>& purge_inos, LogSegment *ls)
7c673cae 780{
9f95a23c
TL
781 dout(10) << "_session_logged " << session->info.inst
782 << " state_seq " << state_seq
783 << " " << (open ? "open":"close")
784 << " " << pv
785 << " purge_inos : " << purge_inos << dendl;
786
787 if (NULL != ls) {
788 dout(10) << "_session_logged seq : " << ls->seq << dendl;
789 if (purge_inos.size()){
790 ls->purge_inodes.insert(purge_inos);
791 mdcache->purge_inodes(purge_inos, ls);
792 }
793 }
794
7c673cae 795 if (piv) {
11fdf7f2 796 ceph_assert(session->is_closing() || session->is_killing() ||
7c673cae
FG
797 session->is_opening()); // re-open closing session
798 session->info.prealloc_inos.subtract(inos);
9f95a23c 799 session->delegated_inos.clear();
7c673cae 800 mds->inotable->apply_release_ids(inos);
11fdf7f2 801 ceph_assert(mds->inotable->get_version() == piv);
7c673cae
FG
802 }
803
804 mds->sessionmap.mark_dirty(session);
805
806 // apply
807 if (session->get_state_seq() != state_seq) {
808 dout(10) << " journaled state_seq " << state_seq << " != current " << session->get_state_seq()
809 << ", noop" << dendl;
810 // close must have been canceled (by an import?), or any number of other things..
811 } else if (open) {
11fdf7f2 812 ceph_assert(session->is_opening());
7c673cae
FG
813 mds->sessionmap.set_state(session, Session::STATE_OPEN);
814 mds->sessionmap.touch_session(session);
11fdf7f2 815 ceph_assert(session->get_connection());
9f95a23c 816 auto reply = make_message<MClientSession>(CEPH_SESSION_OPEN);
11fdf7f2
TL
817 if (session->info.has_feature(CEPHFS_FEATURE_MIMIC))
818 reply->supported_features = supported_features;
819 mds->send_message_client(reply, session);
820 if (mdcache->is_readonly()) {
9f95a23c 821 auto m = make_message<MClientSession>(CEPH_SESSION_FORCE_RO);
11fdf7f2
TL
822 mds->send_message_client(m, session);
823 }
7c673cae
FG
824 } else if (session->is_closing() ||
825 session->is_killing()) {
826 // kill any lingering capabilities, leases, requests
f91f0fd5 827 bool killing = session->is_killing();
7c673cae
FG
828 while (!session->caps.empty()) {
829 Capability *cap = session->caps.front();
830 CInode *in = cap->get_inode();
831 dout(20) << " killing capability " << ccap_string(cap->issued()) << " on " << *in << dendl;
f91f0fd5 832 mds->locker->remove_client_cap(in, cap, killing);
7c673cae
FG
833 }
834 while (!session->leases.empty()) {
835 ClientLease *r = session->leases.front();
836 CDentry *dn = static_cast<CDentry*>(r->parent);
837 dout(20) << " killing client lease of " << *dn << dendl;
838 dn->remove_client_lease(r, mds->locker);
839 }
11fdf7f2 840 if (client_reconnect_gather.erase(session->info.get_client())) {
7c673cae 841 dout(20) << " removing client from reconnect set" << dendl;
7c673cae
FG
842 if (client_reconnect_gather.empty()) {
843 dout(7) << " client " << session->info.inst << " was last reconnect, finishing" << dendl;
844 reconnect_gather_finish();
845 }
846 }
11fdf7f2
TL
847 if (client_reclaim_gather.erase(session->info.get_client())) {
848 dout(20) << " removing client from reclaim set" << dendl;
849 if (client_reclaim_gather.empty()) {
850 dout(7) << " client " << session->info.inst << " was last reclaimed, finishing" << dendl;
851 mds->maybe_clientreplay_done();
852 }
853 }
7c673cae
FG
854
855 if (session->is_closing()) {
856 // mark con disposable. if there is a fault, we will get a
857 // reset and clean it up. if the client hasn't received the
858 // CLOSE message yet, they will reconnect and get an
859 // ms_handle_remote_reset() and realize they had in fact closed.
860 // do this *before* sending the message to avoid a possible
861 // race.
11fdf7f2 862 if (session->get_connection()) {
7c673cae
FG
863 // Conditional because terminate_sessions will indiscrimately
864 // put sessions in CLOSING whether they ever had a conn or not.
11fdf7f2 865 session->get_connection()->mark_disposable();
7c673cae
FG
866 }
867
868 // reset session
9f95a23c 869 mds->send_message_client(make_message<MClientSession>(CEPH_SESSION_CLOSE), session);
7c673cae
FG
870 mds->sessionmap.set_state(session, Session::STATE_CLOSED);
871 session->clear();
872 mds->sessionmap.remove_session(session);
873 } else if (session->is_killing()) {
874 // destroy session, close connection
11fdf7f2 875 if (session->get_connection()) {
92f5a8d4
TL
876 session->get_connection()->mark_down();
877 mds->sessionmap.set_state(session, Session::STATE_CLOSED);
878 session->set_connection(nullptr);
7c673cae
FG
879 }
880 mds->sessionmap.remove_session(session);
881 } else {
882 ceph_abort();
883 }
884 } else {
885 ceph_abort();
886 }
887}
888
889/**
890 * Inject sessions from some source other than actual connections.
891 *
892 * For example:
893 * - sessions inferred from journal replay
894 * - sessions learned from other MDSs during rejoin
895 * - sessions learned from other MDSs during dir/caps migration
896 * - sessions learned from other MDSs during a cross-MDS rename
897 */
898version_t Server::prepare_force_open_sessions(map<client_t,entity_inst_t>& cm,
11fdf7f2 899 map<client_t,client_metadata_t>& cmm,
28e407b8 900 map<client_t, pair<Session*,uint64_t> >& smap)
7c673cae
FG
901{
902 version_t pv = mds->sessionmap.get_projected();
903
904 dout(10) << "prepare_force_open_sessions " << pv
905 << " on " << cm.size() << " clients"
906 << dendl;
7c673cae 907
28e407b8 908 mds->objecter->with_osdmap(
11fdf7f2 909 [this, &cm, &cmm](const OSDMap &osd_map) {
28e407b8
AA
910 for (auto p = cm.begin(); p != cm.end(); ) {
911 if (osd_map.is_blacklisted(p->second.addr)) {
912 dout(10) << " ignoring blacklisted client." << p->first
913 << " (" << p->second.addr << ")" << dendl;
11fdf7f2 914 cmm.erase(p->first);
28e407b8
AA
915 cm.erase(p++);
916 } else {
917 ++p;
918 }
919 }
920 });
921
922 for (map<client_t,entity_inst_t>::iterator p = cm.begin(); p != cm.end(); ++p) {
7c673cae
FG
923 Session *session = mds->sessionmap.get_or_add_session(p->second);
924 pv = mds->sessionmap.mark_projected(session);
28e407b8 925 uint64_t sseq;
7c673cae
FG
926 if (session->is_closed() ||
927 session->is_closing() ||
28e407b8
AA
928 session->is_killing()) {
929 sseq = mds->sessionmap.set_state(session, Session::STATE_OPENING);
11fdf7f2
TL
930 auto q = cmm.find(p->first);
931 if (q != cmm.end())
932 session->info.client_metadata.merge(q->second);
28e407b8 933 } else {
11fdf7f2 934 ceph_assert(session->is_open() ||
7c673cae
FG
935 session->is_opening() ||
936 session->is_stale());
28e407b8
AA
937 sseq = 0;
938 }
939 smap[p->first] = make_pair(session, sseq);
7c673cae
FG
940 session->inc_importing();
941 }
942 return pv;
943}
944
28e407b8 945void Server::finish_force_open_sessions(const map<client_t,pair<Session*,uint64_t> >& smap,
7c673cae
FG
946 bool dec_import)
947{
948 /*
949 * FIXME: need to carefully consider the race conditions between a
950 * client trying to close a session and an MDS doing an import
951 * trying to force open a session...
952 */
28e407b8 953 dout(10) << "finish_force_open_sessions on " << smap.size() << " clients,"
7c673cae 954 << " initial v " << mds->sessionmap.get_version() << dendl;
7c673cae 955
28e407b8
AA
956 for (auto &it : smap) {
957 Session *session = it.second.first;
958 uint64_t sseq = it.second.second;
959 if (sseq > 0) {
7c673cae
FG
960 if (session->get_state_seq() != sseq) {
961 dout(10) << "force_open_sessions skipping changed " << session->info.inst << dendl;
962 } else {
963 dout(10) << "force_open_sessions opened " << session->info.inst << dendl;
964 mds->sessionmap.set_state(session, Session::STATE_OPEN);
965 mds->sessionmap.touch_session(session);
11fdf7f2 966
9f95a23c 967 auto reply = make_message<MClientSession>(CEPH_SESSION_OPEN);
11fdf7f2
TL
968 if (session->info.has_feature(CEPHFS_FEATURE_MIMIC))
969 reply->supported_features = supported_features;
970 mds->send_message_client(reply, session);
971
7c673cae 972 if (mdcache->is_readonly())
9f95a23c 973 mds->send_message_client(make_message<MClientSession>(CEPH_SESSION_FORCE_RO), session);
7c673cae
FG
974 }
975 } else {
976 dout(10) << "force_open_sessions skipping already-open " << session->info.inst << dendl;
11fdf7f2 977 ceph_assert(session->is_open() || session->is_stale());
7c673cae
FG
978 }
979
980 if (dec_import) {
981 session->dec_importing();
982 }
983
984 mds->sessionmap.mark_dirty(session);
985 }
986
987 dout(10) << __func__ << ": final v " << mds->sessionmap.get_version() << dendl;
988}
989
990class C_MDS_TerminatedSessions : public ServerContext {
991 void finish(int r) override {
992 server->terminating_sessions = false;
993 }
994 public:
995 explicit C_MDS_TerminatedSessions(Server *s) : ServerContext(s) {}
996};
997
998void Server::terminate_sessions()
999{
a8e16298 1000 dout(5) << "terminating all sessions..." << dendl;
7c673cae
FG
1001
1002 terminating_sessions = true;
1003
1004 // kill them off. clients will retry etc.
1005 set<Session*> sessions;
1006 mds->sessionmap.get_client_session_set(sessions);
1007 for (set<Session*>::const_iterator p = sessions.begin();
1008 p != sessions.end();
1009 ++p) {
1010 Session *session = *p;
1011 if (session->is_closing() ||
1012 session->is_killing() ||
1013 session->is_closed())
1014 continue;
1015 journal_close_session(session, Session::STATE_CLOSING, NULL);
1016 }
1017
1018 mdlog->wait_for_safe(new C_MDS_TerminatedSessions(this));
1019}
1020
1021
1022void Server::find_idle_sessions()
1023{
91327a77
AA
1024 auto now = clock::now();
1025 auto last_cleared_laggy = mds->last_cleared_laggy();
1026
1027 dout(10) << "find_idle_sessions. last cleared laggy state " << last_cleared_laggy << "s ago" << dendl;
7c673cae
FG
1028
1029 // timeout/stale
1030 // (caps go stale, lease die)
91327a77
AA
1031 double queue_max_age = mds->get_dispatch_queue_max_age(ceph_clock_now());
1032 double cutoff = queue_max_age + mds->mdsmap->get_session_timeout();
f64942e4 1033
494da23a
TL
1034 // don't kick clients if we've been laggy
1035 if (last_cleared_laggy < cutoff) {
1036 dout(10) << " last cleared laggy " << last_cleared_laggy << "s ago (< cutoff " << cutoff
1037 << "), not marking any client stale" << dendl;
1038 return;
1039 }
1040
11fdf7f2
TL
1041 std::vector<Session*> to_evict;
1042
494da23a 1043 bool defer_session_stale = g_conf().get_val<bool>("mds_defer_session_stale");
f64942e4
AA
1044 const auto sessions_p1 = mds->sessionmap.by_state.find(Session::STATE_OPEN);
1045 if (sessions_p1 != mds->sessionmap.by_state.end() && !sessions_p1->second->empty()) {
1046 std::vector<Session*> new_stale;
1047
1048 for (auto session : *(sessions_p1->second)) {
1049 auto last_cap_renew_span = std::chrono::duration<double>(now - session->last_cap_renew).count();
1050 if (last_cap_renew_span < cutoff) {
1051 dout(20) << "laggiest active session is " << session->info.inst
1052 << " and renewed caps recently (" << last_cap_renew_span << "s ago)" << dendl;
1053 break;
1054 }
1055
1056 if (session->last_seen > session->last_cap_renew) {
1057 last_cap_renew_span = std::chrono::duration<double>(now - session->last_seen).count();
1058 if (last_cap_renew_span < cutoff) {
1059 dout(20) << "laggiest active session is " << session->info.inst
1060 << " and renewed caps recently (" << last_cap_renew_span << "s ago)" << dendl;
1061 continue;
1062 }
1063 }
1064
494da23a
TL
1065 if (last_cap_renew_span >= mds->mdsmap->get_session_autoclose()) {
1066 dout(20) << "evicting session " << session->info.inst << " since autoclose "
1067 "has arrived" << dendl;
1068 // evict session without marking it stale
1069 to_evict.push_back(session);
1070 continue;
1071 }
1072
1073 if (defer_session_stale &&
1074 !session->is_any_flush_waiter() &&
1075 !mds->locker->is_revoking_any_caps_from(session->get_client())) {
1076 dout(20) << "deferring marking session " << session->info.inst << " stale "
1077 "since it holds no caps" << dendl;
1078 continue;
1079 }
1080
11fdf7f2
TL
1081 auto it = session->info.client_metadata.find("timeout");
1082 if (it != session->info.client_metadata.end()) {
1083 unsigned timeout = strtoul(it->second.c_str(), nullptr, 0);
1084 if (timeout == 0) {
1085 dout(10) << "skipping session " << session->info.inst
1086 << ", infinite timeout specified" << dendl;
1087 continue;
1088 }
1089 double cutoff = queue_max_age + timeout;
1090 if (last_cap_renew_span < cutoff) {
1091 dout(10) << "skipping session " << session->info.inst
1092 << ", timeout (" << timeout << ") specified"
1093 << " and renewed caps recently (" << last_cap_renew_span << "s ago)" << dendl;
1094 continue;
1095 }
1096
1097 // do not go through stale, evict it directly.
1098 to_evict.push_back(session);
1099 } else {
1100 dout(10) << "new stale session " << session->info.inst
1101 << " last renewed caps " << last_cap_renew_span << "s ago" << dendl;
1102 new_stale.push_back(session);
1103 }
7c673cae
FG
1104 }
1105
f64942e4
AA
1106 for (auto session : new_stale) {
1107 mds->sessionmap.set_state(session, Session::STATE_STALE);
494da23a
TL
1108 if (mds->locker->revoke_stale_caps(session)) {
1109 mds->locker->remove_stale_leases(session);
1110 finish_flush_session(session, session->get_push_seq());
9f95a23c 1111 auto m = make_message<MClientSession>(CEPH_SESSION_STALE, session->get_push_seq());
494da23a
TL
1112 mds->send_message_client(m, session);
1113 } else {
1114 to_evict.push_back(session);
1115 }
f64942e4 1116 }
7c673cae
FG
1117 }
1118
1119 // autoclose
91327a77 1120 cutoff = queue_max_age + mds->mdsmap->get_session_autoclose();
7c673cae 1121
31f18b77 1122 // Collect a list of sessions exceeding the autoclose threshold
f64942e4 1123 const auto sessions_p2 = mds->sessionmap.by_state.find(Session::STATE_STALE);
11fdf7f2
TL
1124 if (sessions_p2 != mds->sessionmap.by_state.end() && !sessions_p2->second->empty()) {
1125 for (auto session : *(sessions_p2->second)) {
1126 assert(session->is_stale());
1127 auto last_cap_renew_span = std::chrono::duration<double>(now - session->last_cap_renew).count();
1128 if (last_cap_renew_span < cutoff) {
1129 dout(20) << "oldest stale session is " << session->info.inst
1130 << " and recently renewed caps " << last_cap_renew_span << "s ago" << dendl;
1131 break;
1132 }
1133 to_evict.push_back(session);
1134 }
31f18b77 1135 }
31f18b77 1136
11fdf7f2 1137 for (auto session: to_evict) {
7c673cae 1138 if (session->is_importing()) {
11fdf7f2
TL
1139 dout(10) << "skipping session " << session->info.inst << ", it's being imported" << dendl;
1140 continue;
7c673cae 1141 }
31f18b77 1142
11fdf7f2
TL
1143 auto last_cap_renew_span = std::chrono::duration<double>(now - session->last_cap_renew).count();
1144 mds->clog->warn() << "evicting unresponsive client " << *session
1145 << ", after " << last_cap_renew_span << " seconds";
1146 dout(10) << "autoclosing stale session " << session->info.inst
1147 << " last renewed caps " << last_cap_renew_span << "s ago" << dendl;
31f18b77 1148
11fdf7f2 1149 if (g_conf()->mds_session_blacklist_on_timeout) {
31f18b77 1150 std::stringstream ss;
11fdf7f2 1151 mds->evict_client(session->get_client().v, false, true, ss, nullptr);
31f18b77
FG
1152 } else {
1153 kill_session(session, NULL);
1154 }
7c673cae
FG
1155 }
1156}
1157
91327a77
AA
1158void Server::evict_cap_revoke_non_responders() {
1159 if (!cap_revoke_eviction_timeout) {
1160 return;
1161 }
1162
9f95a23c 1163 auto&& to_evict = mds->locker->get_late_revoking_clients(cap_revoke_eviction_timeout);
91327a77
AA
1164
1165 for (auto const &client: to_evict) {
1166 mds->clog->warn() << "client id " << client << " has not responded to"
1167 << " cap revoke by MDS for over " << cap_revoke_eviction_timeout
1168 << " seconds, evicting";
1169 dout(1) << __func__ << ": evicting cap revoke non-responder client id "
1170 << client << dendl;
1171
1172 std::stringstream ss;
1173 bool evicted = mds->evict_client(client.v, false,
11fdf7f2 1174 g_conf()->mds_session_blacklist_on_evict,
91327a77
AA
1175 ss, nullptr);
1176 if (evicted && logger) {
1177 logger->inc(l_mdss_cap_revoke_eviction);
1178 }
1179 }
1180}
1181
92f5a8d4 1182void Server::handle_conf_change(const std::set<std::string>& changed) {
f91f0fd5
TL
1183 if (changed.count("mds_forward_all_requests_to_auth")){
1184 forward_all_requests_to_auth = g_conf().get_val<bool>("mds_forward_all_requests_to_auth");
92f5a8d4 1185 }
91327a77 1186 if (changed.count("mds_cap_revoke_eviction_timeout")) {
11fdf7f2 1187 cap_revoke_eviction_timeout = g_conf().get_val<double>("mds_cap_revoke_eviction_timeout");
91327a77
AA
1188 dout(20) << __func__ << " cap revoke eviction timeout changed to "
1189 << cap_revoke_eviction_timeout << dendl;
1190 }
a8e16298 1191 if (changed.count("mds_recall_max_decay_rate")) {
11fdf7f2 1192 recall_throttle = DecayCounter(g_conf().get_val<double>("mds_recall_max_decay_rate"));
a8e16298 1193 }
9f95a23c
TL
1194 if (changed.count("mds_max_snaps_per_dir")) {
1195 max_snaps_per_dir = g_conf().get_val<uint64_t>("mds_max_snaps_per_dir");
1196 dout(20) << __func__ << " max snapshots per directory changed to "
1197 << max_snaps_per_dir << dendl;
1198 }
1199 if (changed.count("mds_client_delegate_inos_pct")) {
1200 delegate_inos_pct = g_conf().get_val<uint64_t>("mds_client_delegate_inos_pct");
1201 }
adb31ebb
TL
1202 if (changed.count("mds_max_caps_per_client")) {
1203 max_caps_per_client = g_conf().get_val<uint64_t>("mds_max_caps_per_client");
1204 }
1205 if (changed.count("mds_session_cap_acquisition_throttle")) {
1206 cap_acquisition_throttle = g_conf().get_val<uint64_t>("mds_session_cap_acquisition_throttle");
1207 }
1208 if (changed.count("mds_session_max_caps_throttle_ratio")) {
1209 max_caps_throttle_ratio = g_conf().get_val<double>("mds_session_max_caps_throttle_ratio");
1210 }
1211 if (changed.count("mds_cap_acquisition_throttle_retry_request_timeout")) {
1212 caps_throttle_retry_request_timeout = g_conf().get_val<double>("mds_cap_acquisition_throttle_retry_request_timeout");
1213 }
91327a77
AA
1214}
1215
7c673cae 1216/*
11fdf7f2 1217 * XXX bump in the interface here, not using an MDSContext here
7c673cae
FG
1218 * because all the callers right now happen to use a SaferCond
1219 */
9f95a23c 1220void Server::kill_session(Session *session, Context *on_safe, bool need_purge_inos)
7c673cae 1221{
9f95a23c 1222 ceph_assert(ceph_mutex_is_locked_by_me(mds->mds_lock));
31f18b77 1223
7c673cae
FG
1224 if ((session->is_opening() ||
1225 session->is_open() ||
1226 session->is_stale()) &&
1227 !session->is_importing()) {
1228 dout(10) << "kill_session " << session << dendl;
9f95a23c 1229 journal_close_session(session, Session::STATE_KILLING, on_safe, need_purge_inos);
7c673cae
FG
1230 } else {
1231 dout(10) << "kill_session importing or already closing/killing " << session << dendl;
11fdf7f2
TL
1232 if (session->is_closing() ||
1233 session->is_killing()) {
1234 if (on_safe)
1235 mdlog->wait_for_safe(new MDSInternalContextWrapper(mds, on_safe));
1236 } else {
1237 ceph_assert(session->is_closed() ||
1238 session->is_importing());
1239 if (on_safe)
1240 on_safe->complete(0);
7c673cae
FG
1241 }
1242 }
1243}
1244
31f18b77
FG
1245size_t Server::apply_blacklist(const std::set<entity_addr_t> &blacklist)
1246{
81eedcae
TL
1247 bool prenautilus = mds->objecter->with_osdmap(
1248 [&](const OSDMap& o) {
9f95a23c 1249 return o.require_osd_release < ceph_release_t::nautilus;
81eedcae
TL
1250 });
1251
1252 std::vector<Session*> victims;
11fdf7f2 1253 const auto& sessions = mds->sessionmap.get_sessions();
81eedcae 1254 for (const auto& p : sessions) {
31f18b77
FG
1255 if (!p.first.is_client()) {
1256 // Do not apply OSDMap blacklist to MDS daemons, we find out
1257 // about their death via MDSMap.
1258 continue;
1259 }
1260
1261 Session *s = p.second;
81eedcae
TL
1262 auto inst_addr = s->info.inst.addr;
1263 // blacklist entries are always TYPE_ANY for nautilus+
1264 inst_addr.set_type(entity_addr_t::TYPE_ANY);
1265 if (blacklist.count(inst_addr)) {
31f18b77 1266 victims.push_back(s);
81eedcae
TL
1267 continue;
1268 }
1269 if (prenautilus) {
1270 // ...except pre-nautilus, they were TYPE_LEGACY
1271 inst_addr.set_type(entity_addr_t::TYPE_LEGACY);
1272 if (blacklist.count(inst_addr)) {
1273 victims.push_back(s);
1274 }
31f18b77
FG
1275 }
1276 }
1277
9f95a23c 1278 for (const auto& s : victims) {
31f18b77
FG
1279 kill_session(s, nullptr);
1280 }
1281
1282 dout(10) << "apply_blacklist: killed " << victims.size() << dendl;
1283
1284 return victims.size();
1285}
1286
9f95a23c 1287void Server::journal_close_session(Session *session, int state, Context *on_safe, bool need_purge_inos)
7c673cae 1288{
9f95a23c
TL
1289 dout(10) << __func__ << " : "
1290 << "("<< need_purge_inos << ")"
1291 << session->info.inst
1292 << "(" << session->info.prealloc_inos.size() << "|" << session->pending_prealloc_inos.size() << ")" << dendl;
1293
7c673cae
FG
1294 uint64_t sseq = mds->sessionmap.set_state(session, state);
1295 version_t pv = mds->sessionmap.mark_projected(session);
1296 version_t piv = 0;
1297
1298 // release alloc and pending-alloc inos for this session
1299 // and wipe out session state, in case the session close aborts for some reason
1300 interval_set<inodeno_t> both;
7c673cae 1301 both.insert(session->pending_prealloc_inos);
9f95a23c
TL
1302 if (!need_purge_inos)
1303 both.insert(session->info.prealloc_inos);
7c673cae
FG
1304 if (both.size()) {
1305 mds->inotable->project_release_ids(both);
1306 piv = mds->inotable->get_projected_version();
1307 } else
1308 piv = 0;
9f95a23c
TL
1309
1310 if(need_purge_inos && session->info.prealloc_inos.size()) {
1311 dout(10) << "start purge indoes " << session->info.prealloc_inos << dendl;
1312 LogSegment* ls = mdlog->get_current_segment();
1313 LogEvent* e = new ESession(session->info.inst, false, pv, both, piv, session->info.prealloc_inos);
1314 MDSLogContextBase* c = new C_MDS_session_finish(this, session, sseq, false, pv, both, piv,
1315 session->info.prealloc_inos, ls, on_safe);
1316 mdlog->start_submit_entry(e, c);
1317 } else {
1318 interval_set<inodeno_t> empty;
1319 LogEvent* e = new ESession(session->info.inst, false, pv, both, piv, empty);
1320 MDSLogContextBase* c = new C_MDS_session_finish(this, session, sseq, false, pv, both, piv, on_safe);
1321 mdlog->start_submit_entry(e, c);
1322 }
7c673cae
FG
1323 mdlog->flush();
1324
1325 // clean up requests, too
9f95a23c
TL
1326 for (auto p = session->requests.begin(); !p.end(); ) {
1327 MDRequestRef mdr(*p);
7c673cae
FG
1328 ++p;
1329 mdcache->request_kill(mdr);
1330 }
1331
1332 finish_flush_session(session, session->get_push_seq());
1333}
1334
11fdf7f2 1335void Server::reconnect_clients(MDSContext *reconnect_done_)
7c673cae
FG
1336{
1337 reconnect_done = reconnect_done_;
28e407b8 1338
11fdf7f2 1339 auto now = clock::now();
28e407b8
AA
1340 set<Session*> sessions;
1341 mds->sessionmap.get_client_session_set(sessions);
1342 for (auto session : sessions) {
11fdf7f2
TL
1343 if (session->is_open()) {
1344 client_reconnect_gather.insert(session->get_client());
92f5a8d4 1345 session->set_reconnecting(true);
11fdf7f2
TL
1346 session->last_cap_renew = now;
1347 }
28e407b8 1348 }
7c673cae
FG
1349
1350 if (client_reconnect_gather.empty()) {
1351 dout(7) << "reconnect_clients -- no sessions, doing nothing." << dendl;
1352 reconnect_gather_finish();
1353 return;
1354 }
1355
1356 // clients will get the mdsmap and discover we're reconnecting via the monitor.
1357
11fdf7f2 1358 reconnect_start = now;
7c673cae
FG
1359 dout(1) << "reconnect_clients -- " << client_reconnect_gather.size() << " sessions" << dendl;
1360 mds->sessionmap.dump();
1361}
1362
9f95a23c 1363void Server::handle_client_reconnect(const cref_t<MClientReconnect> &m)
7c673cae 1364{
11fdf7f2
TL
1365 dout(7) << "handle_client_reconnect " << m->get_source()
1366 << (m->has_more() ? " (more)" : "") << dendl;
7c673cae 1367 client_t from = m->get_source().num();
94b18763 1368 Session *session = mds->get_session(m);
92f5a8d4
TL
1369 if (!session) {
1370 dout(0) << " ignoring sessionless msg " << *m << dendl;
9f95a23c 1371 auto reply = make_message<MClientSession>(CEPH_SESSION_REJECT);
92f5a8d4
TL
1372 reply->metadata["error_string"] = "sessionless";
1373 mds->send_message(reply, m->get_connection());
81eedcae 1374 return;
92f5a8d4
TL
1375 }
1376
1377 if (!session->is_open()) {
1378 dout(0) << " ignoring msg from not-open session" << *m << dendl;
9f95a23c 1379 auto reply = make_message<MClientSession>(CEPH_SESSION_CLOSE);
92f5a8d4
TL
1380 mds->send_message(reply, m->get_connection());
1381 return;
1382 }
7c673cae
FG
1383
1384 if (!mds->is_reconnect() && mds->get_want_state() == CEPH_MDS_STATE_RECONNECT) {
1385 dout(10) << " we're almost in reconnect state (mdsmap delivery race?); waiting" << dendl;
1386 mds->wait_for_reconnect(new C_MDS_RetryMessage(mds, m));
1387 return;
1388 }
1389
f64942e4 1390 auto delay = std::chrono::duration<double>(clock::now() - reconnect_start).count();
7c673cae
FG
1391 dout(10) << " reconnect_start " << reconnect_start << " delay " << delay << dendl;
1392
1393 bool deny = false;
b32b8144 1394 if (!mds->is_reconnect() || mds->get_want_state() != CEPH_MDS_STATE_RECONNECT || reconnect_evicting) {
7c673cae
FG
1395 // XXX maybe in the future we can do better than this?
1396 dout(1) << " no longer in reconnect state, ignoring reconnect, sending close" << dendl;
1397 mds->clog->info() << "denied reconnect attempt (mds is "
1398 << ceph_mds_state_name(mds->get_state())
1399 << ") from " << m->get_source_inst()
11fdf7f2 1400 << " after " << delay << " (allowed interval " << g_conf()->mds_reconnect_timeout << ")";
7c673cae 1401 deny = true;
11fdf7f2
TL
1402 } else {
1403 std::string error_str;
1404 if (!session->is_open()) {
1405 error_str = "session is closed";
1406 } else if (mdcache->is_readonly()) {
1407 error_str = "mds is readonly";
1408 } else {
1409 if (session->info.client_metadata.features.empty())
1410 infer_supported_features(session, session->info.client_metadata);
1411
1412 feature_bitset_t missing_features = required_client_features;
1413 missing_features -= session->info.client_metadata.features;
1414 if (!missing_features.empty()) {
1415 stringstream ss;
1416 ss << "missing required features '" << missing_features << "'";
1417 error_str = ss.str();
1418 }
1419 }
1420
1421 if (!error_str.empty()) {
1422 deny = true;
1423 dout(1) << " " << error_str << ", ignoring reconnect, sending close" << dendl;
1424 mds->clog->info() << "denied reconnect attempt from "
1425 << m->get_source_inst() << " (" << error_str << ")";
1426 }
7c673cae
FG
1427 }
1428
1429 if (deny) {
9f95a23c 1430 auto r = make_message<MClientSession>(CEPH_SESSION_CLOSE);
11fdf7f2
TL
1431 mds->send_message_client(r, session);
1432 if (session->is_open())
1433 kill_session(session, nullptr);
7c673cae
FG
1434 return;
1435 }
1436
11fdf7f2
TL
1437 if (!m->has_more()) {
1438 // notify client of success with an OPEN
9f95a23c 1439 auto reply = make_message<MClientSession>(CEPH_SESSION_OPEN);
11fdf7f2
TL
1440 if (session->info.has_feature(CEPHFS_FEATURE_MIMIC))
1441 reply->supported_features = supported_features;
1442 mds->send_message_client(reply, session);
1443 mds->clog->debug() << "reconnect by " << session->info.inst << " after " << delay;
1444 }
1445
91327a77 1446 session->last_cap_renew = clock::now();
7c673cae
FG
1447
1448 // snaprealms
11fdf7f2
TL
1449 for (const auto &r : m->realms) {
1450 CInode *in = mdcache->get_inode(inodeno_t(r.realm.ino));
7c673cae
FG
1451 if (in && in->state_test(CInode::STATE_PURGING))
1452 continue;
1453 if (in) {
11fdf7f2
TL
1454 if (in->snaprealm) {
1455 dout(15) << "open snaprealm (w inode) on " << *in << dendl;
7c673cae 1456 } else {
11fdf7f2
TL
1457 // this can happen if we are non-auth or we rollback snaprealm
1458 dout(15) << "open snaprealm (null snaprealm) on " << *in << dendl;
7c673cae 1459 }
11fdf7f2 1460 mdcache->add_reconnected_snaprealm(from, inodeno_t(r.realm.ino), snapid_t(r.realm.seq));
7c673cae 1461 } else {
11fdf7f2
TL
1462 dout(15) << "open snaprealm (w/o inode) on " << inodeno_t(r.realm.ino)
1463 << " seq " << r.realm.seq << dendl;
1464 mdcache->add_reconnected_snaprealm(from, inodeno_t(r.realm.ino), snapid_t(r.realm.seq));
7c673cae
FG
1465 }
1466 }
1467
1468 // caps
11fdf7f2 1469 for (const auto &p : m->caps) {
7c673cae 1470 // make sure our last_cap_id is MAX over all issued caps
11fdf7f2
TL
1471 if (p.second.capinfo.cap_id > mdcache->last_cap_id)
1472 mdcache->last_cap_id = p.second.capinfo.cap_id;
7c673cae 1473
11fdf7f2 1474 CInode *in = mdcache->get_inode(p.first);
7c673cae
FG
1475 if (in && in->state_test(CInode::STATE_PURGING))
1476 continue;
1477 if (in && in->is_auth()) {
1478 // we recovered it, and it's ours. take note.
11fdf7f2 1479 dout(15) << "open cap realm " << inodeno_t(p.second.capinfo.snaprealm)
7c673cae 1480 << " on " << *in << dendl;
11fdf7f2
TL
1481 in->reconnect_cap(from, p.second, session);
1482 mdcache->add_reconnected_cap(from, p.first, p.second);
1483 recover_filelocks(in, p.second.flockbl, m->get_orig_source().num());
7c673cae
FG
1484 continue;
1485 }
1486
1487 if (in && !in->is_auth()) {
1488 // not mine.
1489 dout(10) << "non-auth " << *in << ", will pass off to authority" << dendl;
1490 // add to cap export list.
11fdf7f2
TL
1491 mdcache->rejoin_export_caps(p.first, from, p.second,
1492 in->authority().first, true);
7c673cae
FG
1493 } else {
1494 // don't know if the inode is mine
11fdf7f2
TL
1495 dout(10) << "missing ino " << p.first << ", will load later" << dendl;
1496 mdcache->rejoin_recovered_caps(p.first, from, p.second, MDS_RANK_NONE);
7c673cae
FG
1497 }
1498 }
1499
f64942e4
AA
1500 reconnect_last_seen = clock::now();
1501
11fdf7f2
TL
1502 if (!m->has_more()) {
1503 mdcache->rejoin_recovered_client(session->get_client(), session->info.inst);
1504
1505 // remove from gather set
1506 client_reconnect_gather.erase(from);
92f5a8d4 1507 session->set_reconnecting(false);
11fdf7f2
TL
1508 if (client_reconnect_gather.empty())
1509 reconnect_gather_finish();
1510 }
1511}
1512
1513void Server::infer_supported_features(Session *session, client_metadata_t& client_metadata)
1514{
1515 int supported = -1;
1516 auto it = client_metadata.find("ceph_version");
1517 if (it != client_metadata.end()) {
1518 // user space client
1519 if (it->second.compare(0, 16, "ceph version 12.") == 0)
1520 supported = CEPHFS_FEATURE_LUMINOUS;
1521 else if (session->get_connection()->has_feature(CEPH_FEATURE_FS_CHANGE_ATTR))
1522 supported = CEPHFS_FEATURE_KRAKEN;
1523 } else {
1524 it = client_metadata.find("kernel_version");
1525 if (it != client_metadata.end()) {
1526 // kernel client
1527 if (session->get_connection()->has_feature(CEPH_FEATURE_NEW_OSDOP_ENCODING))
1528 supported = CEPHFS_FEATURE_LUMINOUS;
1529 }
1530 }
1531 if (supported == -1 &&
1532 session->get_connection()->has_feature(CEPH_FEATURE_FS_FILE_LAYOUT_V2))
1533 supported = CEPHFS_FEATURE_JEWEL;
7c673cae 1534
11fdf7f2
TL
1535 if (supported >= 0) {
1536 unsigned long value = (1UL << (supported + 1)) - 1;
1537 client_metadata.features = feature_bitset_t(value);
1538 dout(10) << __func__ << " got '" << client_metadata.features << "'" << dendl;
1539 }
7c673cae
FG
1540}
1541
11fdf7f2
TL
1542void Server::update_required_client_features()
1543{
1544 vector<size_t> bits = CEPHFS_FEATURES_MDS_REQUIRED;
1545
9f95a23c
TL
1546 /* If this blows up on you, you added a release without adding a new release bit to cephfs_features.h */
1547 static_assert(CEPHFS_CURRENT_RELEASE == CEPH_RELEASE_MAX-1);
1548
1549 ceph_release_t min_compat = mds->mdsmap->get_min_compat_client();
1550 if (min_compat >= ceph_release_t::octopus)
1551 bits.push_back(CEPHFS_FEATURE_OCTOPUS);
1552 else if (min_compat >= ceph_release_t::nautilus)
11fdf7f2 1553 bits.push_back(CEPHFS_FEATURE_NAUTILUS);
9f95a23c 1554 else if (min_compat >= ceph_release_t::mimic)
11fdf7f2 1555 bits.push_back(CEPHFS_FEATURE_MIMIC);
9f95a23c 1556 else if (min_compat >= ceph_release_t::luminous)
11fdf7f2 1557 bits.push_back(CEPHFS_FEATURE_LUMINOUS);
9f95a23c 1558 else if (min_compat >= ceph_release_t::kraken)
11fdf7f2 1559 bits.push_back(CEPHFS_FEATURE_KRAKEN);
9f95a23c 1560 else if (min_compat >= ceph_release_t::jewel)
11fdf7f2
TL
1561 bits.push_back(CEPHFS_FEATURE_JEWEL);
1562
1563 std::sort(bits.begin(), bits.end());
1564 required_client_features = feature_bitset_t(bits);
1565 dout(7) << "required_client_features: " << required_client_features << dendl;
1566
1567 if (mds->get_state() >= MDSMap::STATE_RECONNECT) {
1568 set<Session*> sessions;
1569 mds->sessionmap.get_client_session_set(sessions);
1570 for (auto session : sessions) {
1571 feature_bitset_t missing_features = required_client_features;
1572 missing_features -= session->info.client_metadata.features;
1573 if (!missing_features.empty()) {
1574 bool blacklisted = mds->objecter->with_osdmap(
1575 [session](const OSDMap &osd_map) -> bool {
1576 return osd_map.is_blacklisted(session->info.inst.addr);
1577 });
1578 if (blacklisted)
1579 continue;
7c673cae 1580
11fdf7f2
TL
1581 mds->clog->warn() << "evicting session " << *session << ", missing required features '"
1582 << missing_features << "'";
1583 std::stringstream ss;
1584 mds->evict_client(session->get_client().v, false,
1585 g_conf()->mds_session_blacklist_on_evict, ss);
1586 }
1587 }
1588 }
1589}
7c673cae
FG
1590
1591void Server::reconnect_gather_finish()
1592{
1593 dout(7) << "reconnect_gather_finish. failed on " << failed_reconnects << " clients" << dendl;
11fdf7f2
TL
1594 ceph_assert(reconnect_done);
1595
1596 if (!mds->snapclient->is_synced()) {
1597 // make sure snaptable cache is populated. snaprealms will be
1598 // extensively used in rejoin stage.
1599 dout(7) << " snaptable cache isn't synced, delaying state transition" << dendl;
1600 mds->snapclient->wait_for_sync(reconnect_done);
1601 } else {
1602 reconnect_done->complete(0);
1603 }
7c673cae
FG
1604 reconnect_done = NULL;
1605}
1606
1607void Server::reconnect_tick()
1608{
31f18b77 1609 if (reconnect_evicting) {
f64942e4 1610 dout(7) << "reconnect_tick: waiting for evictions" << dendl;
31f18b77
FG
1611 return;
1612 }
1613
f64942e4
AA
1614 if (client_reconnect_gather.empty())
1615 return;
31f18b77 1616
f64942e4
AA
1617 auto now = clock::now();
1618 auto elapse1 = std::chrono::duration<double>(now - reconnect_start).count();
11fdf7f2 1619 if (elapse1 < g_conf()->mds_reconnect_timeout)
f64942e4 1620 return;
31f18b77 1621
f64942e4
AA
1622 vector<Session*> remaining_sessions;
1623 remaining_sessions.reserve(client_reconnect_gather.size());
1624 for (auto c : client_reconnect_gather) {
1625 Session *session = mds->sessionmap.get_session(entity_name_t::CLIENT(c.v));
1626 ceph_assert(session);
1627 remaining_sessions.push_back(session);
1628 // client re-sends cap flush messages before the reconnect message
1629 if (session->last_seen > reconnect_last_seen)
1630 reconnect_last_seen = session->last_seen;
1631 }
31f18b77 1632
f64942e4 1633 auto elapse2 = std::chrono::duration<double>(now - reconnect_last_seen).count();
11fdf7f2 1634 if (elapse2 < g_conf()->mds_reconnect_timeout / 2) {
f64942e4
AA
1635 dout(7) << "reconnect_tick: last seen " << elapse2
1636 << " seconds ago, extending reconnect interval" << dendl;
1637 return;
1638 }
1639
1640 dout(7) << "reconnect timed out, " << remaining_sessions.size()
1641 << " clients have not reconnected in time" << dendl;
1642
1643 // If we're doing blacklist evictions, use this to wait for them before
1644 // proceeding to reconnect_gather_finish
1645 MDSGatherBuilder gather(g_ceph_context);
1646
1647 for (auto session : remaining_sessions) {
11fdf7f2
TL
1648 // Keep sessions that have specified timeout. These sessions will prevent
1649 // mds from going to active. MDS goes to active after they all have been
1650 // killed or reclaimed.
1651 if (session->info.client_metadata.find("timeout") !=
1652 session->info.client_metadata.end()) {
1653 dout(1) << "reconnect keeps " << session->info.inst
1654 << ", need to be reclaimed" << dendl;
1655 client_reclaim_gather.insert(session->get_client());
1656 continue;
1657 }
1658
f64942e4 1659 dout(1) << "reconnect gives up on " << session->info.inst << dendl;
31f18b77 1660
f64942e4
AA
1661 mds->clog->warn() << "evicting unresponsive client " << *session
1662 << ", after waiting " << elapse1
1663 << " seconds during MDS startup";
1664
11fdf7f2 1665 if (g_conf()->mds_session_blacklist_on_timeout) {
f64942e4
AA
1666 std::stringstream ss;
1667 mds->evict_client(session->get_client().v, false, true, ss,
1668 gather.new_sub());
31f18b77 1669 } else {
9f95a23c 1670 kill_session(session, NULL, true);
31f18b77 1671 }
f64942e4
AA
1672
1673 failed_reconnects++;
1674 }
1675 client_reconnect_gather.clear();
1676
1677 if (gather.has_subs()) {
1678 dout(1) << "reconnect will complete once clients are evicted" << dendl;
9f95a23c 1679 gather.set_finisher(new MDSInternalContextWrapper(mds, new LambdaContext(
f64942e4
AA
1680 [this](int r){reconnect_gather_finish();})));
1681 gather.activate();
1682 reconnect_evicting = true;
1683 } else {
1684 reconnect_gather_finish();
7c673cae
FG
1685 }
1686}
1687
1688void Server::recover_filelocks(CInode *in, bufferlist locks, int64_t client)
1689{
1690 if (!locks.length()) return;
1691 int numlocks;
1692 ceph_filelock lock;
11fdf7f2
TL
1693 auto p = locks.cbegin();
1694 decode(numlocks, p);
7c673cae 1695 for (int i = 0; i < numlocks; ++i) {
11fdf7f2 1696 decode(lock, p);
7c673cae
FG
1697 lock.client = client;
1698 in->get_fcntl_lock_state()->held_locks.insert(pair<uint64_t, ceph_filelock>(lock.start, lock));
1699 ++in->get_fcntl_lock_state()->client_held_lock_counts[client];
1700 }
11fdf7f2 1701 decode(numlocks, p);
7c673cae 1702 for (int i = 0; i < numlocks; ++i) {
11fdf7f2 1703 decode(lock, p);
7c673cae
FG
1704 lock.client = client;
1705 in->get_flock_lock_state()->held_locks.insert(pair<uint64_t, ceph_filelock> (lock.start, lock));
1706 ++in->get_flock_lock_state()->client_held_lock_counts[client];
1707 }
1708}
1709
7c673cae
FG
1710/**
1711 * Call this when the MDCache is oversized, to send requests to the clients
1712 * to trim some caps, and consequently unpin some inodes in the MDCache so
1713 * that it can trim too.
1714 */
a8e16298
TL
1715std::pair<bool, uint64_t> Server::recall_client_state(MDSGatherBuilder* gather, RecallFlags flags)
1716{
1717 const auto now = clock::now();
92f5a8d4
TL
1718 const bool steady = !!(flags&RecallFlags::STEADY);
1719 const bool enforce_max = !!(flags&RecallFlags::ENFORCE_MAX);
1720 const bool enforce_liveness = !!(flags&RecallFlags::ENFORCE_LIVENESS);
1721 const bool trim = !!(flags&RecallFlags::TRIM);
a8e16298 1722
11fdf7f2
TL
1723 const auto max_caps_per_client = g_conf().get_val<uint64_t>("mds_max_caps_per_client");
1724 const auto min_caps_per_client = g_conf().get_val<uint64_t>("mds_min_caps_per_client");
1725 const auto recall_global_max_decay_threshold = g_conf().get_val<Option::size_t>("mds_recall_global_max_decay_threshold");
1726 const auto recall_max_caps = g_conf().get_val<Option::size_t>("mds_recall_max_caps");
1727 const auto recall_max_decay_threshold = g_conf().get_val<Option::size_t>("mds_recall_max_decay_threshold");
92f5a8d4 1728 const auto cache_liveness_magnitude = g_conf().get_val<Option::size_t>("mds_session_cache_liveness_magnitude");
a8e16298
TL
1729
1730 dout(7) << __func__ << ":"
1731 << " min=" << min_caps_per_client
1732 << " max=" << max_caps_per_client
1733 << " total=" << Capability::count()
92f5a8d4 1734 << " flags=" << flags
a8e16298 1735 << dendl;
f64942e4 1736
a8e16298
TL
1737 /* trim caps of sessions with the most caps first */
1738 std::multimap<uint64_t, Session*> caps_session;
92f5a8d4 1739 auto f = [&caps_session, enforce_max, enforce_liveness, trim, max_caps_per_client, cache_liveness_magnitude](auto& s) {
a8e16298 1740 auto num_caps = s->caps.size();
92f5a8d4
TL
1741 auto cache_liveness = s->get_session_cache_liveness();
1742 if (trim || (enforce_max && num_caps > max_caps_per_client) || (enforce_liveness && cache_liveness < (num_caps>>cache_liveness_magnitude))) {
a8e16298
TL
1743 caps_session.emplace(std::piecewise_construct, std::forward_as_tuple(num_caps), std::forward_as_tuple(s));
1744 }
1745 };
1746 mds->sessionmap.get_client_sessions(std::move(f));
1747
1748 std::pair<bool, uint64_t> result = {false, 0};
11fdf7f2 1749 auto& [throttled, caps_recalled] = result;
a8e16298 1750 last_recall_state = now;
11fdf7f2 1751 for (const auto& [num_caps, session] : boost::adaptors::reverse(caps_session)) {
7c673cae 1752 if (!session->is_open() ||
11fdf7f2 1753 !session->get_connection() ||
7c673cae
FG
1754 !session->info.inst.name.is_client())
1755 continue;
1756
a8e16298
TL
1757 dout(10) << __func__ << ":"
1758 << " session " << session->info.inst
1759 << " caps " << num_caps
7c673cae
FG
1760 << ", leases " << session->leases.size()
1761 << dendl;
1762
a8e16298
TL
1763 uint64_t newlim;
1764 if (num_caps < recall_max_caps || (num_caps-recall_max_caps) < min_caps_per_client) {
1765 newlim = min_caps_per_client;
1766 } else {
1767 newlim = num_caps-recall_max_caps;
1768 }
1769 if (num_caps > newlim) {
1770 /* now limit the number of caps we recall at a time to prevent overloading ourselves */
1771 uint64_t recall = std::min<uint64_t>(recall_max_caps, num_caps-newlim);
1772 newlim = num_caps-recall;
1773 const uint64_t session_recall_throttle = session->get_recall_caps_throttle();
11fdf7f2
TL
1774 const uint64_t session_recall_throttle2o = session->get_recall_caps_throttle2o();
1775 const uint64_t global_recall_throttle = recall_throttle.get();
a8e16298
TL
1776 if (session_recall_throttle+recall > recall_max_decay_threshold) {
1777 dout(15) << " session recall threshold (" << recall_max_decay_threshold << ") hit at " << session_recall_throttle << "; skipping!" << dendl;
1778 throttled = true;
1779 continue;
11fdf7f2
TL
1780 } else if (session_recall_throttle2o+recall > recall_max_caps*2) {
1781 dout(15) << " session recall 2nd-order threshold (" << 2*recall_max_caps << ") hit at " << session_recall_throttle2o << "; skipping!" << dendl;
1782 throttled = true;
1783 continue;
a8e16298
TL
1784 } else if (global_recall_throttle+recall > recall_global_max_decay_threshold) {
1785 dout(15) << " global recall threshold (" << recall_global_max_decay_threshold << ") hit at " << global_recall_throttle << "; skipping!" << dendl;
1786 throttled = true;
1787 break;
1788 }
1789
1790 // now check if we've recalled caps recently and the client is unlikely to satisfy a new recall
1791 if (steady) {
1792 const auto session_recall = session->get_recall_caps();
1793 const auto session_release = session->get_release_caps();
1794 if (2*session_release < session_recall && 2*session_recall > recall_max_decay_threshold) {
1795 /* The session has been unable to keep up with the number of caps
1796 * recalled (by half); additionally, to prevent marking sessions
1797 * we've just begun to recall from, the session_recall counter
1798 * (decayed count of caps recently recalled) is **greater** than the
1799 * session threshold for the session's cap recall throttle.
1800 */
1801 dout(15) << " 2*session_release < session_recall"
11fdf7f2
TL
1802 " (2*" << session_release << " < " << session_recall << ") &&"
1803 " 2*session_recall < recall_max_decay_threshold"
1804 " (2*" << session_recall << " > " << recall_max_decay_threshold << ")"
a8e16298
TL
1805 " Skipping because we are unlikely to get more released." << dendl;
1806 continue;
1807 } else if (recall < recall_max_caps && 2*recall < session_recall) {
1808 /* The number of caps recalled is less than the number we *could*
1809 * recall (so there isn't much left to recall?) and the number of
1810 * caps is less than the current recall_caps counter (decayed count
1811 * of caps recently recalled).
1812 */
1813 dout(15) << " 2*recall < session_recall "
1814 " (2*" << recall << " < " << session_recall << ") &&"
1815 " recall < recall_max_caps (" << recall << " < " << recall_max_caps << ");"
1816 " Skipping because we are unlikely to get more released." << dendl;
1817 continue;
1818 }
1819 }
1820
1821 dout(7) << " recalling " << recall << " caps; session_recall_throttle = " << session_recall_throttle << "; global_recall_throttle = " << global_recall_throttle << dendl;
1822
9f95a23c 1823 auto m = make_message<MClientSession>(CEPH_SESSION_RECALL_STATE);
3efd9988
FG
1824 m->head.max_caps = newlim;
1825 mds->send_message_client(m, session);
a8e16298 1826 if (gather) {
f91f0fd5 1827 flush_session(session, *gather);
f64942e4 1828 }
a8e16298 1829 caps_recalled += session->notify_recall_sent(newlim);
11fdf7f2 1830 recall_throttle.hit(recall);
7c673cae
FG
1831 }
1832 }
a8e16298
TL
1833
1834 dout(7) << "recalled" << (throttled ? " (throttled)" : "") << " " << caps_recalled << " client caps." << dendl;
1835
1836 return result;
7c673cae
FG
1837}
1838
1839void Server::force_clients_readonly()
1840{
1841 dout(10) << "force_clients_readonly" << dendl;
1842 set<Session*> sessions;
1843 mds->sessionmap.get_client_session_set(sessions);
1844 for (set<Session*>::const_iterator p = sessions.begin();
1845 p != sessions.end();
1846 ++p) {
1847 Session *session = *p;
1848 if (!session->info.inst.name.is_client() ||
1849 !(session->is_open() || session->is_stale()))
1850 continue;
9f95a23c 1851 mds->send_message_client(make_message<MClientSession>(CEPH_SESSION_FORCE_RO), session);
7c673cae
FG
1852 }
1853}
1854
1855/*******
1856 * some generic stuff for finishing off requests
1857 */
1858void Server::journal_and_reply(MDRequestRef& mdr, CInode *in, CDentry *dn, LogEvent *le, MDSLogContextBase *fin)
1859{
1860 dout(10) << "journal_and_reply tracei " << in << " tracedn " << dn << dendl;
11fdf7f2 1861 ceph_assert(!mdr->has_completed);
7c673cae
FG
1862
1863 // note trace items for eventual reply.
1864 mdr->tracei = in;
1865 if (in)
1866 mdr->pin(in);
1867
1868 mdr->tracedn = dn;
1869 if (dn)
1870 mdr->pin(dn);
1871
1872 early_reply(mdr, in, dn);
1873
1874 mdr->committing = true;
1875 submit_mdlog_entry(le, fin, mdr, __func__);
1876
1877 if (mdr->client_request && mdr->client_request->is_queued_for_replay()) {
1878 if (mds->queue_one_replay()) {
1879 dout(10) << " queued next replay op" << dendl;
1880 } else {
11fdf7f2 1881 dout(10) << " journaled last replay op" << dendl;
7c673cae
FG
1882 }
1883 } else if (mdr->did_early_reply)
b32b8144 1884 mds->locker->drop_rdlocks_for_early_reply(mdr.get());
7c673cae
FG
1885 else
1886 mdlog->flush();
1887}
1888
1889void Server::submit_mdlog_entry(LogEvent *le, MDSLogContextBase *fin, MDRequestRef& mdr,
11fdf7f2 1890 std::string_view event)
7c673cae
FG
1891{
1892 if (mdr) {
1893 string event_str("submit entry: ");
1894 event_str += event;
11fdf7f2 1895 mdr->mark_event(event_str);
7c673cae
FG
1896 }
1897 mdlog->submit_entry(le, fin);
1898}
1899
1900/*
1901 * send response built from mdr contents and error code; clean up mdr
1902 */
1903void Server::respond_to_request(MDRequestRef& mdr, int r)
1904{
1905 if (mdr->client_request) {
f91f0fd5
TL
1906 if (mdr->is_batch_head()) {
1907 dout(20) << __func__ << " batch head " << *mdr << dendl;
1908 mdr->release_batch_op()->respond(r);
9f95a23c
TL
1909 } else {
1910 reply_client_request(mdr, make_message<MClientReply>(*mdr->client_request, r));
1911 }
7c673cae
FG
1912 } else if (mdr->internal_op > -1) {
1913 dout(10) << "respond_to_request on internal request " << mdr << dendl;
1914 if (!mdr->internal_op_finish)
11fdf7f2 1915 ceph_abort_msg("trying to respond to internal op without finisher");
7c673cae
FG
1916 mdr->internal_op_finish->complete(r);
1917 mdcache->request_finish(mdr);
1918 }
1919}
1920
91327a77 1921// statistics mds req op number and latency
9f95a23c 1922void Server::perf_gather_op_latency(const cref_t<MClientRequest> &req, utime_t lat)
91327a77
AA
1923{
1924 int code = l_mdss_first;
1925 switch(req->get_op()) {
1926 case CEPH_MDS_OP_LOOKUPHASH:
1927 code = l_mdss_req_lookuphash_latency;
1928 break;
1929 case CEPH_MDS_OP_LOOKUPINO:
1930 code = l_mdss_req_lookupino_latency;
1931 break;
1932 case CEPH_MDS_OP_LOOKUPPARENT:
1933 code = l_mdss_req_lookupparent_latency;
1934 break;
1935 case CEPH_MDS_OP_LOOKUPNAME:
1936 code = l_mdss_req_lookupname_latency;
1937 break;
1938 case CEPH_MDS_OP_LOOKUP:
1939 code = l_mdss_req_lookup_latency;
1940 break;
1941 case CEPH_MDS_OP_LOOKUPSNAP:
1942 code = l_mdss_req_lookupsnap_latency;
1943 break;
1944 case CEPH_MDS_OP_GETATTR:
1945 code = l_mdss_req_getattr_latency;
1946 break;
1947 case CEPH_MDS_OP_SETATTR:
1948 code = l_mdss_req_setattr_latency;
1949 break;
1950 case CEPH_MDS_OP_SETLAYOUT:
1951 code = l_mdss_req_setlayout_latency;
1952 break;
1953 case CEPH_MDS_OP_SETDIRLAYOUT:
1954 code = l_mdss_req_setdirlayout_latency;
1955 break;
1956 case CEPH_MDS_OP_SETXATTR:
1957 code = l_mdss_req_setxattr_latency;
1958 break;
1959 case CEPH_MDS_OP_RMXATTR:
1960 code = l_mdss_req_rmxattr_latency;
1961 break;
1962 case CEPH_MDS_OP_READDIR:
1963 code = l_mdss_req_readdir_latency;
1964 break;
1965 case CEPH_MDS_OP_SETFILELOCK:
1966 code = l_mdss_req_setfilelock_latency;
1967 break;
1968 case CEPH_MDS_OP_GETFILELOCK:
1969 code = l_mdss_req_getfilelock_latency;
1970 break;
1971 case CEPH_MDS_OP_CREATE:
1972 code = l_mdss_req_create_latency;
1973 break;
1974 case CEPH_MDS_OP_OPEN:
1975 code = l_mdss_req_open_latency;
1976 break;
1977 case CEPH_MDS_OP_MKNOD:
1978 code = l_mdss_req_mknod_latency;
1979 break;
1980 case CEPH_MDS_OP_LINK:
1981 code = l_mdss_req_link_latency;
1982 break;
1983 case CEPH_MDS_OP_UNLINK:
1984 code = l_mdss_req_unlink_latency;
1985 break;
1986 case CEPH_MDS_OP_RMDIR:
1987 code = l_mdss_req_rmdir_latency;
1988 break;
1989 case CEPH_MDS_OP_RENAME:
1990 code = l_mdss_req_rename_latency;
1991 break;
1992 case CEPH_MDS_OP_MKDIR:
1993 code = l_mdss_req_mkdir_latency;
1994 break;
1995 case CEPH_MDS_OP_SYMLINK:
1996 code = l_mdss_req_symlink_latency;
1997 break;
1998 case CEPH_MDS_OP_LSSNAP:
1999 code = l_mdss_req_lssnap_latency;
2000 break;
2001 case CEPH_MDS_OP_MKSNAP:
2002 code = l_mdss_req_mksnap_latency;
2003 break;
2004 case CEPH_MDS_OP_RMSNAP:
2005 code = l_mdss_req_rmsnap_latency;
2006 break;
2007 case CEPH_MDS_OP_RENAMESNAP:
2008 code = l_mdss_req_renamesnap_latency;
2009 break;
2010 default: ceph_abort();
2011 }
2012 logger->tinc(code, lat);
2013}
2014
7c673cae
FG
2015void Server::early_reply(MDRequestRef& mdr, CInode *tracei, CDentry *tracedn)
2016{
11fdf7f2 2017 if (!g_conf()->mds_early_reply)
7c673cae
FG
2018 return;
2019
b32b8144
FG
2020 if (mdr->no_early_reply) {
2021 dout(10) << "early_reply - flag no_early_reply is set, not allowed." << dendl;
2022 return;
2023 }
2024
7c673cae
FG
2025 if (mdr->has_more() && mdr->more()->has_journaled_slaves) {
2026 dout(10) << "early_reply - there are journaled slaves, not allowed." << dendl;
2027 return;
2028 }
2029
2030 if (mdr->alloc_ino) {
2031 dout(10) << "early_reply - allocated ino, not allowed" << dendl;
2032 return;
2033 }
2034
9f95a23c 2035 const cref_t<MClientRequest> &req = mdr->client_request;
7c673cae
FG
2036 entity_inst_t client_inst = req->get_source_inst();
2037 if (client_inst.name.is_mds())
2038 return;
2039
2040 if (req->is_replay()) {
2041 dout(10) << " no early reply on replay op" << dendl;
2042 return;
2043 }
2044
2045
9f95a23c 2046 auto reply = make_message<MClientReply>(*req, 0);
7c673cae
FG
2047 reply->set_unsafe();
2048
2049 // mark xlocks "done", indicating that we are exposing uncommitted changes.
2050 //
2051 //_rename_finish() does not send dentry link/unlink message to replicas.
2052 // so do not set xlocks on dentries "done", the xlocks prevent dentries
2053 // that have projected linkages from getting new replica.
2054 mds->locker->set_xlocks_done(mdr.get(), req->get_op() == CEPH_MDS_OP_RENAME);
2055
2056 dout(10) << "early_reply " << reply->get_result()
2057 << " (" << cpp_strerror(reply->get_result())
2058 << ") " << *req << dendl;
2059
2060 if (tracei || tracedn) {
2061 if (tracei)
2062 mdr->cap_releases.erase(tracei->vino());
2063 if (tracedn)
2064 mdr->cap_releases.erase(tracedn->get_dir()->get_inode()->vino());
2065
9f95a23c 2066 set_trace_dist(reply, tracei, tracedn, mdr);
7c673cae
FG
2067 }
2068
2069 reply->set_extra_bl(mdr->reply_extra_bl);
11fdf7f2 2070 mds->send_message_client(reply, mdr->session);
7c673cae
FG
2071
2072 mdr->did_early_reply = true;
2073
2074 mds->logger->inc(l_mds_reply);
2075 utime_t lat = ceph_clock_now() - req->get_recv_stamp();
2076 mds->logger->tinc(l_mds_reply_latency, lat);
91327a77
AA
2077 if (client_inst.name.is_client()) {
2078 mds->sessionmap.hit_session(mdr->session);
2079 }
2080 perf_gather_op_latency(req, lat);
7c673cae
FG
2081 dout(20) << "lat " << lat << dendl;
2082
2083 mdr->mark_event("early_replied");
2084}
2085
2086/*
2087 * send given reply
2088 * include a trace to tracei
2089 * Clean up mdr
2090 */
9f95a23c 2091void Server::reply_client_request(MDRequestRef& mdr, const ref_t<MClientReply> &reply)
7c673cae 2092{
11fdf7f2 2093 ceph_assert(mdr.get());
9f95a23c 2094 const cref_t<MClientRequest> &req = mdr->client_request;
7c673cae
FG
2095
2096 dout(7) << "reply_client_request " << reply->get_result()
2097 << " (" << cpp_strerror(reply->get_result())
2098 << ") " << *req << dendl;
2099
2100 mdr->mark_event("replying");
2101
2102 Session *session = mdr->session;
2103
2104 // note successful request in session map?
2105 //
2106 // setfilelock requests are special, they only modify states in MDS memory.
2107 // The states get lost when MDS fails. If Client re-send a completed
2108 // setfilelock request, it means that client did not receive corresponding
2109 // setfilelock reply. So MDS should re-execute the setfilelock request.
2110 if (req->may_write() && req->get_op() != CEPH_MDS_OP_SETFILELOCK &&
2111 reply->get_result() == 0 && session) {
2112 inodeno_t created = mdr->alloc_ino ? mdr->alloc_ino : mdr->used_prealloc_ino;
2113 session->add_completed_request(mdr->reqid.tid, created);
2114 if (mdr->ls) {
2115 mdr->ls->touched_sessions.insert(session->info.inst.name);
2116 }
2117 }
2118
2119 // give any preallocated inos to the session
2120 apply_allocated_inos(mdr, session);
2121
2122 // get tracei/tracedn from mdr?
7c673cae
FG
2123 CInode *tracei = mdr->tracei;
2124 CDentry *tracedn = mdr->tracedn;
2125
2126 bool is_replay = mdr->client_request->is_replay();
2127 bool did_early_reply = mdr->did_early_reply;
2128 entity_inst_t client_inst = req->get_source_inst();
7c673cae
FG
2129
2130 if (!did_early_reply && !is_replay) {
2131
2132 mds->logger->inc(l_mds_reply);
2133 utime_t lat = ceph_clock_now() - mdr->client_request->get_recv_stamp();
2134 mds->logger->tinc(l_mds_reply_latency, lat);
81eedcae 2135 if (session && client_inst.name.is_client()) {
91327a77
AA
2136 mds->sessionmap.hit_session(session);
2137 }
2138 perf_gather_op_latency(req, lat);
7c673cae
FG
2139 dout(20) << "lat " << lat << dendl;
2140
2141 if (tracei)
2142 mdr->cap_releases.erase(tracei->vino());
2143 if (tracedn)
2144 mdr->cap_releases.erase(tracedn->get_dir()->get_inode()->vino());
2145 }
2146
2147 // drop non-rdlocks before replying, so that we can issue leases
2148 mdcache->request_drop_non_rdlocks(mdr);
2149
2150 // reply at all?
81eedcae 2151 if (session && !client_inst.name.is_mds()) {
7c673cae
FG
2152 // send reply.
2153 if (!did_early_reply && // don't issue leases if we sent an earlier reply already
2154 (tracei || tracedn)) {
2155 if (is_replay) {
2156 if (tracei)
2157 mdcache->try_reconnect_cap(tracei, session);
2158 } else {
2159 // include metadata in reply
9f95a23c 2160 set_trace_dist(reply, tracei, tracedn, mdr);
7c673cae
FG
2161 }
2162 }
2163
2164 // We can set the extra bl unconditionally: if it's already been sent in the
2165 // early_reply, set_extra_bl will have claimed it and reply_extra_bl is empty
2166 reply->set_extra_bl(mdr->reply_extra_bl);
2167
2168 reply->set_mdsmap_epoch(mds->mdsmap->get_epoch());
11fdf7f2 2169 mds->send_message_client(reply, session);
7c673cae
FG
2170 }
2171
2172 if (req->is_queued_for_replay() &&
2173 (mdr->has_completed || reply->get_result() < 0)) {
2174 if (reply->get_result() < 0) {
2175 int r = reply->get_result();
2176 derr << "reply_client_request: failed to replay " << *req
2177 << " error " << r << " (" << cpp_strerror(r) << ")" << dendl;
2178 mds->clog->warn() << "failed to replay " << req->get_reqid() << " error " << r;
2179 }
2180 mds->queue_one_replay();
2181 }
2182
2183 // clean up request
2184 mdcache->request_finish(mdr);
2185
2186 // take a closer look at tracei, if it happens to be a remote link
2187 if (tracei &&
2188 tracedn &&
2189 tracedn->get_projected_linkage()->is_remote()) {
2190 mdcache->eval_remote(tracedn);
2191 }
2192}
2193
7c673cae
FG
2194/*
2195 * pass inode OR dentry (not both, or we may get confused)
2196 *
2197 * trace is in reverse order (i.e. root inode comes last)
2198 */
9f95a23c 2199void Server::set_trace_dist(const ref_t<MClientReply> &reply,
7c673cae 2200 CInode *in, CDentry *dn,
7c673cae
FG
2201 MDRequestRef& mdr)
2202{
2203 // skip doing this for debugging purposes?
11fdf7f2 2204 if (g_conf()->mds_inject_traceless_reply_probability &&
7c673cae 2205 mdr->ls && !mdr->o_trunc &&
11fdf7f2 2206 (rand() % 10000 < g_conf()->mds_inject_traceless_reply_probability * 10000.0)) {
7c673cae
FG
2207 dout(5) << "deliberately skipping trace for " << *reply << dendl;
2208 return;
2209 }
2210
2211 // inode, dentry, dir, ..., inode
2212 bufferlist bl;
2213 mds_rank_t whoami = mds->get_nodeid();
9f95a23c
TL
2214 Session *session = mdr->session;
2215 snapid_t snapid = mdr->snapid;
7c673cae
FG
2216 utime_t now = ceph_clock_now();
2217
2218 dout(20) << "set_trace_dist snapid " << snapid << dendl;
2219
7c673cae
FG
2220 // realm
2221 if (snapid == CEPH_NOSNAP) {
2222 SnapRealm *realm;
2223 if (in)
2224 realm = in->find_snaprealm();
2225 else
2226 realm = dn->get_dir()->get_inode()->find_snaprealm();
2227 reply->snapbl = realm->get_snap_trace();
2228 dout(10) << "set_trace_dist snaprealm " << *realm << " len=" << reply->snapbl.length() << dendl;
2229 }
2230
2231 // dir + dentry?
2232 if (dn) {
2233 reply->head.is_dentry = 1;
2234 CDir *dir = dn->get_dir();
2235 CInode *diri = dir->get_inode();
2236
2237 diri->encode_inodestat(bl, session, NULL, snapid);
2238 dout(20) << "set_trace_dist added diri " << *diri << dendl;
2239
2240#ifdef MDS_VERIFY_FRAGSTAT
2241 if (dir->is_complete())
2242 dir->verify_fragstat();
2243#endif
11fdf7f2
TL
2244 DirStat ds;
2245 ds.frag = dir->get_frag();
2246 ds.auth = dir->get_dir_auth().first;
f91f0fd5 2247 if (dir->is_auth() && !forward_all_requests_to_auth)
11fdf7f2
TL
2248 dir->get_dist_spec(ds.dist, whoami);
2249
2250 dir->encode_dirstat(bl, session->info, ds);
7c673cae
FG
2251 dout(20) << "set_trace_dist added dir " << *dir << dendl;
2252
11fdf7f2 2253 encode(dn->get_name(), bl);
9f95a23c
TL
2254
2255 int lease_mask = 0;
2256 CDentry::linkage_t *dnl = dn->get_linkage(mdr->get_client(), mdr);
2257 if (dnl->is_primary()) {
2258 ceph_assert(dnl->get_inode() == in);
2259 lease_mask = CEPH_LEASE_PRIMARY_LINK;
2260 } else {
2261 if (dnl->is_remote())
2262 ceph_assert(dnl->get_remote_ino() == in->ino());
2263 else
2264 ceph_assert(!in);
11fdf7f2 2265 }
9f95a23c 2266 mds->locker->issue_client_lease(dn, mdr, lease_mask, now, bl);
7c673cae
FG
2267 dout(20) << "set_trace_dist added dn " << snapid << " " << *dn << dendl;
2268 } else
2269 reply->head.is_dentry = 0;
2270
2271 // inode
2272 if (in) {
2273 in->encode_inodestat(bl, session, NULL, snapid, 0, mdr->getattr_caps);
2274 dout(20) << "set_trace_dist added in " << *in << dendl;
2275 reply->head.is_target = 1;
2276 } else
2277 reply->head.is_target = 0;
2278
2279 reply->set_trace(bl);
2280}
2281
9f95a23c 2282void Server::handle_client_request(const cref_t<MClientRequest> &req)
7c673cae
FG
2283{
2284 dout(4) << "handle_client_request " << *req << dendl;
2285
2286 if (mds->logger)
2287 mds->logger->inc(l_mds_request);
2288 if (logger)
2289 logger->inc(l_mdss_handle_client_request);
2290
2291 if (!mdcache->is_open()) {
2292 dout(5) << "waiting for root" << dendl;
2293 mdcache->wait_for_open(new C_MDS_RetryMessage(mds, req));
2294 return;
2295 }
2296
92f5a8d4 2297 bool sessionclosed_isok = replay_unsafe_with_closed_session;
7c673cae
FG
2298 // active session?
2299 Session *session = 0;
2300 if (req->get_source().is_client()) {
94b18763 2301 session = mds->get_session(req);
7c673cae
FG
2302 if (!session) {
2303 dout(5) << "no session for " << req->get_source() << ", dropping" << dendl;
92f5a8d4 2304 } else if ((session->is_closed() && (!mds->is_clientreplay() || !sessionclosed_isok)) ||
7c673cae
FG
2305 session->is_closing() ||
2306 session->is_killing()) {
2307 dout(5) << "session closed|closing|killing, dropping" << dendl;
2308 session = NULL;
2309 }
2310 if (!session) {
2311 if (req->is_queued_for_replay())
2312 mds->queue_one_replay();
7c673cae
FG
2313 return;
2314 }
2315 }
2316
2317 // old mdsmap?
2318 if (req->get_mdsmap_epoch() < mds->mdsmap->get_epoch()) {
2319 // send it? hrm, this isn't ideal; they may get a lot of copies if
2320 // they have a high request rate.
2321 }
2322
2323 // completed request?
2324 bool has_completed = false;
2325 if (req->is_replay() || req->get_retry_attempt()) {
11fdf7f2 2326 ceph_assert(session);
7c673cae
FG
2327 inodeno_t created;
2328 if (session->have_completed_request(req->get_reqid().tid, &created)) {
2329 has_completed = true;
92f5a8d4
TL
2330 if (!session->is_open())
2331 return;
7c673cae
FG
2332 // Don't send traceless reply if the completed request has created
2333 // new inode. Treat the request as lookup request instead.
2334 if (req->is_replay() ||
2335 ((created == inodeno_t() || !mds->is_clientreplay()) &&
2336 req->get_op() != CEPH_MDS_OP_OPEN &&
2337 req->get_op() != CEPH_MDS_OP_CREATE)) {
2338 dout(5) << "already completed " << req->get_reqid() << dendl;
9f95a23c 2339 auto reply = make_message<MClientReply>(*req, 0);
7c673cae
FG
2340 if (created != inodeno_t()) {
2341 bufferlist extra;
11fdf7f2 2342 encode(created, extra);
7c673cae
FG
2343 reply->set_extra_bl(extra);
2344 }
11fdf7f2 2345 mds->send_message_client(reply, session);
7c673cae
FG
2346
2347 if (req->is_queued_for_replay())
2348 mds->queue_one_replay();
2349
7c673cae
FG
2350 return;
2351 }
2352 if (req->get_op() != CEPH_MDS_OP_OPEN &&
2353 req->get_op() != CEPH_MDS_OP_CREATE) {
2354 dout(10) << " completed request which created new inode " << created
2355 << ", convert it to lookup request" << dendl;
2356 req->head.op = req->get_dentry_wanted() ? CEPH_MDS_OP_LOOKUP : CEPH_MDS_OP_GETATTR;
2357 req->head.args.getattr.mask = CEPH_STAT_CAP_INODE_ALL;
2358 }
2359 }
2360 }
2361
2362 // trim completed_request list
2363 if (req->get_oldest_client_tid() > 0) {
2364 dout(15) << " oldest_client_tid=" << req->get_oldest_client_tid() << dendl;
11fdf7f2 2365 ceph_assert(session);
7c673cae
FG
2366 if (session->trim_completed_requests(req->get_oldest_client_tid())) {
2367 // Sessions 'completed_requests' was dirtied, mark it to be
2368 // potentially flushed at segment expiry.
2369 mdlog->get_current_segment()->touched_sessions.insert(session->info.inst.name);
2370
2371 if (session->get_num_trim_requests_warnings() > 0 &&
11fdf7f2 2372 session->get_num_completed_requests() * 2 < g_conf()->mds_max_completed_requests)
7c673cae
FG
2373 session->reset_num_trim_requests_warnings();
2374 } else {
2375 if (session->get_num_completed_requests() >=
11fdf7f2 2376 (g_conf()->mds_max_completed_requests << session->get_num_trim_requests_warnings())) {
7c673cae
FG
2377 session->inc_num_trim_requests_warnings();
2378 stringstream ss;
2379 ss << "client." << session->get_client() << " does not advance its oldest_client_tid ("
2380 << req->get_oldest_client_tid() << "), "
2381 << session->get_num_completed_requests()
2382 << " completed requests recorded in session\n";
2383 mds->clog->warn() << ss.str();
2384 dout(20) << __func__ << " " << ss.str() << dendl;
2385 }
2386 }
2387 }
2388
2389 // register + dispatch
2390 MDRequestRef mdr = mdcache->request_start(req);
2391 if (!mdr.get())
2392 return;
2393
2394 if (session) {
2395 mdr->session = session;
2396 session->requests.push_back(&mdr->item_session_request);
2397 }
2398
2399 if (has_completed)
2400 mdr->has_completed = true;
2401
2402 // process embedded cap releases?
2403 // (only if NOT replay!)
2404 if (!req->releases.empty() && req->get_source().is_client() && !req->is_replay()) {
2405 client_t client = req->get_source().num();
11fdf7f2
TL
2406 for (const auto &r : req->releases) {
2407 mds->locker->process_request_cap_release(mdr, client, r.item, r.dname);
2408 }
7c673cae
FG
2409 req->releases.clear();
2410 }
2411
2412 dispatch_client_request(mdr);
2413 return;
2414}
2415
2416void Server::handle_osd_map()
2417{
2418 /* Note that we check the OSDMAP_FULL flag directly rather than
2419 * using osdmap_full_flag(), because we want to know "is the flag set"
2420 * rather than "does the flag apply to us?" */
2421 mds->objecter->with_osdmap([this](const OSDMap& o) {
b32b8144
FG
2422 auto pi = o.get_pg_pool(mds->mdsmap->get_metadata_pool());
2423 is_full = pi && pi->has_flag(pg_pool_t::FLAG_FULL);
7c673cae
FG
2424 dout(7) << __func__ << ": full = " << is_full << " epoch = "
2425 << o.get_epoch() << dendl;
2426 });
2427}
2428
2429void Server::dispatch_client_request(MDRequestRef& mdr)
2430{
2431 // we shouldn't be waiting on anyone.
11fdf7f2 2432 ceph_assert(!mdr->has_more() || mdr->more()->waiting_on_slave.empty());
7c673cae
FG
2433
2434 if (mdr->killed) {
2435 dout(10) << "request " << *mdr << " was killed" << dendl;
9f95a23c
TL
2436 //if the mdr is a "batch_op" and it has followers, pick a follower as
2437 //the new "head of the batch ops" and go on processing the new one.
f91f0fd5
TL
2438 if (mdr->is_batch_head()) {
2439 int mask = mdr->client_request->head.args.getattr.mask;
2440 auto it = mdr->batch_op_map->find(mask);
2441 auto new_batch_head = it->second->find_new_head();
2442 if (!new_batch_head) {
2443 mdr->batch_op_map->erase(it);
9f95a23c
TL
2444 return;
2445 }
f91f0fd5 2446 mdr = std::move(new_batch_head);
9f95a23c
TL
2447 } else {
2448 return;
2449 }
94b18763
FG
2450 } else if (mdr->aborted) {
2451 mdr->aborted = false;
2452 mdcache->request_kill(mdr);
2453 return;
7c673cae
FG
2454 }
2455
9f95a23c 2456 const cref_t<MClientRequest> &req = mdr->client_request;
7c673cae
FG
2457
2458 if (logger) logger->inc(l_mdss_dispatch_client_request);
2459
2460 dout(7) << "dispatch_client_request " << *req << dendl;
2461
9f95a23c
TL
2462 if (req->may_write() && mdcache->is_readonly()) {
2463 dout(10) << " read-only FS" << dendl;
2464 respond_to_request(mdr, -EROFS);
2465 return;
2466 }
2467 if (mdr->has_more() && mdr->more()->slave_error) {
2468 dout(10) << " got error from slaves" << dendl;
2469 respond_to_request(mdr, mdr->more()->slave_error);
2470 return;
7c673cae
FG
2471 }
2472
2473 if (is_full) {
2474 if (req->get_op() == CEPH_MDS_OP_SETLAYOUT ||
2475 req->get_op() == CEPH_MDS_OP_SETDIRLAYOUT ||
2476 req->get_op() == CEPH_MDS_OP_SETLAYOUT ||
2477 req->get_op() == CEPH_MDS_OP_RMXATTR ||
2478 req->get_op() == CEPH_MDS_OP_SETXATTR ||
2479 req->get_op() == CEPH_MDS_OP_CREATE ||
2480 req->get_op() == CEPH_MDS_OP_SYMLINK ||
2481 req->get_op() == CEPH_MDS_OP_MKSNAP ||
2482 ((req->get_op() == CEPH_MDS_OP_LINK ||
2483 req->get_op() == CEPH_MDS_OP_RENAME) &&
2484 (!mdr->has_more() || mdr->more()->witnessed.empty())) // haven't started slave request
2485 ) {
2486
2487 dout(20) << __func__ << ": full, responding ENOSPC to op " << ceph_mds_op_name(req->get_op()) << dendl;
2488 respond_to_request(mdr, -ENOSPC);
2489 return;
2490 } else {
2491 dout(20) << __func__ << ": full, permitting op " << ceph_mds_op_name(req->get_op()) << dendl;
2492 }
2493 }
2494
2495 switch (req->get_op()) {
2496 case CEPH_MDS_OP_LOOKUPHASH:
2497 case CEPH_MDS_OP_LOOKUPINO:
2498 handle_client_lookup_ino(mdr, false, false);
2499 break;
2500 case CEPH_MDS_OP_LOOKUPPARENT:
2501 handle_client_lookup_ino(mdr, true, false);
2502 break;
2503 case CEPH_MDS_OP_LOOKUPNAME:
2504 handle_client_lookup_ino(mdr, false, true);
2505 break;
2506
2507 // inodes ops.
2508 case CEPH_MDS_OP_LOOKUP:
2509 handle_client_getattr(mdr, true);
2510 break;
2511
2512 case CEPH_MDS_OP_LOOKUPSNAP:
2513 // lookupsnap does not reference a CDentry; treat it as a getattr
2514 case CEPH_MDS_OP_GETATTR:
2515 handle_client_getattr(mdr, false);
2516 break;
2517
2518 case CEPH_MDS_OP_SETATTR:
2519 handle_client_setattr(mdr);
2520 break;
2521 case CEPH_MDS_OP_SETLAYOUT:
2522 handle_client_setlayout(mdr);
2523 break;
2524 case CEPH_MDS_OP_SETDIRLAYOUT:
2525 handle_client_setdirlayout(mdr);
2526 break;
2527 case CEPH_MDS_OP_SETXATTR:
2528 handle_client_setxattr(mdr);
2529 break;
2530 case CEPH_MDS_OP_RMXATTR:
2531 handle_client_removexattr(mdr);
2532 break;
2533
2534 case CEPH_MDS_OP_READDIR:
2535 handle_client_readdir(mdr);
2536 break;
2537
2538 case CEPH_MDS_OP_SETFILELOCK:
2539 handle_client_file_setlock(mdr);
2540 break;
2541
2542 case CEPH_MDS_OP_GETFILELOCK:
2543 handle_client_file_readlock(mdr);
2544 break;
2545
2546 // funky.
2547 case CEPH_MDS_OP_CREATE:
2548 if (mdr->has_completed)
2549 handle_client_open(mdr); // already created.. just open
2550 else
2551 handle_client_openc(mdr);
2552 break;
2553
2554 case CEPH_MDS_OP_OPEN:
2555 handle_client_open(mdr);
2556 break;
2557
2558 // namespace.
2559 // no prior locks.
2560 case CEPH_MDS_OP_MKNOD:
2561 handle_client_mknod(mdr);
2562 break;
2563 case CEPH_MDS_OP_LINK:
2564 handle_client_link(mdr);
2565 break;
2566 case CEPH_MDS_OP_UNLINK:
2567 case CEPH_MDS_OP_RMDIR:
2568 handle_client_unlink(mdr);
2569 break;
2570 case CEPH_MDS_OP_RENAME:
2571 handle_client_rename(mdr);
2572 break;
2573 case CEPH_MDS_OP_MKDIR:
2574 handle_client_mkdir(mdr);
2575 break;
2576 case CEPH_MDS_OP_SYMLINK:
2577 handle_client_symlink(mdr);
2578 break;
2579
2580
2581 // snaps
2582 case CEPH_MDS_OP_LSSNAP:
2583 handle_client_lssnap(mdr);
2584 break;
2585 case CEPH_MDS_OP_MKSNAP:
2586 handle_client_mksnap(mdr);
2587 break;
2588 case CEPH_MDS_OP_RMSNAP:
2589 handle_client_rmsnap(mdr);
2590 break;
2591 case CEPH_MDS_OP_RENAMESNAP:
2592 handle_client_renamesnap(mdr);
2593 break;
2594
2595 default:
2596 dout(1) << " unknown client op " << req->get_op() << dendl;
2597 respond_to_request(mdr, -EOPNOTSUPP);
2598 }
2599}
2600
2601
2602// ---------------------------------------
2603// SLAVE REQUESTS
2604
9f95a23c 2605void Server::handle_slave_request(const cref_t<MMDSSlaveRequest> &m)
7c673cae
FG
2606{
2607 dout(4) << "handle_slave_request " << m->get_reqid() << " from " << m->get_source() << dendl;
2608 mds_rank_t from = mds_rank_t(m->get_source().num());
2609
2610 if (logger) logger->inc(l_mdss_handle_slave_request);
2611
2612 // reply?
2613 if (m->is_reply())
2614 return handle_slave_request_reply(m);
2615
2616 // the purpose of rename notify is enforcing causal message ordering. making sure
2617 // bystanders have received all messages from rename srcdn's auth MDS.
2618 if (m->get_op() == MMDSSlaveRequest::OP_RENAMENOTIFY) {
9f95a23c 2619 auto reply = make_message<MMDSSlaveRequest>(m->get_reqid(), m->get_attempt(), MMDSSlaveRequest::OP_RENAMENOTIFYACK);
7c673cae 2620 mds->send_message(reply, m->get_connection());
7c673cae
FG
2621 return;
2622 }
2623
2624 CDentry *straydn = NULL;
11fdf7f2 2625 if (m->straybl.length() > 0) {
9f95a23c 2626 mdcache->decode_replica_stray(straydn, m->straybl, from);
11fdf7f2
TL
2627 ceph_assert(straydn);
2628 m->straybl.clear();
7c673cae
FG
2629 }
2630
9f95a23c
TL
2631 if (!mds->is_clientreplay() && !mds->is_active() && !mds->is_stopping()) {
2632 dout(3) << "not clientreplay|active yet, waiting" << dendl;
2633 mds->wait_for_replay(new C_MDS_RetryMessage(mds, m));
2634 return;
2635 }
2636
7c673cae
FG
2637 // am i a new slave?
2638 MDRequestRef mdr;
2639 if (mdcache->have_request(m->get_reqid())) {
2640 // existing?
2641 mdr = mdcache->request_get(m->get_reqid());
2642
2643 // is my request newer?
2644 if (mdr->attempt > m->get_attempt()) {
2645 dout(10) << "local request " << *mdr << " attempt " << mdr->attempt << " > " << m->get_attempt()
2646 << ", dropping " << *m << dendl;
7c673cae
FG
2647 return;
2648 }
2649
7c673cae
FG
2650 if (mdr->attempt < m->get_attempt()) {
2651 // mine is old, close it out
2652 dout(10) << "local request " << *mdr << " attempt " << mdr->attempt << " < " << m->get_attempt()
2653 << ", closing out" << dendl;
2654 mdcache->request_finish(mdr);
2655 mdr.reset();
2656 } else if (mdr->slave_to_mds != from) {
2657 dout(10) << "local request " << *mdr << " not slave to mds." << from << dendl;
7c673cae
FG
2658 return;
2659 }
2660
9f95a23c
TL
2661 // may get these while mdr->slave_request is non-null
2662 if (m->get_op() == MMDSSlaveRequest::OP_DROPLOCKS) {
2663 mds->locker->drop_locks(mdr.get());
2664 return;
2665 }
2666 if (m->get_op() == MMDSSlaveRequest::OP_FINISH) {
2667 if (m->is_abort()) {
2668 mdr->aborted = true;
2669 if (mdr->slave_request) {
2670 // only abort on-going xlock, wrlock and auth pin
2671 ceph_assert(!mdr->slave_did_prepare());
2672 } else {
2673 mdcache->request_finish(mdr);
2674 }
7c673cae 2675 } else {
9f95a23c
TL
2676 if (m->inode_export.length() > 0)
2677 mdr->more()->inode_import = m->inode_export;
2678 // finish off request.
7c673cae
FG
2679 mdcache->request_finish(mdr);
2680 }
2681 return;
2682 }
2683 }
2684 if (!mdr.get()) {
2685 // new?
2686 if (m->get_op() == MMDSSlaveRequest::OP_FINISH) {
2687 dout(10) << "missing slave request for " << m->get_reqid()
2688 << " OP_FINISH, must have lost race with a forward" << dendl;
7c673cae
FG
2689 return;
2690 }
2691 mdr = mdcache->request_start_slave(m->get_reqid(), m->get_attempt(), m);
2692 mdr->set_op_stamp(m->op_stamp);
2693 }
11fdf7f2 2694 ceph_assert(mdr->slave_request == 0); // only one at a time, please!
7c673cae
FG
2695
2696 if (straydn) {
2697 mdr->pin(straydn);
2698 mdr->straydn = straydn;
2699 }
2700
9f95a23c
TL
2701 if (mds->is_clientreplay() && !mds->mdsmap->is_clientreplay(from) &&
2702 mdr->locks.empty()) {
7c673cae
FG
2703 dout(3) << "not active yet, waiting" << dendl;
2704 mds->wait_for_active(new C_MDS_RetryMessage(mds, m));
2705 return;
2706 }
2707
91327a77 2708 mdr->reset_slave_request(m);
7c673cae
FG
2709
2710 dispatch_slave_request(mdr);
2711}
2712
9f95a23c 2713void Server::handle_slave_request_reply(const cref_t<MMDSSlaveRequest> &m)
7c673cae
FG
2714{
2715 mds_rank_t from = mds_rank_t(m->get_source().num());
2716
2717 if (!mds->is_clientreplay() && !mds->is_active() && !mds->is_stopping()) {
2718 metareqid_t r = m->get_reqid();
2719 if (!mdcache->have_uncommitted_master(r, from)) {
2720 dout(10) << "handle_slave_request_reply ignoring slave reply from mds."
2721 << from << " reqid " << r << dendl;
7c673cae
FG
2722 return;
2723 }
2724 dout(3) << "not clientreplay|active yet, waiting" << dendl;
2725 mds->wait_for_replay(new C_MDS_RetryMessage(mds, m));
2726 return;
2727 }
2728
2729 if (m->get_op() == MMDSSlaveRequest::OP_COMMITTED) {
2730 metareqid_t r = m->get_reqid();
2731 mdcache->committed_master_slave(r, from);
7c673cae
FG
2732 return;
2733 }
2734
2735 MDRequestRef mdr = mdcache->request_get(m->get_reqid());
2736 if (m->get_attempt() != mdr->attempt) {
2737 dout(10) << "handle_slave_request_reply " << *mdr << " ignoring reply from other attempt "
2738 << m->get_attempt() << dendl;
7c673cae
FG
2739 return;
2740 }
2741
2742 switch (m->get_op()) {
2743 case MMDSSlaveRequest::OP_XLOCKACK:
2744 {
2745 // identify lock, master request
2746 SimpleLock *lock = mds->locker->get_lock(m->get_lock_type(),
2747 m->get_object_info());
2748 mdr->more()->slaves.insert(from);
11fdf7f2 2749 lock->decode_locked_state(m->get_lock_data());
7c673cae 2750 dout(10) << "got remote xlock on " << *lock << " on " << *lock->get_parent() << dendl;
9f95a23c 2751 mdr->emplace_lock(lock, MutationImpl::LockOp::XLOCK);
7c673cae
FG
2752 mdr->finish_locking(lock);
2753 lock->get_xlock(mdr, mdr->get_client());
2754
11fdf7f2 2755 ceph_assert(mdr->more()->waiting_on_slave.count(from));
7c673cae 2756 mdr->more()->waiting_on_slave.erase(from);
11fdf7f2 2757 ceph_assert(mdr->more()->waiting_on_slave.empty());
7c673cae
FG
2758 mdcache->dispatch_request(mdr);
2759 }
2760 break;
2761
2762 case MMDSSlaveRequest::OP_WRLOCKACK:
2763 {
2764 // identify lock, master request
2765 SimpleLock *lock = mds->locker->get_lock(m->get_lock_type(),
2766 m->get_object_info());
2767 mdr->more()->slaves.insert(from);
2768 dout(10) << "got remote wrlock on " << *lock << " on " << *lock->get_parent() << dendl;
9f95a23c 2769 auto it = mdr->emplace_lock(lock, MutationImpl::LockOp::REMOTE_WRLOCK, from);
11fdf7f2
TL
2770 ceph_assert(it->is_remote_wrlock());
2771 ceph_assert(it->wrlock_target == from);
2772
7c673cae
FG
2773 mdr->finish_locking(lock);
2774
11fdf7f2 2775 ceph_assert(mdr->more()->waiting_on_slave.count(from));
7c673cae 2776 mdr->more()->waiting_on_slave.erase(from);
11fdf7f2 2777 ceph_assert(mdr->more()->waiting_on_slave.empty());
7c673cae
FG
2778 mdcache->dispatch_request(mdr);
2779 }
2780 break;
2781
2782 case MMDSSlaveRequest::OP_AUTHPINACK:
2783 handle_slave_auth_pin_ack(mdr, m);
2784 break;
2785
2786 case MMDSSlaveRequest::OP_LINKPREPACK:
2787 handle_slave_link_prep_ack(mdr, m);
2788 break;
2789
2790 case MMDSSlaveRequest::OP_RMDIRPREPACK:
2791 handle_slave_rmdir_prep_ack(mdr, m);
2792 break;
2793
2794 case MMDSSlaveRequest::OP_RENAMEPREPACK:
2795 handle_slave_rename_prep_ack(mdr, m);
2796 break;
2797
2798 case MMDSSlaveRequest::OP_RENAMENOTIFYACK:
2799 handle_slave_rename_notify_ack(mdr, m);
2800 break;
2801
2802 default:
2803 ceph_abort();
2804 }
7c673cae
FG
2805}
2806
7c673cae
FG
2807void Server::dispatch_slave_request(MDRequestRef& mdr)
2808{
2809 dout(7) << "dispatch_slave_request " << *mdr << " " << *mdr->slave_request << dendl;
2810
2811 if (mdr->aborted) {
2812 dout(7) << " abort flag set, finishing" << dendl;
2813 mdcache->request_finish(mdr);
2814 return;
2815 }
2816
2817 if (logger) logger->inc(l_mdss_dispatch_slave_request);
2818
2819 int op = mdr->slave_request->get_op();
2820 switch (op) {
2821 case MMDSSlaveRequest::OP_XLOCK:
2822 case MMDSSlaveRequest::OP_WRLOCK:
2823 {
2824 // identify object
2825 SimpleLock *lock = mds->locker->get_lock(mdr->slave_request->get_lock_type(),
2826 mdr->slave_request->get_object_info());
2827
2828 if (!lock) {
2829 dout(10) << "don't have object, dropping" << dendl;
2830 ceph_abort(); // can this happen, if we auth pinned properly.
2831 }
2832 if (op == MMDSSlaveRequest::OP_XLOCK && !lock->get_parent()->is_auth()) {
2833 dout(10) << "not auth for remote xlock attempt, dropping on "
2834 << *lock << " on " << *lock->get_parent() << dendl;
2835 } else {
2836 // use acquire_locks so that we get auth_pinning.
11fdf7f2
TL
2837 MutationImpl::LockOpVec lov;
2838 for (const auto& p : mdr->locks) {
2839 if (p.is_xlock())
2840 lov.add_xlock(p.lock);
2841 else if (p.is_wrlock())
2842 lov.add_wrlock(p.lock);
2843 }
7c673cae
FG
2844
2845 int replycode = 0;
2846 switch (op) {
2847 case MMDSSlaveRequest::OP_XLOCK:
11fdf7f2 2848 lov.add_xlock(lock);
7c673cae
FG
2849 replycode = MMDSSlaveRequest::OP_XLOCKACK;
2850 break;
2851 case MMDSSlaveRequest::OP_WRLOCK:
11fdf7f2 2852 lov.add_wrlock(lock);
7c673cae
FG
2853 replycode = MMDSSlaveRequest::OP_WRLOCKACK;
2854 break;
2855 }
2856
11fdf7f2 2857 if (!mds->locker->acquire_locks(mdr, lov))
7c673cae
FG
2858 return;
2859
2860 // ack
9f95a23c 2861 auto r = make_message<MMDSSlaveRequest>(mdr->reqid, mdr->attempt, replycode);
7c673cae
FG
2862 r->set_lock_type(lock->get_type());
2863 lock->get_parent()->set_object_info(r->get_object_info());
11fdf7f2
TL
2864 if (replycode == MMDSSlaveRequest::OP_XLOCKACK)
2865 lock->encode_locked_state(r->get_lock_data());
7c673cae
FG
2866 mds->send_message(r, mdr->slave_request->get_connection());
2867 }
2868
2869 // done.
91327a77 2870 mdr->reset_slave_request();
7c673cae
FG
2871 }
2872 break;
2873
2874 case MMDSSlaveRequest::OP_UNXLOCK:
2875 case MMDSSlaveRequest::OP_UNWRLOCK:
2876 {
2877 SimpleLock *lock = mds->locker->get_lock(mdr->slave_request->get_lock_type(),
2878 mdr->slave_request->get_object_info());
11fdf7f2
TL
2879 ceph_assert(lock);
2880 auto it = mdr->locks.find(lock);
2881 ceph_assert(it != mdr->locks.end());
7c673cae
FG
2882 bool need_issue = false;
2883 switch (op) {
2884 case MMDSSlaveRequest::OP_UNXLOCK:
11fdf7f2 2885 mds->locker->xlock_finish(it, mdr.get(), &need_issue);
7c673cae
FG
2886 break;
2887 case MMDSSlaveRequest::OP_UNWRLOCK:
11fdf7f2 2888 mds->locker->wrlock_finish(it, mdr.get(), &need_issue);
7c673cae
FG
2889 break;
2890 }
2891 if (need_issue)
2892 mds->locker->issue_caps(static_cast<CInode*>(lock->get_parent()));
2893
2894 // done. no ack necessary.
91327a77 2895 mdr->reset_slave_request();
7c673cae
FG
2896 }
2897 break;
2898
7c673cae
FG
2899 case MMDSSlaveRequest::OP_AUTHPIN:
2900 handle_slave_auth_pin(mdr);
2901 break;
2902
2903 case MMDSSlaveRequest::OP_LINKPREP:
2904 case MMDSSlaveRequest::OP_UNLINKPREP:
2905 handle_slave_link_prep(mdr);
2906 break;
2907
2908 case MMDSSlaveRequest::OP_RMDIRPREP:
2909 handle_slave_rmdir_prep(mdr);
2910 break;
2911
2912 case MMDSSlaveRequest::OP_RENAMEPREP:
2913 handle_slave_rename_prep(mdr);
2914 break;
2915
7c673cae
FG
2916 default:
2917 ceph_abort();
2918 }
2919}
2920
7c673cae
FG
2921void Server::handle_slave_auth_pin(MDRequestRef& mdr)
2922{
2923 dout(10) << "handle_slave_auth_pin " << *mdr << dendl;
2924
2925 // build list of objects
2926 list<MDSCacheObject*> objects;
2927 CInode *auth_pin_freeze = NULL;
9f95a23c 2928 bool nonblocking = mdr->slave_request->is_nonblocking();
7c673cae 2929 bool fail = false, wouldblock = false, readonly = false;
9f95a23c 2930 ref_t<MMDSSlaveRequest> reply;
7c673cae
FG
2931
2932 if (mdcache->is_readonly()) {
2933 dout(10) << " read-only FS" << dendl;
2934 readonly = true;
2935 fail = true;
2936 }
2937
2938 if (!fail) {
11fdf7f2
TL
2939 for (const auto &oi : mdr->slave_request->get_authpins()) {
2940 MDSCacheObject *object = mdcache->get_object(oi);
7c673cae 2941 if (!object) {
11fdf7f2 2942 dout(10) << " don't have " << oi << dendl;
7c673cae
FG
2943 fail = true;
2944 break;
2945 }
2946
2947 objects.push_back(object);
11fdf7f2 2948 if (oi == mdr->slave_request->get_authpin_freeze())
7c673cae
FG
2949 auth_pin_freeze = static_cast<CInode*>(object);
2950 }
2951 }
2952
2953 // can we auth pin them?
2954 if (!fail) {
9f95a23c
TL
2955 for (const auto& obj : objects) {
2956 if (!obj->is_auth()) {
2957 dout(10) << " not auth for " << *obj << dendl;
7c673cae
FG
2958 fail = true;
2959 break;
2960 }
9f95a23c 2961 if (mdr->is_auth_pinned(obj))
7c673cae 2962 continue;
9f95a23c
TL
2963 if (!mdr->can_auth_pin(obj)) {
2964 if (nonblocking) {
2965 dout(10) << " can't auth_pin (freezing?) " << *obj << " nonblocking" << dendl;
7c673cae
FG
2966 fail = true;
2967 wouldblock = true;
2968 break;
2969 }
2970 // wait
9f95a23c
TL
2971 dout(10) << " waiting for authpinnable on " << *obj << dendl;
2972 obj->add_waiter(CDir::WAIT_UNFREEZE, new C_MDS_RetryRequest(mdcache, mdr));
7c673cae
FG
2973 mdr->drop_local_auth_pins();
2974
9f95a23c
TL
2975 mds->locker->notify_freeze_waiter(obj);
2976 goto blocked;
7c673cae
FG
2977 }
2978 }
2979 }
2980
9f95a23c 2981 if (!fail) {
7c673cae
FG
2982 /* freeze authpin wrong inode */
2983 if (mdr->has_more() && mdr->more()->is_freeze_authpin &&
2984 mdr->more()->rename_inode != auth_pin_freeze)
2985 mdr->unfreeze_auth_pin(true);
2986
2987 /* handle_slave_rename_prep() call freeze_inode() to wait for all other operations
2988 * on the source inode to complete. This happens after all locks for the rename
2989 * operation are acquired. But to acquire locks, we need auth pin locks' parent
2990 * objects first. So there is an ABBA deadlock if someone auth pins the source inode
2991 * after locks are acquired and before Server::handle_slave_rename_prep() is called.
2992 * The solution is freeze the inode and prevent other MDRequests from getting new
2993 * auth pins.
2994 */
2995 if (auth_pin_freeze) {
2996 dout(10) << " freezing auth pin on " << *auth_pin_freeze << dendl;
2997 if (!mdr->freeze_auth_pin(auth_pin_freeze)) {
2998 auth_pin_freeze->add_waiter(CInode::WAIT_FROZEN, new C_MDS_RetryRequest(mdcache, mdr));
2999 mds->mdlog->flush();
9f95a23c 3000 goto blocked;
7c673cae
FG
3001 }
3002 }
7c673cae
FG
3003 }
3004
9f95a23c 3005 reply = make_message<MMDSSlaveRequest>(mdr->reqid, mdr->attempt, MMDSSlaveRequest::OP_AUTHPINACK);
7c673cae 3006
9f95a23c
TL
3007 if (fail) {
3008 mdr->drop_local_auth_pins(); // just in case
3009 if (readonly)
3010 reply->mark_error_rofs();
3011 if (wouldblock)
3012 reply->mark_error_wouldblock();
3013 } else {
3014 // auth pin!
3015 for (const auto& obj : objects) {
3016 dout(10) << "auth_pinning " << *obj << dendl;
3017 mdr->auth_pin(obj);
3018 }
3019 // return list of my auth_pins (if any)
3020 for (const auto &p : mdr->object_states) {
3021 if (!p.second.auth_pinned)
3022 continue;
3023 MDSCacheObjectInfo info;
3024 p.first->set_object_info(info);
3025 reply->get_authpins().push_back(info);
3026 if (p.first == (MDSCacheObject*)auth_pin_freeze)
3027 auth_pin_freeze->set_object_info(reply->get_authpin_freeze());
3028 }
3029 }
7c673cae
FG
3030
3031 mds->send_message_mds(reply, mdr->slave_to_mds);
3032
3033 // clean up this request
91327a77 3034 mdr->reset_slave_request();
7c673cae 3035 return;
9f95a23c
TL
3036
3037blocked:
3038 if (mdr->slave_request->should_notify_blocking()) {
3039 reply = make_message<MMDSSlaveRequest>(mdr->reqid, mdr->attempt, MMDSSlaveRequest::OP_AUTHPINACK);
3040 reply->mark_req_blocked();
3041 mds->send_message_mds(reply, mdr->slave_to_mds);
3042 mdr->slave_request->clear_notify_blocking();
3043 }
3044 return;
7c673cae
FG
3045}
3046
9f95a23c 3047void Server::handle_slave_auth_pin_ack(MDRequestRef& mdr, const cref_t<MMDSSlaveRequest> &ack)
7c673cae
FG
3048{
3049 dout(10) << "handle_slave_auth_pin_ack on " << *mdr << " " << *ack << dendl;
3050 mds_rank_t from = mds_rank_t(ack->get_source().num());
3051
9f95a23c
TL
3052 if (ack->is_req_blocked()) {
3053 mdr->disable_lock_cache();
3054 // slave auth pin is blocked, drop locks to avoid deadlock
3055 mds->locker->drop_locks(mdr.get(), nullptr);
3056 return;
3057 }
3058
7c673cae
FG
3059 // added auth pins?
3060 set<MDSCacheObject*> pinned;
11fdf7f2
TL
3061 for (const auto &oi : ack->get_authpins()) {
3062 MDSCacheObject *object = mdcache->get_object(oi);
3063 ceph_assert(object); // we pinned it
7c673cae 3064 dout(10) << " remote has pinned " << *object << dendl;
9f95a23c 3065 mdr->set_remote_auth_pinned(object, from);
11fdf7f2 3066 if (oi == ack->get_authpin_freeze())
7c673cae
FG
3067 mdr->set_remote_frozen_auth_pin(static_cast<CInode *>(object));
3068 pinned.insert(object);
3069 }
3070
3071 // removed frozen auth pin ?
3072 if (mdr->more()->is_remote_frozen_authpin &&
3073 ack->get_authpin_freeze() == MDSCacheObjectInfo()) {
9f95a23c
TL
3074 auto stat_p = mdr->find_object_state(mdr->more()->rename_inode);
3075 ceph_assert(stat_p);
3076 if (stat_p->remote_auth_pinned == from) {
7c673cae
FG
3077 mdr->more()->is_remote_frozen_authpin = false;
3078 }
3079 }
3080
3081 // removed auth pins?
9f95a23c
TL
3082 for (auto& p : mdr->object_states) {
3083 if (p.second.remote_auth_pinned == MDS_RANK_NONE)
3084 continue;
3085 MDSCacheObject* object = p.first;
3086 if (p.second.remote_auth_pinned == from && pinned.count(object) == 0) {
7c673cae 3087 dout(10) << " remote has unpinned " << *object << dendl;
9f95a23c 3088 mdr->_clear_remote_auth_pinned(p.second);
7c673cae
FG
3089 }
3090 }
3091
9f95a23c
TL
3092 // note slave
3093 mdr->more()->slaves.insert(from);
3094
3095 // clear from waiting list
3096 auto ret = mdr->more()->waiting_on_slave.erase(from);
3097 ceph_assert(ret);
3098
7c673cae
FG
3099 if (ack->is_error_rofs()) {
3100 mdr->more()->slave_error = -EROFS;
7c673cae
FG
3101 } else if (ack->is_error_wouldblock()) {
3102 mdr->more()->slave_error = -EWOULDBLOCK;
7c673cae 3103 }
7c673cae
FG
3104
3105 // go again?
3106 if (mdr->more()->waiting_on_slave.empty())
3107 mdcache->dispatch_request(mdr);
3108 else
3109 dout(10) << "still waiting on slaves " << mdr->more()->waiting_on_slave << dendl;
3110}
3111
3112
3113// ---------------------------------------
3114// HELPERS
3115
3116
3117/**
3118 * check whether we are permitted to complete a request
3119 *
3120 * Check whether we have permission to perform the operation specified
3121 * by mask on the given inode, based on the capability in the mdr's
3122 * session.
3123 */
3124bool Server::check_access(MDRequestRef& mdr, CInode *in, unsigned mask)
3125{
3126 if (mdr->session) {
3127 int r = mdr->session->check_access(
3128 in, mask,
3129 mdr->client_request->get_caller_uid(),
3130 mdr->client_request->get_caller_gid(),
3131 &mdr->client_request->get_caller_gid_list(),
3132 mdr->client_request->head.args.setattr.uid,
3133 mdr->client_request->head.args.setattr.gid);
3134 if (r < 0) {
3135 respond_to_request(mdr, r);
3136 return false;
3137 }
3138 }
3139 return true;
3140}
3141
3142/**
3143 * check whether fragment has reached maximum size
3144 *
3145 */
3146bool Server::check_fragment_space(MDRequestRef &mdr, CDir *in)
3147{
3148 const auto size = in->get_frag_size();
11fdf7f2
TL
3149 if (size >= g_conf()->mds_bal_fragment_size_max) {
3150 dout(10) << "fragment " << *in << " size exceeds " << g_conf()->mds_bal_fragment_size_max << " (ENOSPC)" << dendl;
7c673cae
FG
3151 respond_to_request(mdr, -ENOSPC);
3152 return false;
3153 }
3154
3155 return true;
3156}
3157
7c673cae
FG
3158CDentry* Server::prepare_stray_dentry(MDRequestRef& mdr, CInode *in)
3159{
3160 CDentry *straydn = mdr->straydn;
3161 if (straydn) {
3162 string straydname;
3163 in->name_stray_dentry(straydname);
9f95a23c
TL
3164 ceph_assert(straydn->get_name() == straydname);
3165 return straydn;
7c673cae
FG
3166 }
3167
3168 CDir *straydir = mdcache->get_stray_dir(in);
3169
3170 if (!mdr->client_request->is_replay() &&
3171 !check_fragment_space(mdr, straydir))
3172 return NULL;
3173
3174 straydn = mdcache->get_or_create_stray_dentry(in);
3175 mdr->straydn = straydn;
3176 mdr->pin(straydn);
3177 return straydn;
3178}
3179
3180/** prepare_new_inode
3181 *
3182 * create a new inode. set c/m/atime. hit dir pop.
3183 */
3184CInode* Server::prepare_new_inode(MDRequestRef& mdr, CDir *dir, inodeno_t useino, unsigned mode,
3185 file_layout_t *layout)
3186{
3187 CInode *in = new CInode(mdcache);
3188
3189 // Server::prepare_force_open_sessions() can re-open session in closing
3190 // state. In that corner case, session's prealloc_inos are being freed.
3191 // To simplify the code, we disallow using/refilling session's prealloc_ino
3192 // while session is opening.
92f5a8d4 3193 bool allow_prealloc_inos = mdr->session->is_open();
7c673cae
FG
3194
3195 // assign ino
9f95a23c 3196 if (allow_prealloc_inos && (mdr->used_prealloc_ino = in->inode.ino = mdr->session->take_ino(useino))) {
7c673cae 3197 mds->sessionmap.mark_projected(mdr->session);
7c673cae
FG
3198 dout(10) << "prepare_new_inode used_prealloc " << mdr->used_prealloc_ino
3199 << " (" << mdr->session->info.prealloc_inos
3200 << ", " << mdr->session->info.prealloc_inos.size() << " left)"
3201 << dendl;
3202 } else {
3203 mdr->alloc_ino =
92f5a8d4 3204 in->inode.ino = mds->inotable->project_alloc_id(useino);
7c673cae
FG
3205 dout(10) << "prepare_new_inode alloc " << mdr->alloc_ino << dendl;
3206 }
3207
3208 if (useino && useino != in->inode.ino) {
3209 dout(0) << "WARNING: client specified " << useino << " and i allocated " << in->inode.ino << dendl;
3210 mds->clog->error() << mdr->client_request->get_source()
3211 << " specified ino " << useino
3212 << " but mds." << mds->get_nodeid() << " allocated " << in->inode.ino;
3213 //ceph_abort(); // just for now.
3214 }
3215
3216 if (allow_prealloc_inos &&
11fdf7f2
TL
3217 mdr->session->get_num_projected_prealloc_inos() < g_conf()->mds_client_prealloc_inos / 2) {
3218 int need = g_conf()->mds_client_prealloc_inos - mdr->session->get_num_projected_prealloc_inos();
7c673cae 3219 mds->inotable->project_alloc_ids(mdr->prealloc_inos, need);
11fdf7f2 3220 ceph_assert(mdr->prealloc_inos.size()); // or else fix projected increment semantics
7c673cae
FG
3221 mdr->session->pending_prealloc_inos.insert(mdr->prealloc_inos);
3222 mds->sessionmap.mark_projected(mdr->session);
3223 dout(10) << "prepare_new_inode prealloc " << mdr->prealloc_inos << dendl;
3224 }
3225
3226 in->inode.version = 1;
3227 in->inode.xattr_version = 1;
3228 in->inode.nlink = 1; // FIXME
3229
3230 in->inode.mode = mode;
3231
92f5a8d4 3232 // FIPS zeroization audit 20191117: this memset is not security related.
7c673cae
FG
3233 memset(&in->inode.dir_layout, 0, sizeof(in->inode.dir_layout));
3234 if (in->inode.is_dir()) {
11fdf7f2 3235 in->inode.dir_layout.dl_dir_hash = g_conf()->mds_default_dir_hash;
7c673cae
FG
3236 } else if (layout) {
3237 in->inode.layout = *layout;
3238 } else {
3239 in->inode.layout = mdcache->default_file_layout;
3240 }
3241
3242 in->inode.truncate_size = -1ull; // not truncated, yet!
3243 in->inode.truncate_seq = 1; /* starting with 1, 0 is kept for no-truncation logic */
3244
3245 CInode *diri = dir->get_inode();
3246
3247 dout(10) << oct << " dir mode 0" << diri->inode.mode << " new mode 0" << mode << dec << dendl;
3248
3249 if (diri->inode.mode & S_ISGID) {
3250 dout(10) << " dir is sticky" << dendl;
3251 in->inode.gid = diri->inode.gid;
3252 if (S_ISDIR(mode)) {
3253 dout(10) << " new dir also sticky" << dendl;
3254 in->inode.mode |= S_ISGID;
3255 }
3256 } else
3257 in->inode.gid = mdr->client_request->get_caller_gid();
3258
3259 in->inode.uid = mdr->client_request->get_caller_uid();
3260
3261 in->inode.btime = in->inode.ctime = in->inode.mtime = in->inode.atime =
3262 mdr->get_op_stamp();
3263
3264 in->inode.change_attr = 0;
3265
9f95a23c 3266 const cref_t<MClientRequest> &req = mdr->client_request;
7c673cae 3267 if (req->get_data().length()) {
11fdf7f2 3268 auto p = req->get_data().cbegin();
7c673cae
FG
3269
3270 // xattrs on new inode?
94b18763 3271 CInode::mempool_xattr_map xattrs;
e306af50 3272 decode_noshare(xattrs, p);
94b18763
FG
3273 for (const auto &p : xattrs) {
3274 dout(10) << "prepare_new_inode setting xattr " << p.first << dendl;
3275 auto em = in->xattrs.emplace(std::piecewise_construct, std::forward_as_tuple(p.first), std::forward_as_tuple(p.second));
3276 if (!em.second)
3277 em.first->second = p.second;
7c673cae
FG
3278 }
3279 }
3280
3281 if (!mds->mdsmap->get_inline_data_enabled() ||
11fdf7f2 3282 !mdr->session->get_connection()->has_feature(CEPH_FEATURE_MDS_INLINE_DATA))
7c673cae
FG
3283 in->inode.inline_data.version = CEPH_INLINE_NONE;
3284
3285 mdcache->add_inode(in); // add
3286 dout(10) << "prepare_new_inode " << *in << dendl;
3287 return in;
3288}
3289
3290void Server::journal_allocated_inos(MDRequestRef& mdr, EMetaBlob *blob)
3291{
3292 dout(20) << "journal_allocated_inos sessionmapv " << mds->sessionmap.get_projected()
3293 << " inotablev " << mds->inotable->get_projected_version()
3294 << dendl;
3295 blob->set_ino_alloc(mdr->alloc_ino,
3296 mdr->used_prealloc_ino,
3297 mdr->prealloc_inos,
3298 mdr->client_request->get_source(),
3299 mds->sessionmap.get_projected(),
3300 mds->inotable->get_projected_version());
3301}
3302
3303void Server::apply_allocated_inos(MDRequestRef& mdr, Session *session)
3304{
3305 dout(10) << "apply_allocated_inos " << mdr->alloc_ino
3306 << " / " << mdr->prealloc_inos
3307 << " / " << mdr->used_prealloc_ino << dendl;
3308
3309 if (mdr->alloc_ino) {
3310 mds->inotable->apply_alloc_id(mdr->alloc_ino);
3311 }
3312 if (mdr->prealloc_inos.size()) {
11fdf7f2 3313 ceph_assert(session);
7c673cae
FG
3314 session->pending_prealloc_inos.subtract(mdr->prealloc_inos);
3315 session->info.prealloc_inos.insert(mdr->prealloc_inos);
81eedcae 3316 mds->sessionmap.mark_dirty(session, !mdr->used_prealloc_ino);
7c673cae
FG
3317 mds->inotable->apply_alloc_ids(mdr->prealloc_inos);
3318 }
3319 if (mdr->used_prealloc_ino) {
11fdf7f2 3320 ceph_assert(session);
7c673cae
FG
3321 session->info.used_inos.erase(mdr->used_prealloc_ino);
3322 mds->sessionmap.mark_dirty(session);
3323 }
3324}
3325
3326class C_MDS_TryFindInode : public ServerContext {
3327 MDRequestRef mdr;
3328public:
3329 C_MDS_TryFindInode(Server *s, MDRequestRef& r) : ServerContext(s), mdr(r) {}
3330 void finish(int r) override {
3331 if (r == -ESTALE) // :( find_ino_peers failed
3332 server->respond_to_request(mdr, r);
3333 else
3334 server->dispatch_client_request(mdr);
3335 }
3336};
3337
11fdf7f2
TL
3338class CF_MDS_MDRContextFactory : public MDSContextFactory {
3339public:
9f95a23c
TL
3340 CF_MDS_MDRContextFactory(MDCache *cache, MDRequestRef &mdr, bool dl) :
3341 mdcache(cache), mdr(mdr), drop_locks(dl) {}
11fdf7f2 3342 MDSContext *build() {
9f95a23c
TL
3343 if (drop_locks) {
3344 mdcache->mds->locker->drop_locks(mdr.get(), nullptr);
3345 mdr->drop_local_auth_pins();
3346 }
3347 return new C_MDS_RetryRequest(mdcache, mdr);
11fdf7f2
TL
3348 }
3349private:
9f95a23c 3350 MDCache *mdcache;
11fdf7f2 3351 MDRequestRef mdr;
9f95a23c 3352 bool drop_locks;
11fdf7f2
TL
3353};
3354
7c673cae
FG
3355/* If this returns null, the request has been handled
3356 * as appropriate: forwarded on, or the client's been replied to */
9f95a23c 3357CInode* Server::rdlock_path_pin_ref(MDRequestRef& mdr,
7c673cae 3358 bool want_auth,
9f95a23c 3359 bool no_want_auth)
7c673cae 3360{
9f95a23c 3361 const filepath& refpath = mdr->get_filepath();
7c673cae
FG
3362 dout(10) << "rdlock_path_pin_ref " << *mdr << " " << refpath << dendl;
3363
9f95a23c
TL
3364 if (mdr->locking_state & MutationImpl::PATH_LOCKED)
3365 return mdr->in[0];
7c673cae
FG
3366
3367 // traverse
9f95a23c
TL
3368 CF_MDS_MDRContextFactory cf(mdcache, mdr, true);
3369 int flags = 0;
3370 if (refpath.is_last_snap()) {
3371 if (!no_want_auth)
3372 want_auth = true;
3373 } else {
f91f0fd5
TL
3374 if (!no_want_auth && forward_all_requests_to_auth)
3375 want_auth = true;
9f95a23c
TL
3376 flags |= MDS_TRAVERSE_RDLOCK_PATH | MDS_TRAVERSE_RDLOCK_SNAP;
3377 }
3378 if (want_auth)
3379 flags |= MDS_TRAVERSE_WANT_AUTH;
3380 int r = mdcache->path_traverse(mdr, cf, refpath, flags, &mdr->dn[0], &mdr->in[0]);
7c673cae 3381 if (r > 0)
9f95a23c 3382 return nullptr; // delayed
7c673cae 3383 if (r < 0) { // error
9f95a23c
TL
3384 if (r == -ENOENT && !mdr->dn[0].empty()) {
3385 if (mdr->client_request &&
3386 mdr->client_request->get_dentry_wanted())
3387 mdr->tracedn = mdr->dn[0].back();
7c673cae
FG
3388 respond_to_request(mdr, r);
3389 } else if (r == -ESTALE) {
3390 dout(10) << "FAIL on ESTALE but attempting recovery" << dendl;
11fdf7f2 3391 MDSContext *c = new C_MDS_TryFindInode(this, mdr);
7c673cae
FG
3392 mdcache->find_ino_peers(refpath.get_ino(), c);
3393 } else {
3394 dout(10) << "FAIL on error " << r << dendl;
3395 respond_to_request(mdr, r);
3396 }
9f95a23c 3397 return nullptr;
7c673cae 3398 }
9f95a23c 3399 CInode *ref = mdr->in[0];
7c673cae
FG
3400 dout(10) << "ref is " << *ref << dendl;
3401
7c673cae 3402 if (want_auth) {
7c673cae
FG
3403 // auth_pin?
3404 // do NOT proceed if freezing, as cap release may defer in that case, and
3405 // we could deadlock when we try to lock @ref.
3406 // if we're already auth_pinned, continue; the release has already been processed.
3407 if (ref->is_frozen() || ref->is_frozen_auth_pin() ||
3408 (ref->is_freezing() && !mdr->is_auth_pinned(ref))) {
3409 dout(7) << "waiting for !frozen/authpinnable on " << *ref << dendl;
9f95a23c
TL
3410 ref->add_waiter(CInode::WAIT_UNFREEZE, cf.build());
3411 if (mdr->is_any_remote_auth_pin())
224ce89b 3412 mds->locker->notify_freeze_waiter(ref);
7c673cae
FG
3413 return 0;
3414 }
7c673cae
FG
3415 mdr->auth_pin(ref);
3416 }
3417
7c673cae
FG
3418 // set and pin ref
3419 mdr->pin(ref);
3420 return ref;
3421}
3422
3423
3424/** rdlock_path_xlock_dentry
3425 * traverse path to the directory that could/would contain dentry.
3426 * make sure i am auth for that dentry, forward as necessary.
3427 * create null dentry in place (or use existing if okexist).
3428 * get rdlocks on traversed dentries, xlock on new dentry.
3429 */
9f95a23c
TL
3430CDentry* Server::rdlock_path_xlock_dentry(MDRequestRef& mdr,
3431 bool create, bool okexist, bool want_layout)
7c673cae 3432{
9f95a23c 3433 const filepath& refpath = mdr->get_filepath();
7c673cae
FG
3434 dout(10) << "rdlock_path_xlock_dentry " << *mdr << " " << refpath << dendl;
3435
9f95a23c
TL
3436 if (mdr->locking_state & MutationImpl::PATH_LOCKED)
3437 return mdr->dn[0].back();
3438
3439 // figure parent dir vs dname
3440 if (refpath.depth() == 0) {
3441 dout(7) << "invalid path (zero length)" << dendl;
3442 respond_to_request(mdr, -EINVAL);
3443 return nullptr;
3444 }
3445
3446 if (refpath.is_last_snap()) {
3447 respond_to_request(mdr, -EROFS);
3448 return nullptr;
3449 }
7c673cae 3450
9f95a23c
TL
3451 if (refpath.is_last_dot_or_dotdot()) {
3452 dout(7) << "invalid path (last dot or dot_dot)" << dendl;
3453 if (create)
3454 respond_to_request(mdr, -EEXIST);
3455 else
3456 respond_to_request(mdr, -ENOTEMPTY);
3457 return nullptr;
3458 }
7c673cae 3459
9f95a23c
TL
3460 // traverse to parent dir
3461 CF_MDS_MDRContextFactory cf(mdcache, mdr, true);
3462 int flags = MDS_TRAVERSE_RDLOCK_SNAP | MDS_TRAVERSE_RDLOCK_PATH |
3463 MDS_TRAVERSE_WANT_DENTRY | MDS_TRAVERSE_XLOCK_DENTRY |
3464 MDS_TRAVERSE_WANT_AUTH;
3465 if (refpath.depth() == 1 && !mdr->lock_cache_disabled)
3466 flags |= MDS_TRAVERSE_CHECK_LOCKCACHE;
3467 if (create)
3468 flags |= MDS_TRAVERSE_RDLOCK_AUTHLOCK;
3469 if (want_layout)
3470 flags |= MDS_TRAVERSE_WANT_DIRLAYOUT;
3471 int r = mdcache->path_traverse(mdr, cf, refpath, flags, &mdr->dn[0]);
3472 if (r > 0)
3473 return nullptr; // delayed
3474 if (r < 0) {
3475 if (r == -ESTALE) {
3476 dout(10) << "FAIL on ESTALE but attempting recovery" << dendl;
3477 mdcache->find_ino_peers(refpath.get_ino(), new C_MDS_TryFindInode(this, mdr));
3478 return nullptr;
3479 }
3480 respond_to_request(mdr, r);
3481 return nullptr;
3482 }
7c673cae 3483
9f95a23c
TL
3484 CDentry *dn = mdr->dn[0].back();
3485 CDir *dir = dn->get_dir();
7c673cae 3486 CInode *diri = dir->get_inode();
9f95a23c 3487
7c673cae
FG
3488 if (!mdr->reqid.name.is_mds()) {
3489 if (diri->is_system() && !diri->is_root()) {
3490 respond_to_request(mdr, -EROFS);
9f95a23c 3491 return nullptr;
7c673cae
FG
3492 }
3493 }
9f95a23c 3494
7c673cae
FG
3495 if (!diri->is_base() && diri->get_projected_parent_dir()->inode->is_stray()) {
3496 respond_to_request(mdr, -ENOENT);
9f95a23c 3497 return nullptr;
7c673cae
FG
3498 }
3499
9f95a23c
TL
3500 CDentry::linkage_t *dnl = dn->get_projected_linkage();
3501 if (dnl->is_null()) {
3502 if (!create && okexist) {
3503 respond_to_request(mdr, -ENOENT);
3504 return nullptr;
7c673cae
FG
3505 }
3506
9f95a23c
TL
3507 snapid_t next_snap = mdcache->get_global_snaprealm()->get_newest_seq() + 1;
3508 dn->first = std::max(dn->first, next_snap);
7c673cae 3509 } else {
9f95a23c
TL
3510 if (!okexist) {
3511 respond_to_request(mdr, -EEXIST);
3512 return nullptr;
3513 }
3514 mdr->in[0] = dnl->get_inode();
7c673cae
FG
3515 }
3516
7c673cae
FG
3517 return dn;
3518}
3519
9f95a23c
TL
3520/** rdlock_two_paths_xlock_destdn
3521 * traverse two paths and lock the two paths in proper order.
3522 * The order of taking locks is:
3523 * 1. Lock directory inodes or dentries according to which trees they
3524 * are under. Lock objects under fs root before objects under mdsdir.
3525 * 2. Lock directory inodes or dentries according to their depth, in
3526 * ascending order.
3527 * 3. Lock directory inodes or dentries according to inode numbers or
3528 * dentries' parent inode numbers, in ascending order.
3529 * 4. Lock dentries in the same directory in order of their keys.
3530 * 5. Lock non-directory inodes according to inode numbers, in ascending
3531 * order.
3532 */
3533std::pair<CDentry*, CDentry*>
3534Server::rdlock_two_paths_xlock_destdn(MDRequestRef& mdr, bool xlock_srcdn)
3535{
7c673cae 3536
9f95a23c
TL
3537 const filepath& refpath = mdr->get_filepath();
3538 const filepath& refpath2 = mdr->get_filepath2();
7c673cae 3539
9f95a23c 3540 dout(10) << "rdlock_two_paths_xlock_destdn " << *mdr << " " << refpath << " " << refpath2 << dendl;
7c673cae 3541
9f95a23c
TL
3542 if (mdr->locking_state & MutationImpl::PATH_LOCKED)
3543 return std::make_pair(mdr->dn[0].back(), mdr->dn[1].back());
7c673cae 3544
9f95a23c
TL
3545 if (refpath.depth() != 1 || refpath2.depth() != 1) {
3546 respond_to_request(mdr, -EINVAL);
3547 return std::pair<CDentry*, CDentry*>(nullptr, nullptr);
3548 }
3549
3550 if (refpath.is_last_snap() || refpath2.is_last_snap()) {
3551 respond_to_request(mdr, -EROFS);
3552 return std::make_pair(nullptr, nullptr);
3553 }
3554
3555 // traverse to parent dir
3556 CF_MDS_MDRContextFactory cf(mdcache, mdr, true);
3557 int flags = MDS_TRAVERSE_RDLOCK_SNAP | MDS_TRAVERSE_WANT_DENTRY | MDS_TRAVERSE_WANT_AUTH;
3558 int r = mdcache->path_traverse(mdr, cf, refpath, flags, &mdr->dn[0]);
3559 if (r != 0) {
3560 if (r == -ESTALE) {
3561 dout(10) << "ESTALE on path, attempting recovery" << dendl;
3562 mdcache->find_ino_peers(refpath.get_ino(), new C_MDS_TryFindInode(this, mdr));
3563 } else if (r < 0) {
3564 respond_to_request(mdr, r);
3565 }
3566 return std::make_pair(nullptr, nullptr);
3567 }
3568
3569 flags = MDS_TRAVERSE_RDLOCK_SNAP2 | MDS_TRAVERSE_WANT_DENTRY | MDS_TRAVERSE_DISCOVER;
3570 r = mdcache->path_traverse(mdr, cf, refpath2, flags, &mdr->dn[1]);
3571 if (r != 0) {
3572 if (r == -ESTALE) {
3573 dout(10) << "ESTALE on path2, attempting recovery" << dendl;
3574 mdcache->find_ino_peers(refpath2.get_ino(), new C_MDS_TryFindInode(this, mdr));
3575 } else if (r < 0) {
3576 respond_to_request(mdr, r);
3577 }
3578 return std::make_pair(nullptr, nullptr);
3579 }
3580
3581 CDentry *srcdn = mdr->dn[1].back();
3582 CDir *srcdir = srcdn->get_dir();
3583 CDentry *destdn = mdr->dn[0].back();
3584 CDir *destdir = destdn->get_dir();
3585
3586 if (!mdr->reqid.name.is_mds()) {
3587 if ((srcdir->get_inode()->is_system() && !srcdir->get_inode()->is_root()) ||
3588 (destdir->get_inode()->is_system() && !destdir->get_inode()->is_root())) {
3589 respond_to_request(mdr, -EROFS);
3590 return std::make_pair(nullptr, nullptr);
3591 }
3592 }
3593
3594 if (!destdir->get_inode()->is_base() &&
3595 destdir->get_inode()->get_projected_parent_dir()->inode->is_stray()) {
3596 respond_to_request(mdr, -ENOENT);
3597 return std::make_pair(nullptr, nullptr);
3598 }
3599
3600 MutationImpl::LockOpVec lov;
3601 if (srcdir->get_inode() == destdir->get_inode()) {
3602 lov.add_wrlock(&destdir->inode->filelock);
3603 lov.add_wrlock(&destdir->inode->nestlock);
3604 if (xlock_srcdn && srcdir != destdir) {
3605 mds_rank_t srcdir_auth = srcdir->authority().first;
3606 if (srcdir_auth != mds->get_nodeid()) {
3607 lov.add_remote_wrlock(&srcdir->inode->filelock, srcdir_auth);
3608 lov.add_remote_wrlock(&srcdir->inode->nestlock, srcdir_auth);
3609 }
3610 }
3611
3612 if (srcdn->get_name() > destdn->get_name())
3613 lov.add_xlock(&destdn->lock);
3614
3615 if (xlock_srcdn)
3616 lov.add_xlock(&srcdn->lock);
3617 else
3618 lov.add_rdlock(&srcdn->lock);
3619
3620 if (srcdn->get_name() < destdn->get_name())
3621 lov.add_xlock(&destdn->lock);
3622 } else {
3623 int cmp = mdr->compare_paths();
3624 bool lock_destdir_first =
3625 (cmp < 0 || (cmp == 0 && destdir->ino() < srcdir->ino()));
3626
3627 if (lock_destdir_first) {
3628 lov.add_wrlock(&destdir->inode->filelock);
3629 lov.add_wrlock(&destdir->inode->nestlock);
3630 lov.add_xlock(&destdn->lock);
3631 }
3632
3633 if (xlock_srcdn) {
3634 mds_rank_t srcdir_auth = srcdir->authority().first;
3635 if (srcdir_auth == mds->get_nodeid()) {
3636 lov.add_wrlock(&srcdir->inode->filelock);
3637 lov.add_wrlock(&srcdir->inode->nestlock);
3638 } else {
3639 lov.add_remote_wrlock(&srcdir->inode->filelock, srcdir_auth);
3640 lov.add_remote_wrlock(&srcdir->inode->nestlock, srcdir_auth);
3641 }
3642 lov.add_xlock(&srcdn->lock);
3643 } else {
3644 lov.add_rdlock(&srcdn->lock);
3645 }
3646
3647 if (!lock_destdir_first) {
3648 lov.add_wrlock(&destdir->inode->filelock);
3649 lov.add_wrlock(&destdir->inode->nestlock);
3650 lov.add_xlock(&destdn->lock);
3651 }
3652 }
3653
3654 CInode *auth_pin_freeze = nullptr;
3655 // XXX any better way to do this?
3656 if (xlock_srcdn && !srcdn->is_auth()) {
3657 CDentry::linkage_t *srcdnl = srcdn->get_projected_linkage();
3658 auth_pin_freeze = srcdnl->is_primary() ? srcdnl->get_inode() : nullptr;
3659 }
3660 if (!mds->locker->acquire_locks(mdr, lov, auth_pin_freeze))
3661 return std::make_pair(nullptr, nullptr);
3662
3663 if (srcdn->get_projected_linkage()->is_null()) {
3664 respond_to_request(mdr, -ENOENT);
3665 return std::make_pair(nullptr, nullptr);
3666 }
3667
3668 if (destdn->get_projected_linkage()->is_null()) {
3669 snapid_t next_snap = mdcache->get_global_snaprealm()->get_newest_seq() + 1;
3670 destdn->first = std::max(destdn->first, next_snap);
3671 }
3672
3673 mdr->locking_state |= MutationImpl::PATH_LOCKED;
3674
3675 return std::make_pair(destdn, srcdn);
3676}
3677
3678/**
3679 * try_open_auth_dirfrag -- open dirfrag, or forward to dirfrag auth
3680 *
3681 * @param diri base inode
3682 * @param fg the exact frag we want
7c673cae
FG
3683 * @param mdr request
3684 * @returns the pointer, or NULL if it had to be delayed (but mdr is taken care of)
3685 */
3686CDir* Server::try_open_auth_dirfrag(CInode *diri, frag_t fg, MDRequestRef& mdr)
3687{
3688 CDir *dir = diri->get_dirfrag(fg);
3689
9f95a23c
TL
3690 if (dir) {
3691 // am i auth for the dirfrag?
3692 if (!dir->is_auth()) {
3693 mds_rank_t auth = dir->authority().first;
3694 dout(7) << "try_open_auth_dirfrag: not auth for " << *dir
3695 << ", fw to mds." << auth << dendl;
3696 mdcache->request_forward(mdr, auth);
3697 return nullptr;
3698 }
3699 } else {
3700 // not open and inode not mine?
3701 if (!diri->is_auth()) {
3702 mds_rank_t inauth = diri->authority().first;
3703 dout(7) << "try_open_auth_dirfrag: not open, not inode auth, fw to mds." << inauth << dendl;
3704 mdcache->request_forward(mdr, inauth);
3705 return nullptr;
3706 }
7c673cae 3707
9f95a23c
TL
3708 // not open and inode frozen?
3709 if (diri->is_frozen()) {
3710 dout(10) << "try_open_auth_dirfrag: dir inode is frozen, waiting " << *diri << dendl;
3711 ceph_assert(diri->get_parent_dir());
3712 diri->add_waiter(CInode::WAIT_UNFREEZE, new C_MDS_RetryRequest(mdcache, mdr));
3713 return nullptr;
3714 }
7c673cae 3715
9f95a23c 3716 // invent?
7c673cae 3717 dir = diri->get_or_open_dirfrag(mdcache, fg);
7c673cae
FG
3718 }
3719
3720 return dir;
3721}
3722
3723
3724// ===============================================================================
3725// STAT
3726
3727void Server::handle_client_getattr(MDRequestRef& mdr, bool is_lookup)
3728{
9f95a23c 3729 const cref_t<MClientRequest> &req = mdr->client_request;
7c673cae
FG
3730
3731 if (req->get_filepath().depth() == 0 && is_lookup) {
3732 // refpath can't be empty for lookup but it can for
3733 // getattr (we do getattr with empty refpath for mount of '/')
3734 respond_to_request(mdr, -EINVAL);
3735 return;
3736 }
3737
28e407b8
AA
3738 bool want_auth = false;
3739 int mask = req->head.args.getattr.mask;
3740 if (mask & CEPH_STAT_RSTAT)
3741 want_auth = true; // set want_auth for CEPH_STAT_RSTAT mask
3742
f91f0fd5
TL
3743 if (!mdr->is_batch_head() && mdr->can_batch()) {
3744 CF_MDS_MDRContextFactory cf(mdcache, mdr, false);
3745 int r = mdcache->path_traverse(mdr, cf, mdr->get_filepath(),
3746 (want_auth ? MDS_TRAVERSE_WANT_AUTH : 0),
3747 &mdr->dn[0], &mdr->in[0]);
3748 if (r > 0)
3749 return; // delayed
9f95a23c 3750
f91f0fd5
TL
3751 if (r < 0) {
3752 // fall-thru. let rdlock_path_pin_ref() check again.
3753 } else if (is_lookup) {
3754 CDentry* dn = mdr->dn[0].back();
3755 mdr->pin(dn);
3756 auto em = dn->batch_ops.emplace(std::piecewise_construct, std::forward_as_tuple(mask), std::forward_as_tuple());
9f95a23c 3757 if (em.second) {
f91f0fd5 3758 em.first->second = std::make_unique<Batch_Getattr_Lookup>(this, mdr);
9f95a23c 3759 } else {
f91f0fd5 3760 dout(20) << __func__ << ": LOOKUP op, wait for previous same getattr ops to respond. " << *mdr << dendl;
9f95a23c
TL
3761 em.first->second->add_request(mdr);
3762 return;
3763 }
3764 } else {
f91f0fd5
TL
3765 CInode *in = mdr->in[0];
3766 mdr->pin(in);
3767 auto em = in->batch_ops.emplace(std::piecewise_construct, std::forward_as_tuple(mask), std::forward_as_tuple());
9f95a23c 3768 if (em.second) {
f91f0fd5 3769 em.first->second = std::make_unique<Batch_Getattr_Lookup>(this, mdr);
9f95a23c 3770 } else {
f91f0fd5 3771 dout(20) << __func__ << ": GETATTR op, wait for previous same getattr ops to respond. " << *mdr << dendl;
9f95a23c
TL
3772 em.first->second->add_request(mdr);
3773 return;
3774 }
3775 }
9f95a23c 3776 }
7c673cae 3777
f91f0fd5
TL
3778 CInode *ref = rdlock_path_pin_ref(mdr, want_auth, false);
3779 if (!ref)
3780 return;
3781
3782 mdr->getattr_caps = mask;
3783
7c673cae
FG
3784 /*
3785 * if client currently holds the EXCL cap on a field, do not rdlock
3786 * it; client's stat() will result in valid info if _either_ EXCL
3787 * cap is held or MDS rdlocks and reads the value here.
3788 *
3789 * handling this case here is easier than weakening rdlock
3790 * semantics... that would cause problems elsewhere.
3791 */
3792 client_t client = mdr->get_client();
3793 int issued = 0;
3794 Capability *cap = ref->get_client_cap(client);
3795 if (cap && (mdr->snapid == CEPH_NOSNAP ||
3796 mdr->snapid <= cap->client_follows))
3797 issued = cap->issued();
3798
9f95a23c
TL
3799 // FIXME
3800 MutationImpl::LockOpVec lov;
94b18763 3801 if ((mask & CEPH_CAP_LINK_SHARED) && !(issued & CEPH_CAP_LINK_EXCL))
11fdf7f2 3802 lov.add_rdlock(&ref->linklock);
94b18763 3803 if ((mask & CEPH_CAP_AUTH_SHARED) && !(issued & CEPH_CAP_AUTH_EXCL))
11fdf7f2 3804 lov.add_rdlock(&ref->authlock);
94b18763 3805 if ((mask & CEPH_CAP_XATTR_SHARED) && !(issued & CEPH_CAP_XATTR_EXCL))
11fdf7f2 3806 lov.add_rdlock(&ref->xattrlock);
94b18763
FG
3807 if ((mask & CEPH_CAP_FILE_SHARED) && !(issued & CEPH_CAP_FILE_EXCL)) {
3808 // Don't wait on unstable filelock if client is allowed to read file size.
3809 // This can reduce the response time of getattr in the case that multiple
3810 // clients do stat(2) and there are writers.
3811 // The downside of this optimization is that mds may not issue Fs caps along
3812 // with getattr reply. Client may need to send more getattr requests.
11fdf7f2
TL
3813 if (mdr->is_rdlocked(&ref->filelock)) {
3814 lov.add_rdlock(&ref->filelock);
94b18763
FG
3815 } else if (ref->filelock.is_stable() ||
3816 ref->filelock.get_num_wrlocks() > 0 ||
3817 !ref->filelock.can_read(mdr->get_client())) {
11fdf7f2 3818 lov.add_rdlock(&ref->filelock);
9f95a23c 3819 mdr->locking_state &= ~MutationImpl::ALL_LOCKED;
94b18763
FG
3820 }
3821 }
7c673cae 3822
11fdf7f2 3823 if (!mds->locker->acquire_locks(mdr, lov))
7c673cae
FG
3824 return;
3825
3826 if (!check_access(mdr, ref, MAY_READ))
3827 return;
3828
28e407b8
AA
3829 utime_t now = ceph_clock_now();
3830 mdr->set_mds_stamp(now);
3831
7c673cae
FG
3832 // note which caps are requested, so we return at least a snapshot
3833 // value for them. (currently this matters for xattrs and inline data)
3834 mdr->getattr_caps = mask;
3835
11fdf7f2 3836 mds->balancer->hit_inode(ref, META_POP_IRD, req->get_source().num());
7c673cae
FG
3837
3838 // reply
3839 dout(10) << "reply to stat on " << *req << dendl;
3840 mdr->tracei = ref;
3841 if (is_lookup)
3842 mdr->tracedn = mdr->dn[0].back();
3843 respond_to_request(mdr, 0);
3844}
3845
3846struct C_MDS_LookupIno2 : public ServerContext {
3847 MDRequestRef mdr;
3848 C_MDS_LookupIno2(Server *s, MDRequestRef& r) : ServerContext(s), mdr(r) {}
3849 void finish(int r) override {
3850 server->_lookup_ino_2(mdr, r);
3851 }
3852};
3853
7c673cae
FG
3854/*
3855 * filepath: ino
3856 */
3857void Server::handle_client_lookup_ino(MDRequestRef& mdr,
3858 bool want_parent, bool want_dentry)
3859{
9f95a23c 3860 const cref_t<MClientRequest> &req = mdr->client_request;
11fdf7f2
TL
3861
3862 if ((uint64_t)req->head.args.lookupino.snapid > 0)
3863 return _lookup_snap_ino(mdr);
7c673cae
FG
3864
3865 inodeno_t ino = req->get_filepath().get_ino();
3866 CInode *in = mdcache->get_inode(ino);
3867 if (in && in->state_test(CInode::STATE_PURGING)) {
3868 respond_to_request(mdr, -ESTALE);
3869 return;
3870 }
3871 if (!in) {
3872 mdcache->open_ino(ino, (int64_t)-1, new C_MDS_LookupIno2(this, mdr), false);
3873 return;
3874 }
3875
11fdf7f2 3876 if (mdr && in->snaprealm && !in->snaprealm->have_past_parents_open() &&
7c673cae
FG
3877 !in->snaprealm->open_parents(new C_MDS_RetryRequest(mdcache, mdr))) {
3878 return;
3879 }
3880
3881 // check for nothing (not read or write); this still applies the
3882 // path check.
3883 if (!check_access(mdr, in, 0))
3884 return;
3885
3886 CDentry *dn = in->get_projected_parent_dn();
3887 CInode *diri = dn ? dn->get_dir()->inode : NULL;
3888
11fdf7f2 3889 MutationImpl::LockOpVec lov;
7c673cae
FG
3890 if (dn && (want_parent || want_dentry)) {
3891 mdr->pin(dn);
11fdf7f2 3892 lov.add_rdlock(&dn->lock);
7c673cae
FG
3893 }
3894
11fdf7f2 3895 unsigned mask = req->head.args.lookupino.mask;
7c673cae
FG
3896 if (mask) {
3897 Capability *cap = in->get_client_cap(mdr->get_client());
3898 int issued = 0;
3899 if (cap && (mdr->snapid == CEPH_NOSNAP || mdr->snapid <= cap->client_follows))
3900 issued = cap->issued();
9f95a23c 3901 // FIXME
7c673cae
FG
3902 // permission bits, ACL/security xattrs
3903 if ((mask & CEPH_CAP_AUTH_SHARED) && (issued & CEPH_CAP_AUTH_EXCL) == 0)
11fdf7f2 3904 lov.add_rdlock(&in->authlock);
7c673cae 3905 if ((mask & CEPH_CAP_XATTR_SHARED) && (issued & CEPH_CAP_XATTR_EXCL) == 0)
11fdf7f2 3906 lov.add_rdlock(&in->xattrlock);
7c673cae
FG
3907
3908 mdr->getattr_caps = mask;
3909 }
3910
11fdf7f2
TL
3911 if (!lov.empty()) {
3912 if (!mds->locker->acquire_locks(mdr, lov))
7c673cae
FG
3913 return;
3914
d2e6a577
FG
3915 if (diri != NULL) {
3916 // need read access to directory inode
3917 if (!check_access(mdr, diri, MAY_READ))
3918 return;
3919 }
7c673cae
FG
3920 }
3921
3922 if (want_parent) {
3923 if (in->is_base()) {
3924 respond_to_request(mdr, -EINVAL);
3925 return;
3926 }
3927 if (!diri || diri->is_stray()) {
3928 respond_to_request(mdr, -ESTALE);
3929 return;
3930 }
3931 dout(10) << "reply to lookup_parent " << *in << dendl;
3932 mdr->tracei = diri;
3933 respond_to_request(mdr, 0);
3934 } else {
3935 if (want_dentry) {
3936 inodeno_t dirino = req->get_filepath2().get_ino();
3937 if (!diri || (dirino != inodeno_t() && diri->ino() != dirino)) {
3938 respond_to_request(mdr, -ENOENT);
3939 return;
3940 }
3941 dout(10) << "reply to lookup_name " << *in << dendl;
3942 } else
3943 dout(10) << "reply to lookup_ino " << *in << dendl;
3944
3945 mdr->tracei = in;
3946 if (want_dentry)
3947 mdr->tracedn = dn;
3948 respond_to_request(mdr, 0);
3949 }
3950}
3951
11fdf7f2
TL
3952void Server::_lookup_snap_ino(MDRequestRef& mdr)
3953{
9f95a23c 3954 const cref_t<MClientRequest> &req = mdr->client_request;
11fdf7f2
TL
3955
3956 vinodeno_t vino;
3957 vino.ino = req->get_filepath().get_ino();
3958 vino.snapid = (__u64)req->head.args.lookupino.snapid;
3959 inodeno_t parent_ino = (__u64)req->head.args.lookupino.parent;
3960 __u32 hash = req->head.args.lookupino.hash;
3961
3962 dout(7) << "lookup_snap_ino " << vino << " parent " << parent_ino << " hash " << hash << dendl;
3963
3964 CInode *in = mdcache->lookup_snap_inode(vino);
3965 if (!in) {
3966 in = mdcache->get_inode(vino.ino);
3967 if (in) {
3968 if (in->state_test(CInode::STATE_PURGING) ||
3969 !in->has_snap_data(vino.snapid)) {
3970 if (in->is_dir() || !parent_ino) {
3971 respond_to_request(mdr, -ESTALE);
3972 return;
3973 }
3974 in = NULL;
3975 }
3976 }
3977 }
3978
3979 if (in) {
3980 dout(10) << "reply to lookup_snap_ino " << *in << dendl;
3981 mdr->snapid = vino.snapid;
3982 mdr->tracei = in;
3983 respond_to_request(mdr, 0);
3984 return;
3985 }
3986
3987 CInode *diri = NULL;
3988 if (parent_ino) {
3989 diri = mdcache->get_inode(parent_ino);
3990 if (!diri) {
3991 mdcache->open_ino(parent_ino, mds->mdsmap->get_metadata_pool(), new C_MDS_LookupIno2(this, mdr));
3992 return;
3993 }
3994
3995 if (!diri->is_dir()) {
3996 respond_to_request(mdr, -EINVAL);
3997 return;
3998 }
3999
4000 MutationImpl::LockOpVec lov;
4001 lov.add_rdlock(&diri->dirfragtreelock);
4002 if (!mds->locker->acquire_locks(mdr, lov))
4003 return;
4004
4005 frag_t frag = diri->dirfragtree[hash];
4006 CDir *dir = try_open_auth_dirfrag(diri, frag, mdr);
4007 if (!dir)
4008 return;
4009
4010 if (!dir->is_complete()) {
4011 if (dir->is_frozen()) {
4012 mds->locker->drop_locks(mdr.get());
4013 mdr->drop_local_auth_pins();
4014 dir->add_waiter(CDir::WAIT_UNFREEZE, new C_MDS_RetryRequest(mdcache, mdr));
4015 return;
4016 }
4017 dir->fetch(new C_MDS_RetryRequest(mdcache, mdr), true);
4018 return;
4019 }
4020
4021 respond_to_request(mdr, -ESTALE);
4022 } else {
4023 mdcache->open_ino(vino.ino, mds->mdsmap->get_metadata_pool(), new C_MDS_LookupIno2(this, mdr), false);
4024 }
4025}
4026
7c673cae
FG
4027void Server::_lookup_ino_2(MDRequestRef& mdr, int r)
4028{
4029 inodeno_t ino = mdr->client_request->get_filepath().get_ino();
4030 dout(10) << "_lookup_ino_2 " << mdr.get() << " ino " << ino << " r=" << r << dendl;
4031
4032 // `r` is a rank if >=0, else an error code
4033 if (r >= 0) {
4034 mds_rank_t dest_rank(r);
4035 if (dest_rank == mds->get_nodeid())
4036 dispatch_client_request(mdr);
4037 else
4038 mdcache->request_forward(mdr, dest_rank);
4039 return;
4040 }
4041
4042 // give up
4043 if (r == -ENOENT || r == -ENODATA)
4044 r = -ESTALE;
4045 respond_to_request(mdr, r);
4046}
4047
4048
4049/* This function takes responsibility for the passed mdr*/
4050void Server::handle_client_open(MDRequestRef& mdr)
4051{
9f95a23c 4052 const cref_t<MClientRequest> &req = mdr->client_request;
7c673cae
FG
4053 dout(7) << "open on " << req->get_filepath() << dendl;
4054
4055 int flags = req->head.args.open.flags;
4056 int cmode = ceph_flags_to_mode(flags);
4057 if (cmode < 0) {
4058 respond_to_request(mdr, -EINVAL);
4059 return;
4060 }
4061
181888fb
FG
4062 bool need_auth = !file_mode_is_readonly(cmode) ||
4063 (flags & (CEPH_O_TRUNC | CEPH_O_DIRECTORY));
7c673cae
FG
4064
4065 if ((cmode & CEPH_FILE_MODE_WR) && mdcache->is_readonly()) {
4066 dout(7) << "read-only FS" << dendl;
4067 respond_to_request(mdr, -EROFS);
4068 return;
4069 }
4070
9f95a23c 4071 CInode *cur = rdlock_path_pin_ref(mdr, need_auth);
7c673cae
FG
4072 if (!cur)
4073 return;
4074
4075 if (cur->is_frozen() || cur->state_test(CInode::STATE_EXPORTINGCAPS)) {
11fdf7f2 4076 ceph_assert(!need_auth);
9f95a23c
TL
4077 mdr->locking_state &= ~(MutationImpl::PATH_LOCKED | MutationImpl::ALL_LOCKED);
4078 CInode *cur = rdlock_path_pin_ref(mdr, true);
7c673cae
FG
4079 if (!cur)
4080 return;
4081 }
4082
4083 if (!cur->inode.is_file()) {
4084 // can only open non-regular inode with mode FILE_MODE_PIN, at least for now.
4085 cmode = CEPH_FILE_MODE_PIN;
4086 // the inode is symlink and client wants to follow it, ignore the O_TRUNC flag.
4087 if (cur->inode.is_symlink() && !(flags & CEPH_O_NOFOLLOW))
4088 flags &= ~CEPH_O_TRUNC;
4089 }
4090
4091 dout(10) << "open flags = " << flags
4092 << ", filemode = " << cmode
4093 << ", need_auth = " << need_auth
4094 << dendl;
4095
4096 // regular file?
4097 /*if (!cur->inode.is_file() && !cur->inode.is_dir()) {
4098 dout(7) << "not a file or dir " << *cur << dendl;
4099 respond_to_request(mdr, -ENXIO); // FIXME what error do we want?
4100 return;
4101 }*/
4102 if ((flags & CEPH_O_DIRECTORY) && !cur->inode.is_dir() && !cur->inode.is_symlink()) {
4103 dout(7) << "specified O_DIRECTORY on non-directory " << *cur << dendl;
4104 respond_to_request(mdr, -EINVAL);
4105 return;
4106 }
4107
4108 if ((flags & CEPH_O_TRUNC) && !cur->inode.is_file()) {
4109 dout(7) << "specified O_TRUNC on !(file|symlink) " << *cur << dendl;
4110 // we should return -EISDIR for directory, return -EINVAL for other non-regular
4111 respond_to_request(mdr, cur->inode.is_dir() ? -EISDIR : -EINVAL);
4112 return;
4113 }
4114
4115 if (cur->inode.inline_data.version != CEPH_INLINE_NONE &&
11fdf7f2 4116 !mdr->session->get_connection()->has_feature(CEPH_FEATURE_MDS_INLINE_DATA)) {
7c673cae
FG
4117 dout(7) << "old client cannot open inline data file " << *cur << dendl;
4118 respond_to_request(mdr, -EPERM);
4119 return;
4120 }
4121
4122 // snapped data is read only
4123 if (mdr->snapid != CEPH_NOSNAP &&
4124 ((cmode & CEPH_FILE_MODE_WR) || req->may_write())) {
4125 dout(7) << "snap " << mdr->snapid << " is read-only " << *cur << dendl;
4126 respond_to_request(mdr, -EROFS);
4127 return;
4128 }
4129
9f95a23c
TL
4130 MutationImpl::LockOpVec lov;
4131
7c673cae
FG
4132 unsigned mask = req->head.args.open.mask;
4133 if (mask) {
4134 Capability *cap = cur->get_client_cap(mdr->get_client());
4135 int issued = 0;
4136 if (cap && (mdr->snapid == CEPH_NOSNAP || mdr->snapid <= cap->client_follows))
4137 issued = cap->issued();
4138 // permission bits, ACL/security xattrs
4139 if ((mask & CEPH_CAP_AUTH_SHARED) && (issued & CEPH_CAP_AUTH_EXCL) == 0)
11fdf7f2 4140 lov.add_rdlock(&cur->authlock);
7c673cae 4141 if ((mask & CEPH_CAP_XATTR_SHARED) && (issued & CEPH_CAP_XATTR_EXCL) == 0)
11fdf7f2 4142 lov.add_rdlock(&cur->xattrlock);
7c673cae
FG
4143
4144 mdr->getattr_caps = mask;
4145 }
4146
4147 // O_TRUNC
4148 if ((flags & CEPH_O_TRUNC) && !mdr->has_completed) {
11fdf7f2 4149 ceph_assert(cur->is_auth());
7c673cae 4150
11fdf7f2
TL
4151 lov.add_xlock(&cur->filelock);
4152 if (!mds->locker->acquire_locks(mdr, lov))
7c673cae
FG
4153 return;
4154
4155 if (!check_access(mdr, cur, MAY_WRITE))
4156 return;
4157
4158 // wait for pending truncate?
94b18763 4159 const auto pi = cur->get_projected_inode();
7c673cae
FG
4160 if (pi->is_truncating()) {
4161 dout(10) << " waiting for pending truncate from " << pi->truncate_from
4162 << " to " << pi->truncate_size << " to complete on " << *cur << dendl;
4163 mds->locker->drop_locks(mdr.get());
4164 mdr->drop_local_auth_pins();
4165 cur->add_waiter(CInode::WAIT_TRUNC, new C_MDS_RetryRequest(mdcache, mdr));
4166 return;
4167 }
4168
4169 do_open_truncate(mdr, cmode);
4170 return;
4171 }
4172
4173 // sync filelock if snapped.
4174 // this makes us wait for writers to flushsnaps, ensuring we get accurate metadata,
4175 // and that data itself is flushed so that we can read the snapped data off disk.
4176 if (mdr->snapid != CEPH_NOSNAP && !cur->is_dir()) {
11fdf7f2 4177 lov.add_rdlock(&cur->filelock);
7c673cae
FG
4178 }
4179
11fdf7f2 4180 if (!mds->locker->acquire_locks(mdr, lov))
7c673cae
FG
4181 return;
4182
4183 mask = MAY_READ;
4184 if (cmode & CEPH_FILE_MODE_WR)
4185 mask |= MAY_WRITE;
4186 if (!check_access(mdr, cur, mask))
4187 return;
4188
28e407b8
AA
4189 utime_t now = ceph_clock_now();
4190 mdr->set_mds_stamp(now);
4191
7c673cae
FG
4192 if (cur->is_file() || cur->is_dir()) {
4193 if (mdr->snapid == CEPH_NOSNAP) {
4194 // register new cap
9f95a23c 4195 Capability *cap = mds->locker->issue_new_caps(cur, cmode, mdr, nullptr);
7c673cae
FG
4196 if (cap)
4197 dout(12) << "open issued caps " << ccap_string(cap->pending())
4198 << " for " << req->get_source()
4199 << " on " << *cur << dendl;
4200 } else {
4201 int caps = ceph_caps_for_mode(cmode);
4202 dout(12) << "open issued IMMUTABLE SNAP caps " << ccap_string(caps)
4203 << " for " << req->get_source()
4204 << " snapid " << mdr->snapid
4205 << " on " << *cur << dendl;
4206 mdr->snap_caps = caps;
4207 }
4208 }
4209
4210 // increase max_size?
4211 if (cmode & CEPH_FILE_MODE_WR)
4212 mds->locker->check_inode_max_size(cur);
4213
4214 // make sure this inode gets into the journal
4215 if (cur->is_auth() && cur->last == CEPH_NOSNAP &&
11fdf7f2 4216 mdcache->open_file_table.should_log_open(cur)) {
7c673cae
FG
4217 EOpen *le = new EOpen(mds->mdlog);
4218 mdlog->start_entry(le);
4219 le->add_clean_inode(cur);
7c673cae
FG
4220 mdlog->submit_entry(le);
4221 }
4222
4223 // hit pop
4224 if (cmode & CEPH_FILE_MODE_WR)
11fdf7f2 4225 mds->balancer->hit_inode(cur, META_POP_IWR);
7c673cae 4226 else
11fdf7f2 4227 mds->balancer->hit_inode(cur, META_POP_IRD,
7c673cae
FG
4228 mdr->client_request->get_source().num());
4229
4230 CDentry *dn = 0;
4231 if (req->get_dentry_wanted()) {
11fdf7f2 4232 ceph_assert(mdr->dn[0].size());
7c673cae
FG
4233 dn = mdr->dn[0].back();
4234 }
4235
4236 mdr->tracei = cur;
4237 mdr->tracedn = dn;
4238 respond_to_request(mdr, 0);
4239}
4240
4241class C_MDS_openc_finish : public ServerLogContext {
4242 CDentry *dn;
4243 CInode *newi;
7c673cae 4244public:
11fdf7f2
TL
4245 C_MDS_openc_finish(Server *s, MDRequestRef& r, CDentry *d, CInode *ni) :
4246 ServerLogContext(s, r), dn(d), newi(ni) {}
7c673cae 4247 void finish(int r) override {
11fdf7f2 4248 ceph_assert(r == 0);
7c673cae
FG
4249
4250 dn->pop_projected_linkage();
4251
4252 // dirty inode, dn, dir
4253 newi->inode.version--; // a bit hacky, see C_MDS_mknod_finish
4254 newi->mark_dirty(newi->inode.version+1, mdr->ls);
28e407b8 4255 newi->mark_dirty_parent(mdr->ls, true);
7c673cae
FG
4256
4257 mdr->apply();
4258
4259 get_mds()->locker->share_inode_max_size(newi);
4260
4261 MDRequestRef null_ref;
4262 get_mds()->mdcache->send_dentry_link(dn, null_ref);
4263
11fdf7f2 4264 get_mds()->balancer->hit_inode(newi, META_POP_IWR);
7c673cae
FG
4265
4266 server->respond_to_request(mdr, 0);
4267
11fdf7f2 4268 ceph_assert(g_conf()->mds_kill_openc_at != 1);
7c673cae
FG
4269 }
4270};
4271
4272/* This function takes responsibility for the passed mdr*/
4273void Server::handle_client_openc(MDRequestRef& mdr)
4274{
9f95a23c 4275 const cref_t<MClientRequest> &req = mdr->client_request;
7c673cae
FG
4276 client_t client = mdr->get_client();
4277
4278 dout(7) << "open w/ O_CREAT on " << req->get_filepath() << dendl;
4279
4280 int cmode = ceph_flags_to_mode(req->head.args.open.flags);
4281 if (cmode < 0) {
4282 respond_to_request(mdr, -EINVAL);
4283 return;
4284 }
4285
c07f9fc5 4286 bool excl = req->head.args.open.flags & CEPH_O_EXCL;
9f95a23c
TL
4287 CDentry *dn = rdlock_path_xlock_dentry(mdr, true, !excl, true);
4288 if (!dn)
4289 return;
c07f9fc5 4290
9f95a23c
TL
4291 CDentry::linkage_t *dnl = dn->get_projected_linkage();
4292 if (!excl && !dnl->is_null()) {
4293 // it existed.
4294 mds->locker->xlock_downgrade(&dn->lock, mdr.get());
4295
4296 MutationImpl::LockOpVec lov;
4297 lov.add_rdlock(&dnl->get_inode()->snaplock);
4298 if (!mds->locker->acquire_locks(mdr, lov))
7c673cae 4299 return;
7c673cae 4300
9f95a23c 4301 handle_client_open(mdr);
7c673cae
FG
4302 return;
4303 }
9f95a23c
TL
4304
4305 ceph_assert(dnl->is_null());
4306
7c673cae
FG
4307 // set layout
4308 file_layout_t layout;
9f95a23c
TL
4309 if (mdr->dir_layout != file_layout_t())
4310 layout = mdr->dir_layout;
7c673cae
FG
4311 else
4312 layout = mdcache->default_file_layout;
4313
4314 // What kind of client caps are required to complete this operation
4315 uint64_t access = MAY_WRITE;
4316
4317 const auto default_layout = layout;
4318
4319 // fill in any special params from client
4320 if (req->head.args.open.stripe_unit)
4321 layout.stripe_unit = req->head.args.open.stripe_unit;
4322 if (req->head.args.open.stripe_count)
4323 layout.stripe_count = req->head.args.open.stripe_count;
4324 if (req->head.args.open.object_size)
4325 layout.object_size = req->head.args.open.object_size;
4326 if (req->get_connection()->has_feature(CEPH_FEATURE_CREATEPOOLID) &&
4327 (__s32)req->head.args.open.pool >= 0) {
4328 layout.pool_id = req->head.args.open.pool;
4329
4330 // make sure we have as new a map as the client
4331 if (req->get_mdsmap_epoch() > mds->mdsmap->get_epoch()) {
4332 mds->wait_for_mdsmap(req->get_mdsmap_epoch(), new C_MDS_RetryRequest(mdcache, mdr));
4333 return;
4334 }
4335 }
4336
4337 // If client doesn't have capability to modify layout pools, then
4338 // only permit this request if the requested pool matches what the
4339 // file would have inherited anyway from its parent.
4340 if (default_layout != layout) {
4341 access |= MAY_SET_VXATTR;
4342 }
4343
4344 if (!layout.is_valid()) {
4345 dout(10) << " invalid initial file layout" << dendl;
4346 respond_to_request(mdr, -EINVAL);
4347 return;
4348 }
4349 if (!mds->mdsmap->is_data_pool(layout.pool_id)) {
4350 dout(10) << " invalid data pool " << layout.pool_id << dendl;
4351 respond_to_request(mdr, -EINVAL);
4352 return;
4353 }
4354
c07f9fc5 4355 // created null dn.
7c673cae
FG
4356 CDir *dir = dn->get_dir();
4357 CInode *diri = dir->get_inode();
7c673cae
FG
4358 if (!check_access(mdr, diri, access))
4359 return;
7c673cae
FG
4360 if (!check_fragment_space(mdr, dir))
4361 return;
4362
9f95a23c
TL
4363 if (mdr->dn[0].size() == 1)
4364 mds->locker->create_lock_cache(mdr, diri, &mdr->dir_layout);
7c673cae 4365
7c673cae 4366 // create inode.
7c673cae
FG
4367 CInode *in = prepare_new_inode(mdr, dn->get_dir(), inodeno_t(req->head.ino),
4368 req->head.args.open.mode | S_IFREG, &layout);
11fdf7f2 4369 ceph_assert(in);
7c673cae
FG
4370
4371 // it's a file.
4372 dn->push_projected_linkage(in);
4373
4374 in->inode.version = dn->pre_dirty();
4375 if (layout.pool_id != mdcache->default_file_layout.pool_id)
4376 in->inode.add_old_pool(mdcache->default_file_layout.pool_id);
4377 in->inode.update_backtrace();
a8e16298
TL
4378 in->inode.rstat.rfiles = 1;
4379
4380 SnapRealm *realm = diri->find_snaprealm();
11fdf7f2
TL
4381 snapid_t follows = mdcache->get_global_snaprealm()->get_newest_seq();
4382 ceph_assert(follows >= realm->get_newest_seq());
a8e16298
TL
4383
4384 ceph_assert(dn->first == follows+1);
4385 in->first = dn->first;
4386
4387 // do the open
9f95a23c 4388 Capability *cap = mds->locker->issue_new_caps(in, cmode, mdr, realm);
a8e16298
TL
4389 in->authlock.set_state(LOCK_EXCL);
4390 in->xattrlock.set_state(LOCK_EXCL);
4391
4392 if (cap && (cmode & CEPH_FILE_MODE_WR)) {
7c673cae 4393 in->inode.client_ranges[client].range.first = 0;
9f95a23c 4394 in->inode.client_ranges[client].range.last = in->inode.layout.stripe_unit;
7c673cae 4395 in->inode.client_ranges[client].follows = follows;
f91f0fd5 4396 in->mark_clientwriteable();
a8e16298 4397 cap->mark_clientwriteable();
7c673cae 4398 }
7c673cae
FG
4399
4400 // prepare finisher
4401 mdr->ls = mdlog->get_current_segment();
4402 EUpdate *le = new EUpdate(mdlog, "openc");
4403 mdlog->start_entry(le);
4404 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
4405 journal_allocated_inos(mdr, &le->metablob);
4406 mdcache->predirty_journal_parents(mdr, &le->metablob, in, dn->get_dir(), PREDIRTY_PRIMARY|PREDIRTY_DIR, 1);
4407 le->metablob.add_primary_dentry(dn, in, true, true, true);
4408
7c673cae
FG
4409 // make sure this inode gets into the journal
4410 le->metablob.add_opened_ino(in->ino());
7c673cae 4411
11fdf7f2 4412 C_MDS_openc_finish *fin = new C_MDS_openc_finish(this, mdr, dn, in);
7c673cae 4413
9f95a23c
TL
4414 if (mdr->session->info.has_feature(CEPHFS_FEATURE_DELEG_INO)) {
4415 openc_response_t ocresp;
4416
4417 dout(10) << "adding created_ino and delegated_inos" << dendl;
4418 ocresp.created_ino = in->inode.ino;
4419
4420 if (delegate_inos_pct && !req->is_queued_for_replay()) {
4421 // Try to delegate some prealloc_inos to the client, if it's down to half the max
4422 unsigned frac = 100 / delegate_inos_pct;
4423 if (mdr->session->delegated_inos.size() < (unsigned)g_conf()->mds_client_prealloc_inos / frac / 2)
4424 mdr->session->delegate_inos(g_conf()->mds_client_prealloc_inos / frac, ocresp.delegated_inos);
4425 }
4426
4427 encode(ocresp, mdr->reply_extra_bl);
4428 } else if (mdr->client_request->get_connection()->has_feature(CEPH_FEATURE_REPLY_CREATE_INODE)) {
7c673cae
FG
4429 dout(10) << "adding ino to reply to indicate inode was created" << dendl;
4430 // add the file created flag onto the reply if create_flags features is supported
11fdf7f2 4431 encode(in->inode.ino, mdr->reply_extra_bl);
7c673cae
FG
4432 }
4433
4434 journal_and_reply(mdr, in, dn, le, fin);
4435
4436 // We hit_dir (via hit_inode) in our finish callback, but by then we might
4437 // have overshot the split size (multiple opencs in flight), so here is
4438 // an early chance to split the dir if this openc makes it oversized.
4439 mds->balancer->maybe_fragment(dir, false);
4440}
4441
4442
4443
4444void Server::handle_client_readdir(MDRequestRef& mdr)
4445{
9f95a23c 4446 const cref_t<MClientRequest> &req = mdr->client_request;
adb31ebb 4447 Session *session = mds->get_session(req);
7c673cae 4448 client_t client = req->get_source().num();
11fdf7f2 4449 MutationImpl::LockOpVec lov;
9f95a23c 4450 CInode *diri = rdlock_path_pin_ref(mdr, false, true);
7c673cae
FG
4451 if (!diri) return;
4452
4453 // it's a directory, right?
4454 if (!diri->is_dir()) {
4455 // not a dir
4456 dout(10) << "reply to " << *req << " readdir -ENOTDIR" << dendl;
4457 respond_to_request(mdr, -ENOTDIR);
4458 return;
4459 }
4460
adb31ebb
TL
4461 auto num_caps = session->get_num_caps();
4462 auto session_cap_acquisition = session->get_cap_acquisition();
4463
4464 if (num_caps > static_cast<uint64_t>(max_caps_per_client * max_caps_throttle_ratio) && session_cap_acquisition >= cap_acquisition_throttle) {
4465 dout(20) << "readdir throttled. max_caps_per_client: " << max_caps_per_client << " num_caps: " << num_caps
4466 << " session_cap_acquistion: " << session_cap_acquisition << " cap_acquisition_throttle: " << cap_acquisition_throttle << dendl;
4467 if (logger)
4468 logger->inc(l_mdss_cap_acquisition_throttle);
4469
4470 mds->timer.add_event_after(caps_throttle_retry_request_timeout, new C_MDS_RetryRequest(mdcache, mdr));
4471 return;
4472 }
4473
11fdf7f2
TL
4474 lov.add_rdlock(&diri->filelock);
4475 lov.add_rdlock(&diri->dirfragtreelock);
7c673cae 4476
11fdf7f2 4477 if (!mds->locker->acquire_locks(mdr, lov))
7c673cae
FG
4478 return;
4479
4480 if (!check_access(mdr, diri, MAY_READ))
4481 return;
4482
4483 // which frag?
4484 frag_t fg = (__u32)req->head.args.readdir.frag;
4485 unsigned req_flags = (__u32)req->head.args.readdir.flags;
4486 string offset_str = req->get_path2();
4487
4488 __u32 offset_hash = 0;
4489 if (!offset_str.empty())
4490 offset_hash = ceph_frag_value(diri->hash_dentry_name(offset_str));
4491 else
4492 offset_hash = (__u32)req->head.args.readdir.offset_hash;
4493
4494 dout(10) << " frag " << fg << " offset '" << offset_str << "'"
4495 << " offset_hash " << offset_hash << " flags " << req_flags << dendl;
4496
4497 // does the frag exist?
4498 if (diri->dirfragtree[fg.value()] != fg) {
4499 frag_t newfg;
4500 if (req_flags & CEPH_READDIR_REPLY_BITFLAGS) {
4501 if (fg.contains((unsigned)offset_hash)) {
4502 newfg = diri->dirfragtree[offset_hash];
4503 } else {
4504 // client actually wants next frag
4505 newfg = diri->dirfragtree[fg.value()];
4506 }
4507 } else {
4508 offset_str.clear();
4509 newfg = diri->dirfragtree[fg.value()];
4510 }
4511 dout(10) << " adjust frag " << fg << " -> " << newfg << " " << diri->dirfragtree << dendl;
4512 fg = newfg;
4513 }
4514
4515 CDir *dir = try_open_auth_dirfrag(diri, fg, mdr);
4516 if (!dir) return;
4517
4518 // ok!
4519 dout(10) << "handle_client_readdir on " << *dir << dendl;
11fdf7f2 4520 ceph_assert(dir->is_auth());
7c673cae
FG
4521
4522 if (!dir->is_complete()) {
4523 if (dir->is_frozen()) {
4524 dout(7) << "dir is frozen " << *dir << dendl;
4525 mds->locker->drop_locks(mdr.get());
4526 mdr->drop_local_auth_pins();
4527 dir->add_waiter(CDir::WAIT_UNFREEZE, new C_MDS_RetryRequest(mdcache, mdr));
4528 return;
4529 }
4530 // fetch
4531 dout(10) << " incomplete dir contents for readdir on " << *dir << ", fetching" << dendl;
4532 dir->fetch(new C_MDS_RetryRequest(mdcache, mdr), true);
4533 return;
4534 }
4535
4536#ifdef MDS_VERIFY_FRAGSTAT
4537 dir->verify_fragstat();
4538#endif
4539
4540 utime_t now = ceph_clock_now();
4541 mdr->set_mds_stamp(now);
4542
4543 snapid_t snapid = mdr->snapid;
4544 dout(10) << "snapid " << snapid << dendl;
4545
4546 SnapRealm *realm = diri->find_snaprealm();
4547
4548 unsigned max = req->head.args.readdir.max_entries;
4549 if (!max)
4550 max = dir->get_num_any(); // whatever, something big.
4551 unsigned max_bytes = req->head.args.readdir.max_bytes;
4552 if (!max_bytes)
4553 // make sure at least one item can be encoded
11fdf7f2 4554 max_bytes = (512 << 10) + g_conf()->mds_max_xattr_pairs_size;
7c673cae
FG
4555
4556 // start final blob
4557 bufferlist dirbl;
11fdf7f2
TL
4558 DirStat ds;
4559 ds.frag = dir->get_frag();
4560 ds.auth = dir->get_dir_auth().first;
f91f0fd5 4561 if (dir->is_auth() && !forward_all_requests_to_auth)
11fdf7f2
TL
4562 dir->get_dist_spec(ds.dist, mds->get_nodeid());
4563
4564 dir->encode_dirstat(dirbl, mdr->session->info, ds);
7c673cae
FG
4565
4566 // count bytes available.
4567 // this isn't perfect, but we should capture the main variable/unbounded size items!
4568 int front_bytes = dirbl.length() + sizeof(__u32) + sizeof(__u8)*2;
4569 int bytes_left = max_bytes - front_bytes;
4570 bytes_left -= realm->get_snap_trace().length();
4571
4572 // build dir contents
4573 bufferlist dnbl;
4574 __u32 numfiles = 0;
4575 bool start = !offset_hash && offset_str.empty();
7c673cae
FG
4576 // skip all dns < dentry_key_t(snapid, offset_str, offset_hash)
4577 dentry_key_t skip_key(snapid, offset_str.c_str(), offset_hash);
181888fb
FG
4578 auto it = start ? dir->begin() : dir->lower_bound(skip_key);
4579 bool end = (it == dir->end());
4580 for (; !end && numfiles < max; end = (it == dir->end())) {
7c673cae
FG
4581 CDentry *dn = it->second;
4582 ++it;
4583
4584 if (dn->state_test(CDentry::STATE_PURGING))
4585 continue;
4586
4587 bool dnp = dn->use_projected(client, mdr);
4588 CDentry::linkage_t *dnl = dnp ? dn->get_projected_linkage() : dn->get_linkage();
4589
4590 if (dnl->is_null())
4591 continue;
4592
4593 if (dn->last < snapid || dn->first > snapid) {
4594 dout(20) << "skipping non-overlapping snap " << *dn << dendl;
4595 continue;
4596 }
4597
4598 if (!start) {
4599 dentry_key_t offset_key(dn->last, offset_str.c_str(), offset_hash);
4600 if (!(offset_key < dn->key()))
4601 continue;
4602 }
4603
4604 CInode *in = dnl->get_inode();
4605
4606 if (in && in->ino() == CEPH_INO_CEPH)
4607 continue;
4608
4609 // remote link?
4610 // better for the MDS to do the work, if we think the client will stat any of these files.
4611 if (dnl->is_remote() && !in) {
4612 in = mdcache->get_inode(dnl->get_remote_ino());
4613 if (in) {
4614 dn->link_remote(dnl, in);
4615 } else if (dn->state_test(CDentry::STATE_BADREMOTEINO)) {
4616 dout(10) << "skipping bad remote ino on " << *dn << dendl;
4617 continue;
4618 } else {
4619 // touch everything i _do_ have
94b18763
FG
4620 for (auto &p : *dir) {
4621 if (!p.second->get_linkage()->is_null())
4622 mdcache->lru.lru_touch(p.second);
4623 }
7c673cae
FG
4624
4625 // already issued caps and leases, reply immediately.
4626 if (dnbl.length() > 0) {
4627 mdcache->open_remote_dentry(dn, dnp, new C_MDSInternalNoop);
4628 dout(10) << " open remote dentry after caps were issued, stopping at "
4629 << dnbl.length() << " < " << bytes_left << dendl;
4630 break;
4631 }
4632
4633 mds->locker->drop_locks(mdr.get());
4634 mdr->drop_local_auth_pins();
4635 mdcache->open_remote_dentry(dn, dnp, new C_MDS_RetryRequest(mdcache, mdr));
4636 return;
4637 }
4638 }
11fdf7f2 4639 ceph_assert(in);
7c673cae 4640
94b18763 4641 if ((int)(dnbl.length() + dn->get_name().length() + sizeof(__u32) + sizeof(LeaseStat)) > bytes_left) {
7c673cae
FG
4642 dout(10) << " ran out of room, stopping at " << dnbl.length() << " < " << bytes_left << dendl;
4643 break;
4644 }
4645
4646 unsigned start_len = dnbl.length();
4647
4648 // dentry
4649 dout(12) << "including dn " << *dn << dendl;
11fdf7f2 4650 encode(dn->get_name(), dnbl);
9f95a23c
TL
4651 int lease_mask = dnl->is_primary() ? CEPH_LEASE_PRIMARY_LINK : 0;
4652 mds->locker->issue_client_lease(dn, mdr, lease_mask, now, dnbl);
7c673cae
FG
4653
4654 // inode
4655 dout(12) << "including inode " << *in << dendl;
4656 int r = in->encode_inodestat(dnbl, mdr->session, realm, snapid, bytes_left - (int)dnbl.length());
4657 if (r < 0) {
4658 // chop off dn->name, lease
4659 dout(10) << " ran out of room, stopping at " << start_len << " < " << bytes_left << dendl;
4660 bufferlist keep;
4661 keep.substr_of(dnbl, 0, start_len);
4662 dnbl.swap(keep);
4663 break;
4664 }
11fdf7f2 4665 ceph_assert(r >= 0);
7c673cae
FG
4666 numfiles++;
4667
4668 // touch dn
4669 mdcache->lru.lru_touch(dn);
4670 }
4671
adb31ebb
TL
4672 session->touch_readdir_cap(numfiles);
4673
7c673cae
FG
4674 __u16 flags = 0;
4675 if (end) {
4676 flags = CEPH_READDIR_FRAG_END;
4677 if (start)
4678 flags |= CEPH_READDIR_FRAG_COMPLETE; // FIXME: what purpose does this serve
4679 }
4680 // client only understand END and COMPLETE flags ?
4681 if (req_flags & CEPH_READDIR_REPLY_BITFLAGS) {
4682 flags |= CEPH_READDIR_HASH_ORDER | CEPH_READDIR_OFFSET_HASH;
4683 }
4684
4685 // finish final blob
11fdf7f2
TL
4686 encode(numfiles, dirbl);
4687 encode(flags, dirbl);
7c673cae
FG
4688 dirbl.claim_append(dnbl);
4689
4690 // yay, reply
4691 dout(10) << "reply to " << *req << " readdir num=" << numfiles
4692 << " bytes=" << dirbl.length()
4693 << " start=" << (int)start
4694 << " end=" << (int)end
4695 << dendl;
4696 mdr->reply_extra_bl = dirbl;
4697
4698 // bump popularity. NOTE: this doesn't quite capture it.
11fdf7f2 4699 mds->balancer->hit_dir(dir, META_POP_IRD, -1, numfiles);
7c673cae
FG
4700
4701 // reply
4702 mdr->tracei = diri;
4703 respond_to_request(mdr, 0);
4704}
4705
4706
4707
4708// ===============================================================================
4709// INODE UPDATES
4710
4711
4712/*
4713 * finisher for basic inode updates
4714 */
4715class C_MDS_inode_update_finish : public ServerLogContext {
4716 CInode *in;
adb31ebb 4717 bool truncating_smaller, changed_ranges, adjust_realm;
7c673cae
FG
4718public:
4719 C_MDS_inode_update_finish(Server *s, MDRequestRef& r, CInode *i,
adb31ebb 4720 bool sm=false, bool cr=false, bool ar=false) :
11fdf7f2 4721 ServerLogContext(s, r), in(i),
adb31ebb 4722 truncating_smaller(sm), changed_ranges(cr), adjust_realm(ar) { }
7c673cae 4723 void finish(int r) override {
11fdf7f2 4724 ceph_assert(r == 0);
7c673cae 4725
adb31ebb
TL
4726 int snap_op = (in->snaprealm ? CEPH_SNAP_OP_UPDATE : CEPH_SNAP_OP_SPLIT);
4727
7c673cae
FG
4728 // apply
4729 in->pop_and_dirty_projected_inode(mdr->ls);
4730 mdr->apply();
4731
11fdf7f2
TL
4732 MDSRank *mds = get_mds();
4733
7c673cae
FG
4734 // notify any clients
4735 if (truncating_smaller && in->inode.is_truncating()) {
11fdf7f2
TL
4736 mds->locker->issue_truncate(in);
4737 mds->mdcache->truncate_inode(in, mdr->ls);
4738 }
4739
adb31ebb
TL
4740 if (adjust_realm) {
4741 mds->mdcache->send_snap_update(in, 0, snap_op);
4742 mds->mdcache->do_realm_invalidate_and_update_notify(in, snap_op);
7c673cae
FG
4743 }
4744
11fdf7f2 4745 get_mds()->balancer->hit_inode(in, META_POP_IWR);
7c673cae
FG
4746
4747 server->respond_to_request(mdr, 0);
4748
4749 if (changed_ranges)
4750 get_mds()->locker->share_inode_max_size(in);
4751 }
4752};
4753
4754void Server::handle_client_file_setlock(MDRequestRef& mdr)
4755{
9f95a23c 4756 const cref_t<MClientRequest> &req = mdr->client_request;
11fdf7f2 4757 MutationImpl::LockOpVec lov;
7c673cae
FG
4758
4759 // get the inode to operate on, and set up any locks needed for that
9f95a23c 4760 CInode *cur = rdlock_path_pin_ref(mdr, true);
7c673cae
FG
4761 if (!cur)
4762 return;
4763
11fdf7f2 4764 lov.add_xlock(&cur->flocklock);
7c673cae
FG
4765 /* acquire_locks will return true if it gets the locks. If it fails,
4766 it will redeliver this request at a later date, so drop the request.
4767 */
11fdf7f2 4768 if (!mds->locker->acquire_locks(mdr, lov)) {
7c673cae
FG
4769 dout(10) << "handle_client_file_setlock could not get locks!" << dendl;
4770 return;
4771 }
4772
4773 // copy the lock change into a ceph_filelock so we can store/apply it
4774 ceph_filelock set_lock;
4775 set_lock.start = req->head.args.filelock_change.start;
4776 set_lock.length = req->head.args.filelock_change.length;
4777 set_lock.client = req->get_orig_source().num();
4778 set_lock.owner = req->head.args.filelock_change.owner;
4779 set_lock.pid = req->head.args.filelock_change.pid;
4780 set_lock.type = req->head.args.filelock_change.type;
4781 bool will_wait = req->head.args.filelock_change.wait;
4782
4783 dout(10) << "handle_client_file_setlock: " << set_lock << dendl;
4784
4785 ceph_lock_state_t *lock_state = NULL;
4786 bool interrupt = false;
4787
4788 // get the appropriate lock state
4789 switch (req->head.args.filelock_change.rule) {
4790 case CEPH_LOCK_FLOCK_INTR:
4791 interrupt = true;
4792 // fall-thru
4793 case CEPH_LOCK_FLOCK:
4794 lock_state = cur->get_flock_lock_state();
4795 break;
4796
4797 case CEPH_LOCK_FCNTL_INTR:
4798 interrupt = true;
4799 // fall-thru
4800 case CEPH_LOCK_FCNTL:
4801 lock_state = cur->get_fcntl_lock_state();
4802 break;
4803
4804 default:
4805 dout(10) << "got unknown lock type " << set_lock.type
4806 << ", dropping request!" << dendl;
4807 respond_to_request(mdr, -EOPNOTSUPP);
4808 return;
4809 }
4810
4811 dout(10) << " state prior to lock change: " << *lock_state << dendl;
4812 if (CEPH_LOCK_UNLOCK == set_lock.type) {
4813 list<ceph_filelock> activated_locks;
11fdf7f2 4814 MDSContext::vec waiters;
7c673cae
FG
4815 if (lock_state->is_waiting(set_lock)) {
4816 dout(10) << " unlock removing waiting lock " << set_lock << dendl;
4817 lock_state->remove_waiting(set_lock);
4818 cur->take_waiting(CInode::WAIT_FLOCK, waiters);
4819 } else if (!interrupt) {
4820 dout(10) << " unlock attempt on " << set_lock << dendl;
4821 lock_state->remove_lock(set_lock, activated_locks);
4822 cur->take_waiting(CInode::WAIT_FLOCK, waiters);
4823 }
4824 mds->queue_waiters(waiters);
4825
4826 respond_to_request(mdr, 0);
4827 } else {
4828 dout(10) << " lock attempt on " << set_lock << dendl;
4829 bool deadlock = false;
4830 if (mdr->more()->flock_was_waiting &&
4831 !lock_state->is_waiting(set_lock)) {
4832 dout(10) << " was waiting for lock but not anymore, must have been canceled " << set_lock << dendl;
4833 respond_to_request(mdr, -EINTR);
4834 } else if (!lock_state->add_lock(set_lock, will_wait, mdr->more()->flock_was_waiting, &deadlock)) {
4835 dout(10) << " it failed on this attempt" << dendl;
4836 // couldn't set lock right now
4837 if (deadlock) {
4838 respond_to_request(mdr, -EDEADLK);
4839 } else if (!will_wait) {
4840 respond_to_request(mdr, -EWOULDBLOCK);
4841 } else {
4842 dout(10) << " added to waiting list" << dendl;
11fdf7f2 4843 ceph_assert(lock_state->is_waiting(set_lock));
7c673cae
FG
4844 mdr->more()->flock_was_waiting = true;
4845 mds->locker->drop_locks(mdr.get());
4846 mdr->drop_local_auth_pins();
1adf2230
AA
4847 mdr->mark_event("failed to add lock, waiting");
4848 mdr->mark_nowarn();
7c673cae
FG
4849 cur->add_waiter(CInode::WAIT_FLOCK, new C_MDS_RetryRequest(mdcache, mdr));
4850 }
4851 } else
4852 respond_to_request(mdr, 0);
4853 }
4854 dout(10) << " state after lock change: " << *lock_state << dendl;
4855}
4856
4857void Server::handle_client_file_readlock(MDRequestRef& mdr)
4858{
9f95a23c 4859 const cref_t<MClientRequest> &req = mdr->client_request;
11fdf7f2 4860 MutationImpl::LockOpVec lov;
7c673cae
FG
4861
4862 // get the inode to operate on, and set up any locks needed for that
9f95a23c 4863 CInode *cur = rdlock_path_pin_ref(mdr, true);
7c673cae
FG
4864 if (!cur)
4865 return;
4866
4867 /* acquire_locks will return true if it gets the locks. If it fails,
4868 it will redeliver this request at a later date, so drop the request.
4869 */
11fdf7f2
TL
4870 lov.add_rdlock(&cur->flocklock);
4871 if (!mds->locker->acquire_locks(mdr, lov)) {
7c673cae
FG
4872 dout(10) << "handle_client_file_readlock could not get locks!" << dendl;
4873 return;
4874 }
4875
4876 // copy the lock change into a ceph_filelock so we can store/apply it
4877 ceph_filelock checking_lock;
4878 checking_lock.start = req->head.args.filelock_change.start;
4879 checking_lock.length = req->head.args.filelock_change.length;
4880 checking_lock.client = req->get_orig_source().num();
4881 checking_lock.owner = req->head.args.filelock_change.owner;
4882 checking_lock.pid = req->head.args.filelock_change.pid;
4883 checking_lock.type = req->head.args.filelock_change.type;
4884
4885 // get the appropriate lock state
4886 ceph_lock_state_t *lock_state = NULL;
4887 switch (req->head.args.filelock_change.rule) {
4888 case CEPH_LOCK_FLOCK:
4889 lock_state = cur->get_flock_lock_state();
4890 break;
4891
4892 case CEPH_LOCK_FCNTL:
4893 lock_state = cur->get_fcntl_lock_state();
4894 break;
4895
4896 default:
4897 dout(10) << "got unknown lock type " << checking_lock.type << dendl;
4898 respond_to_request(mdr, -EINVAL);
4899 return;
4900 }
4901 lock_state->look_for_lock(checking_lock);
4902
4903 bufferlist lock_bl;
11fdf7f2 4904 encode(checking_lock, lock_bl);
7c673cae
FG
4905
4906 mdr->reply_extra_bl = lock_bl;
4907 respond_to_request(mdr, 0);
4908}
4909
4910void Server::handle_client_setattr(MDRequestRef& mdr)
4911{
9f95a23c 4912 const cref_t<MClientRequest> &req = mdr->client_request;
11fdf7f2 4913 MutationImpl::LockOpVec lov;
9f95a23c 4914 CInode *cur = rdlock_path_pin_ref(mdr, true);
7c673cae
FG
4915 if (!cur) return;
4916
4917 if (mdr->snapid != CEPH_NOSNAP) {
4918 respond_to_request(mdr, -EROFS);
4919 return;
4920 }
4921 if (cur->ino() < MDS_INO_SYSTEM_BASE && !cur->is_base()) {
4922 respond_to_request(mdr, -EPERM);
4923 return;
4924 }
4925
4926 __u32 mask = req->head.args.setattr.mask;
4927 __u32 access_mask = MAY_WRITE;
4928
4929 // xlock inode
4930 if (mask & (CEPH_SETATTR_MODE|CEPH_SETATTR_UID|CEPH_SETATTR_GID|CEPH_SETATTR_BTIME|CEPH_SETATTR_KILL_SGUID))
11fdf7f2 4931 lov.add_xlock(&cur->authlock);
7c673cae 4932 if (mask & (CEPH_SETATTR_MTIME|CEPH_SETATTR_ATIME|CEPH_SETATTR_SIZE))
11fdf7f2 4933 lov.add_xlock(&cur->filelock);
7c673cae 4934 if (mask & CEPH_SETATTR_CTIME)
11fdf7f2 4935 lov.add_wrlock(&cur->versionlock);
7c673cae 4936
11fdf7f2 4937 if (!mds->locker->acquire_locks(mdr, lov))
7c673cae
FG
4938 return;
4939
4940 if ((mask & CEPH_SETATTR_UID) && (cur->inode.uid != req->head.args.setattr.uid))
4941 access_mask |= MAY_CHOWN;
4942
4943 if ((mask & CEPH_SETATTR_GID) && (cur->inode.gid != req->head.args.setattr.gid))
4944 access_mask |= MAY_CHGRP;
4945
4946 if (!check_access(mdr, cur, access_mask))
4947 return;
4948
4949 // trunc from bigger -> smaller?
94b18763 4950 auto pip = cur->get_projected_inode();
7c673cae 4951
94b18763 4952 uint64_t old_size = std::max<uint64_t>(pip->size, req->head.args.setattr.old_size);
7c673cae
FG
4953
4954 // ENOSPC on growing file while full, but allow shrinks
4955 if (is_full && req->head.args.setattr.size > old_size) {
4956 dout(20) << __func__ << ": full, responding ENOSPC to setattr with larger size" << dendl;
4957 respond_to_request(mdr, -ENOSPC);
4958 return;
4959 }
4960
4961 bool truncating_smaller = false;
4962 if (mask & CEPH_SETATTR_SIZE) {
4963 truncating_smaller = req->head.args.setattr.size < old_size;
94b18763
FG
4964 if (truncating_smaller && pip->is_truncating()) {
4965 dout(10) << " waiting for pending truncate from " << pip->truncate_from
4966 << " to " << pip->truncate_size << " to complete on " << *cur << dendl;
7c673cae
FG
4967 mds->locker->drop_locks(mdr.get());
4968 mdr->drop_local_auth_pins();
4969 cur->add_waiter(CInode::WAIT_TRUNC, new C_MDS_RetryRequest(mdcache, mdr));
4970 return;
4971 }
4972 }
4973
4974 bool changed_ranges = false;
4975
4976 // project update
4977 mdr->ls = mdlog->get_current_segment();
4978 EUpdate *le = new EUpdate(mdlog, "setattr");
4979 mdlog->start_entry(le);
4980
94b18763 4981 auto &pi = cur->project_inode();
7c673cae
FG
4982
4983 if (mask & CEPH_SETATTR_UID)
94b18763 4984 pi.inode.uid = req->head.args.setattr.uid;
7c673cae 4985 if (mask & CEPH_SETATTR_GID)
94b18763 4986 pi.inode.gid = req->head.args.setattr.gid;
7c673cae
FG
4987
4988 if (mask & CEPH_SETATTR_MODE)
94b18763 4989 pi.inode.mode = (pi.inode.mode & ~07777) | (req->head.args.setattr.mode & 07777);
7c673cae 4990 else if ((mask & (CEPH_SETATTR_UID|CEPH_SETATTR_GID|CEPH_SETATTR_KILL_SGUID)) &&
94b18763
FG
4991 S_ISREG(pi.inode.mode) &&
4992 (pi.inode.mode & (S_IXUSR|S_IXGRP|S_IXOTH))) {
4993 pi.inode.mode &= ~(S_ISUID|S_ISGID);
7c673cae
FG
4994 }
4995
4996 if (mask & CEPH_SETATTR_MTIME)
94b18763 4997 pi.inode.mtime = req->head.args.setattr.mtime;
7c673cae 4998 if (mask & CEPH_SETATTR_ATIME)
94b18763 4999 pi.inode.atime = req->head.args.setattr.atime;
7c673cae 5000 if (mask & CEPH_SETATTR_BTIME)
94b18763 5001 pi.inode.btime = req->head.args.setattr.btime;
7c673cae 5002 if (mask & (CEPH_SETATTR_ATIME | CEPH_SETATTR_MTIME | CEPH_SETATTR_BTIME))
94b18763 5003 pi.inode.time_warp_seq++; // maybe not a timewarp, but still a serialization point.
7c673cae
FG
5004 if (mask & CEPH_SETATTR_SIZE) {
5005 if (truncating_smaller) {
94b18763 5006 pi.inode.truncate(old_size, req->head.args.setattr.size);
7c673cae
FG
5007 le->metablob.add_truncate_start(cur->ino());
5008 } else {
94b18763
FG
5009 pi.inode.size = req->head.args.setattr.size;
5010 pi.inode.rstat.rbytes = pi.inode.size;
7c673cae 5011 }
94b18763 5012 pi.inode.mtime = mdr->get_op_stamp();
7c673cae
FG
5013
5014 // adjust client's max_size?
f91f0fd5
TL
5015 if (mds->locker->calc_new_client_ranges(cur, pi.inode.size)) {
5016 dout(10) << " client_ranges " << cur->get_previous_projected_inode()->client_ranges
5017 << " -> " << pi.inode.client_ranges << dendl;
7c673cae
FG
5018 changed_ranges = true;
5019 }
5020 }
5021
94b18763 5022 pi.inode.version = cur->pre_dirty();
91327a77
AA
5023 pi.inode.ctime = mdr->get_op_stamp();
5024 if (mdr->get_op_stamp() > pi.inode.rstat.rctime)
5025 pi.inode.rstat.rctime = mdr->get_op_stamp();
94b18763 5026 pi.inode.change_attr++;
7c673cae
FG
5027
5028 // log + wait
5029 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
5030 mdcache->predirty_journal_parents(mdr, &le->metablob, cur, 0, PREDIRTY_PRIMARY);
5031 mdcache->journal_dirty_inode(mdr.get(), &le->metablob, cur);
5032
5033 journal_and_reply(mdr, cur, 0, le, new C_MDS_inode_update_finish(this, mdr, cur,
5034 truncating_smaller, changed_ranges));
5035
5036 // flush immediately if there are readers/writers waiting
11fdf7f2 5037 if (mdr->is_xlocked(&cur->filelock) &&
7c673cae
FG
5038 (cur->get_caps_wanted() & (CEPH_CAP_FILE_RD|CEPH_CAP_FILE_WR)))
5039 mds->mdlog->flush();
5040}
5041
5042/* Takes responsibility for mdr */
5043void Server::do_open_truncate(MDRequestRef& mdr, int cmode)
5044{
5045 CInode *in = mdr->in[0];
5046 client_t client = mdr->get_client();
11fdf7f2 5047 ceph_assert(in);
7c673cae
FG
5048
5049 dout(10) << "do_open_truncate " << *in << dendl;
5050
5051 SnapRealm *realm = in->find_snaprealm();
9f95a23c 5052 Capability *cap = mds->locker->issue_new_caps(in, cmode, mdr, realm);
7c673cae
FG
5053
5054 mdr->ls = mdlog->get_current_segment();
5055 EUpdate *le = new EUpdate(mdlog, "open_truncate");
5056 mdlog->start_entry(le);
5057
5058 // prepare
94b18763
FG
5059 auto &pi = in->project_inode();
5060 pi.inode.version = in->pre_dirty();
91327a77
AA
5061 pi.inode.mtime = pi.inode.ctime = mdr->get_op_stamp();
5062 if (mdr->get_op_stamp() > pi.inode.rstat.rctime)
5063 pi.inode.rstat.rctime = mdr->get_op_stamp();
94b18763 5064 pi.inode.change_attr++;
7c673cae 5065
94b18763 5066 uint64_t old_size = std::max<uint64_t>(pi.inode.size, mdr->client_request->head.args.open.old_size);
7c673cae 5067 if (old_size > 0) {
94b18763 5068 pi.inode.truncate(old_size, 0);
7c673cae
FG
5069 le->metablob.add_truncate_start(in->ino());
5070 }
5071
5072 bool changed_ranges = false;
a8e16298 5073 if (cap && (cmode & CEPH_FILE_MODE_WR)) {
94b18763
FG
5074 pi.inode.client_ranges[client].range.first = 0;
5075 pi.inode.client_ranges[client].range.last = pi.inode.get_layout_size_increment();
11fdf7f2 5076 pi.inode.client_ranges[client].follows = realm->get_newest_seq();
7c673cae 5077 changed_ranges = true;
f91f0fd5 5078 in->mark_clientwriteable();
a8e16298 5079 cap->mark_clientwriteable();
7c673cae
FG
5080 }
5081
5082 le->metablob.add_client_req(mdr->reqid, mdr->client_request->get_oldest_client_tid());
5083
5084 mdcache->predirty_journal_parents(mdr, &le->metablob, in, 0, PREDIRTY_PRIMARY);
5085 mdcache->journal_dirty_inode(mdr.get(), &le->metablob, in);
5086
5087 // make sure ino gets into the journal
5088 le->metablob.add_opened_ino(in->ino());
7c673cae
FG
5089
5090 mdr->o_trunc = true;
5091
5092 CDentry *dn = 0;
5093 if (mdr->client_request->get_dentry_wanted()) {
11fdf7f2 5094 ceph_assert(mdr->dn[0].size());
7c673cae
FG
5095 dn = mdr->dn[0].back();
5096 }
5097
5098 journal_and_reply(mdr, in, dn, le, new C_MDS_inode_update_finish(this, mdr, in, old_size > 0,
5099 changed_ranges));
5100 // Although the `open` part can give an early reply, the truncation won't
5101 // happen until our EUpdate is persistent, to give the client a prompt
5102 // response we must also flush that event.
5103 mdlog->flush();
5104}
5105
5106
5107/* This function cleans up the passed mdr */
5108void Server::handle_client_setlayout(MDRequestRef& mdr)
5109{
9f95a23c
TL
5110 const cref_t<MClientRequest> &req = mdr->client_request;
5111 CInode *cur = rdlock_path_pin_ref(mdr, true);
7c673cae
FG
5112 if (!cur) return;
5113
5114 if (mdr->snapid != CEPH_NOSNAP) {
5115 respond_to_request(mdr, -EROFS);
5116 return;
5117 }
5118 if (!cur->is_file()) {
5119 respond_to_request(mdr, -EINVAL);
5120 return;
5121 }
5122 if (cur->get_projected_inode()->size ||
5123 cur->get_projected_inode()->truncate_seq > 1) {
5124 respond_to_request(mdr, -ENOTEMPTY);
5125 return;
5126 }
5127
5128 // validate layout
5129 file_layout_t layout = cur->get_projected_inode()->layout;
5130 // save existing layout for later
5131 const auto old_layout = layout;
5132
5133 int access = MAY_WRITE;
5134
5135 if (req->head.args.setlayout.layout.fl_object_size > 0)
5136 layout.object_size = req->head.args.setlayout.layout.fl_object_size;
5137 if (req->head.args.setlayout.layout.fl_stripe_unit > 0)
5138 layout.stripe_unit = req->head.args.setlayout.layout.fl_stripe_unit;
5139 if (req->head.args.setlayout.layout.fl_stripe_count > 0)
5140 layout.stripe_count=req->head.args.setlayout.layout.fl_stripe_count;
5141 if (req->head.args.setlayout.layout.fl_pg_pool > 0) {
5142 layout.pool_id = req->head.args.setlayout.layout.fl_pg_pool;
5143
5144 // make sure we have as new a map as the client
5145 if (req->get_mdsmap_epoch() > mds->mdsmap->get_epoch()) {
5146 mds->wait_for_mdsmap(req->get_mdsmap_epoch(), new C_MDS_RetryRequest(mdcache, mdr));
5147 return;
5148 }
5149 }
5150
5151 // Don't permit layout modifications without 'p' caps
5152 if (layout != old_layout) {
5153 access |= MAY_SET_VXATTR;
5154 }
5155
5156 if (!layout.is_valid()) {
5157 dout(10) << "bad layout" << dendl;
5158 respond_to_request(mdr, -EINVAL);
5159 return;
5160 }
5161 if (!mds->mdsmap->is_data_pool(layout.pool_id)) {
5162 dout(10) << " invalid data pool " << layout.pool_id << dendl;
5163 respond_to_request(mdr, -EINVAL);
5164 return;
5165 }
5166
9f95a23c 5167 MutationImpl::LockOpVec lov;
11fdf7f2
TL
5168 lov.add_xlock(&cur->filelock);
5169 if (!mds->locker->acquire_locks(mdr, lov))
7c673cae
FG
5170 return;
5171
5172 if (!check_access(mdr, cur, access))
5173 return;
5174
5175 // project update
94b18763
FG
5176 auto &pi = cur->project_inode();
5177 pi.inode.layout = layout;
7c673cae 5178 // add the old pool to the inode
94b18763
FG
5179 pi.inode.add_old_pool(old_layout.pool_id);
5180 pi.inode.version = cur->pre_dirty();
91327a77
AA
5181 pi.inode.ctime = mdr->get_op_stamp();
5182 if (mdr->get_op_stamp() > pi.inode.rstat.rctime)
5183 pi.inode.rstat.rctime = mdr->get_op_stamp();
94b18763 5184 pi.inode.change_attr++;
7c673cae
FG
5185
5186 // log + wait
5187 mdr->ls = mdlog->get_current_segment();
5188 EUpdate *le = new EUpdate(mdlog, "setlayout");
5189 mdlog->start_entry(le);
5190 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
5191 mdcache->predirty_journal_parents(mdr, &le->metablob, cur, 0, PREDIRTY_PRIMARY);
5192 mdcache->journal_dirty_inode(mdr.get(), &le->metablob, cur);
5193
5194 journal_and_reply(mdr, cur, 0, le, new C_MDS_inode_update_finish(this, mdr, cur));
5195}
5196
9f95a23c 5197bool Server::xlock_policylock(MDRequestRef& mdr, CInode *in, bool want_layout, bool xlock_snaplock)
7c673cae 5198{
9f95a23c
TL
5199 if (mdr->locking_state & MutationImpl::ALL_LOCKED)
5200 return true;
5201
11fdf7f2 5202 MutationImpl::LockOpVec lov;
9f95a23c
TL
5203 lov.add_xlock(&in->policylock);
5204 if (xlock_snaplock)
5205 lov.add_xlock(&in->snaplock);
5206 else
5207 lov.add_rdlock(&in->snaplock);
5208 if (!mds->locker->acquire_locks(mdr, lov))
5209 return false;
7c673cae 5210
9f95a23c
TL
5211 if (want_layout && in->get_projected_inode()->has_layout()) {
5212 mdr->dir_layout = in->get_projected_inode()->layout;
5213 want_layout = false;
5214 }
5215 if (CDentry *pdn = in->get_projected_parent_dn(); pdn) {
5216 if (!mds->locker->try_rdlock_snap_layout(pdn->get_dir()->get_inode(), mdr, 0, want_layout))
5217 return false;
7c673cae
FG
5218 }
5219
9f95a23c
TL
5220 mdr->locking_state |= MutationImpl::ALL_LOCKED;
5221 return true;
5222}
5223
5224CInode* Server::try_get_auth_inode(MDRequestRef& mdr, inodeno_t ino)
5225{
5226 CInode *in = mdcache->get_inode(ino);
5227 if (!in || in->state_test(CInode::STATE_PURGING)) {
5228 respond_to_request(mdr, -ESTALE);
5229 return nullptr;
5230 }
5231 if (!in->is_auth()) {
5232 mdcache->request_forward(mdr, in->authority().first);
5233 return nullptr;
5234 }
5235
5236 return in;
5237}
5238
5239void Server::handle_client_setdirlayout(MDRequestRef& mdr)
5240{
5241 const cref_t<MClientRequest> &req = mdr->client_request;
5242
5243 // can't use rdlock_path_pin_ref because we need to xlock snaplock/policylock
5244 CInode *cur = try_get_auth_inode(mdr, req->get_filepath().get_ino());
5245 if (!cur)
5246 return;
5247
7c673cae
FG
5248 if (!cur->is_dir()) {
5249 respond_to_request(mdr, -ENOTDIR);
5250 return;
5251 }
5252
9f95a23c 5253 if (!xlock_policylock(mdr, cur, true))
7c673cae
FG
5254 return;
5255
5256 // validate layout
94b18763 5257 const auto old_pi = cur->get_projected_inode();
7c673cae
FG
5258 file_layout_t layout;
5259 if (old_pi->has_layout())
5260 layout = old_pi->layout;
9f95a23c
TL
5261 else if (mdr->dir_layout != file_layout_t())
5262 layout = mdr->dir_layout;
7c673cae
FG
5263 else
5264 layout = mdcache->default_file_layout;
5265
5266 // Level of access required to complete
5267 int access = MAY_WRITE;
5268
5269 const auto old_layout = layout;
5270
5271 if (req->head.args.setlayout.layout.fl_object_size > 0)
5272 layout.object_size = req->head.args.setlayout.layout.fl_object_size;
5273 if (req->head.args.setlayout.layout.fl_stripe_unit > 0)
5274 layout.stripe_unit = req->head.args.setlayout.layout.fl_stripe_unit;
5275 if (req->head.args.setlayout.layout.fl_stripe_count > 0)
5276 layout.stripe_count=req->head.args.setlayout.layout.fl_stripe_count;
5277 if (req->head.args.setlayout.layout.fl_pg_pool > 0) {
5278 layout.pool_id = req->head.args.setlayout.layout.fl_pg_pool;
5279 // make sure we have as new a map as the client
5280 if (req->get_mdsmap_epoch() > mds->mdsmap->get_epoch()) {
5281 mds->wait_for_mdsmap(req->get_mdsmap_epoch(), new C_MDS_RetryRequest(mdcache, mdr));
5282 return;
5283 }
5284 }
5285
5286 if (layout != old_layout) {
5287 access |= MAY_SET_VXATTR;
5288 }
5289
5290 if (!layout.is_valid()) {
5291 dout(10) << "bad layout" << dendl;
5292 respond_to_request(mdr, -EINVAL);
5293 return;
5294 }
5295 if (!mds->mdsmap->is_data_pool(layout.pool_id)) {
5296 dout(10) << " invalid data pool " << layout.pool_id << dendl;
5297 respond_to_request(mdr, -EINVAL);
5298 return;
5299 }
5300
5301 if (!check_access(mdr, cur, access))
5302 return;
5303
94b18763
FG
5304 auto &pi = cur->project_inode();
5305 pi.inode.layout = layout;
5306 pi.inode.version = cur->pre_dirty();
7c673cae
FG
5307
5308 // log + wait
5309 mdr->ls = mdlog->get_current_segment();
5310 EUpdate *le = new EUpdate(mdlog, "setlayout");
5311 mdlog->start_entry(le);
5312 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
5313 mdcache->predirty_journal_parents(mdr, &le->metablob, cur, 0, PREDIRTY_PRIMARY);
5314 mdcache->journal_dirty_inode(mdr.get(), &le->metablob, cur);
5315
b32b8144 5316 mdr->no_early_reply = true;
7c673cae
FG
5317 journal_and_reply(mdr, cur, 0, le, new C_MDS_inode_update_finish(this, mdr, cur));
5318}
5319
5320// XATTRS
5321
5322int Server::parse_layout_vxattr(string name, string value, const OSDMap& osdmap,
5323 file_layout_t *layout, bool validate)
5324{
5325 dout(20) << "parse_layout_vxattr name " << name << " value '" << value << "'" << dendl;
5326 try {
5327 if (name == "layout") {
5328 string::iterator begin = value.begin();
5329 string::iterator end = value.end();
5330 keys_and_values<string::iterator> p; // create instance of parser
5331 std::map<string, string> m; // map to receive results
5332 if (!qi::parse(begin, end, p, m)) { // returns true if successful
5333 return -EINVAL;
5334 }
5335 string left(begin, end);
5336 dout(10) << " parsed " << m << " left '" << left << "'" << dendl;
5337 if (begin != end)
5338 return -EINVAL;
5339 for (map<string,string>::iterator q = m.begin(); q != m.end(); ++q) {
5340 // Skip validation on each attr, we do it once at the end (avoid
5341 // rejecting intermediate states if the overall result is ok)
5342 int r = parse_layout_vxattr(string("layout.") + q->first, q->second,
5343 osdmap, layout, false);
5344 if (r < 0)
5345 return r;
5346 }
5347 } else if (name == "layout.object_size") {
5348 layout->object_size = boost::lexical_cast<unsigned>(value);
5349 } else if (name == "layout.stripe_unit") {
5350 layout->stripe_unit = boost::lexical_cast<unsigned>(value);
5351 } else if (name == "layout.stripe_count") {
5352 layout->stripe_count = boost::lexical_cast<unsigned>(value);
5353 } else if (name == "layout.pool") {
5354 try {
5355 layout->pool_id = boost::lexical_cast<unsigned>(value);
5356 } catch (boost::bad_lexical_cast const&) {
5357 int64_t pool = osdmap.lookup_pg_pool_name(value);
5358 if (pool < 0) {
5359 dout(10) << " unknown pool " << value << dendl;
5360 return -ENOENT;
5361 }
5362 layout->pool_id = pool;
5363 }
5364 } else if (name == "layout.pool_namespace") {
5365 layout->pool_ns = value;
5366 } else {
5367 dout(10) << " unknown layout vxattr " << name << dendl;
5368 return -EINVAL;
5369 }
5370 } catch (boost::bad_lexical_cast const&) {
5371 dout(10) << "bad vxattr value, unable to parse int for " << name << dendl;
5372 return -EINVAL;
5373 }
5374
5375 if (validate && !layout->is_valid()) {
5376 dout(10) << "bad layout" << dendl;
5377 return -EINVAL;
5378 }
5379 if (!mds->mdsmap->is_data_pool(layout->pool_id)) {
5380 dout(10) << " invalid data pool " << layout->pool_id << dendl;
5381 return -EINVAL;
5382 }
5383 return 0;
5384}
5385
5386int Server::parse_quota_vxattr(string name, string value, quota_info_t *quota)
5387{
5388 dout(20) << "parse_quota_vxattr name " << name << " value '" << value << "'" << dendl;
5389 try {
5390 if (name == "quota") {
5391 string::iterator begin = value.begin();
5392 string::iterator end = value.end();
11fdf7f2
TL
5393 if (begin == end) {
5394 // keep quota unchanged. (for create_quota_realm())
5395 return 0;
5396 }
7c673cae
FG
5397 keys_and_values<string::iterator> p; // create instance of parser
5398 std::map<string, string> m; // map to receive results
5399 if (!qi::parse(begin, end, p, m)) { // returns true if successful
5400 return -EINVAL;
5401 }
5402 string left(begin, end);
5403 dout(10) << " parsed " << m << " left '" << left << "'" << dendl;
5404 if (begin != end)
5405 return -EINVAL;
5406 for (map<string,string>::iterator q = m.begin(); q != m.end(); ++q) {
5407 int r = parse_quota_vxattr(string("quota.") + q->first, q->second, quota);
5408 if (r < 0)
5409 return r;
5410 }
5411 } else if (name == "quota.max_bytes") {
5412 int64_t q = boost::lexical_cast<int64_t>(value);
5413 if (q < 0)
5414 return -EINVAL;
5415 quota->max_bytes = q;
5416 } else if (name == "quota.max_files") {
5417 int64_t q = boost::lexical_cast<int64_t>(value);
5418 if (q < 0)
5419 return -EINVAL;
5420 quota->max_files = q;
5421 } else {
5422 dout(10) << " unknown quota vxattr " << name << dendl;
5423 return -EINVAL;
5424 }
5425 } catch (boost::bad_lexical_cast const&) {
5426 dout(10) << "bad vxattr value, unable to parse int for " << name << dendl;
5427 return -EINVAL;
5428 }
5429
5430 if (!quota->is_valid()) {
5431 dout(10) << "bad quota" << dendl;
5432 return -EINVAL;
5433 }
5434 return 0;
5435}
5436
11fdf7f2
TL
5437void Server::create_quota_realm(CInode *in)
5438{
5439 dout(10) << __func__ << " " << *in << dendl;
5440
9f95a23c 5441 auto req = make_message<MClientRequest>(CEPH_MDS_OP_SETXATTR);
11fdf7f2
TL
5442 req->set_filepath(filepath(in->ino()));
5443 req->set_string2("ceph.quota");
5444 // empty vxattr value
5445 req->set_tid(mds->issue_tid());
5446
5447 mds->send_message_mds(req, in->authority().first);
5448}
5449
7c673cae
FG
5450/*
5451 * Verify that the file layout attribute carried by client
5452 * is well-formatted.
5453 * Return 0 on success, otherwise this function takes
5454 * responsibility for the passed mdr.
5455 */
5456int Server::check_layout_vxattr(MDRequestRef& mdr,
5457 string name,
5458 string value,
5459 file_layout_t *layout)
5460{
9f95a23c 5461 const cref_t<MClientRequest> &req = mdr->client_request;
7c673cae
FG
5462 epoch_t epoch;
5463 int r;
5464
5465 mds->objecter->with_osdmap([&](const OSDMap& osdmap) {
5466 r = parse_layout_vxattr(name, value, osdmap, layout);
5467 epoch = osdmap.get_epoch();
5468 });
5469
5470 if (r == -ENOENT) {
5471
5472 // we don't have the specified pool, make sure our map
5473 // is newer than or as new as the client.
5474 epoch_t req_epoch = req->get_osdmap_epoch();
5475
5476 if (req_epoch > epoch) {
5477
5478 // well, our map is older. consult mds.
5479 Context *fin = new C_IO_Wrapper(mds, new C_MDS_RetryRequest(mdcache, mdr));
5480
5481 if (!mds->objecter->wait_for_map(req_epoch, fin))
5482 return r; // wait, fin will retry this request later
5483
5484 delete fin;
5485
5486 // now we have at least as new a map as the client, try again.
5487 mds->objecter->with_osdmap([&](const OSDMap& osdmap) {
5488 r = parse_layout_vxattr(name, value, osdmap, layout);
5489 epoch = osdmap.get_epoch();
5490 });
5491
11fdf7f2 5492 ceph_assert(epoch >= req_epoch); // otherwise wait_for_map() told a lie
7c673cae
FG
5493
5494 } else if (req_epoch == 0 && !mdr->waited_for_osdmap) {
5495
5496 // For compatibility with client w/ old code, we still need get the
5497 // latest map. One day if COMPACT_VERSION of MClientRequest >=3,
5498 // we can remove those code.
5499 mdr->waited_for_osdmap = true;
5500 mds->objecter->wait_for_latest_osdmap(new C_IO_Wrapper(
5501 mds, new C_MDS_RetryRequest(mdcache, mdr)));
5502 return r;
5503 }
5504 }
5505
5506 if (r < 0) {
5507
5508 if (r == -ENOENT)
5509 r = -EINVAL;
5510
5511 respond_to_request(mdr, r);
5512 return r;
5513 }
5514
5515 // all is well
5516 return 0;
5517}
5518
9f95a23c 5519void Server::handle_set_vxattr(MDRequestRef& mdr, CInode *cur)
7c673cae 5520{
9f95a23c 5521 const cref_t<MClientRequest> &req = mdr->client_request;
7c673cae
FG
5522 string name(req->get_path2());
5523 bufferlist bl = req->get_data();
5524 string value (bl.c_str(), bl.length());
5525 dout(10) << "handle_set_vxattr " << name
5526 << " val " << value.length()
5527 << " bytes on " << *cur
5528 << dendl;
5529
94b18763 5530 CInode::mempool_inode *pip = nullptr;
7c673cae
FG
5531 string rest;
5532
5533 if (!check_access(mdr, cur, MAY_SET_VXATTR)) {
5534 return;
5535 }
5536
adb31ebb 5537 bool adjust_realm = false;
7c673cae
FG
5538 if (name.compare(0, 15, "ceph.dir.layout") == 0) {
5539 if (!cur->is_dir()) {
5540 respond_to_request(mdr, -EINVAL);
5541 return;
5542 }
5543
9f95a23c
TL
5544 if (!xlock_policylock(mdr, cur, true))
5545 return;
5546
7c673cae
FG
5547 file_layout_t layout;
5548 if (cur->get_projected_inode()->has_layout())
5549 layout = cur->get_projected_inode()->layout;
9f95a23c
TL
5550 else if (mdr->dir_layout != file_layout_t())
5551 layout = mdr->dir_layout;
7c673cae
FG
5552 else
5553 layout = mdcache->default_file_layout;
5554
5555 rest = name.substr(name.find("layout"));
5556 if (check_layout_vxattr(mdr, rest, value, &layout) < 0)
5557 return;
5558
94b18763
FG
5559 auto &pi = cur->project_inode();
5560 pi.inode.layout = layout;
b32b8144 5561 mdr->no_early_reply = true;
94b18763 5562 pip = &pi.inode;
7c673cae
FG
5563 } else if (name.compare(0, 16, "ceph.file.layout") == 0) {
5564 if (!cur->is_file()) {
5565 respond_to_request(mdr, -EINVAL);
5566 return;
5567 }
5568 if (cur->get_projected_inode()->size ||
5569 cur->get_projected_inode()->truncate_seq > 1) {
5570 respond_to_request(mdr, -ENOTEMPTY);
5571 return;
5572 }
5573 file_layout_t layout = cur->get_projected_inode()->layout;
5574 rest = name.substr(name.find("layout"));
5575 if (check_layout_vxattr(mdr, rest, value, &layout) < 0)
5576 return;
5577
9f95a23c 5578 MutationImpl::LockOpVec lov;
11fdf7f2
TL
5579 lov.add_xlock(&cur->filelock);
5580 if (!mds->locker->acquire_locks(mdr, lov))
7c673cae
FG
5581 return;
5582
94b18763
FG
5583 auto &pi = cur->project_inode();
5584 int64_t old_pool = pi.inode.layout.pool_id;
5585 pi.inode.add_old_pool(old_pool);
5586 pi.inode.layout = layout;
94b18763 5587 pip = &pi.inode;
7c673cae
FG
5588 } else if (name.compare(0, 10, "ceph.quota") == 0) {
5589 if (!cur->is_dir() || cur->is_root()) {
5590 respond_to_request(mdr, -EINVAL);
5591 return;
5592 }
5593
5594 quota_info_t quota = cur->get_projected_inode()->quota;
5595
5596 rest = name.substr(name.find("quota"));
5597 int r = parse_quota_vxattr(rest, value, &quota);
5598 if (r < 0) {
5599 respond_to_request(mdr, r);
5600 return;
5601 }
5602
9f95a23c 5603 if (quota.is_enable() && !cur->get_projected_srnode())
adb31ebb
TL
5604 adjust_realm = true;
5605
5606 if (!xlock_policylock(mdr, cur, false, adjust_realm))
5607 return;
11fdf7f2 5608
adb31ebb
TL
5609 if (cur->get_projected_inode()->quota == quota) {
5610 respond_to_request(mdr, 0);
7c673cae 5611 return;
adb31ebb 5612 }
7c673cae 5613
adb31ebb 5614 auto &pi = cur->project_inode(false, adjust_realm);
94b18763
FG
5615 pi.inode.quota = quota;
5616
adb31ebb
TL
5617 if (adjust_realm)
5618 pi.snapnode->created = pi.snapnode->seq = cur->find_snaprealm()->get_newest_seq();
5619
b32b8144 5620 mdr->no_early_reply = true;
94b18763 5621 pip = &pi.inode;
28e407b8
AA
5622
5623 client_t exclude_ct = mdr->get_client();
a8e16298 5624 mdcache->broadcast_quota_to_client(cur, exclude_ct, true);
adb31ebb
TL
5625 } else if (name == "ceph.dir.subvolume"sv) {
5626 if (!cur->is_dir()) {
5627 respond_to_request(mdr, -EINVAL);
5628 return;
5629 }
5630
5631 bool val;
5632 try {
5633 val = boost::lexical_cast<bool>(value);
5634 } catch (boost::bad_lexical_cast const&) {
5635 dout(10) << "bad vxattr value, unable to parse bool for " << name << dendl;
5636 respond_to_request(mdr, -EINVAL);
5637 return;
5638 }
5639
5640 if (!xlock_policylock(mdr, cur, false, true))
5641 return;
5642
5643 SnapRealm *realm = cur->find_snaprealm();
5644 if (val) {
5645 inodeno_t subvol_ino = realm->get_subvolume_ino();
5646 // can't create subvolume inside another subvolume
5647 if (subvol_ino && subvol_ino != cur->ino()) {
5648 respond_to_request(mdr, -EINVAL);
5649 return;
5650 }
5651 }
5652
5653 const auto srnode = cur->get_projected_srnode();
5654 if (val == (srnode && srnode->is_subvolume())) {
5655 respond_to_request(mdr, 0);
5656 return;
5657 }
5658
5659 auto& pi = cur->project_inode(false, true);
5660 if (!srnode)
5661 pi.snapnode->created = pi.snapnode->seq = realm->get_newest_seq();
5662 if (val)
5663 pi.snapnode->mark_subvolume();
5664 else
5665 pi.snapnode->clear_subvolume();
5666
5667 mdr->no_early_reply = true;
5668 pip = &pi.inode;
5669 adjust_realm = true;
f6b5b4d7 5670 } else if (name == "ceph.dir.pin"sv) {
7c673cae
FG
5671 if (!cur->is_dir() || cur->is_root()) {
5672 respond_to_request(mdr, -EINVAL);
5673 return;
5674 }
5675
5676 mds_rank_t rank;
5677 try {
5678 rank = boost::lexical_cast<mds_rank_t>(value);
5679 if (rank < 0) rank = MDS_RANK_NONE;
5680 } catch (boost::bad_lexical_cast const&) {
5681 dout(10) << "bad vxattr value, unable to parse int for " << name << dendl;
5682 respond_to_request(mdr, -EINVAL);
5683 return;
5684 }
5685
9f95a23c 5686 if (!xlock_policylock(mdr, cur))
7c673cae
FG
5687 return;
5688
94b18763 5689 auto &pi = cur->project_inode();
7c673cae 5690 cur->set_export_pin(rank);
94b18763 5691 pip = &pi.inode;
f6b5b4d7
TL
5692 } else if (name == "ceph.dir.pin.random"sv) {
5693 if (!cur->is_dir() || cur->is_root()) {
5694 respond_to_request(mdr, -EINVAL);
5695 return;
5696 }
5697
5698 double val;
5699 try {
5700 val = boost::lexical_cast<double>(value);
5701 } catch (boost::bad_lexical_cast const&) {
5702 dout(10) << "bad vxattr value, unable to parse float for " << name << dendl;
5703 respond_to_request(mdr, -EINVAL);
5704 return;
5705 }
5706
5707 if (val < 0.0 || 1.0 < val) {
5708 respond_to_request(mdr, -EDOM);
5709 return;
5710 } else if (mdcache->export_ephemeral_random_max < val) {
5711 respond_to_request(mdr, -EINVAL);
5712 return;
5713 }
5714
5715 if (!xlock_policylock(mdr, cur))
5716 return;
5717
5718 auto &pi = cur->project_inode();
5719 cur->setxattr_ephemeral_rand(val);
5720 pip = &pi.inode;
5721 } else if (name == "ceph.dir.pin.distributed"sv) {
5722 if (!cur->is_dir() || cur->is_root()) {
5723 respond_to_request(mdr, -EINVAL);
5724 return;
5725 }
5726
5727 bool val;
5728 try {
5729 val = boost::lexical_cast<bool>(value);
5730 } catch (boost::bad_lexical_cast const&) {
5731 dout(10) << "bad vxattr value, unable to parse bool for " << name << dendl;
5732 respond_to_request(mdr, -EINVAL);
5733 return;
5734 }
5735
5736 if (!xlock_policylock(mdr, cur))
5737 return;
5738
5739 auto &pi = cur->project_inode();
5740 cur->setxattr_ephemeral_dist(val);
5741 pip = &pi.inode;
7c673cae
FG
5742 } else {
5743 dout(10) << " unknown vxattr " << name << dendl;
5744 respond_to_request(mdr, -EINVAL);
5745 return;
5746 }
5747
94b18763 5748 pip->change_attr++;
91327a77
AA
5749 pip->ctime = mdr->get_op_stamp();
5750 if (mdr->get_op_stamp() > pip->rstat.rctime)
5751 pip->rstat.rctime = mdr->get_op_stamp();
94b18763 5752 pip->version = cur->pre_dirty();
7c673cae 5753 if (cur->is_file())
94b18763 5754 pip->update_backtrace();
7c673cae
FG
5755
5756 // log + wait
5757 mdr->ls = mdlog->get_current_segment();
5758 EUpdate *le = new EUpdate(mdlog, "set vxattr layout");
5759 mdlog->start_entry(le);
5760 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
5761 mdcache->predirty_journal_parents(mdr, &le->metablob, cur, 0, PREDIRTY_PRIMARY);
5762 mdcache->journal_dirty_inode(mdr.get(), &le->metablob, cur);
5763
11fdf7f2 5764 journal_and_reply(mdr, cur, 0, le, new C_MDS_inode_update_finish(this, mdr, cur,
adb31ebb 5765 false, false, adjust_realm));
7c673cae
FG
5766 return;
5767}
5768
9f95a23c 5769void Server::handle_remove_vxattr(MDRequestRef& mdr, CInode *cur)
7c673cae 5770{
9f95a23c 5771 const cref_t<MClientRequest> &req = mdr->client_request;
7c673cae
FG
5772 string name(req->get_path2());
5773
5774 dout(10) << __func__ << " " << name << " on " << *cur << dendl;
5775
5776 if (name == "ceph.dir.layout") {
5777 if (!cur->is_dir()) {
5778 respond_to_request(mdr, -ENODATA);
5779 return;
5780 }
5781 if (cur->is_root()) {
5782 dout(10) << "can't remove layout policy on the root directory" << dendl;
5783 respond_to_request(mdr, -EINVAL);
5784 return;
5785 }
5786
5787 if (!cur->get_projected_inode()->has_layout()) {
5788 respond_to_request(mdr, -ENODATA);
5789 return;
5790 }
5791
9f95a23c 5792 MutationImpl::LockOpVec lov;
11fdf7f2
TL
5793 lov.add_xlock(&cur->policylock);
5794 if (!mds->locker->acquire_locks(mdr, lov))
7c673cae
FG
5795 return;
5796
94b18763
FG
5797 auto &pi = cur->project_inode();
5798 pi.inode.clear_layout();
5799 pi.inode.version = cur->pre_dirty();
7c673cae
FG
5800
5801 // log + wait
5802 mdr->ls = mdlog->get_current_segment();
5803 EUpdate *le = new EUpdate(mdlog, "remove dir layout vxattr");
5804 mdlog->start_entry(le);
5805 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
5806 mdcache->predirty_journal_parents(mdr, &le->metablob, cur, 0, PREDIRTY_PRIMARY);
5807 mdcache->journal_dirty_inode(mdr.get(), &le->metablob, cur);
5808
b32b8144 5809 mdr->no_early_reply = true;
7c673cae
FG
5810 journal_and_reply(mdr, cur, 0, le, new C_MDS_inode_update_finish(this, mdr, cur));
5811 return;
5812 } else if (name == "ceph.dir.layout.pool_namespace"
5813 || name == "ceph.file.layout.pool_namespace") {
5814 // Namespace is the only layout field that has a meaningful
5815 // null/none value (empty string, means default layout). Is equivalent
5816 // to a setxattr with empty string: pass through the empty payload of
5817 // the rmxattr request to do this.
9f95a23c 5818 handle_set_vxattr(mdr, cur);
7c673cae
FG
5819 return;
5820 }
5821
5822 respond_to_request(mdr, -ENODATA);
5823}
5824
5825class C_MDS_inode_xattr_update_finish : public ServerLogContext {
5826 CInode *in;
5827public:
5828
5829 C_MDS_inode_xattr_update_finish(Server *s, MDRequestRef& r, CInode *i) :
5830 ServerLogContext(s, r), in(i) { }
5831 void finish(int r) override {
11fdf7f2 5832 ceph_assert(r == 0);
7c673cae
FG
5833
5834 // apply
5835 in->pop_and_dirty_projected_inode(mdr->ls);
5836
5837 mdr->apply();
5838
11fdf7f2 5839 get_mds()->balancer->hit_inode(in, META_POP_IWR);
7c673cae
FG
5840
5841 server->respond_to_request(mdr, 0);
5842 }
5843};
5844
5845void Server::handle_client_setxattr(MDRequestRef& mdr)
5846{
9f95a23c 5847 const cref_t<MClientRequest> &req = mdr->client_request;
7c673cae 5848 string name(req->get_path2());
7c673cae 5849
9f95a23c
TL
5850 // magic ceph.* namespace?
5851 if (name.compare(0, 5, "ceph.") == 0) {
5852 // can't use rdlock_path_pin_ref because we need to xlock snaplock/policylock
5853 CInode *cur = try_get_auth_inode(mdr, req->get_filepath().get_ino());
5854 if (!cur)
5855 return;
5856
5857 handle_set_vxattr(mdr, cur);
5858 return;
5859 }
5860
5861 CInode *cur = rdlock_path_pin_ref(mdr, true);
7c673cae
FG
5862 if (!cur)
5863 return;
5864
5865 if (mdr->snapid != CEPH_NOSNAP) {
5866 respond_to_request(mdr, -EROFS);
5867 return;
5868 }
5869
5870 int flags = req->head.args.setxattr.flags;
5871
9f95a23c 5872 MutationImpl::LockOpVec lov;
11fdf7f2
TL
5873 lov.add_xlock(&cur->xattrlock);
5874 if (!mds->locker->acquire_locks(mdr, lov))
7c673cae
FG
5875 return;
5876
5877 if (!check_access(mdr, cur, MAY_WRITE))
5878 return;
5879
94b18763 5880 auto pxattrs = cur->get_projected_xattrs();
7c673cae
FG
5881 size_t len = req->get_data().length();
5882 size_t inc = len + name.length();
5883
5884 // check xattrs kv pairs size
5885 size_t cur_xattrs_size = 0;
5886 for (const auto& p : *pxattrs) {
11fdf7f2 5887 if ((flags & CEPH_XATTR_REPLACE) && (name.compare(p.first) == 0)) {
7c673cae
FG
5888 continue;
5889 }
5890 cur_xattrs_size += p.first.length() + p.second.length();
5891 }
5892
11fdf7f2 5893 if (((cur_xattrs_size + inc) > g_conf()->mds_max_xattr_pairs_size)) {
7c673cae
FG
5894 dout(10) << "xattr kv pairs size too big. cur_xattrs_size "
5895 << cur_xattrs_size << ", inc " << inc << dendl;
5896 respond_to_request(mdr, -ENOSPC);
5897 return;
5898 }
5899
11fdf7f2 5900 if ((flags & CEPH_XATTR_CREATE) && pxattrs->count(mempool::mds_co::string(name))) {
7c673cae
FG
5901 dout(10) << "setxattr '" << name << "' XATTR_CREATE and EEXIST on " << *cur << dendl;
5902 respond_to_request(mdr, -EEXIST);
5903 return;
5904 }
11fdf7f2 5905 if ((flags & CEPH_XATTR_REPLACE) && !pxattrs->count(mempool::mds_co::string(name))) {
7c673cae
FG
5906 dout(10) << "setxattr '" << name << "' XATTR_REPLACE and ENODATA on " << *cur << dendl;
5907 respond_to_request(mdr, -ENODATA);
5908 return;
5909 }
5910
5911 dout(10) << "setxattr '" << name << "' len " << len << " on " << *cur << dendl;
5912
5913 // project update
94b18763
FG
5914 auto &pi = cur->project_inode(true);
5915 pi.inode.version = cur->pre_dirty();
91327a77
AA
5916 pi.inode.ctime = mdr->get_op_stamp();
5917 if (mdr->get_op_stamp() > pi.inode.rstat.rctime)
5918 pi.inode.rstat.rctime = mdr->get_op_stamp();
94b18763
FG
5919 pi.inode.change_attr++;
5920 pi.inode.xattr_version++;
5921 auto &px = *pi.xattrs;
5922 if ((flags & CEPH_XATTR_REMOVE)) {
11fdf7f2 5923 px.erase(mempool::mds_co::string(name));
94b18763
FG
5924 } else {
5925 bufferptr b = buffer::create(len);
7c673cae 5926 if (len)
9f95a23c 5927 req->get_data().begin().copy(len, b.c_str());
11fdf7f2 5928 auto em = px.emplace(std::piecewise_construct, std::forward_as_tuple(mempool::mds_co::string(name)), std::forward_as_tuple(b));
94b18763
FG
5929 if (!em.second)
5930 em.first->second = b;
7c673cae
FG
5931 }
5932
5933 // log + wait
5934 mdr->ls = mdlog->get_current_segment();
5935 EUpdate *le = new EUpdate(mdlog, "setxattr");
5936 mdlog->start_entry(le);
5937 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
5938 mdcache->predirty_journal_parents(mdr, &le->metablob, cur, 0, PREDIRTY_PRIMARY);
5939 mdcache->journal_dirty_inode(mdr.get(), &le->metablob, cur);
5940
5941 journal_and_reply(mdr, cur, 0, le, new C_MDS_inode_update_finish(this, mdr, cur));
5942}
5943
5944void Server::handle_client_removexattr(MDRequestRef& mdr)
5945{
9f95a23c 5946 const cref_t<MClientRequest> &req = mdr->client_request;
94b18763 5947 std::string name(req->get_path2());
11fdf7f2 5948
9f95a23c
TL
5949 if (name.compare(0, 5, "ceph.") == 0) {
5950 // can't use rdlock_path_pin_ref because we need to xlock snaplock/policylock
5951 CInode *cur = try_get_auth_inode(mdr, req->get_filepath().get_ino());
5952 if (!cur)
5953 return;
5954
5955 handle_remove_vxattr(mdr, cur);
5956 return;
5957 }
5958
5959 CInode* cur = rdlock_path_pin_ref(mdr, true);
7c673cae
FG
5960 if (!cur)
5961 return;
5962
5963 if (mdr->snapid != CEPH_NOSNAP) {
5964 respond_to_request(mdr, -EROFS);
5965 return;
5966 }
5967
9f95a23c 5968 MutationImpl::LockOpVec lov;
11fdf7f2
TL
5969 lov.add_xlock(&cur->xattrlock);
5970 if (!mds->locker->acquire_locks(mdr, lov))
7c673cae
FG
5971 return;
5972
94b18763 5973 auto pxattrs = cur->get_projected_xattrs();
11fdf7f2 5974 if (pxattrs->count(mempool::mds_co::string(name)) == 0) {
7c673cae
FG
5975 dout(10) << "removexattr '" << name << "' and ENODATA on " << *cur << dendl;
5976 respond_to_request(mdr, -ENODATA);
5977 return;
5978 }
5979
5980 dout(10) << "removexattr '" << name << "' on " << *cur << dendl;
5981
5982 // project update
94b18763
FG
5983 auto &pi = cur->project_inode(true);
5984 auto &px = *pi.xattrs;
5985 pi.inode.version = cur->pre_dirty();
91327a77
AA
5986 pi.inode.ctime = mdr->get_op_stamp();
5987 if (mdr->get_op_stamp() > pi.inode.rstat.rctime)
5988 pi.inode.rstat.rctime = mdr->get_op_stamp();
94b18763
FG
5989 pi.inode.change_attr++;
5990 pi.inode.xattr_version++;
11fdf7f2 5991 px.erase(mempool::mds_co::string(name));
7c673cae
FG
5992
5993 // log + wait
5994 mdr->ls = mdlog->get_current_segment();
5995 EUpdate *le = new EUpdate(mdlog, "removexattr");
5996 mdlog->start_entry(le);
5997 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
5998 mdcache->predirty_journal_parents(mdr, &le->metablob, cur, 0, PREDIRTY_PRIMARY);
5999 mdcache->journal_dirty_inode(mdr.get(), &le->metablob, cur);
6000
6001 journal_and_reply(mdr, cur, 0, le, new C_MDS_inode_update_finish(this, mdr, cur));
6002}
6003
6004
6005// =================================================================
6006// DIRECTORY and NAMESPACE OPS
6007
6008
6009// ------------------------------------------------
6010
6011// MKNOD
6012
6013class C_MDS_mknod_finish : public ServerLogContext {
6014 CDentry *dn;
6015 CInode *newi;
6016public:
6017 C_MDS_mknod_finish(Server *s, MDRequestRef& r, CDentry *d, CInode *ni) :
6018 ServerLogContext(s, r), dn(d), newi(ni) {}
6019 void finish(int r) override {
11fdf7f2 6020 ceph_assert(r == 0);
7c673cae
FG
6021
6022 // link the inode
6023 dn->pop_projected_linkage();
6024
6025 // be a bit hacky with the inode version, here.. we decrement it
6026 // just to keep mark_dirty() happen. (we didn't bother projecting
6027 // a new version of hte inode since it's just been created)
6028 newi->inode.version--;
6029 newi->mark_dirty(newi->inode.version + 1, mdr->ls);
28e407b8 6030 newi->mark_dirty_parent(mdr->ls, true);
7c673cae
FG
6031
6032 // mkdir?
6033 if (newi->inode.is_dir()) {
6034 CDir *dir = newi->get_dirfrag(frag_t());
11fdf7f2 6035 ceph_assert(dir);
7c673cae
FG
6036 dir->fnode.version--;
6037 dir->mark_dirty(dir->fnode.version + 1, mdr->ls);
6038 dir->mark_new(mdr->ls);
6039 }
6040
6041 mdr->apply();
6042
6043 MDRequestRef null_ref;
6044 get_mds()->mdcache->send_dentry_link(dn, null_ref);
6045
f6b5b4d7 6046 if (newi->inode.is_file()) {
7c673cae 6047 get_mds()->locker->share_inode_max_size(newi);
f6b5b4d7
TL
6048 } else if (newi->inode.is_dir()) {
6049 // We do this now so that the linkages on the new directory are stable.
6050 newi->maybe_ephemeral_dist();
6051 newi->maybe_ephemeral_rand(true);
6052 }
7c673cae
FG
6053
6054 // hit pop
11fdf7f2 6055 get_mds()->balancer->hit_inode(newi, META_POP_IWR);
7c673cae
FG
6056
6057 // reply
6058 server->respond_to_request(mdr, 0);
6059 }
6060};
6061
6062
6063void Server::handle_client_mknod(MDRequestRef& mdr)
6064{
9f95a23c 6065 const cref_t<MClientRequest> &req = mdr->client_request;
7c673cae 6066 client_t client = mdr->get_client();
9f95a23c
TL
6067
6068 unsigned mode = req->head.args.mknod.mode;
6069 if ((mode & S_IFMT) == 0)
6070 mode |= S_IFREG;
6071
6072 mdr->disable_lock_cache();
6073 CDentry *dn = rdlock_path_xlock_dentry(mdr, true, false, S_ISREG(mode));
6074 if (!dn)
7c673cae
FG
6075 return;
6076
9f95a23c
TL
6077 CDir *dir = dn->get_dir();
6078 CInode *diri = dir->get_inode();
7c673cae
FG
6079 if (!check_access(mdr, diri, MAY_WRITE))
6080 return;
7c673cae
FG
6081 if (!check_fragment_space(mdr, dn->get_dir()))
6082 return;
6083
7c673cae
FG
6084 // set layout
6085 file_layout_t layout;
9f95a23c
TL
6086 if (mdr->dir_layout != file_layout_t())
6087 layout = mdr->dir_layout;
7c673cae
FG
6088 else
6089 layout = mdcache->default_file_layout;
6090
11fdf7f2
TL
6091 CInode *newi = prepare_new_inode(mdr, dn->get_dir(), inodeno_t(req->head.ino), mode, &layout);
6092 ceph_assert(newi);
7c673cae
FG
6093
6094 dn->push_projected_linkage(newi);
6095
6096 newi->inode.rdev = req->head.args.mknod.rdev;
6097 newi->inode.version = dn->pre_dirty();
6098 newi->inode.rstat.rfiles = 1;
6099 if (layout.pool_id != mdcache->default_file_layout.pool_id)
6100 newi->inode.add_old_pool(mdcache->default_file_layout.pool_id);
6101 newi->inode.update_backtrace();
6102
11fdf7f2
TL
6103 snapid_t follows = mdcache->get_global_snaprealm()->get_newest_seq();
6104 SnapRealm *realm = dn->get_dir()->inode->find_snaprealm();
6105 ceph_assert(follows >= realm->get_newest_seq());
6106
7c673cae
FG
6107 // if the client created a _regular_ file via MKNOD, it's highly likely they'll
6108 // want to write to it (e.g., if they are reexporting NFS)
6109 if (S_ISREG(newi->inode.mode)) {
7c673cae
FG
6110 // issue a cap on the file
6111 int cmode = CEPH_FILE_MODE_RDWR;
9f95a23c 6112 Capability *cap = mds->locker->issue_new_caps(newi, cmode, mdr, realm);
7c673cae
FG
6113 if (cap) {
6114 cap->set_wanted(0);
6115
6116 // put locks in excl mode
6117 newi->filelock.set_state(LOCK_EXCL);
6118 newi->authlock.set_state(LOCK_EXCL);
6119 newi->xattrlock.set_state(LOCK_EXCL);
a8e16298
TL
6120
6121 dout(15) << " setting a client_range too, since this is a regular file" << dendl;
6122 newi->inode.client_ranges[client].range.first = 0;
9f95a23c 6123 newi->inode.client_ranges[client].range.last = newi->inode.layout.stripe_unit;
a8e16298 6124 newi->inode.client_ranges[client].follows = follows;
f91f0fd5 6125 newi->mark_clientwriteable();
a8e16298 6126 cap->mark_clientwriteable();
7c673cae
FG
6127 }
6128 }
6129
11fdf7f2 6130 ceph_assert(dn->first == follows + 1);
7c673cae
FG
6131 newi->first = dn->first;
6132
6133 dout(10) << "mknod mode " << newi->inode.mode << " rdev " << newi->inode.rdev << dendl;
6134
6135 // prepare finisher
6136 mdr->ls = mdlog->get_current_segment();
6137 EUpdate *le = new EUpdate(mdlog, "mknod");
6138 mdlog->start_entry(le);
6139 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
6140 journal_allocated_inos(mdr, &le->metablob);
6141
6142 mdcache->predirty_journal_parents(mdr, &le->metablob, newi, dn->get_dir(),
6143 PREDIRTY_PRIMARY|PREDIRTY_DIR, 1);
6144 le->metablob.add_primary_dentry(dn, newi, true, true, true);
6145
6146 journal_and_reply(mdr, newi, dn, le, new C_MDS_mknod_finish(this, mdr, dn, newi));
92f5a8d4 6147 mds->balancer->maybe_fragment(dn->get_dir(), false);
7c673cae
FG
6148}
6149
6150
6151
6152// MKDIR
6153/* This function takes responsibility for the passed mdr*/
6154void Server::handle_client_mkdir(MDRequestRef& mdr)
6155{
9f95a23c 6156 const cref_t<MClientRequest> &req = mdr->client_request;
91327a77 6157
9f95a23c
TL
6158 mdr->disable_lock_cache();
6159 CDentry *dn = rdlock_path_xlock_dentry(mdr, true);
6160 if (!dn)
7c673cae 6161 return;
9f95a23c 6162
7c673cae
FG
6163 CDir *dir = dn->get_dir();
6164 CInode *diri = dir->get_inode();
7c673cae
FG
6165
6166 // mkdir check access
6167 if (!check_access(mdr, diri, MAY_WRITE))
6168 return;
6169
6170 if (!check_fragment_space(mdr, dir))
6171 return;
6172
6173 // new inode
7c673cae
FG
6174 unsigned mode = req->head.args.mkdir.mode;
6175 mode &= ~S_IFMT;
6176 mode |= S_IFDIR;
9f95a23c 6177 CInode *newi = prepare_new_inode(mdr, dir, inodeno_t(req->head.ino), mode);
11fdf7f2 6178 ceph_assert(newi);
7c673cae
FG
6179
6180 // it's a directory.
6181 dn->push_projected_linkage(newi);
6182
6183 newi->inode.version = dn->pre_dirty();
6184 newi->inode.rstat.rsubdirs = 1;
6185 newi->inode.update_backtrace();
6186
11fdf7f2
TL
6187 snapid_t follows = mdcache->get_global_snaprealm()->get_newest_seq();
6188 SnapRealm *realm = dn->get_dir()->inode->find_snaprealm();
6189 ceph_assert(follows >= realm->get_newest_seq());
6190
7c673cae 6191 dout(12) << " follows " << follows << dendl;
11fdf7f2 6192 ceph_assert(dn->first == follows + 1);
7c673cae
FG
6193 newi->first = dn->first;
6194
6195 // ...and that new dir is empty.
6196 CDir *newdir = newi->get_or_open_dirfrag(mdcache, frag_t());
6197 newdir->state_set(CDir::STATE_CREATING);
6198 newdir->mark_complete();
6199 newdir->fnode.version = newdir->pre_dirty();
6200
6201 // prepare finisher
6202 mdr->ls = mdlog->get_current_segment();
6203 EUpdate *le = new EUpdate(mdlog, "mkdir");
6204 mdlog->start_entry(le);
6205 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
6206 journal_allocated_inos(mdr, &le->metablob);
6207 mdcache->predirty_journal_parents(mdr, &le->metablob, newi, dn->get_dir(), PREDIRTY_PRIMARY|PREDIRTY_DIR, 1);
6208 le->metablob.add_primary_dentry(dn, newi, true, true);
6209 le->metablob.add_new_dir(newdir); // dirty AND complete AND new
6210
6211 // issue a cap on the directory
6212 int cmode = CEPH_FILE_MODE_RDWR;
9f95a23c 6213 Capability *cap = mds->locker->issue_new_caps(newi, cmode, mdr, realm);
7c673cae
FG
6214 if (cap) {
6215 cap->set_wanted(0);
6216
6217 // put locks in excl mode
6218 newi->filelock.set_state(LOCK_EXCL);
6219 newi->authlock.set_state(LOCK_EXCL);
6220 newi->xattrlock.set_state(LOCK_EXCL);
6221 }
6222
6223 // make sure this inode gets into the journal
6224 le->metablob.add_opened_ino(newi->ino());
7c673cae
FG
6225
6226 journal_and_reply(mdr, newi, dn, le, new C_MDS_mknod_finish(this, mdr, dn, newi));
81eedcae
TL
6227
6228 // We hit_dir (via hit_inode) in our finish callback, but by then we might
6229 // have overshot the split size (multiple mkdir in flight), so here is
6230 // an early chance to split the dir if this mkdir makes it oversized.
6231 mds->balancer->maybe_fragment(dir, false);
7c673cae
FG
6232}
6233
6234
6235// SYMLINK
6236
6237void Server::handle_client_symlink(MDRequestRef& mdr)
6238{
9f95a23c
TL
6239 mdr->disable_lock_cache();
6240 CDentry *dn = rdlock_path_xlock_dentry(mdr, true);
6241 if (!dn)
7c673cae 6242 return;
9f95a23c 6243
7c673cae
FG
6244 CDir *dir = dn->get_dir();
6245 CInode *diri = dir->get_inode();
7c673cae
FG
6246
6247 if (!check_access(mdr, diri, MAY_WRITE))
9f95a23c 6248 return;
7c673cae
FG
6249 if (!check_fragment_space(mdr, dir))
6250 return;
6251
9f95a23c
TL
6252 const cref_t<MClientRequest> &req = mdr->client_request;
6253
7c673cae 6254 unsigned mode = S_IFLNK | 0777;
9f95a23c 6255 CInode *newi = prepare_new_inode(mdr, dir, inodeno_t(req->head.ino), mode);
11fdf7f2 6256 ceph_assert(newi);
7c673cae
FG
6257
6258 // it's a symlink
6259 dn->push_projected_linkage(newi);
6260
11fdf7f2 6261 newi->symlink = req->get_path2();
7c673cae
FG
6262 newi->inode.size = newi->symlink.length();
6263 newi->inode.rstat.rbytes = newi->inode.size;
6264 newi->inode.rstat.rfiles = 1;
6265 newi->inode.version = dn->pre_dirty();
6266 newi->inode.update_backtrace();
6267
6268 newi->first = dn->first;
6269
6270 // prepare finisher
6271 mdr->ls = mdlog->get_current_segment();
6272 EUpdate *le = new EUpdate(mdlog, "symlink");
6273 mdlog->start_entry(le);
6274 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
6275 journal_allocated_inos(mdr, &le->metablob);
6276 mdcache->predirty_journal_parents(mdr, &le->metablob, newi, dn->get_dir(), PREDIRTY_PRIMARY|PREDIRTY_DIR, 1);
6277 le->metablob.add_primary_dentry(dn, newi, true, true);
6278
6279 journal_and_reply(mdr, newi, dn, le, new C_MDS_mknod_finish(this, mdr, dn, newi));
92f5a8d4 6280 mds->balancer->maybe_fragment(dir, false);
7c673cae
FG
6281}
6282
6283
6284
6285
6286
6287// LINK
6288
6289void Server::handle_client_link(MDRequestRef& mdr)
6290{
9f95a23c 6291 const cref_t<MClientRequest> &req = mdr->client_request;
7c673cae
FG
6292
6293 dout(7) << "handle_client_link " << req->get_filepath()
6294 << " to " << req->get_filepath2()
6295 << dendl;
6296
9f95a23c 6297 mdr->disable_lock_cache();
7c673cae 6298
9f95a23c
TL
6299 CDentry *destdn;
6300 CInode *targeti;
6301
6302 if (req->get_filepath2().depth() == 0) {
6303 targeti = mdcache->get_inode(req->get_filepath2().get_ino());
6304 if (!targeti) {
6305 dout(10) << "ESTALE on path2, attempting recovery" << dendl;
6306 mdcache->find_ino_peers(req->get_filepath2().get_ino(), new C_MDS_TryFindInode(this, mdr));
6307 return;
6308 }
6309 mdr->pin(targeti);
6310
6311 if (!(mdr->locking_state & MutationImpl::SNAP2_LOCKED)) {
6312 CDentry *pdn = targeti->get_projected_parent_dn();
6313 if (!pdn) {
6314 dout(7) << "target has no parent dn, failing..." << dendl;
6315 respond_to_request(mdr, -EINVAL);
6316 return;
6317 }
6318 if (!mds->locker->try_rdlock_snap_layout(pdn->get_dir()->get_inode(), mdr, 1))
6319 return;
6320 mdr->locking_state |= MutationImpl::SNAP2_LOCKED;
6321 }
6322
6323 destdn = rdlock_path_xlock_dentry(mdr, false);
6324 if (!destdn)
6325 return;
6326
6327 } else {
6328 auto ret = rdlock_two_paths_xlock_destdn(mdr, false);
6329 destdn = ret.first;
6330 if (!destdn)
6331 return;
6332
6333 if (!destdn->get_projected_linkage()->is_null()) {
6334 respond_to_request(mdr, -EEXIST);
6335 return;
6336 }
6337
6338 targeti = ret.second->get_projected_linkage()->get_inode();
6339 }
6340
6341 if (targeti->is_dir()) {
6342 dout(7) << "target is a dir, failing..." << dendl;
6343 respond_to_request(mdr, -EINVAL);
7c673cae
FG
6344 return;
6345 }
6346
9f95a23c
TL
6347 CDir *dir = destdn->get_dir();
6348 dout(7) << "handle_client_link link " << destdn->get_name() << " in " << *dir << dendl;
7c673cae 6349 dout(7) << "target is " << *targeti << dendl;
9f95a23c
TL
6350
6351 if (!(mdr->locking_state & MutationImpl::ALL_LOCKED)) {
6352 MutationImpl::LockOpVec lov;
6353 lov.add_xlock(&targeti->snaplock);
6354 lov.add_xlock(&targeti->linklock);
6355
6356 if (!mds->locker->acquire_locks(mdr, lov))
181888fb 6357 return;
7c673cae 6358
9f95a23c
TL
6359 mdr->locking_state |= MutationImpl::ALL_LOCKED;
6360 }
7c673cae 6361
9f95a23c
TL
6362 if (targeti->get_projected_inode()->nlink == 0) {
6363 dout(7) << "target has no link, failing..." << dendl;
6364 respond_to_request(mdr, -ENOENT);
6365 }
7c673cae
FG
6366
6367 if ((!mdr->has_more() || mdr->more()->witnessed.empty())) {
6368 if (!check_access(mdr, targeti, MAY_WRITE))
6369 return;
6370
6371 if (!check_access(mdr, dir->get_inode(), MAY_WRITE))
6372 return;
6373
6374 if (!check_fragment_space(mdr, dir))
6375 return;
6376 }
6377
adb31ebb
TL
6378 CInode* target_pin = targeti->get_projected_parent_dir()->inode;
6379 SnapRealm *target_realm = target_pin->find_snaprealm();
6380 if (target_pin != dir->inode &&
6381 target_realm->get_subvolume_ino() !=
6382 dir->inode->find_snaprealm()->get_subvolume_ino()) {
6383 dout(7) << "target is in different subvolume, failing..." << dendl;
6384 respond_to_request(mdr, -EXDEV);
6385 return;
6386 }
6387
7c673cae 6388 // go!
11fdf7f2 6389 ceph_assert(g_conf()->mds_kill_link_at != 1);
7c673cae
FG
6390
6391 // local or remote?
6392 if (targeti->is_auth())
adb31ebb 6393 _link_local(mdr, destdn, targeti, target_realm);
7c673cae 6394 else
9f95a23c 6395 _link_remote(mdr, true, destdn, targeti);
92f5a8d4 6396 mds->balancer->maybe_fragment(dir, false);
7c673cae
FG
6397}
6398
6399
6400class C_MDS_link_local_finish : public ServerLogContext {
6401 CDentry *dn;
6402 CInode *targeti;
6403 version_t dnpv;
6404 version_t tipv;
11fdf7f2 6405 bool adjust_realm;
7c673cae
FG
6406public:
6407 C_MDS_link_local_finish(Server *s, MDRequestRef& r, CDentry *d, CInode *ti,
11fdf7f2 6408 version_t dnpv_, version_t tipv_, bool ar) :
7c673cae 6409 ServerLogContext(s, r), dn(d), targeti(ti),
11fdf7f2 6410 dnpv(dnpv_), tipv(tipv_), adjust_realm(ar) { }
7c673cae 6411 void finish(int r) override {
11fdf7f2
TL
6412 ceph_assert(r == 0);
6413 server->_link_local_finish(mdr, dn, targeti, dnpv, tipv, adjust_realm);
7c673cae
FG
6414 }
6415};
6416
6417
adb31ebb 6418void Server::_link_local(MDRequestRef& mdr, CDentry *dn, CInode *targeti, SnapRealm *target_realm)
7c673cae
FG
6419{
6420 dout(10) << "_link_local " << *dn << " to " << *targeti << dendl;
6421
6422 mdr->ls = mdlog->get_current_segment();
6423
6424 // predirty NEW dentry
6425 version_t dnpv = dn->pre_dirty();
6426 version_t tipv = targeti->pre_dirty();
6427
6428 // project inode update
94b18763
FG
6429 auto &pi = targeti->project_inode();
6430 pi.inode.nlink++;
91327a77
AA
6431 pi.inode.ctime = mdr->get_op_stamp();
6432 if (mdr->get_op_stamp() > pi.inode.rstat.rctime)
6433 pi.inode.rstat.rctime = mdr->get_op_stamp();
94b18763
FG
6434 pi.inode.change_attr++;
6435 pi.inode.version = tipv;
7c673cae 6436
11fdf7f2 6437 bool adjust_realm = false;
adb31ebb 6438 if (!target_realm->get_subvolume_ino() && !targeti->is_projected_snaprealm_global()) {
11fdf7f2
TL
6439 sr_t *newsnap = targeti->project_snaprealm();
6440 targeti->mark_snaprealm_global(newsnap);
adb31ebb 6441 targeti->record_snaprealm_parent_dentry(newsnap, target_realm, targeti->get_projected_parent_dn(), true);
11fdf7f2
TL
6442 adjust_realm = true;
6443 }
6444
7c673cae
FG
6445 // log + wait
6446 EUpdate *le = new EUpdate(mdlog, "link_local");
6447 mdlog->start_entry(le);
6448 le->metablob.add_client_req(mdr->reqid, mdr->client_request->get_oldest_client_tid());
6449 mdcache->predirty_journal_parents(mdr, &le->metablob, targeti, dn->get_dir(), PREDIRTY_DIR, 1); // new dn
6450 mdcache->predirty_journal_parents(mdr, &le->metablob, targeti, 0, PREDIRTY_PRIMARY); // targeti
6451 le->metablob.add_remote_dentry(dn, true, targeti->ino(), targeti->d_type()); // new remote
6452 mdcache->journal_dirty_inode(mdr.get(), &le->metablob, targeti);
6453
6454 // do this after predirty_*, to avoid funky extra dnl arg
6455 dn->push_projected_linkage(targeti->ino(), targeti->d_type());
6456
11fdf7f2
TL
6457 journal_and_reply(mdr, targeti, dn, le,
6458 new C_MDS_link_local_finish(this, mdr, dn, targeti, dnpv, tipv, adjust_realm));
7c673cae
FG
6459}
6460
6461void Server::_link_local_finish(MDRequestRef& mdr, CDentry *dn, CInode *targeti,
11fdf7f2 6462 version_t dnpv, version_t tipv, bool adjust_realm)
7c673cae
FG
6463{
6464 dout(10) << "_link_local_finish " << *dn << " to " << *targeti << dendl;
6465
6466 // link and unlock the NEW dentry
31f18b77
FG
6467 CDentry::linkage_t *dnl = dn->pop_projected_linkage();
6468 if (!dnl->get_inode())
6469 dn->link_remote(dnl, targeti);
7c673cae
FG
6470 dn->mark_dirty(dnpv, mdr->ls);
6471
6472 // target inode
6473 targeti->pop_and_dirty_projected_inode(mdr->ls);
6474
6475 mdr->apply();
6476
6477 MDRequestRef null_ref;
6478 mdcache->send_dentry_link(dn, null_ref);
6479
11fdf7f2
TL
6480 if (adjust_realm) {
6481 int op = CEPH_SNAP_OP_SPLIT;
6482 mds->mdcache->send_snap_update(targeti, 0, op);
6483 mds->mdcache->do_realm_invalidate_and_update_notify(targeti, op);
6484 }
6485
7c673cae 6486 // bump target popularity
11fdf7f2
TL
6487 mds->balancer->hit_inode(targeti, META_POP_IWR);
6488 mds->balancer->hit_dir(dn->get_dir(), META_POP_IWR);
7c673cae
FG
6489
6490 // reply
6491 respond_to_request(mdr, 0);
6492}
6493
6494
6495// link / unlink remote
6496
6497class C_MDS_link_remote_finish : public ServerLogContext {
6498 bool inc;
6499 CDentry *dn;
6500 CInode *targeti;
6501 version_t dpv;
6502public:
6503 C_MDS_link_remote_finish(Server *s, MDRequestRef& r, bool i, CDentry *d, CInode *ti) :
6504 ServerLogContext(s, r), inc(i), dn(d), targeti(ti),
6505 dpv(d->get_projected_version()) {}
6506 void finish(int r) override {
11fdf7f2 6507 ceph_assert(r == 0);
7c673cae
FG
6508 server->_link_remote_finish(mdr, inc, dn, targeti, dpv);
6509 }
6510};
6511
6512void Server::_link_remote(MDRequestRef& mdr, bool inc, CDentry *dn, CInode *targeti)
6513{
6514 dout(10) << "_link_remote "
6515 << (inc ? "link ":"unlink ")
6516 << *dn << " to " << *targeti << dendl;
6517
6518 // 1. send LinkPrepare to dest (journal nlink++ prepare)
6519 mds_rank_t linkauth = targeti->authority().first;
6520 if (mdr->more()->witnessed.count(linkauth) == 0) {
6521 if (mds->is_cluster_degraded() &&
6522 !mds->mdsmap->is_clientreplay_or_active_or_stopping(linkauth)) {
6523 dout(10) << " targeti auth mds." << linkauth << " is not active" << dendl;
6524 if (mdr->more()->waiting_on_slave.empty())
6525 mds->wait_for_active_peer(linkauth, new C_MDS_RetryRequest(mdcache, mdr));
6526 return;
6527 }
6528
6529 dout(10) << " targeti auth must prepare nlink++/--" << dendl;
6530 int op;
6531 if (inc)
6532 op = MMDSSlaveRequest::OP_LINKPREP;
6533 else
6534 op = MMDSSlaveRequest::OP_UNLINKPREP;
9f95a23c 6535 auto req = make_message<MMDSSlaveRequest>(mdr->reqid, mdr->attempt, op);
7c673cae
FG
6536 targeti->set_object_info(req->get_object_info());
6537 req->op_stamp = mdr->get_op_stamp();
11fdf7f2
TL
6538 if (auto& desti_srnode = mdr->more()->desti_srnode)
6539 encode(*desti_srnode, req->desti_snapbl);
7c673cae
FG
6540 mds->send_message_mds(req, linkauth);
6541
11fdf7f2 6542 ceph_assert(mdr->more()->waiting_on_slave.count(linkauth) == 0);
7c673cae
FG
6543 mdr->more()->waiting_on_slave.insert(linkauth);
6544 return;
6545 }
6546 dout(10) << " targeti auth has prepared nlink++/--" << dendl;
6547
11fdf7f2
TL
6548 ceph_assert(g_conf()->mds_kill_link_at != 2);
6549
6550 if (auto& desti_srnode = mdr->more()->desti_srnode) {
6551 delete desti_srnode;
6552 desti_srnode = NULL;
6553 }
7c673cae
FG
6554
6555 mdr->set_mds_stamp(ceph_clock_now());
6556
6557 // add to event
6558 mdr->ls = mdlog->get_current_segment();
6559 EUpdate *le = new EUpdate(mdlog, inc ? "link_remote":"unlink_remote");
6560 mdlog->start_entry(le);
6561 le->metablob.add_client_req(mdr->reqid, mdr->client_request->get_oldest_client_tid());
6562 if (!mdr->more()->witnessed.empty()) {
6563 dout(20) << " noting uncommitted_slaves " << mdr->more()->witnessed << dendl;
6564 le->reqid = mdr->reqid;
6565 le->had_slaves = true;
6566 mdcache->add_uncommitted_master(mdr->reqid, mdr->ls, mdr->more()->witnessed);
6567 }
6568
6569 if (inc) {
6570 dn->pre_dirty();
6571 mdcache->predirty_journal_parents(mdr, &le->metablob, targeti, dn->get_dir(), PREDIRTY_DIR, 1);
6572 le->metablob.add_remote_dentry(dn, true, targeti->ino(), targeti->d_type()); // new remote
6573 dn->push_projected_linkage(targeti->ino(), targeti->d_type());
6574 } else {
6575 dn->pre_dirty();
6576 mdcache->predirty_journal_parents(mdr, &le->metablob, targeti, dn->get_dir(), PREDIRTY_DIR, -1);
6577 mdcache->journal_cow_dentry(mdr.get(), &le->metablob, dn);
6578 le->metablob.add_null_dentry(dn, true);
31f18b77 6579 dn->push_projected_linkage();
7c673cae
FG
6580 }
6581
9f95a23c
TL
6582 journal_and_reply(mdr, (inc ? targeti : nullptr), dn, le,
6583 new C_MDS_link_remote_finish(this, mdr, inc, dn, targeti));
7c673cae
FG
6584}
6585
6586void Server::_link_remote_finish(MDRequestRef& mdr, bool inc,
6587 CDentry *dn, CInode *targeti,
6588 version_t dpv)
6589{
6590 dout(10) << "_link_remote_finish "
6591 << (inc ? "link ":"unlink ")
6592 << *dn << " to " << *targeti << dendl;
6593
11fdf7f2 6594 ceph_assert(g_conf()->mds_kill_link_at != 3);
7c673cae
FG
6595
6596 if (!mdr->more()->witnessed.empty())
6597 mdcache->logged_master_update(mdr->reqid);
6598
6599 if (inc) {
6600 // link the new dentry
31f18b77
FG
6601 CDentry::linkage_t *dnl = dn->pop_projected_linkage();
6602 if (!dnl->get_inode())
6603 dn->link_remote(dnl, targeti);
7c673cae
FG
6604 dn->mark_dirty(dpv, mdr->ls);
6605 } else {
6606 // unlink main dentry
6607 dn->get_dir()->unlink_inode(dn);
31f18b77 6608 dn->pop_projected_linkage();
7c673cae
FG
6609 dn->mark_dirty(dn->get_projected_version(), mdr->ls); // dirty old dentry
6610 }
6611
6612 mdr->apply();
6613
6614 MDRequestRef null_ref;
6615 if (inc)
6616 mdcache->send_dentry_link(dn, null_ref);
6617 else
6618 mdcache->send_dentry_unlink(dn, NULL, null_ref);
6619
6620 // bump target popularity
11fdf7f2
TL
6621 mds->balancer->hit_inode(targeti, META_POP_IWR);
6622 mds->balancer->hit_dir(dn->get_dir(), META_POP_IWR);
7c673cae
FG
6623
6624 // reply
6625 respond_to_request(mdr, 0);
6626
6627 if (!inc)
6628 // removing a new dn?
6629 dn->get_dir()->try_remove_unlinked_dn(dn);
6630}
6631
6632
6633// remote linking/unlinking
6634
6635class C_MDS_SlaveLinkPrep : public ServerLogContext {
6636 CInode *targeti;
11fdf7f2 6637 bool adjust_realm;
7c673cae 6638public:
11fdf7f2
TL
6639 C_MDS_SlaveLinkPrep(Server *s, MDRequestRef& r, CInode *t, bool ar) :
6640 ServerLogContext(s, r), targeti(t), adjust_realm(ar) { }
7c673cae 6641 void finish(int r) override {
11fdf7f2
TL
6642 ceph_assert(r == 0);
6643 server->_logged_slave_link(mdr, targeti, adjust_realm);
7c673cae
FG
6644 }
6645};
6646
6647class C_MDS_SlaveLinkCommit : public ServerContext {
6648 MDRequestRef mdr;
6649 CInode *targeti;
6650public:
6651 C_MDS_SlaveLinkCommit(Server *s, MDRequestRef& r, CInode *t) :
6652 ServerContext(s), mdr(r), targeti(t) { }
6653 void finish(int r) override {
6654 server->_commit_slave_link(mdr, r, targeti);
6655 }
6656};
6657
7c673cae
FG
6658void Server::handle_slave_link_prep(MDRequestRef& mdr)
6659{
6660 dout(10) << "handle_slave_link_prep " << *mdr
6661 << " on " << mdr->slave_request->get_object_info()
6662 << dendl;
6663
11fdf7f2 6664 ceph_assert(g_conf()->mds_kill_link_at != 4);
7c673cae
FG
6665
6666 CInode *targeti = mdcache->get_inode(mdr->slave_request->get_object_info().ino);
11fdf7f2 6667 ceph_assert(targeti);
7c673cae
FG
6668 dout(10) << "targeti " << *targeti << dendl;
6669 CDentry *dn = targeti->get_parent_dn();
6670 CDentry::linkage_t *dnl = dn->get_linkage();
11fdf7f2 6671 ceph_assert(dnl->is_primary());
7c673cae
FG
6672
6673 mdr->set_op_stamp(mdr->slave_request->op_stamp);
6674
6675 mdr->auth_pin(targeti);
6676
6677 //ceph_abort(); // test hack: make sure master can handle a slave that fails to prepare...
11fdf7f2 6678 ceph_assert(g_conf()->mds_kill_link_at != 5);
7c673cae
FG
6679
6680 // journal it
6681 mdr->ls = mdlog->get_current_segment();
6682 ESlaveUpdate *le = new ESlaveUpdate(mdlog, "slave_link_prep", mdr->reqid, mdr->slave_to_mds,
6683 ESlaveUpdate::OP_PREPARE, ESlaveUpdate::LINK);
6684 mdlog->start_entry(le);
6685
94b18763 6686 auto &pi = dnl->get_inode()->project_inode();
7c673cae
FG
6687
6688 // update journaled target inode
6689 bool inc;
11fdf7f2
TL
6690 bool adjust_realm = false;
6691 bool realm_projected = false;
7c673cae
FG
6692 if (mdr->slave_request->get_op() == MMDSSlaveRequest::OP_LINKPREP) {
6693 inc = true;
94b18763 6694 pi.inode.nlink++;
adb31ebb
TL
6695
6696 CDentry *target_pdn = targeti->get_projected_parent_dn();
6697 SnapRealm *target_realm = target_pdn->get_dir()->inode->find_snaprealm();
6698 if (!target_realm->get_subvolume_ino() && !targeti->is_projected_snaprealm_global()) {
11fdf7f2
TL
6699 sr_t *newsnap = targeti->project_snaprealm();
6700 targeti->mark_snaprealm_global(newsnap);
adb31ebb 6701 targeti->record_snaprealm_parent_dentry(newsnap, target_realm, target_pdn, true);
11fdf7f2
TL
6702 adjust_realm = true;
6703 realm_projected = true;
6704 }
7c673cae
FG
6705 } else {
6706 inc = false;
94b18763 6707 pi.inode.nlink--;
11fdf7f2
TL
6708 if (targeti->is_projected_snaprealm_global()) {
6709 ceph_assert(mdr->slave_request->desti_snapbl.length());
6710 auto p = mdr->slave_request->desti_snapbl.cbegin();
6711
6712 sr_t *newsnap = targeti->project_snaprealm();
6713 decode(*newsnap, p);
6714
6715 if (pi.inode.nlink == 0)
6716 ceph_assert(!newsnap->is_parent_global());
6717
6718 realm_projected = true;
6719 } else {
6720 ceph_assert(mdr->slave_request->desti_snapbl.length() == 0);
6721 }
7c673cae
FG
6722 }
6723
6724 link_rollback rollback;
6725 rollback.reqid = mdr->reqid;
6726 rollback.ino = targeti->ino();
6727 rollback.old_ctime = targeti->inode.ctime; // we hold versionlock xlock; no concorrent projections
6728 const fnode_t *pf = targeti->get_parent_dn()->get_dir()->get_projected_fnode();
6729 rollback.old_dir_mtime = pf->fragstat.mtime;
6730 rollback.old_dir_rctime = pf->rstat.rctime;
6731 rollback.was_inc = inc;
11fdf7f2
TL
6732 if (realm_projected) {
6733 if (targeti->snaprealm) {
6734 encode(true, rollback.snapbl);
6735 targeti->encode_snap_blob(rollback.snapbl);
6736 } else {
6737 encode(false, rollback.snapbl);
6738 }
6739 }
6740 encode(rollback, le->rollback);
7c673cae
FG
6741 mdr->more()->rollback_bl = le->rollback;
6742
94b18763
FG
6743 pi.inode.ctime = mdr->get_op_stamp();
6744 pi.inode.version = targeti->pre_dirty();
7c673cae 6745
94b18763 6746 dout(10) << " projected inode " << pi.inode.ino << " v " << pi.inode.version << dendl;
7c673cae
FG
6747
6748 // commit case
6749 mdcache->predirty_journal_parents(mdr, &le->commit, dnl->get_inode(), 0, PREDIRTY_SHALLOW|PREDIRTY_PRIMARY);
6750 mdcache->journal_dirty_inode(mdr.get(), &le->commit, targeti);
e306af50 6751 mdcache->add_uncommitted_slave(mdr->reqid, mdr->ls, mdr->slave_to_mds);
7c673cae
FG
6752
6753 // set up commit waiter
6754 mdr->more()->slave_commit = new C_MDS_SlaveLinkCommit(this, mdr, targeti);
6755
6756 mdr->more()->slave_update_journaled = true;
11fdf7f2 6757 submit_mdlog_entry(le, new C_MDS_SlaveLinkPrep(this, mdr, targeti, adjust_realm),
7c673cae
FG
6758 mdr, __func__);
6759 mdlog->flush();
6760}
6761
11fdf7f2 6762void Server::_logged_slave_link(MDRequestRef& mdr, CInode *targeti, bool adjust_realm)
7c673cae
FG
6763{
6764 dout(10) << "_logged_slave_link " << *mdr
6765 << " " << *targeti << dendl;
6766
11fdf7f2 6767 ceph_assert(g_conf()->mds_kill_link_at != 6);
7c673cae
FG
6768
6769 // update the target
6770 targeti->pop_and_dirty_projected_inode(mdr->ls);
6771 mdr->apply();
6772
6773 // hit pop
11fdf7f2 6774 mds->balancer->hit_inode(targeti, META_POP_IWR);
7c673cae
FG
6775
6776 // done.
91327a77 6777 mdr->reset_slave_request();
7c673cae 6778
11fdf7f2
TL
6779 if (adjust_realm) {
6780 int op = CEPH_SNAP_OP_SPLIT;
6781 mds->mdcache->send_snap_update(targeti, 0, op);
6782 mds->mdcache->do_realm_invalidate_and_update_notify(targeti, op);
6783 }
6784
7c673cae
FG
6785 // ack
6786 if (!mdr->aborted) {
9f95a23c 6787 auto reply = make_message<MMDSSlaveRequest>(mdr->reqid, mdr->attempt, MMDSSlaveRequest::OP_LINKPREPACK);
7c673cae
FG
6788 mds->send_message_mds(reply, mdr->slave_to_mds);
6789 } else {
6790 dout(10) << " abort flag set, finishing" << dendl;
6791 mdcache->request_finish(mdr);
6792 }
6793}
6794
6795
6796struct C_MDS_CommittedSlave : public ServerLogContext {
6797 C_MDS_CommittedSlave(Server *s, MDRequestRef& m) : ServerLogContext(s, m) {}
6798 void finish(int r) override {
6799 server->_committed_slave(mdr);
6800 }
6801};
6802
6803void Server::_commit_slave_link(MDRequestRef& mdr, int r, CInode *targeti)
6804{
6805 dout(10) << "_commit_slave_link " << *mdr
6806 << " r=" << r
6807 << " " << *targeti << dendl;
6808
11fdf7f2 6809 ceph_assert(g_conf()->mds_kill_link_at != 7);
7c673cae
FG
6810
6811 if (r == 0) {
6812 // drop our pins, etc.
6813 mdr->cleanup();
6814
6815 // write a commit to the journal
6816 ESlaveUpdate *le = new ESlaveUpdate(mdlog, "slave_link_commit", mdr->reqid, mdr->slave_to_mds,
6817 ESlaveUpdate::OP_COMMIT, ESlaveUpdate::LINK);
6818 mdlog->start_entry(le);
6819 submit_mdlog_entry(le, new C_MDS_CommittedSlave(this, mdr), mdr, __func__);
6820 mdlog->flush();
6821 } else {
6822 do_link_rollback(mdr->more()->rollback_bl, mdr->slave_to_mds, mdr);
6823 }
6824}
6825
6826void Server::_committed_slave(MDRequestRef& mdr)
6827{
6828 dout(10) << "_committed_slave " << *mdr << dendl;
6829
11fdf7f2 6830 ceph_assert(g_conf()->mds_kill_link_at != 8);
7c673cae 6831
e306af50
TL
6832 bool assert_exist = mdr->more()->slave_update_journaled;
6833 mdcache->finish_uncommitted_slave(mdr->reqid, assert_exist);
9f95a23c 6834 auto req = make_message<MMDSSlaveRequest>(mdr->reqid, mdr->attempt, MMDSSlaveRequest::OP_COMMITTED);
7c673cae
FG
6835 mds->send_message_mds(req, mdr->slave_to_mds);
6836 mdcache->request_finish(mdr);
6837}
6838
6839struct C_MDS_LoggedLinkRollback : public ServerLogContext {
6840 MutationRef mut;
9f95a23c 6841 map<client_t,ref_t<MClientSnap>> splits;
11fdf7f2 6842 C_MDS_LoggedLinkRollback(Server *s, MutationRef& m, MDRequestRef& r,
9f95a23c 6843 map<client_t,ref_t<MClientSnap>>&& _splits) :
11fdf7f2
TL
6844 ServerLogContext(s, r), mut(m), splits(std::move(_splits)) {
6845 }
7c673cae 6846 void finish(int r) override {
11fdf7f2 6847 server->_link_rollback_finish(mut, mdr, splits);
7c673cae
FG
6848 }
6849};
6850
6851void Server::do_link_rollback(bufferlist &rbl, mds_rank_t master, MDRequestRef& mdr)
6852{
6853 link_rollback rollback;
11fdf7f2
TL
6854 auto p = rbl.cbegin();
6855 decode(rollback, p);
7c673cae
FG
6856
6857 dout(10) << "do_link_rollback on " << rollback.reqid
6858 << (rollback.was_inc ? " inc":" dec")
6859 << " ino " << rollback.ino
6860 << dendl;
6861
11fdf7f2 6862 ceph_assert(g_conf()->mds_kill_link_at != 9);
7c673cae
FG
6863
6864 mdcache->add_rollback(rollback.reqid, master); // need to finish this update before resolve finishes
11fdf7f2 6865 ceph_assert(mdr || mds->is_resolve());
7c673cae
FG
6866
6867 MutationRef mut(new MutationImpl(nullptr, utime_t(), rollback.reqid));
6868 mut->ls = mds->mdlog->get_current_segment();
6869
6870 CInode *in = mdcache->get_inode(rollback.ino);
11fdf7f2 6871 ceph_assert(in);
7c673cae 6872 dout(10) << " target is " << *in << dendl;
11fdf7f2 6873 ceph_assert(!in->is_projected()); // live slave request hold versionlock xlock.
7c673cae 6874
94b18763
FG
6875 auto &pi = in->project_inode();
6876 pi.inode.version = in->pre_dirty();
7c673cae
FG
6877 mut->add_projected_inode(in);
6878
6879 // parent dir rctime
6880 CDir *parent = in->get_projected_parent_dn()->get_dir();
6881 fnode_t *pf = parent->project_fnode();
6882 mut->add_projected_fnode(parent);
6883 pf->version = parent->pre_dirty();
94b18763 6884 if (pf->fragstat.mtime == pi.inode.ctime) {
7c673cae 6885 pf->fragstat.mtime = rollback.old_dir_mtime;
94b18763 6886 if (pf->rstat.rctime == pi.inode.ctime)
7c673cae
FG
6887 pf->rstat.rctime = rollback.old_dir_rctime;
6888 mut->add_updated_lock(&parent->get_inode()->filelock);
6889 mut->add_updated_lock(&parent->get_inode()->nestlock);
6890 }
6891
6892 // inode
91327a77 6893 pi.inode.ctime = rollback.old_ctime;
7c673cae 6894 if (rollback.was_inc)
94b18763 6895 pi.inode.nlink--;
7c673cae 6896 else
94b18763 6897 pi.inode.nlink++;
7c673cae 6898
9f95a23c 6899 map<client_t,ref_t<MClientSnap>> splits;
11fdf7f2
TL
6900 if (rollback.snapbl.length() && in->snaprealm) {
6901 bool hadrealm;
6902 auto p = rollback.snapbl.cbegin();
6903 decode(hadrealm, p);
6904 if (hadrealm) {
6905 if (!mds->is_resolve()) {
6906 sr_t *new_srnode = new sr_t();
6907 decode(*new_srnode, p);
6908 in->project_snaprealm(new_srnode);
6909 } else {
6910 decode(in->snaprealm->srnode, p);
6911 }
6912 } else {
6913 SnapRealm *realm = parent->get_inode()->find_snaprealm();
6914 if (!mds->is_resolve())
6915 mdcache->prepare_realm_merge(in->snaprealm, realm, splits);
6916 in->project_snaprealm(NULL);
6917 }
6918 }
6919
7c673cae
FG
6920 // journal it
6921 ESlaveUpdate *le = new ESlaveUpdate(mdlog, "slave_link_rollback", rollback.reqid, master,
6922 ESlaveUpdate::OP_ROLLBACK, ESlaveUpdate::LINK);
6923 mdlog->start_entry(le);
6924 le->commit.add_dir_context(parent);
6925 le->commit.add_dir(parent, true);
6926 le->commit.add_primary_dentry(in->get_projected_parent_dn(), 0, true);
6927
11fdf7f2 6928 submit_mdlog_entry(le, new C_MDS_LoggedLinkRollback(this, mut, mdr, std::move(splits)),
7c673cae
FG
6929 mdr, __func__);
6930 mdlog->flush();
6931}
6932
11fdf7f2 6933void Server::_link_rollback_finish(MutationRef& mut, MDRequestRef& mdr,
9f95a23c 6934 map<client_t,ref_t<MClientSnap>>& splits)
7c673cae
FG
6935{
6936 dout(10) << "_link_rollback_finish" << dendl;
6937
11fdf7f2 6938 ceph_assert(g_conf()->mds_kill_link_at != 10);
7c673cae
FG
6939
6940 mut->apply();
11fdf7f2
TL
6941
6942 if (!mds->is_resolve())
6943 mdcache->send_snaps(splits);
6944
7c673cae
FG
6945 if (mdr)
6946 mdcache->request_finish(mdr);
6947
e306af50 6948 mdcache->finish_rollback(mut->reqid, mdr);
7c673cae
FG
6949
6950 mut->cleanup();
6951}
6952
6953
9f95a23c 6954void Server::handle_slave_link_prep_ack(MDRequestRef& mdr, const cref_t<MMDSSlaveRequest> &m)
7c673cae
FG
6955{
6956 dout(10) << "handle_slave_link_prep_ack " << *mdr
6957 << " " << *m << dendl;
6958 mds_rank_t from = mds_rank_t(m->get_source().num());
6959
11fdf7f2 6960 ceph_assert(g_conf()->mds_kill_link_at != 11);
7c673cae
FG
6961
6962 // note slave
6963 mdr->more()->slaves.insert(from);
6964
6965 // witnessed!
11fdf7f2 6966 ceph_assert(mdr->more()->witnessed.count(from) == 0);
7c673cae 6967 mdr->more()->witnessed.insert(from);
11fdf7f2 6968 ceph_assert(!m->is_not_journaled());
7c673cae
FG
6969 mdr->more()->has_journaled_slaves = true;
6970
6971 // remove from waiting list
11fdf7f2 6972 ceph_assert(mdr->more()->waiting_on_slave.count(from));
7c673cae
FG
6973 mdr->more()->waiting_on_slave.erase(from);
6974
11fdf7f2 6975 ceph_assert(mdr->more()->waiting_on_slave.empty());
7c673cae 6976
9f95a23c
TL
6977 dispatch_client_request(mdr); // go again!
6978}
7c673cae 6979
9f95a23c
TL
6980
6981
6982
6983
6984// UNLINK
6985
6986void Server::handle_client_unlink(MDRequestRef& mdr)
6987{
6988 const cref_t<MClientRequest> &req = mdr->client_request;
6989 client_t client = mdr->get_client();
6990
6991 // rmdir or unlink?
6992 bool rmdir = (req->get_op() == CEPH_MDS_OP_RMDIR);
6993
6994 if (rmdir)
6995 mdr->disable_lock_cache();
6996 CDentry *dn = rdlock_path_xlock_dentry(mdr, false, true);
6997 if (!dn)
6998 return;
7c673cae
FG
6999
7000 CDentry::linkage_t *dnl = dn->get_linkage(client, mdr);
11fdf7f2 7001 ceph_assert(!dnl->is_null());
9f95a23c 7002 CInode *in = dnl->get_inode();
7c673cae
FG
7003
7004 if (rmdir) {
7005 dout(7) << "handle_client_rmdir on " << *dn << dendl;
7006 } else {
7007 dout(7) << "handle_client_unlink on " << *dn << dendl;
7008 }
7009 dout(7) << "dn links to " << *in << dendl;
7010
7011 // rmdir vs is_dir
7012 if (in->is_dir()) {
7013 if (rmdir) {
7014 // do empty directory checks
7015 if (_dir_is_nonempty_unlocked(mdr, in)) {
7016 respond_to_request(mdr, -ENOTEMPTY);
7017 return;
7018 }
7019 } else {
7020 dout(7) << "handle_client_unlink on dir " << *in << ", returning error" << dendl;
7021 respond_to_request(mdr, -EISDIR);
7022 return;
7023 }
7024 } else {
7025 if (rmdir) {
7026 // unlink
7027 dout(7) << "handle_client_rmdir on non-dir " << *in << ", returning error" << dendl;
7028 respond_to_request(mdr, -ENOTDIR);
7029 return;
7030 }
7031 }
7032
9f95a23c
TL
7033 CInode *diri = dn->get_dir()->get_inode();
7034 if ((!mdr->has_more() || mdr->more()->witnessed.empty())) {
7035 if (!check_access(mdr, diri, MAY_WRITE))
7036 return;
7037 }
7038
7c673cae
FG
7039 // -- create stray dentry? --
7040 CDentry *straydn = NULL;
7041 if (dnl->is_primary()) {
7042 straydn = prepare_stray_dentry(mdr, dnl->get_inode());
7043 if (!straydn)
7044 return;
7045 dout(10) << " straydn is " << *straydn << dendl;
7046 } else if (mdr->straydn) {
7047 mdr->unpin(mdr->straydn);
7048 mdr->straydn = NULL;
7049 }
7050
7051 // lock
9f95a23c
TL
7052 if (!(mdr->locking_state & MutationImpl::ALL_LOCKED)) {
7053 MutationImpl::LockOpVec lov;
11fdf7f2 7054
9f95a23c
TL
7055 lov.add_xlock(&in->linklock);
7056 lov.add_xlock(&in->snaplock);
7057 if (in->is_dir())
7058 lov.add_rdlock(&in->filelock); // to verify it's empty
7059
7060 if (straydn) {
7061 lov.add_wrlock(&straydn->get_dir()->inode->filelock);
7062 lov.add_wrlock(&straydn->get_dir()->inode->nestlock);
7063 lov.add_xlock(&straydn->lock);
7064 }
11fdf7f2 7065
9f95a23c
TL
7066 if (!mds->locker->acquire_locks(mdr, lov))
7067 return;
7c673cae 7068
9f95a23c
TL
7069 mdr->locking_state |= MutationImpl::ALL_LOCKED;
7070 }
7c673cae
FG
7071
7072 if (in->is_dir() &&
7073 _dir_is_nonempty(mdr, in)) {
7074 respond_to_request(mdr, -ENOTEMPTY);
7075 return;
7076 }
7077
11fdf7f2
TL
7078 if (straydn)
7079 straydn->first = mdcache->get_global_snaprealm()->get_newest_seq() + 1;
7080
7081 if (!mdr->more()->desti_srnode) {
7082 if (in->is_projected_snaprealm_global()) {
7083 sr_t *new_srnode = in->prepare_new_srnode(0);
adb31ebb 7084 in->record_snaprealm_parent_dentry(new_srnode, nullptr, dn, dnl->is_primary());
11fdf7f2
TL
7085 // dropping the last linkage or dropping the last remote linkage,
7086 // detch the inode from global snaprealm
7087 auto nlink = in->get_projected_inode()->nlink;
7088 if (nlink == 1 ||
7089 (nlink == 2 && !dnl->is_primary() &&
7090 !in->get_projected_parent_dir()->inode->is_stray()))
7091 in->clear_snaprealm_global(new_srnode);
7092 mdr->more()->desti_srnode = new_srnode;
7093 } else if (dnl->is_primary()) {
7094 // prepare snaprealm blob for slave request
7095 SnapRealm *realm = in->find_snaprealm();
7096 snapid_t follows = realm->get_newest_seq();
7097 if (in->snaprealm || follows + 1 > in->get_oldest_snap()) {
7098 sr_t *new_srnode = in->prepare_new_srnode(follows);
7099 in->record_snaprealm_past_parent(new_srnode, straydn->get_dir()->inode->find_snaprealm());
7100 mdr->more()->desti_srnode = new_srnode;
7101 }
7102 }
7103 }
7104
7c673cae
FG
7105 // yay!
7106 if (in->is_dir() && in->has_subtree_root_dirfrag()) {
7107 // subtree root auths need to be witnesses
7108 set<mds_rank_t> witnesses;
7109 in->list_replicas(witnesses);
7110 dout(10) << " witnesses " << witnesses << ", have " << mdr->more()->witnessed << dendl;
7111
7112 for (set<mds_rank_t>::iterator p = witnesses.begin();
7113 p != witnesses.end();
7114 ++p) {
7115 if (mdr->more()->witnessed.count(*p)) {
7116 dout(10) << " already witnessed by mds." << *p << dendl;
7117 } else if (mdr->more()->waiting_on_slave.count(*p)) {
7118 dout(10) << " already waiting on witness mds." << *p << dendl;
7119 } else {
9f95a23c 7120 if (!_rmdir_prepare_witness(mdr, *p, mdr->dn[0], straydn))
7c673cae
FG
7121 return;
7122 }
7123 }
7124 if (!mdr->more()->waiting_on_slave.empty())
7125 return; // we're waiting for a witness.
7126 }
7127
9f95a23c
TL
7128 if (!rmdir && dnl->is_primary() && mdr->dn[0].size() == 1)
7129 mds->locker->create_lock_cache(mdr, diri);
7130
7c673cae
FG
7131 // ok!
7132 if (dnl->is_remote() && !dnl->get_inode()->is_auth())
7133 _link_remote(mdr, false, dn, dnl->get_inode());
7134 else
7135 _unlink_local(mdr, dn, straydn);
7136}
7137
7138class C_MDS_unlink_local_finish : public ServerLogContext {
7139 CDentry *dn;
7140 CDentry *straydn;
7141 version_t dnpv; // deleted dentry
7142public:
7143 C_MDS_unlink_local_finish(Server *s, MDRequestRef& r, CDentry *d, CDentry *sd) :
7144 ServerLogContext(s, r), dn(d), straydn(sd),
7145 dnpv(d->get_projected_version()) {}
7146 void finish(int r) override {
11fdf7f2 7147 ceph_assert(r == 0);
7c673cae
FG
7148 server->_unlink_local_finish(mdr, dn, straydn, dnpv);
7149 }
7150};
7151
7152void Server::_unlink_local(MDRequestRef& mdr, CDentry *dn, CDentry *straydn)
7153{
7154 dout(10) << "_unlink_local " << *dn << dendl;
7155
7156 CDentry::linkage_t *dnl = dn->get_projected_linkage();
7157 CInode *in = dnl->get_inode();
7158
7c673cae
FG
7159
7160 // ok, let's do it.
7161 mdr->ls = mdlog->get_current_segment();
7162
7163 // prepare log entry
7164 EUpdate *le = new EUpdate(mdlog, "unlink_local");
7165 mdlog->start_entry(le);
7166 le->metablob.add_client_req(mdr->reqid, mdr->client_request->get_oldest_client_tid());
7167 if (!mdr->more()->witnessed.empty()) {
7168 dout(20) << " noting uncommitted_slaves " << mdr->more()->witnessed << dendl;
7169 le->reqid = mdr->reqid;
7170 le->had_slaves = true;
7171 mdcache->add_uncommitted_master(mdr->reqid, mdr->ls, mdr->more()->witnessed);
7172 }
7173
7174 if (straydn) {
11fdf7f2 7175 ceph_assert(dnl->is_primary());
7c673cae 7176 straydn->push_projected_linkage(in);
7c673cae
FG
7177 }
7178
7179 // the unlinked dentry
7180 dn->pre_dirty();
7181
94b18763
FG
7182 auto &pi = in->project_inode();
7183 {
7184 std::string t;
7185 dn->make_path_string(t, true);
11fdf7f2 7186 pi.inode.stray_prior_path = std::move(t);
94b18763 7187 }
94b18763 7188 pi.inode.version = in->pre_dirty();
91327a77
AA
7189 pi.inode.ctime = mdr->get_op_stamp();
7190 if (mdr->get_op_stamp() > pi.inode.rstat.rctime)
7191 pi.inode.rstat.rctime = mdr->get_op_stamp();
94b18763
FG
7192 pi.inode.change_attr++;
7193 pi.inode.nlink--;
7194 if (pi.inode.nlink == 0)
7c673cae
FG
7195 in->state_set(CInode::STATE_ORPHAN);
7196
11fdf7f2
TL
7197 if (mdr->more()->desti_srnode) {
7198 auto& desti_srnode = mdr->more()->desti_srnode;
7199 in->project_snaprealm(desti_srnode);
7200 desti_srnode = NULL;
7201 }
7202
7203 if (straydn) {
7204 // will manually pop projected inode
7205
7c673cae 7206 // primary link. add stray dentry.
7c673cae
FG
7207 mdcache->predirty_journal_parents(mdr, &le->metablob, in, dn->get_dir(), PREDIRTY_PRIMARY|PREDIRTY_DIR, -1);
7208 mdcache->predirty_journal_parents(mdr, &le->metablob, in, straydn->get_dir(), PREDIRTY_PRIMARY|PREDIRTY_DIR, 1);
7209
94b18763 7210 pi.inode.update_backtrace();
7c673cae
FG
7211 le->metablob.add_primary_dentry(straydn, in, true, true);
7212 } else {
11fdf7f2 7213 mdr->add_projected_inode(in);
7c673cae
FG
7214 // remote link. update remote inode.
7215 mdcache->predirty_journal_parents(mdr, &le->metablob, in, dn->get_dir(), PREDIRTY_DIR, -1);
7216 mdcache->predirty_journal_parents(mdr, &le->metablob, in, 0, PREDIRTY_PRIMARY);
7217 mdcache->journal_dirty_inode(mdr.get(), &le->metablob, in);
7218 }
7219
7220 mdcache->journal_cow_dentry(mdr.get(), &le->metablob, dn);
7221 le->metablob.add_null_dentry(dn, true);
7222
7223 if (in->is_dir()) {
7224 dout(10) << " noting renamed (unlinked) dir ino " << in->ino() << " in metablob" << dendl;
7225 le->metablob.renamed_dirino = in->ino();
7226 }
7227
7228 dn->push_projected_linkage();
7229
11fdf7f2
TL
7230 if (straydn) {
7231 ceph_assert(in->first <= straydn->first);
7232 in->first = straydn->first;
7233 }
7234
7c673cae 7235 if (in->is_dir()) {
11fdf7f2 7236 ceph_assert(straydn);
7c673cae
FG
7237 mdcache->project_subtree_rename(in, dn->get_dir(), straydn->get_dir());
7238 }
7239
7240 journal_and_reply(mdr, 0, dn, le, new C_MDS_unlink_local_finish(this, mdr, dn, straydn));
7241}
7242
7243void Server::_unlink_local_finish(MDRequestRef& mdr,
7244 CDentry *dn, CDentry *straydn,
7245 version_t dnpv)
7246{
7247 dout(10) << "_unlink_local_finish " << *dn << dendl;
7248
7249 if (!mdr->more()->witnessed.empty())
7250 mdcache->logged_master_update(mdr->reqid);
7251
11fdf7f2
TL
7252 CInode *strayin = NULL;
7253 bool hadrealm = false;
7254 if (straydn) {
7255 // if there is newly created snaprealm, need to split old snaprealm's
7256 // inodes_with_caps. So pop snaprealm before linkage changes.
7257 strayin = dn->get_linkage()->get_inode();
7258 hadrealm = strayin->snaprealm ? true : false;
7259 strayin->early_pop_projected_snaprealm();
7260 }
7261
7c673cae
FG
7262 // unlink main dentry
7263 dn->get_dir()->unlink_inode(dn);
7264 dn->pop_projected_linkage();
7265
7266 // relink as stray? (i.e. was primary link?)
7c673cae
FG
7267 if (straydn) {
7268 dout(20) << " straydn is " << *straydn << dendl;
11fdf7f2
TL
7269 straydn->pop_projected_linkage();
7270
7271 strayin->pop_and_dirty_projected_inode(mdr->ls);
7c673cae 7272
7c673cae
FG
7273 mdcache->touch_dentry_bottom(straydn);
7274 }
7275
7276 dn->mark_dirty(dnpv, mdr->ls);
7277 mdr->apply();
7c673cae
FG
7278
7279 mdcache->send_dentry_unlink(dn, straydn, mdr);
7280
11fdf7f2
TL
7281 if (straydn) {
7282 // update subtree map?
7283 if (strayin->is_dir())
7284 mdcache->adjust_subtree_after_rename(strayin, dn->get_dir(), true);
7285
7286 if (strayin->snaprealm && !hadrealm)
7287 mdcache->do_realm_invalidate_and_update_notify(strayin, CEPH_SNAP_OP_SPLIT, false);
7288 }
7c673cae
FG
7289
7290 // bump pop
11fdf7f2 7291 mds->balancer->hit_dir(dn->get_dir(), META_POP_IWR);
7c673cae
FG
7292
7293 // reply
7294 respond_to_request(mdr, 0);
7295
7296 // removing a new dn?
7297 dn->get_dir()->try_remove_unlinked_dn(dn);
7298
7299 // clean up ?
7300 // respond_to_request() drops locks. So stray reintegration can race with us.
7301 if (straydn && !straydn->get_projected_linkage()->is_null()) {
7302 // Tip off the MDCache that this dentry is a stray that
7303 // might be elegible for purge.
7304 mdcache->notify_stray(straydn);
7305 }
7306}
7307
7308bool Server::_rmdir_prepare_witness(MDRequestRef& mdr, mds_rank_t who, vector<CDentry*>& trace, CDentry *straydn)
7309{
7310 if (mds->is_cluster_degraded() &&
7311 !mds->mdsmap->is_clientreplay_or_active_or_stopping(who)) {
7312 dout(10) << "_rmdir_prepare_witness mds." << who << " is not active" << dendl;
7313 if (mdr->more()->waiting_on_slave.empty())
7314 mds->wait_for_active_peer(who, new C_MDS_RetryRequest(mdcache, mdr));
7315 return false;
7316 }
7317
7318 dout(10) << "_rmdir_prepare_witness mds." << who << dendl;
9f95a23c 7319 auto req = make_message<MMDSSlaveRequest>(mdr->reqid, mdr->attempt, MMDSSlaveRequest::OP_RMDIRPREP);
7c673cae
FG
7320 req->srcdnpath = filepath(trace.front()->get_dir()->ino());
7321 for (auto dn : trace)
94b18763 7322 req->srcdnpath.push_dentry(dn->get_name());
9f95a23c 7323 mdcache->encode_replica_stray(straydn, who, req->straybl);
11fdf7f2
TL
7324 if (mdr->more()->desti_srnode)
7325 encode(*mdr->more()->desti_srnode, req->desti_snapbl);
7c673cae
FG
7326
7327 req->op_stamp = mdr->get_op_stamp();
7328 mds->send_message_mds(req, who);
7329
11fdf7f2 7330 ceph_assert(mdr->more()->waiting_on_slave.count(who) == 0);
7c673cae
FG
7331 mdr->more()->waiting_on_slave.insert(who);
7332 return true;
7333}
7334
7335struct C_MDS_SlaveRmdirPrep : public ServerLogContext {
7336 CDentry *dn, *straydn;
7337 C_MDS_SlaveRmdirPrep(Server *s, MDRequestRef& r, CDentry *d, CDentry *st)
7338 : ServerLogContext(s, r), dn(d), straydn(st) {}
7339 void finish(int r) override {
7340 server->_logged_slave_rmdir(mdr, dn, straydn);
7341 }
7342};
7343
7344struct C_MDS_SlaveRmdirCommit : public ServerContext {
7345 MDRequestRef mdr;
31f18b77
FG
7346 CDentry *straydn;
7347 C_MDS_SlaveRmdirCommit(Server *s, MDRequestRef& r, CDentry *sd)
7348 : ServerContext(s), mdr(r), straydn(sd) { }
7c673cae 7349 void finish(int r) override {
31f18b77 7350 server->_commit_slave_rmdir(mdr, r, straydn);
7c673cae
FG
7351 }
7352};
7353
7354void Server::handle_slave_rmdir_prep(MDRequestRef& mdr)
7355{
7356 dout(10) << "handle_slave_rmdir_prep " << *mdr
7357 << " " << mdr->slave_request->srcdnpath
7358 << " to " << mdr->slave_request->destdnpath
7359 << dendl;
7360
7361 vector<CDentry*> trace;
7362 filepath srcpath(mdr->slave_request->srcdnpath);
7363 dout(10) << " src " << srcpath << dendl;
7364 CInode *in;
9f95a23c
TL
7365 CF_MDS_MDRContextFactory cf(mdcache, mdr, false);
7366 int r = mdcache->path_traverse(mdr, cf, srcpath,
7367 MDS_TRAVERSE_DISCOVER | MDS_TRAVERSE_PATH_LOCKED,
7368 &trace, &in);
7c673cae
FG
7369 if (r > 0) return;
7370 if (r == -ESTALE) {
7371 mdcache->find_ino_peers(srcpath.get_ino(), new C_MDS_RetryRequest(mdcache, mdr),
9f95a23c 7372 mdr->slave_to_mds, true);
7c673cae
FG
7373 return;
7374 }
11fdf7f2 7375 ceph_assert(r == 0);
91327a77 7376 CDentry *dn = trace.back();
7c673cae
FG
7377 dout(10) << " dn " << *dn << dendl;
7378 mdr->pin(dn);
7379
11fdf7f2 7380 ceph_assert(mdr->straydn);
7c673cae
FG
7381 CDentry *straydn = mdr->straydn;
7382 dout(10) << " straydn " << *straydn << dendl;
7383
7384 mdr->set_op_stamp(mdr->slave_request->op_stamp);
7385
7386 rmdir_rollback rollback;
7387 rollback.reqid = mdr->reqid;
7388 rollback.src_dir = dn->get_dir()->dirfrag();
11fdf7f2 7389 rollback.src_dname = dn->get_name();
7c673cae 7390 rollback.dest_dir = straydn->get_dir()->dirfrag();
11fdf7f2
TL
7391 rollback.dest_dname = straydn->get_name();
7392 if (mdr->slave_request->desti_snapbl.length()) {
7393 if (in->snaprealm) {
7394 encode(true, rollback.snapbl);
7395 in->encode_snap_blob(rollback.snapbl);
7396 } else {
7397 encode(false, rollback.snapbl);
7398 }
7399 }
7400 encode(rollback, mdr->more()->rollback_bl);
7401 // FIXME: rollback snaprealm
7c673cae
FG
7402 dout(20) << " rollback is " << mdr->more()->rollback_bl.length() << " bytes" << dendl;
7403
7404 // set up commit waiter
31f18b77 7405 mdr->more()->slave_commit = new C_MDS_SlaveRmdirCommit(this, mdr, straydn);
7c673cae 7406
11fdf7f2
TL
7407 straydn->push_projected_linkage(in);
7408 dn->push_projected_linkage();
7c673cae 7409
11fdf7f2
TL
7410 ceph_assert(straydn->first >= in->first);
7411 in->first = straydn->first;
7c673cae 7412
11fdf7f2
TL
7413 if (!in->has_subtree_root_dirfrag(mds->get_nodeid())) {
7414 dout(10) << " no auth subtree in " << *in << ", skipping journal" << dendl;
7415 _logged_slave_rmdir(mdr, dn, straydn);
7c673cae
FG
7416 return;
7417 }
7418
e306af50 7419 mdr->ls = mdlog->get_current_segment();
7c673cae
FG
7420 ESlaveUpdate *le = new ESlaveUpdate(mdlog, "slave_rmdir", mdr->reqid, mdr->slave_to_mds,
7421 ESlaveUpdate::OP_PREPARE, ESlaveUpdate::RMDIR);
7422 mdlog->start_entry(le);
7423 le->rollback = mdr->more()->rollback_bl;
7424
7425 le->commit.add_dir_context(straydn->get_dir());
7426 le->commit.add_primary_dentry(straydn, in, true);
7427 // slave: no need to journal original dentry
7428
7429 dout(10) << " noting renamed (unlinked) dir ino " << in->ino() << " in metablob" << dendl;
7430 le->commit.renamed_dirino = in->ino();
7431
7432 mdcache->project_subtree_rename(in, dn->get_dir(), straydn->get_dir());
e306af50 7433 mdcache->add_uncommitted_slave(mdr->reqid, mdr->ls, mdr->slave_to_mds);
7c673cae
FG
7434
7435 mdr->more()->slave_update_journaled = true;
7436 submit_mdlog_entry(le, new C_MDS_SlaveRmdirPrep(this, mdr, dn, straydn),
7437 mdr, __func__);
7438 mdlog->flush();
7439}
7440
7441void Server::_logged_slave_rmdir(MDRequestRef& mdr, CDentry *dn, CDentry *straydn)
7442{
7443 dout(10) << "_logged_slave_rmdir " << *mdr << " on " << *dn << dendl;
11fdf7f2
TL
7444 CInode *in = dn->get_linkage()->get_inode();
7445
7446 bool new_realm;
7447 if (mdr->slave_request->desti_snapbl.length()) {
7448 new_realm = !in->snaprealm;
7449 in->decode_snap_blob(mdr->slave_request->desti_snapbl);
7450 ceph_assert(in->snaprealm);
7451 ceph_assert(in->snaprealm->have_past_parents_open());
7452 } else {
7453 new_realm = false;
7454 }
7c673cae
FG
7455
7456 // update our cache now, so we are consistent with what is in the journal
7457 // when we journal a subtree map
7c673cae
FG
7458 dn->get_dir()->unlink_inode(dn);
7459 straydn->pop_projected_linkage();
7460 dn->pop_projected_linkage();
11fdf7f2
TL
7461
7462 mdcache->adjust_subtree_after_rename(in, dn->get_dir(), mdr->more()->slave_update_journaled);
7463
7464 if (new_realm)
7465 mdcache->do_realm_invalidate_and_update_notify(in, CEPH_SNAP_OP_SPLIT, false);
7c673cae
FG
7466
7467 // done.
91327a77 7468 mdr->reset_slave_request();
7c673cae
FG
7469 mdr->straydn = 0;
7470
7471 if (!mdr->aborted) {
9f95a23c 7472 auto reply = make_message<MMDSSlaveRequest>(mdr->reqid, mdr->attempt, MMDSSlaveRequest::OP_RMDIRPREPACK);
11fdf7f2
TL
7473 if (!mdr->more()->slave_update_journaled)
7474 reply->mark_not_journaled();
7c673cae
FG
7475 mds->send_message_mds(reply, mdr->slave_to_mds);
7476 } else {
7477 dout(10) << " abort flag set, finishing" << dendl;
7478 mdcache->request_finish(mdr);
7479 }
7480}
7481
9f95a23c 7482void Server::handle_slave_rmdir_prep_ack(MDRequestRef& mdr, const cref_t<MMDSSlaveRequest> &ack)
7c673cae
FG
7483{
7484 dout(10) << "handle_slave_rmdir_prep_ack " << *mdr
7485 << " " << *ack << dendl;
7486
7487 mds_rank_t from = mds_rank_t(ack->get_source().num());
7488
7489 mdr->more()->slaves.insert(from);
7490 mdr->more()->witnessed.insert(from);
7491 if (!ack->is_not_journaled())
7492 mdr->more()->has_journaled_slaves = true;
7493
7494 // remove from waiting list
11fdf7f2 7495 ceph_assert(mdr->more()->waiting_on_slave.count(from));
7c673cae
FG
7496 mdr->more()->waiting_on_slave.erase(from);
7497
7498 if (mdr->more()->waiting_on_slave.empty())
7499 dispatch_client_request(mdr); // go again!
7500 else
7501 dout(10) << "still waiting on slaves " << mdr->more()->waiting_on_slave << dendl;
7502}
7503
31f18b77 7504void Server::_commit_slave_rmdir(MDRequestRef& mdr, int r, CDentry *straydn)
7c673cae
FG
7505{
7506 dout(10) << "_commit_slave_rmdir " << *mdr << " r=" << r << dendl;
e306af50 7507
7c673cae 7508 if (r == 0) {
31f18b77
FG
7509 if (mdr->more()->slave_update_journaled) {
7510 CInode *strayin = straydn->get_projected_linkage()->get_inode();
7511 if (strayin && !strayin->snaprealm)
7512 mdcache->clear_dirty_bits_for_stray(strayin);
7513 }
7514
7c673cae
FG
7515 mdr->cleanup();
7516
7517 if (mdr->more()->slave_update_journaled) {
7518 // write a commit to the journal
7519 ESlaveUpdate *le = new ESlaveUpdate(mdlog, "slave_rmdir_commit", mdr->reqid,
7520 mdr->slave_to_mds, ESlaveUpdate::OP_COMMIT,
7521 ESlaveUpdate::RMDIR);
7522 mdlog->start_entry(le);
7523 submit_mdlog_entry(le, new C_MDS_CommittedSlave(this, mdr), mdr, __func__);
7524 mdlog->flush();
7525 } else {
7526 _committed_slave(mdr);
7527 }
7528 } else {
7529 // abort
7530 do_rmdir_rollback(mdr->more()->rollback_bl, mdr->slave_to_mds, mdr);
7531 }
7532}
7533
7534struct C_MDS_LoggedRmdirRollback : public ServerLogContext {
7535 metareqid_t reqid;
7536 CDentry *dn;
7537 CDentry *straydn;
7538 C_MDS_LoggedRmdirRollback(Server *s, MDRequestRef& m, metareqid_t mr, CDentry *d, CDentry *st)
7539 : ServerLogContext(s, m), reqid(mr), dn(d), straydn(st) {}
7540 void finish(int r) override {
7541 server->_rmdir_rollback_finish(mdr, reqid, dn, straydn);
7542 }
7543};
7544
7545void Server::do_rmdir_rollback(bufferlist &rbl, mds_rank_t master, MDRequestRef& mdr)
7546{
7547 // unlink the other rollback methods, the rmdir rollback is only
7548 // needed to record the subtree changes in the journal for inode
7549 // replicas who are auth for empty dirfrags. no actual changes to
7550 // the file system are taking place here, so there is no Mutation.
7551
7552 rmdir_rollback rollback;
11fdf7f2
TL
7553 auto p = rbl.cbegin();
7554 decode(rollback, p);
7c673cae
FG
7555
7556 dout(10) << "do_rmdir_rollback on " << rollback.reqid << dendl;
7557 mdcache->add_rollback(rollback.reqid, master); // need to finish this update before resolve finishes
11fdf7f2 7558 ceph_assert(mdr || mds->is_resolve());
7c673cae
FG
7559
7560 CDir *dir = mdcache->get_dirfrag(rollback.src_dir);
7561 if (!dir)
7562 dir = mdcache->get_dirfrag(rollback.src_dir.ino, rollback.src_dname);
11fdf7f2 7563 ceph_assert(dir);
7c673cae 7564 CDentry *dn = dir->lookup(rollback.src_dname);
11fdf7f2 7565 ceph_assert(dn);
7c673cae 7566 dout(10) << " dn " << *dn << dendl;
11fdf7f2
TL
7567 CDir *straydir = mdcache->get_dirfrag(rollback.dest_dir);
7568 ceph_assert(straydir);
7569 CDentry *straydn = straydir->lookup(rollback.dest_dname);
7570 ceph_assert(straydn);
7571 dout(10) << " straydn " << *straydn << dendl;
7c673cae
FG
7572 CInode *in = straydn->get_linkage()->get_inode();
7573
11fdf7f2
TL
7574 dn->push_projected_linkage(in);
7575 straydn->push_projected_linkage();
7c673cae 7576
11fdf7f2
TL
7577 if (rollback.snapbl.length() && in->snaprealm) {
7578 bool hadrealm;
7579 auto p = rollback.snapbl.cbegin();
7580 decode(hadrealm, p);
7581 if (hadrealm) {
7582 decode(in->snaprealm->srnode, p);
7583 } else {
7584 in->snaprealm->merge_to(dir->get_inode()->find_snaprealm());
7585 }
7586 }
7c673cae 7587
11fdf7f2
TL
7588 if (mdr && !mdr->more()->slave_update_journaled) {
7589 ceph_assert(!in->has_subtree_root_dirfrag(mds->get_nodeid()));
7c673cae 7590
11fdf7f2 7591 _rmdir_rollback_finish(mdr, rollback.reqid, dn, straydn);
7c673cae
FG
7592 return;
7593 }
7594
7c673cae
FG
7595
7596 ESlaveUpdate *le = new ESlaveUpdate(mdlog, "slave_rmdir_rollback", rollback.reqid, master,
7597 ESlaveUpdate::OP_ROLLBACK, ESlaveUpdate::RMDIR);
7598 mdlog->start_entry(le);
7599
7600 le->commit.add_dir_context(dn->get_dir());
7601 le->commit.add_primary_dentry(dn, in, true);
7602 // slave: no need to journal straydn
7603
7604 dout(10) << " noting renamed (unlinked) dir ino " << in->ino() << " in metablob" << dendl;
7605 le->commit.renamed_dirino = in->ino();
7606
7607 mdcache->project_subtree_rename(in, straydn->get_dir(), dn->get_dir());
7608
7609 submit_mdlog_entry(le,
7610 new C_MDS_LoggedRmdirRollback(this, mdr,rollback.reqid,
7611 dn, straydn),
7612 mdr, __func__);
7613 mdlog->flush();
7614}
7615
7616void Server::_rmdir_rollback_finish(MDRequestRef& mdr, metareqid_t reqid, CDentry *dn, CDentry *straydn)
7617{
7618 dout(10) << "_rmdir_rollback_finish " << reqid << dendl;
7619
7620 straydn->get_dir()->unlink_inode(straydn);
7621 dn->pop_projected_linkage();
7622 straydn->pop_projected_linkage();
7623
7624 CInode *in = dn->get_linkage()->get_inode();
11fdf7f2
TL
7625 mdcache->adjust_subtree_after_rename(in, straydn->get_dir(),
7626 !mdr || mdr->more()->slave_update_journaled);
7627
7c673cae
FG
7628 if (mds->is_resolve()) {
7629 CDir *root = mdcache->get_subtree_root(straydn->get_dir());
7630 mdcache->try_trim_non_auth_subtree(root);
7631 }
7632
7633 if (mdr)
7634 mdcache->request_finish(mdr);
7635
e306af50 7636 mdcache->finish_rollback(reqid, mdr);
7c673cae
FG
7637}
7638
7639
7640/** _dir_is_nonempty[_unlocked]
7641 *
7642 * check if a directory is non-empty (i.e. we can rmdir it).
7643 *
7644 * the unlocked varient this is a fastpath check. we can't really be
7645 * sure until we rdlock the filelock.
7646 */
7647bool Server::_dir_is_nonempty_unlocked(MDRequestRef& mdr, CInode *in)
7648{
7649 dout(10) << "dir_is_nonempty_unlocked " << *in << dendl;
11fdf7f2 7650 ceph_assert(in->is_auth());
7c673cae 7651
9f95a23c
TL
7652 if (in->filelock.is_cached())
7653 return false; // there can be pending async create/unlink. don't know.
7c673cae
FG
7654 if (in->snaprealm && in->snaprealm->srnode.snaps.size())
7655 return true; // in a snapshot!
7656
9f95a23c
TL
7657 auto&& ls = in->get_dirfrags();
7658 for (const auto& dir : ls) {
7c673cae
FG
7659 // is the frag obviously non-empty?
7660 if (dir->is_auth()) {
7661 if (dir->get_projected_fnode()->fragstat.size()) {
7662 dout(10) << "dir_is_nonempty_unlocked dirstat has "
7663 << dir->get_projected_fnode()->fragstat.size() << " items " << *dir << dendl;
7664 return true;
7665 }
7666 }
7667 }
7668
7669 return false;
7670}
7671
7672bool Server::_dir_is_nonempty(MDRequestRef& mdr, CInode *in)
7673{
7674 dout(10) << "dir_is_nonempty " << *in << dendl;
11fdf7f2
TL
7675 ceph_assert(in->is_auth());
7676 ceph_assert(in->filelock.can_read(mdr->get_client()));
7c673cae
FG
7677
7678 frag_info_t dirstat;
7679 version_t dirstat_version = in->get_projected_inode()->dirstat.version;
7680
9f95a23c
TL
7681 auto&& ls = in->get_dirfrags();
7682 for (const auto& dir : ls) {
7c673cae
FG
7683 const fnode_t *pf = dir->get_projected_fnode();
7684 if (pf->fragstat.size()) {
7685 dout(10) << "dir_is_nonempty dirstat has "
7686 << pf->fragstat.size() << " items " << *dir << dendl;
7687 return true;
7688 }
7689
7690 if (pf->accounted_fragstat.version == dirstat_version)
7691 dirstat.add(pf->accounted_fragstat);
7692 else
7693 dirstat.add(pf->fragstat);
7694 }
7695
7696 return dirstat.size() != in->get_projected_inode()->dirstat.size();
7697}
7698
7699
7700// ======================================================
7701
7702
7703class C_MDS_rename_finish : public ServerLogContext {
7704 CDentry *srcdn;
7705 CDentry *destdn;
7706 CDentry *straydn;
7707public:
7708 C_MDS_rename_finish(Server *s, MDRequestRef& r,
7709 CDentry *sdn, CDentry *ddn, CDentry *stdn) :
7710 ServerLogContext(s, r),
7711 srcdn(sdn), destdn(ddn), straydn(stdn) { }
7712 void finish(int r) override {
11fdf7f2 7713 ceph_assert(r == 0);
7c673cae
FG
7714 server->_rename_finish(mdr, srcdn, destdn, straydn);
7715 }
7716};
7717
7718
7719/** handle_client_rename
7720 *
7721 * rename master is the destdn auth. this is because cached inodes
7722 * must remain connected. thus, any replica of srci, must also
7723 * replicate destdn, and possibly straydn, so that srci (and
7724 * destdn->inode) remain connected during the rename.
7725 *
7726 * to do this, we freeze srci, then master (destdn auth) verifies that
7727 * all other nodes have also replciated destdn and straydn. note that
7728 * destdn replicas need not also replicate srci. this only works when
7729 * destdn is master.
7730 *
7731 * This function takes responsibility for the passed mdr.
7732 */
7733void Server::handle_client_rename(MDRequestRef& mdr)
7734{
9f95a23c 7735 const cref_t<MClientRequest> &req = mdr->client_request;
7c673cae
FG
7736 dout(7) << "handle_client_rename " << *req << dendl;
7737
7738 filepath destpath = req->get_filepath();
7739 filepath srcpath = req->get_filepath2();
91327a77
AA
7740 if (srcpath.is_last_dot_or_dotdot() || destpath.is_last_dot_or_dotdot()) {
7741 respond_to_request(mdr, -EBUSY);
7742 return;
7743 }
7744
9f95a23c
TL
7745 auto [destdn, srcdn] = rdlock_two_paths_xlock_destdn(mdr, true);
7746 if (!destdn)
7747 return;
7c673cae 7748
7c673cae 7749 dout(10) << " destdn " << *destdn << dendl;
7c673cae 7750 CDir *destdir = destdn->get_dir();
11fdf7f2 7751 ceph_assert(destdir->is_auth());
9f95a23c 7752 CDentry::linkage_t *destdnl = destdn->get_projected_linkage();
7c673cae 7753
7c673cae 7754 dout(10) << " srcdn " << *srcdn << dendl;
11fdf7f2 7755 CDir *srcdir = srcdn->get_dir();
7c673cae
FG
7756 CDentry::linkage_t *srcdnl = srcdn->get_projected_linkage();
7757 CInode *srci = srcdnl->get_inode();
7758 dout(10) << " srci " << *srci << dendl;
7759
9f95a23c
TL
7760 // -- some sanity checks --
7761 if (destdn == srcdn) {
7762 dout(7) << "rename src=dest, noop" << dendl;
7763 respond_to_request(mdr, 0);
7764 return;
7765 }
7766
7767 // dest a child of src?
7768 // e.g. mv /usr /usr/foo
7769 if (srci->is_dir() && srci->is_projected_ancestor_of(destdir->get_inode())) {
7770 dout(7) << "cannot rename item to be a child of itself" << dendl;
7771 respond_to_request(mdr, -EINVAL);
7772 return;
7773 }
7774
7775 // is this a stray migration, reintegration or merge? (sanity checks!)
7776 if (mdr->reqid.name.is_mds() &&
7777 !(MDS_INO_IS_STRAY(srcpath.get_ino()) &&
7778 MDS_INO_IS_STRAY(destpath.get_ino())) &&
7779 !(destdnl->is_remote() &&
7780 destdnl->get_remote_ino() == srci->ino())) {
7781 respond_to_request(mdr, -EINVAL); // actually, this won't reply, but whatev.
7782 return;
7783 }
7784
7c673cae
FG
7785 CInode *oldin = 0;
7786 if (!destdnl->is_null()) {
7787 //dout(10) << "dest dn exists " << *destdn << dendl;
7788 oldin = mdcache->get_dentry_inode(destdn, mdr, true);
7789 if (!oldin) return;
7790 dout(10) << " oldin " << *oldin << dendl;
7c673cae
FG
7791
7792 // non-empty dir? do trivial fast unlocked check, do another check later with read locks
7793 if (oldin->is_dir() && _dir_is_nonempty_unlocked(mdr, oldin)) {
7794 respond_to_request(mdr, -ENOTEMPTY);
7795 return;
7796 }
181888fb 7797
9f95a23c
TL
7798 // mv /some/thing /to/some/existing_other_thing
7799 if (oldin->is_dir() && !srci->is_dir()) {
7800 respond_to_request(mdr, -EISDIR);
7801 return;
7802 }
7803 if (!oldin->is_dir() && srci->is_dir()) {
7804 respond_to_request(mdr, -ENOTDIR);
7805 return;
7806 }
7807 if (srci == oldin && !srcdir->inode->is_stray()) {
7808 respond_to_request(mdr, 0); // no-op. POSIX makes no sense.
7809 return;
7c673cae
FG
7810 }
7811 }
7812
9f95a23c
TL
7813 vector<CDentry*>& srctrace = mdr->dn[1];
7814 vector<CDentry*>& desttrace = mdr->dn[0];
7c673cae
FG
7815
7816 // src+dest traces _must_ share a common ancestor for locking to prevent orphans
7817 if (destpath.get_ino() != srcpath.get_ino() &&
7818 !(req->get_source().is_mds() &&
9f95a23c 7819 MDS_INO_IS_STRAY(srcpath.get_ino()))) { // <-- mds 'rename' out of stray dir is ok!
7c673cae
FG
7820 CInode *srcbase = srctrace[0]->get_dir()->get_inode();
7821 CInode *destbase = desttrace[0]->get_dir()->get_inode();
7822 // ok, extend srctrace toward root until it is an ancestor of desttrace.
7823 while (srcbase != destbase &&
7824 !srcbase->is_projected_ancestor_of(destbase)) {
7825 CDentry *pdn = srcbase->get_projected_parent_dn();
7826 srctrace.insert(srctrace.begin(), pdn);
7827 dout(10) << "rename prepending srctrace with " << *pdn << dendl;
7828 srcbase = pdn->get_dir()->get_inode();
7829 }
7830
7831 // then, extend destpath until it shares the same parent inode as srcpath.
7832 while (destbase != srcbase) {
7833 CDentry *pdn = destbase->get_projected_parent_dn();
7834 desttrace.insert(desttrace.begin(), pdn);
7c673cae
FG
7835 dout(10) << "rename prepending desttrace with " << *pdn << dendl;
7836 destbase = pdn->get_dir()->get_inode();
7837 }
7838 dout(10) << "rename src and dest traces now share common ancestor " << *destbase << dendl;
7839 }
7840
7c673cae 7841
11fdf7f2 7842 bool linkmerge = srcdnl->get_inode() == destdnl->get_inode();
7c673cae
FG
7843 if (linkmerge)
7844 dout(10) << " this is a link merge" << dendl;
7845
7846 // -- create stray dentry? --
7847 CDentry *straydn = NULL;
7848 if (destdnl->is_primary() && !linkmerge) {
7849 straydn = prepare_stray_dentry(mdr, destdnl->get_inode());
7850 if (!straydn)
7851 return;
7852 dout(10) << " straydn is " << *straydn << dendl;
7853 } else if (mdr->straydn) {
7854 mdr->unpin(mdr->straydn);
7855 mdr->straydn = NULL;
7856 }
7857
7c673cae
FG
7858
7859 // -- locks --
9f95a23c
TL
7860 if (!(mdr->locking_state & MutationImpl::ALL_LOCKED)) {
7861 MutationImpl::LockOpVec lov;
7c673cae 7862
9f95a23c
TL
7863 // we need to update srci's ctime. xlock its least contended lock to do that...
7864 lov.add_xlock(&srci->linklock);
7865 lov.add_xlock(&srci->snaplock);
7c673cae 7866
9f95a23c
TL
7867 if (oldin) {
7868 // xlock oldin (for nlink--)
7869 lov.add_xlock(&oldin->linklock);
7870 lov.add_xlock(&oldin->snaplock);
7871 if (oldin->is_dir()) {
7872 ceph_assert(srci->is_dir());
11fdf7f2 7873 lov.add_rdlock(&oldin->filelock); // to verify it's empty
7c673cae 7874
9f95a23c
TL
7875 // adjust locking order?
7876 int cmp = mdr->compare_paths();
7877 if (cmp < 0 || (cmp == 0 && oldin->ino() < srci->ino()))
7878 std::reverse(lov.begin(), lov.end());
7879 } else {
7880 ceph_assert(!srci->is_dir());
7881 // adjust locking order;
7882 if (srci->ino() > oldin->ino())
7883 std::reverse(lov.begin(), lov.end());
7884 }
7885 }
7886
7887 // straydn?
7888 if (straydn) {
7889 lov.add_wrlock(&straydn->get_dir()->inode->filelock);
7890 lov.add_wrlock(&straydn->get_dir()->inode->nestlock);
7891 lov.add_xlock(&straydn->lock);
7892 }
7893
7894 CInode *auth_pin_freeze = !srcdn->is_auth() && srcdnl->is_primary() ? srci : nullptr;
7895 if (!mds->locker->acquire_locks(mdr, lov, auth_pin_freeze))
7896 return;
7897
7898 mdr->locking_state |= MutationImpl::ALL_LOCKED;
7899 }
7c673cae 7900
11fdf7f2
TL
7901 if (linkmerge)
7902 ceph_assert(srcdir->inode->is_stray() && srcdnl->is_primary() && destdnl->is_remote());
7903
7c673cae 7904 if ((!mdr->has_more() || mdr->more()->witnessed.empty())) {
11fdf7f2 7905 if (!check_access(mdr, srcdir->get_inode(), MAY_WRITE))
7c673cae
FG
7906 return;
7907
7908 if (!check_access(mdr, destdn->get_dir()->get_inode(), MAY_WRITE))
7909 return;
7910
7911 if (!check_fragment_space(mdr, destdn->get_dir()))
7912 return;
7913
7914 if (!check_access(mdr, srci, MAY_WRITE))
7915 return;
7916 }
7917
7918 // with read lock, really verify oldin is empty
7919 if (oldin &&
7920 oldin->is_dir() &&
7921 _dir_is_nonempty(mdr, oldin)) {
7922 respond_to_request(mdr, -ENOTEMPTY);
7923 return;
7924 }
7925
11fdf7f2 7926 /* project_snaprealm_past_parent() will do this job
7c673cae
FG
7927 *
7928 // moving between snaprealms?
7929 if (srcdnl->is_primary() && srci->is_multiversion() && !srci->snaprealm) {
7930 SnapRealm *srcrealm = srci->find_snaprealm();
7931 SnapRealm *destrealm = destdn->get_dir()->inode->find_snaprealm();
7932 if (srcrealm != destrealm &&
7933 (srcrealm->get_newest_seq() + 1 > srcdn->first ||
7934 destrealm->get_newest_seq() + 1 > srcdn->first)) {
7935 dout(10) << " renaming between snaprealms, creating snaprealm for " << *srci << dendl;
7936 mdcache->snaprealm_create(mdr, srci);
7937 return;
7938 }
7939 }
7940 */
7941
adb31ebb
TL
7942 SnapRealm *dest_realm = nullptr;
7943 SnapRealm *src_realm = nullptr;
7944 if (!linkmerge) {
7945 dest_realm = destdir->inode->find_snaprealm();
7946 if (srcdir->inode == destdir->inode)
7947 src_realm = dest_realm;
7948 else
7949 src_realm = srcdir->inode->find_snaprealm();
7950 if (src_realm != dest_realm &&
7951 src_realm->get_subvolume_ino() != dest_realm->get_subvolume_ino()) {
7952 respond_to_request(mdr, -EXDEV);
7953 return;
7954 }
7955 }
7956
11fdf7f2 7957 ceph_assert(g_conf()->mds_kill_rename_at != 1);
7c673cae
FG
7958
7959 // -- open all srcdn inode frags, if any --
7960 // we need these open so that auth can properly delegate from inode to dirfrags
7961 // after the inode is _ours_.
7962 if (srcdnl->is_primary() &&
7963 !srcdn->is_auth() &&
7964 srci->is_dir()) {
7965 dout(10) << "srci is remote dir, setting stickydirs and opening all frags" << dendl;
7966 mdr->set_stickydirs(srci);
7967
11fdf7f2
TL
7968 frag_vec_t leaves;
7969 srci->dirfragtree.get_leaves(leaves);
7970 for (const auto& leaf : leaves) {
7971 CDir *dir = srci->get_dirfrag(leaf);
7c673cae 7972 if (!dir) {
11fdf7f2
TL
7973 dout(10) << " opening " << leaf << " under " << *srci << dendl;
7974 mdcache->open_remote_dirfrag(srci, leaf, new C_MDS_RetryRequest(mdcache, mdr));
7c673cae
FG
7975 return;
7976 }
7977 }
7978 }
7979
11fdf7f2
TL
7980 // -- prepare snaprealm ---
7981
7982 if (linkmerge) {
7983 if (!mdr->more()->srci_srnode &&
7984 srci->get_projected_inode()->nlink == 1 &&
7985 srci->is_projected_snaprealm_global()) {
7986 sr_t *new_srnode = srci->prepare_new_srnode(0);
adb31ebb 7987 srci->record_snaprealm_parent_dentry(new_srnode, nullptr, destdn, false);
11fdf7f2
TL
7988
7989 srci->clear_snaprealm_global(new_srnode);
7990 mdr->more()->srci_srnode = new_srnode;
7991 }
7992 } else {
7993 if (oldin && !mdr->more()->desti_srnode) {
7994 if (oldin->is_projected_snaprealm_global()) {
7995 sr_t *new_srnode = oldin->prepare_new_srnode(0);
adb31ebb 7996 oldin->record_snaprealm_parent_dentry(new_srnode, dest_realm, destdn, destdnl->is_primary());
11fdf7f2
TL
7997 // dropping the last linkage or dropping the last remote linkage,
7998 // detch the inode from global snaprealm
7999 auto nlink = oldin->get_projected_inode()->nlink;
8000 if (nlink == 1 ||
8001 (nlink == 2 && !destdnl->is_primary() &&
8002 !oldin->get_projected_parent_dir()->inode->is_stray()))
8003 oldin->clear_snaprealm_global(new_srnode);
8004 mdr->more()->desti_srnode = new_srnode;
8005 } else if (destdnl->is_primary()) {
11fdf7f2
TL
8006 snapid_t follows = dest_realm->get_newest_seq();
8007 if (oldin->snaprealm || follows + 1 > oldin->get_oldest_snap()) {
8008 sr_t *new_srnode = oldin->prepare_new_srnode(follows);
8009 oldin->record_snaprealm_past_parent(new_srnode, straydn->get_dir()->inode->find_snaprealm());
8010 mdr->more()->desti_srnode = new_srnode;
8011 }
8012 }
8013 }
8014 if (!mdr->more()->srci_srnode) {
11fdf7f2
TL
8015 if (srci->is_projected_snaprealm_global()) {
8016 sr_t *new_srnode = srci->prepare_new_srnode(0);
adb31ebb 8017 srci->record_snaprealm_parent_dentry(new_srnode, src_realm, srcdn, srcdnl->is_primary());
11fdf7f2
TL
8018 mdr->more()->srci_srnode = new_srnode;
8019 } else if (srcdnl->is_primary()) {
11fdf7f2
TL
8020 snapid_t follows = src_realm->get_newest_seq();
8021 if (src_realm != dest_realm &&
8022 (srci->snaprealm || follows + 1 > srci->get_oldest_snap())) {
8023 sr_t *new_srnode = srci->prepare_new_srnode(follows);
8024 srci->record_snaprealm_past_parent(new_srnode, dest_realm);
8025 mdr->more()->srci_srnode = new_srnode;
8026 }
8027 }
8028 }
8029 }
8030
7c673cae
FG
8031 // -- prepare witnesses --
8032
9f95a23c
TL
8033 /*
8034 * NOTE: we use _all_ replicas as witnesses.
8035 * this probably isn't totally necessary (esp for file renames),
8036 * but if/when we change that, we have to make sure rejoin is
8037 * sufficiently robust to handle strong rejoins from survivors
8038 * with totally wrong dentry->inode linkage.
8039 * (currently, it can ignore rename effects, because the resolve
8040 * stage will sort them out.)
8041 */
8042 set<mds_rank_t> witnesses = mdr->more()->extra_witnesses;
8043 if (srcdn->is_auth())
8044 srcdn->list_replicas(witnesses);
8045 else
8046 witnesses.insert(srcdn->authority().first);
8047 if (srcdnl->is_remote() && !srci->is_auth())
8048 witnesses.insert(srci->authority().first);
8049 destdn->list_replicas(witnesses);
8050 if (destdnl->is_remote() && !oldin->is_auth())
8051 witnesses.insert(oldin->authority().first);
8052 dout(10) << " witnesses " << witnesses << ", have " << mdr->more()->witnessed << dendl;
8053
8054 if (!witnesses.empty()) {
8055 // Replicas can't see projected dentry linkages and will get confused.
8056 // We have taken snaplocks on ancestor inodes. Later rename/rmdir requests
8057 // can't project these inodes' linkages.
8058 bool need_flush = false;
8059 for (auto& dn : srctrace) {
8060 if (dn->is_projected()) {
8061 need_flush = true;
8062 break;
8063 }
8064 }
8065 if (!need_flush) {
8066 CDentry *dn = destdn;
8067 do {
8068 if (dn->is_projected()) {
8069 need_flush = true;
8070 break;
8071 }
8072 CInode *diri = dn->get_dir()->get_inode();
8073 dn = diri->get_projected_parent_dn();
8074 } while (dn);
8075 }
8076 if (need_flush) {
8077 mdlog->wait_for_safe(
8078 new MDSInternalContextWrapper(mds,
8079 new C_MDS_RetryRequest(mdcache, mdr)));
8080 mdlog->flush();
8081 return;
8082 }
8083 }
8084
7c673cae
FG
8085 // do srcdn auth last
8086 mds_rank_t last = MDS_RANK_NONE;
8087 if (!srcdn->is_auth()) {
8088 last = srcdn->authority().first;
8089 mdr->more()->srcdn_auth_mds = last;
8090 // ask auth of srci to mark srci as ambiguous auth if more than two MDS
8091 // are involved in the rename operation.
8092 if (srcdnl->is_primary() && !mdr->more()->is_ambiguous_auth) {
8093 dout(10) << " preparing ambiguous auth for srci" << dendl;
11fdf7f2
TL
8094 ceph_assert(mdr->more()->is_remote_frozen_authpin);
8095 ceph_assert(mdr->more()->rename_inode == srci);
7c673cae
FG
8096 _rename_prepare_witness(mdr, last, witnesses, srctrace, desttrace, straydn);
8097 return;
8098 }
8099 }
8100
8101 for (set<mds_rank_t>::iterator p = witnesses.begin();
8102 p != witnesses.end();
8103 ++p) {
8104 if (*p == last) continue; // do it last!
8105 if (mdr->more()->witnessed.count(*p)) {
8106 dout(10) << " already witnessed by mds." << *p << dendl;
8107 } else if (mdr->more()->waiting_on_slave.count(*p)) {
8108 dout(10) << " already waiting on witness mds." << *p << dendl;
8109 } else {
8110 if (!_rename_prepare_witness(mdr, *p, witnesses, srctrace, desttrace, straydn))
8111 return;
8112 }
8113 }
8114 if (!mdr->more()->waiting_on_slave.empty())
8115 return; // we're waiting for a witness.
8116
8117 if (last != MDS_RANK_NONE && mdr->more()->witnessed.count(last) == 0) {
8118 dout(10) << " preparing last witness (srcdn auth)" << dendl;
11fdf7f2 8119 ceph_assert(mdr->more()->waiting_on_slave.count(last) == 0);
7c673cae
FG
8120 _rename_prepare_witness(mdr, last, witnesses, srctrace, desttrace, straydn);
8121 return;
8122 }
8123
8124 // test hack: bail after slave does prepare, so we can verify it's _live_ rollback.
8125 if (!mdr->more()->slaves.empty() && !srci->is_dir())
11fdf7f2 8126 ceph_assert(g_conf()->mds_kill_rename_at != 3);
7c673cae 8127 if (!mdr->more()->slaves.empty() && srci->is_dir())
11fdf7f2 8128 ceph_assert(g_conf()->mds_kill_rename_at != 4);
7c673cae
FG
8129
8130 // -- declare now --
8131 mdr->set_mds_stamp(ceph_clock_now());
8132
8133 // -- prepare journal entry --
8134 mdr->ls = mdlog->get_current_segment();
8135 EUpdate *le = new EUpdate(mdlog, "rename");
8136 mdlog->start_entry(le);
8137 le->metablob.add_client_req(mdr->reqid, mdr->client_request->get_oldest_client_tid());
8138 if (!mdr->more()->witnessed.empty()) {
8139 dout(20) << " noting uncommitted_slaves " << mdr->more()->witnessed << dendl;
8140
8141 le->reqid = mdr->reqid;
8142 le->had_slaves = true;
8143
8144 mdcache->add_uncommitted_master(mdr->reqid, mdr->ls, mdr->more()->witnessed);
8145 // no need to send frozen auth pin to recovring auth MDS of srci
8146 mdr->more()->is_remote_frozen_authpin = false;
8147 }
8148
8149 _rename_prepare(mdr, &le->metablob, &le->client_map, srcdn, destdn, straydn);
8150 if (le->client_map.length())
8151 le->cmapv = mds->sessionmap.get_projected();
8152
8153 // -- commit locally --
8154 C_MDS_rename_finish *fin = new C_MDS_rename_finish(this, mdr, srcdn, destdn, straydn);
8155
8156 journal_and_reply(mdr, srci, destdn, le, fin);
81eedcae 8157 mds->balancer->maybe_fragment(destdn->get_dir(), false);
7c673cae
FG
8158}
8159
8160
8161void Server::_rename_finish(MDRequestRef& mdr, CDentry *srcdn, CDentry *destdn, CDentry *straydn)
8162{
8163 dout(10) << "_rename_finish " << *mdr << dendl;
8164
8165 if (!mdr->more()->witnessed.empty())
8166 mdcache->logged_master_update(mdr->reqid);
8167
8168 // apply
8169 _rename_apply(mdr, srcdn, destdn, straydn);
8170
8171 mdcache->send_dentry_link(destdn, mdr);
8172
8173 CDentry::linkage_t *destdnl = destdn->get_linkage();
8174 CInode *in = destdnl->get_inode();
8175 bool need_eval = mdr->more()->cap_imports.count(in);
8176
8177 // test hack: test slave commit
8178 if (!mdr->more()->slaves.empty() && !in->is_dir())
11fdf7f2 8179 ceph_assert(g_conf()->mds_kill_rename_at != 5);
7c673cae 8180 if (!mdr->more()->slaves.empty() && in->is_dir())
11fdf7f2 8181 ceph_assert(g_conf()->mds_kill_rename_at != 6);
7c673cae
FG
8182
8183 // bump popularity
11fdf7f2 8184 mds->balancer->hit_dir(srcdn->get_dir(), META_POP_IWR);
7c673cae 8185 if (destdnl->is_remote() && in->is_auth())
11fdf7f2 8186 mds->balancer->hit_inode(in, META_POP_IWR);
7c673cae
FG
8187
8188 // did we import srci? if so, explicitly ack that import that, before we unlock and reply.
8189
11fdf7f2 8190 ceph_assert(g_conf()->mds_kill_rename_at != 7);
7c673cae
FG
8191
8192 // reply
8193 respond_to_request(mdr, 0);
8194
8195 if (need_eval)
8196 mds->locker->eval(in, CEPH_CAP_LOCKS, true);
8197
8198 // clean up?
8199 // respond_to_request() drops locks. So stray reintegration can race with us.
8200 if (straydn && !straydn->get_projected_linkage()->is_null()) {
8201 mdcache->notify_stray(straydn);
8202 }
8203}
8204
8205
8206
8207// helpers
8208
8209bool Server::_rename_prepare_witness(MDRequestRef& mdr, mds_rank_t who, set<mds_rank_t> &witnesse,
8210 vector<CDentry*>& srctrace, vector<CDentry*>& dsttrace, CDentry *straydn)
8211{
8212 if (mds->is_cluster_degraded() &&
8213 !mds->mdsmap->is_clientreplay_or_active_or_stopping(who)) {
8214 dout(10) << "_rename_prepare_witness mds." << who << " is not active" << dendl;
8215 if (mdr->more()->waiting_on_slave.empty())
8216 mds->wait_for_active_peer(who, new C_MDS_RetryRequest(mdcache, mdr));
8217 return false;
8218 }
8219
8220 dout(10) << "_rename_prepare_witness mds." << who << dendl;
9f95a23c 8221 auto req = make_message<MMDSSlaveRequest>(mdr->reqid, mdr->attempt, MMDSSlaveRequest::OP_RENAMEPREP);
7c673cae
FG
8222
8223 req->srcdnpath = filepath(srctrace.front()->get_dir()->ino());
8224 for (auto dn : srctrace)
94b18763 8225 req->srcdnpath.push_dentry(dn->get_name());
7c673cae
FG
8226 req->destdnpath = filepath(dsttrace.front()->get_dir()->ino());
8227 for (auto dn : dsttrace)
94b18763 8228 req->destdnpath.push_dentry(dn->get_name());
7c673cae 8229 if (straydn)
9f95a23c 8230 mdcache->encode_replica_stray(straydn, who, req->straybl);
11fdf7f2
TL
8231
8232 if (mdr->more()->srci_srnode)
8233 encode(*mdr->more()->srci_srnode, req->srci_snapbl);
8234 if (mdr->more()->desti_srnode)
8235 encode(*mdr->more()->desti_srnode, req->desti_snapbl);
31f18b77
FG
8236
8237 req->srcdn_auth = mdr->more()->srcdn_auth_mds;
7c673cae
FG
8238
8239 // srcdn auth will verify our current witness list is sufficient
8240 req->witnesses = witnesse;
8241
8242 req->op_stamp = mdr->get_op_stamp();
8243 mds->send_message_mds(req, who);
8244
11fdf7f2 8245 ceph_assert(mdr->more()->waiting_on_slave.count(who) == 0);
7c673cae
FG
8246 mdr->more()->waiting_on_slave.insert(who);
8247 return true;
8248}
8249
8250version_t Server::_rename_prepare_import(MDRequestRef& mdr, CDentry *srcdn, bufferlist *client_map_bl)
8251{
8252 version_t oldpv = mdr->more()->inode_import_v;
8253
8254 CDentry::linkage_t *srcdnl = srcdn->get_linkage();
8255
8256 /* import node */
11fdf7f2 8257 auto blp = mdr->more()->inode_import.cbegin();
7c673cae
FG
8258
8259 // imported caps
28e407b8 8260 map<client_t,entity_inst_t> client_map;
11fdf7f2 8261 map<client_t, client_metadata_t> client_metadata_map;
28e407b8 8262 decode(client_map, blp);
11fdf7f2
TL
8263 decode(client_metadata_map, blp);
8264 prepare_force_open_sessions(client_map, client_metadata_map,
8265 mdr->more()->imported_session_map);
28e407b8 8266 encode(client_map, *client_map_bl, mds->mdsmap->get_up_features());
11fdf7f2 8267 encode(client_metadata_map, *client_map_bl);
7c673cae
FG
8268
8269 list<ScatterLock*> updated_scatterlocks;
8270 mdcache->migrator->decode_import_inode(srcdn, blp, srcdn->authority().first, mdr->ls,
8271 mdr->more()->cap_imports, updated_scatterlocks);
8272
8273 // hack: force back to !auth and clean, temporarily
8274 srcdnl->get_inode()->state_clear(CInode::STATE_AUTH);
8275 srcdnl->get_inode()->mark_clean();
8276
8277 return oldpv;
8278}
8279
8280bool Server::_need_force_journal(CInode *diri, bool empty)
8281{
9f95a23c 8282 auto&& dirs = diri->get_dirfrags();
7c673cae
FG
8283
8284 bool force_journal = false;
8285 if (empty) {
11fdf7f2
TL
8286 for (const auto& dir : dirs) {
8287 if (dir->is_subtree_root() && dir->get_dir_auth().first == mds->get_nodeid()) {
8288 dout(10) << " frag " << dir->get_frag() << " is auth subtree dirfrag, will force journal" << dendl;
7c673cae
FG
8289 force_journal = true;
8290 break;
8291 } else
11fdf7f2 8292 dout(20) << " frag " << dir->get_frag() << " is not auth subtree dirfrag" << dendl;
7c673cae
FG
8293 }
8294 } else {
8295 // see if any children of our frags are auth subtrees.
11fdf7f2
TL
8296 std::vector<CDir*> subtrees;
8297 mdcache->get_subtrees(subtrees);
8298 dout(10) << " subtrees " << subtrees << " frags " << dirs << dendl;
8299 for (const auto& dir : dirs) {
8300 for (const auto& subtree : subtrees) {
8301 if (dir->contains(subtree)) {
8302 if (subtree->get_dir_auth().first == mds->get_nodeid()) {
8303 dout(10) << " frag " << dir->get_frag() << " contains (maybe) auth subtree, will force journal "
8304 << *subtree << dendl;
7c673cae
FG
8305 force_journal = true;
8306 break;
8307 } else
11fdf7f2 8308 dout(20) << " frag " << dir->get_frag() << " contains but isn't auth for " << *subtree << dendl;
7c673cae 8309 } else
11fdf7f2 8310 dout(20) << " frag " << dir->get_frag() << " does not contain " << *subtree << dendl;
7c673cae
FG
8311 }
8312 if (force_journal)
8313 break;
8314 }
8315 }
8316 return force_journal;
8317}
8318
8319void Server::_rename_prepare(MDRequestRef& mdr,
8320 EMetaBlob *metablob, bufferlist *client_map_bl,
8321 CDentry *srcdn, CDentry *destdn, CDentry *straydn)
8322{
8323 dout(10) << "_rename_prepare " << *mdr << " " << *srcdn << " " << *destdn << dendl;
8324 if (straydn)
8325 dout(10) << " straydn " << *straydn << dendl;
8326
8327 CDentry::linkage_t *srcdnl = srcdn->get_projected_linkage();
8328 CDentry::linkage_t *destdnl = destdn->get_projected_linkage();
8329 CInode *srci = srcdnl->get_inode();
8330 CInode *oldin = destdnl->get_inode();
8331
8332 // primary+remote link merge?
11fdf7f2
TL
8333 bool linkmerge = (srci == oldin);
8334 if (linkmerge)
8335 ceph_assert(srcdnl->is_primary() && destdnl->is_remote());
7c673cae
FG
8336 bool silent = srcdn->get_dir()->inode->is_stray();
8337
8338 bool force_journal_dest = false;
8339 if (srci->is_dir() && !destdn->is_auth()) {
8340 if (srci->is_auth()) {
8341 // if we are auth for srci and exporting it, force journal because journal replay needs
8342 // the source inode to create auth subtrees.
8343 dout(10) << " we are exporting srci, will force journal destdn" << dendl;
8344 force_journal_dest = true;
8345 } else
8346 force_journal_dest = _need_force_journal(srci, false);
8347 }
8348
8349 bool force_journal_stray = false;
8350 if (oldin && oldin->is_dir() && straydn && !straydn->is_auth())
8351 force_journal_stray = _need_force_journal(oldin, true);
8352
8353 if (linkmerge)
8354 dout(10) << " merging remote and primary links to the same inode" << dendl;
8355 if (silent)
8356 dout(10) << " reintegrating stray; will avoid changing nlink or dir mtime" << dendl;
8357 if (force_journal_dest)
8358 dout(10) << " forcing journal destdn because we (will) have auth subtrees nested beneath it" << dendl;
8359 if (force_journal_stray)
8360 dout(10) << " forcing journal straydn because we (will) have auth subtrees nested beneath it" << dendl;
8361
8362 if (srci->is_dir() && (destdn->is_auth() || force_journal_dest)) {
8363 dout(10) << " noting renamed dir ino " << srci->ino() << " in metablob" << dendl;
8364 metablob->renamed_dirino = srci->ino();
8365 } else if (oldin && oldin->is_dir() && force_journal_stray) {
8366 dout(10) << " noting rename target dir " << oldin->ino() << " in metablob" << dendl;
8367 metablob->renamed_dirino = oldin->ino();
8368 }
8369
8370 // prepare
94b18763
FG
8371 CInode::mempool_inode *spi = 0; // renamed inode
8372 CInode::mempool_inode *tpi = 0; // target/overwritten inode
7c673cae
FG
8373
8374 // target inode
8375 if (!linkmerge) {
8376 if (destdnl->is_primary()) {
11fdf7f2 8377 ceph_assert(straydn); // moving to straydn.
7c673cae
FG
8378 // link--, and move.
8379 if (destdn->is_auth()) {
94b18763
FG
8380 auto &pi= oldin->project_inode(); //project_snaprealm
8381 pi.inode.version = straydn->pre_dirty(pi.inode.version);
8382 pi.inode.update_backtrace();
8383 tpi = &pi.inode;
7c673cae
FG
8384 }
8385 straydn->push_projected_linkage(oldin);
8386 } else if (destdnl->is_remote()) {
8387 // nlink-- targeti
8388 if (oldin->is_auth()) {
94b18763
FG
8389 auto &pi = oldin->project_inode();
8390 pi.inode.version = oldin->pre_dirty();
8391 tpi = &pi.inode;
7c673cae
FG
8392 }
8393 }
8394 }
8395
8396 // dest
8397 if (srcdnl->is_remote()) {
8398 if (!linkmerge) {
8399 // destdn
8400 if (destdn->is_auth())
8401 mdr->more()->pvmap[destdn] = destdn->pre_dirty();
8402 destdn->push_projected_linkage(srcdnl->get_remote_ino(), srcdnl->get_remote_d_type());
8403 // srci
8404 if (srci->is_auth()) {
94b18763
FG
8405 auto &pi = srci->project_inode();
8406 pi.inode.version = srci->pre_dirty();
8407 spi = &pi.inode;
7c673cae
FG
8408 }
8409 } else {
8410 dout(10) << " will merge remote onto primary link" << dendl;
8411 if (destdn->is_auth()) {
94b18763
FG
8412 auto &pi = oldin->project_inode();
8413 pi.inode.version = mdr->more()->pvmap[destdn] = destdn->pre_dirty(oldin->inode.version);
8414 spi = &pi.inode;
7c673cae
FG
8415 }
8416 }
8417 } else { // primary
8418 if (destdn->is_auth()) {
8419 version_t oldpv;
8420 if (srcdn->is_auth())
8421 oldpv = srci->get_projected_version();
8422 else {
8423 oldpv = _rename_prepare_import(mdr, srcdn, client_map_bl);
8424
8425 // note which dirfrags have child subtrees in the journal
8426 // event, so that we can open those (as bounds) during replay.
8427 if (srci->is_dir()) {
9f95a23c
TL
8428 auto&& ls = srci->get_dirfrags();
8429 for (const auto& dir : ls) {
7c673cae
FG
8430 if (!dir->is_auth())
8431 metablob->renamed_dir_frags.push_back(dir->get_frag());
8432 }
8433 dout(10) << " noting renamed dir open frags " << metablob->renamed_dir_frags << dendl;
8434 }
8435 }
94b18763 8436 auto &pi = srci->project_inode(); // project snaprealm if srcdnl->is_primary
7c673cae 8437 // & srcdnl->snaprealm
94b18763
FG
8438 pi.inode.version = mdr->more()->pvmap[destdn] = destdn->pre_dirty(oldpv);
8439 pi.inode.update_backtrace();
8440 spi = &pi.inode;
7c673cae
FG
8441 }
8442 destdn->push_projected_linkage(srci);
8443 }
8444
8445 // src
8446 if (srcdn->is_auth())
8447 mdr->more()->pvmap[srcdn] = srcdn->pre_dirty();
8448 srcdn->push_projected_linkage(); // push null linkage
8449
8450 if (!silent) {
94b18763 8451 if (spi) {
91327a77
AA
8452 spi->ctime = mdr->get_op_stamp();
8453 if (mdr->get_op_stamp() > spi->rstat.rctime)
8454 spi->rstat.rctime = mdr->get_op_stamp();
94b18763 8455 spi->change_attr++;
7c673cae 8456 if (linkmerge)
94b18763 8457 spi->nlink--;
7c673cae
FG
8458 }
8459 if (tpi) {
91327a77
AA
8460 tpi->ctime = mdr->get_op_stamp();
8461 if (mdr->get_op_stamp() > tpi->rstat.rctime)
8462 tpi->rstat.rctime = mdr->get_op_stamp();
7c673cae 8463 tpi->change_attr++;
94b18763
FG
8464 {
8465 std::string t;
8466 destdn->make_path_string(t, true);
11fdf7f2 8467 tpi->stray_prior_path = std::move(t);
94b18763 8468 }
7c673cae
FG
8469 tpi->nlink--;
8470 if (tpi->nlink == 0)
8471 oldin->state_set(CInode::STATE_ORPHAN);
8472 }
8473 }
8474
8475 // prepare nesting, mtime updates
8476 int predirty_dir = silent ? 0:PREDIRTY_DIR;
8477
8478 // guarantee stray dir is processed first during journal replay. unlink the old inode,
8479 // then link the source inode to destdn
8480 if (destdnl->is_primary()) {
11fdf7f2 8481 ceph_assert(straydn);
7c673cae
FG
8482 if (straydn->is_auth()) {
8483 metablob->add_dir_context(straydn->get_dir());
8484 metablob->add_dir(straydn->get_dir(), true);
8485 }
8486 }
8487
8488 // sub off target
8489 if (destdn->is_auth() && !destdnl->is_null()) {
8490 mdcache->predirty_journal_parents(mdr, metablob, oldin, destdn->get_dir(),
8491 (destdnl->is_primary() ? PREDIRTY_PRIMARY:0)|predirty_dir, -1);
224ce89b 8492 if (destdnl->is_primary()) {
11fdf7f2 8493 ceph_assert(straydn);
7c673cae
FG
8494 mdcache->predirty_journal_parents(mdr, metablob, oldin, straydn->get_dir(),
8495 PREDIRTY_PRIMARY|PREDIRTY_DIR, 1);
224ce89b 8496 }
7c673cae
FG
8497 }
8498
8499 // move srcdn
8500 int predirty_primary = (srcdnl->is_primary() && srcdn->get_dir() != destdn->get_dir()) ? PREDIRTY_PRIMARY:0;
8501 int flags = predirty_dir | predirty_primary;
8502 if (srcdn->is_auth())
8503 mdcache->predirty_journal_parents(mdr, metablob, srci, srcdn->get_dir(), PREDIRTY_SHALLOW|flags, -1);
8504 if (destdn->is_auth())
8505 mdcache->predirty_journal_parents(mdr, metablob, srci, destdn->get_dir(), flags, 1);
8506
7c673cae
FG
8507 // add it all to the metablob
8508 // target inode
8509 if (!linkmerge) {
8510 if (destdnl->is_primary()) {
11fdf7f2 8511 ceph_assert(straydn);
7c673cae
FG
8512 if (destdn->is_auth()) {
8513 // project snaprealm, too
11fdf7f2
TL
8514 if (auto& desti_srnode = mdr->more()->desti_srnode) {
8515 oldin->project_snaprealm(desti_srnode);
8516 if (tpi->nlink == 0)
8517 ceph_assert(!desti_srnode->is_parent_global());
8518 desti_srnode = NULL;
8519 }
8520 straydn->first = mdcache->get_global_snaprealm()->get_newest_seq() + 1;
7c673cae
FG
8521 metablob->add_primary_dentry(straydn, oldin, true, true);
8522 } else if (force_journal_stray) {
8523 dout(10) << " forced journaling straydn " << *straydn << dendl;
8524 metablob->add_dir_context(straydn->get_dir());
8525 metablob->add_primary_dentry(straydn, oldin, true);
8526 }
8527 } else if (destdnl->is_remote()) {
8528 if (oldin->is_auth()) {
11fdf7f2
TL
8529 sr_t *new_srnode = NULL;
8530 if (mdr->slave_request) {
8531 if (mdr->slave_request->desti_snapbl.length() > 0) {
8532 new_srnode = new sr_t();
8533 auto p = mdr->slave_request->desti_snapbl.cbegin();
8534 decode(*new_srnode, p);
8535 }
8536 } else if (auto& desti_srnode = mdr->more()->desti_srnode) {
8537 new_srnode = desti_srnode;
8538 desti_srnode = NULL;
8539 }
8540 if (new_srnode) {
8541 oldin->project_snaprealm(new_srnode);
8542 if (tpi->nlink == 0)
8543 ceph_assert(!new_srnode->is_parent_global());
8544 }
7c673cae
FG
8545 // auth for targeti
8546 metablob->add_dir_context(oldin->get_projected_parent_dir());
8547 mdcache->journal_cow_dentry(mdr.get(), metablob, oldin->get_projected_parent_dn(),
8548 CEPH_NOSNAP, 0, destdnl);
8549 metablob->add_primary_dentry(oldin->get_projected_parent_dn(), oldin, true);
8550 }
8551 }
8552 }
8553
8554 // dest
8555 if (srcdnl->is_remote()) {
11fdf7f2
TL
8556 ceph_assert(!linkmerge);
8557 if (destdn->is_auth() && !destdnl->is_null())
8558 mdcache->journal_cow_dentry(mdr.get(), metablob, destdn, CEPH_NOSNAP, 0, destdnl);
8559 else
8560 destdn->first = mdcache->get_global_snaprealm()->get_newest_seq() + 1;
7c673cae 8561
11fdf7f2
TL
8562 if (destdn->is_auth())
8563 metablob->add_remote_dentry(destdn, true, srcdnl->get_remote_ino(), srcdnl->get_remote_d_type());
8564
8565 if (srci->is_auth() ) { // it's remote
8566 if (mdr->slave_request) {
8567 if (mdr->slave_request->srci_snapbl.length() > 0) {
8568 sr_t *new_srnode = new sr_t();
8569 auto p = mdr->slave_request->srci_snapbl.cbegin();
8570 decode(*new_srnode, p);
8571 srci->project_snaprealm(new_srnode);
8572 }
8573 } else if (auto& srci_srnode = mdr->more()->srci_srnode) {
8574 srci->project_snaprealm(srci_srnode);
8575 srci_srnode = NULL;
7c673cae 8576 }
7c673cae 8577
11fdf7f2
TL
8578 CDentry *srci_pdn = srci->get_projected_parent_dn();
8579 metablob->add_dir_context(srci_pdn->get_dir());
8580 mdcache->journal_cow_dentry(mdr.get(), metablob, srci_pdn, CEPH_NOSNAP, 0, srcdnl);
8581 metablob->add_primary_dentry(srci_pdn, srci, true);
7c673cae
FG
8582 }
8583 } else if (srcdnl->is_primary()) {
8584 // project snap parent update?
11fdf7f2
TL
8585 if (destdn->is_auth()) {
8586 if (auto& srci_srnode = mdr->more()->srci_srnode) {
8587 srci->project_snaprealm(srci_srnode);
8588 srci_srnode = NULL;
8589 }
8590 }
7c673cae
FG
8591
8592 if (destdn->is_auth() && !destdnl->is_null())
8593 mdcache->journal_cow_dentry(mdr.get(), metablob, destdn, CEPH_NOSNAP, 0, destdnl);
11fdf7f2
TL
8594
8595 destdn->first = mdcache->get_global_snaprealm()->get_newest_seq() + 1;
7c673cae
FG
8596
8597 if (destdn->is_auth())
8598 metablob->add_primary_dentry(destdn, srci, true, true);
8599 else if (force_journal_dest) {
8600 dout(10) << " forced journaling destdn " << *destdn << dendl;
8601 metablob->add_dir_context(destdn->get_dir());
8602 metablob->add_primary_dentry(destdn, srci, true);
8603 if (srcdn->is_auth() && srci->is_dir()) {
8604 // journal new subtrees root dirfrags
9f95a23c
TL
8605 auto&& ls = srci->get_dirfrags();
8606 for (const auto& dir : ls) {
7c673cae
FG
8607 if (dir->is_auth())
8608 metablob->add_dir(dir, true);
8609 }
8610 }
8611 }
8612 }
8613
8614 // src
8615 if (srcdn->is_auth()) {
8616 dout(10) << " journaling srcdn " << *srcdn << dendl;
8617 mdcache->journal_cow_dentry(mdr.get(), metablob, srcdn, CEPH_NOSNAP, 0, srcdnl);
8618 // also journal the inode in case we need do slave rename rollback. It is Ok to add
8619 // both primary and NULL dentries. Because during journal replay, null dentry is
8620 // processed after primary dentry.
8621 if (srcdnl->is_primary() && !srci->is_dir() && !destdn->is_auth())
8622 metablob->add_primary_dentry(srcdn, srci, true);
8623 metablob->add_null_dentry(srcdn, true);
8624 } else
8625 dout(10) << " NOT journaling srcdn " << *srcdn << dendl;
8626
8627 // make renamed inode first track the dn
11fdf7f2
TL
8628 if (srcdnl->is_primary() && destdn->is_auth()) {
8629 ceph_assert(srci->first <= destdn->first);
8630 srci->first = destdn->first;
8631 }
8632 // make stray inode first track the straydn
8633 if (straydn && straydn->is_auth()) {
8634 ceph_assert(oldin->first <= straydn->first);
8635 oldin->first = straydn->first;
8636 }
7c673cae 8637
224ce89b 8638 if (oldin && oldin->is_dir()) {
11fdf7f2 8639 ceph_assert(straydn);
7c673cae 8640 mdcache->project_subtree_rename(oldin, destdn->get_dir(), straydn->get_dir());
224ce89b 8641 }
7c673cae
FG
8642 if (srci->is_dir())
8643 mdcache->project_subtree_rename(srci, srcdn->get_dir(), destdn->get_dir());
8644
8645}
8646
8647
8648void Server::_rename_apply(MDRequestRef& mdr, CDentry *srcdn, CDentry *destdn, CDentry *straydn)
8649{
8650 dout(10) << "_rename_apply " << *mdr << " " << *srcdn << " " << *destdn << dendl;
8651 dout(10) << " pvs " << mdr->more()->pvmap << dendl;
8652
8653 CDentry::linkage_t *srcdnl = srcdn->get_linkage();
8654 CDentry::linkage_t *destdnl = destdn->get_linkage();
8655
8656 CInode *oldin = destdnl->get_inode();
7c673cae
FG
8657
8658 // primary+remote link merge?
11fdf7f2
TL
8659 bool linkmerge = (srcdnl->get_inode() == oldin);
8660 if (linkmerge)
8661 ceph_assert(srcdnl->is_primary() || destdnl->is_remote());
8662
8663 bool new_in_snaprealm = false;
8664 bool new_oldin_snaprealm = false;
7c673cae
FG
8665
8666 // target inode
8667 if (!linkmerge) {
8668 if (destdnl->is_primary()) {
11fdf7f2 8669 ceph_assert(straydn);
7c673cae 8670 dout(10) << "straydn is " << *straydn << dendl;
11fdf7f2
TL
8671
8672 // if there is newly created snaprealm, need to split old snaprealm's
8673 // inodes_with_caps. So pop snaprealm before linkage changes.
8674 if (destdn->is_auth()) {
8675 bool hadrealm = (oldin->snaprealm ? true : false);
8676 oldin->early_pop_projected_snaprealm();
8677 new_oldin_snaprealm = (oldin->snaprealm && !hadrealm);
8678 } else {
8679 ceph_assert(mdr->slave_request);
8680 if (mdr->slave_request->desti_snapbl.length()) {
8681 new_oldin_snaprealm = !oldin->snaprealm;
8682 oldin->decode_snap_blob(mdr->slave_request->desti_snapbl);
8683 ceph_assert(oldin->snaprealm);
8684 ceph_assert(oldin->snaprealm->have_past_parents_open());
8685 }
8686 }
8687
31f18b77 8688 destdn->get_dir()->unlink_inode(destdn, false);
7c673cae
FG
8689
8690 straydn->pop_projected_linkage();
8691 if (mdr->is_slave() && !mdr->more()->slave_update_journaled)
11fdf7f2 8692 ceph_assert(!straydn->is_projected()); // no other projected
7c673cae
FG
8693
8694 // nlink-- targeti
11fdf7f2 8695 if (destdn->is_auth())
7c673cae 8696 oldin->pop_and_dirty_projected_inode(mdr->ls);
11fdf7f2
TL
8697
8698 mdcache->touch_dentry_bottom(straydn); // drop dn as quickly as possible.
7c673cae 8699 } else if (destdnl->is_remote()) {
31f18b77 8700 destdn->get_dir()->unlink_inode(destdn, false);
11fdf7f2
TL
8701 if (oldin->is_auth()) {
8702 oldin->pop_and_dirty_projected_inode(mdr->ls);
8703 } else if (mdr->slave_request) {
8704 if (mdr->slave_request->desti_snapbl.length() > 0) {
8705 ceph_assert(oldin->snaprealm);
8706 oldin->decode_snap_blob(mdr->slave_request->desti_snapbl);
8707 }
8708 } else if (auto& desti_srnode = mdr->more()->desti_srnode) {
8709 delete desti_srnode;
8710 desti_srnode = NULL;
8711 }
7c673cae
FG
8712 }
8713 }
8714
8715 // unlink src before we relink it at dest
8716 CInode *in = srcdnl->get_inode();
11fdf7f2 8717 ceph_assert(in);
7c673cae
FG
8718
8719 bool srcdn_was_remote = srcdnl->is_remote();
11fdf7f2
TL
8720 if (!srcdn_was_remote) {
8721 // if there is newly created snaprealm, need to split old snaprealm's
8722 // inodes_with_caps. So pop snaprealm before linkage changes.
8723 if (destdn->is_auth()) {
8724 bool hadrealm = (in->snaprealm ? true : false);
8725 in->early_pop_projected_snaprealm();
8726 new_in_snaprealm = (in->snaprealm && !hadrealm);
8727 } else {
8728 ceph_assert(mdr->slave_request);
8729 if (mdr->slave_request->srci_snapbl.length()) {
8730 new_in_snaprealm = !in->snaprealm;
8731 in->decode_snap_blob(mdr->slave_request->srci_snapbl);
8732 ceph_assert(in->snaprealm);
8733 ceph_assert(in->snaprealm->have_past_parents_open());
8734 }
8735 }
8736 }
8737
7c673cae
FG
8738 srcdn->get_dir()->unlink_inode(srcdn);
8739
8740 // dest
8741 if (srcdn_was_remote) {
8742 if (!linkmerge) {
8743 // destdn
8744 destdnl = destdn->pop_projected_linkage();
8745 if (mdr->is_slave() && !mdr->more()->slave_update_journaled)
11fdf7f2 8746 ceph_assert(!destdn->is_projected()); // no other projected
7c673cae
FG
8747
8748 destdn->link_remote(destdnl, in);
8749 if (destdn->is_auth())
8750 destdn->mark_dirty(mdr->more()->pvmap[destdn], mdr->ls);
8751 // in
11fdf7f2 8752 if (in->is_auth()) {
7c673cae 8753 in->pop_and_dirty_projected_inode(mdr->ls);
11fdf7f2
TL
8754 } else if (mdr->slave_request) {
8755 if (mdr->slave_request->srci_snapbl.length() > 0) {
8756 ceph_assert(in->snaprealm);
8757 in->decode_snap_blob(mdr->slave_request->srci_snapbl);
8758 }
8759 } else if (auto& srci_srnode = mdr->more()->srci_srnode) {
8760 delete srci_srnode;
8761 srci_srnode = NULL;
8762 }
7c673cae
FG
8763 } else {
8764 dout(10) << "merging remote onto primary link" << dendl;
8765 oldin->pop_and_dirty_projected_inode(mdr->ls);
8766 }
8767 } else { // primary
8768 if (linkmerge) {
8769 dout(10) << "merging primary onto remote link" << dendl;
31f18b77 8770 destdn->get_dir()->unlink_inode(destdn, false);
7c673cae
FG
8771 }
8772 destdnl = destdn->pop_projected_linkage();
8773 if (mdr->is_slave() && !mdr->more()->slave_update_journaled)
11fdf7f2 8774 ceph_assert(!destdn->is_projected()); // no other projected
7c673cae
FG
8775
8776 // srcdn inode import?
8777 if (!srcdn->is_auth() && destdn->is_auth()) {
11fdf7f2 8778 ceph_assert(mdr->more()->inode_import.length() > 0);
7c673cae
FG
8779
8780 map<client_t,Capability::Import> imported_caps;
8781
8782 // finish cap imports
28e407b8 8783 finish_force_open_sessions(mdr->more()->imported_session_map);
7c673cae
FG
8784 if (mdr->more()->cap_imports.count(destdnl->get_inode())) {
8785 mdcache->migrator->finish_import_inode_caps(destdnl->get_inode(),
28e407b8
AA
8786 mdr->more()->srcdn_auth_mds, true,
8787 mdr->more()->imported_session_map,
8788 mdr->more()->cap_imports[destdnl->get_inode()],
8789 imported_caps);
7c673cae
FG
8790 }
8791
8792 mdr->more()->inode_import.clear();
11fdf7f2 8793 encode(imported_caps, mdr->more()->inode_import);
7c673cae
FG
8794
8795 /* hack: add an auth pin for each xlock we hold. These were
8796 * remote xlocks previously but now they're local and
8797 * we're going to try and unpin when we xlock_finish. */
11fdf7f2
TL
8798
8799 for (auto i = mdr->locks.lower_bound(&destdnl->get_inode()->versionlock);
8800 i != mdr->locks.end();
8801 ++i) {
8802 SimpleLock *lock = i->lock;
8803 if (lock->get_parent() != destdnl->get_inode())
8804 break;
8805 if (i->is_xlock() && !lock->is_locallock())
8806 mds->locker->xlock_import(lock);
8807 }
7c673cae
FG
8808
8809 // hack: fix auth bit
8810 in->state_set(CInode::STATE_AUTH);
7c673cae
FG
8811
8812 mdr->clear_ambiguous_auth();
8813 }
8814
11fdf7f2 8815 if (destdn->is_auth())
7c673cae 8816 in->pop_and_dirty_projected_inode(mdr->ls);
7c673cae
FG
8817 }
8818
8819 // src
8820 if (srcdn->is_auth())
8821 srcdn->mark_dirty(mdr->more()->pvmap[srcdn], mdr->ls);
8822 srcdn->pop_projected_linkage();
8823 if (mdr->is_slave() && !mdr->more()->slave_update_journaled)
11fdf7f2 8824 ceph_assert(!srcdn->is_projected()); // no other projected
7c673cae
FG
8825
8826 // apply remaining projected inodes (nested)
8827 mdr->apply();
8828
8829 // update subtree map?
11fdf7f2 8830 if (destdnl->is_primary() && in->is_dir())
224ce89b 8831 mdcache->adjust_subtree_after_rename(in, srcdn->get_dir(), true);
7c673cae
FG
8832
8833 if (straydn && oldin->is_dir())
8834 mdcache->adjust_subtree_after_rename(oldin, destdn->get_dir(), true);
8835
11fdf7f2
TL
8836 if (new_oldin_snaprealm)
8837 mdcache->do_realm_invalidate_and_update_notify(oldin, CEPH_SNAP_OP_SPLIT, false);
8838 if (new_in_snaprealm)
8839 mdcache->do_realm_invalidate_and_update_notify(in, CEPH_SNAP_OP_SPLIT, true);
8840
7c673cae
FG
8841 // removing a new dn?
8842 if (srcdn->is_auth())
8843 srcdn->get_dir()->try_remove_unlinked_dn(srcdn);
8844}
8845
8846
8847
8848// ------------
8849// SLAVE
8850
8851class C_MDS_SlaveRenamePrep : public ServerLogContext {
8852 CDentry *srcdn, *destdn, *straydn;
8853public:
8854 C_MDS_SlaveRenamePrep(Server *s, MDRequestRef& m, CDentry *sr, CDentry *de, CDentry *st) :
8855 ServerLogContext(s, m), srcdn(sr), destdn(de), straydn(st) {}
8856 void finish(int r) override {
8857 server->_logged_slave_rename(mdr, srcdn, destdn, straydn);
8858 }
8859};
8860
8861class C_MDS_SlaveRenameCommit : public ServerContext {
8862 MDRequestRef mdr;
8863 CDentry *srcdn, *destdn, *straydn;
8864public:
8865 C_MDS_SlaveRenameCommit(Server *s, MDRequestRef& m, CDentry *sr, CDentry *de, CDentry *st) :
8866 ServerContext(s), mdr(m), srcdn(sr), destdn(de), straydn(st) {}
8867 void finish(int r) override {
8868 server->_commit_slave_rename(mdr, r, srcdn, destdn, straydn);
8869 }
8870};
8871
8872class C_MDS_SlaveRenameSessionsFlushed : public ServerContext {
8873 MDRequestRef mdr;
8874public:
8875 C_MDS_SlaveRenameSessionsFlushed(Server *s, MDRequestRef& r) :
8876 ServerContext(s), mdr(r) {}
8877 void finish(int r) override {
8878 server->_slave_rename_sessions_flushed(mdr);
8879 }
8880};
8881
7c673cae
FG
8882void Server::handle_slave_rename_prep(MDRequestRef& mdr)
8883{
8884 dout(10) << "handle_slave_rename_prep " << *mdr
8885 << " " << mdr->slave_request->srcdnpath
8886 << " to " << mdr->slave_request->destdnpath
8887 << dendl;
31f18b77
FG
8888
8889 if (mdr->slave_request->is_interrupted()) {
8890 dout(10) << " slave request interrupted, sending noop reply" << dendl;
9f95a23c 8891 auto reply = make_message<MMDSSlaveRequest>(mdr->reqid, mdr->attempt, MMDSSlaveRequest::OP_RENAMEPREPACK);
31f18b77
FG
8892 reply->mark_interrupted();
8893 mds->send_message_mds(reply, mdr->slave_to_mds);
91327a77 8894 mdr->reset_slave_request();
31f18b77
FG
8895 return;
8896 }
8897
7c673cae
FG
8898 // discover destdn
8899 filepath destpath(mdr->slave_request->destdnpath);
8900 dout(10) << " dest " << destpath << dendl;
8901 vector<CDentry*> trace;
9f95a23c
TL
8902 CF_MDS_MDRContextFactory cf(mdcache, mdr, false);
8903 int r = mdcache->path_traverse(mdr, cf, destpath,
8904 MDS_TRAVERSE_DISCOVER | MDS_TRAVERSE_PATH_LOCKED | MDS_TRAVERSE_WANT_DENTRY,
8905 &trace);
7c673cae
FG
8906 if (r > 0) return;
8907 if (r == -ESTALE) {
8908 mdcache->find_ino_peers(destpath.get_ino(), new C_MDS_RetryRequest(mdcache, mdr),
9f95a23c 8909 mdr->slave_to_mds, true);
7c673cae
FG
8910 return;
8911 }
11fdf7f2 8912 ceph_assert(r == 0); // we shouldn't get an error here!
7c673cae 8913
91327a77 8914 CDentry *destdn = trace.back();
7c673cae
FG
8915 CDentry::linkage_t *destdnl = destdn->get_projected_linkage();
8916 dout(10) << " destdn " << *destdn << dendl;
8917 mdr->pin(destdn);
8918
8919 // discover srcdn
8920 filepath srcpath(mdr->slave_request->srcdnpath);
8921 dout(10) << " src " << srcpath << dendl;
8922 CInode *srci = nullptr;
9f95a23c
TL
8923 r = mdcache->path_traverse(mdr, cf, srcpath,
8924 MDS_TRAVERSE_DISCOVER | MDS_TRAVERSE_PATH_LOCKED,
8925 &trace, &srci);
7c673cae 8926 if (r > 0) return;
11fdf7f2 8927 ceph_assert(r == 0);
7c673cae 8928
91327a77 8929 CDentry *srcdn = trace.back();
7c673cae
FG
8930 CDentry::linkage_t *srcdnl = srcdn->get_projected_linkage();
8931 dout(10) << " srcdn " << *srcdn << dendl;
8932 mdr->pin(srcdn);
8933 mdr->pin(srci);
8934
8935 // stray?
11fdf7f2
TL
8936 bool linkmerge = srcdnl->get_inode() == destdnl->get_inode();
8937 if (linkmerge)
8938 ceph_assert(srcdnl->is_primary() && destdnl->is_remote());
7c673cae
FG
8939 CDentry *straydn = mdr->straydn;
8940 if (destdnl->is_primary() && !linkmerge)
11fdf7f2 8941 ceph_assert(straydn);
7c673cae
FG
8942
8943 mdr->set_op_stamp(mdr->slave_request->op_stamp);
8944 mdr->more()->srcdn_auth_mds = srcdn->authority().first;
8945
8946 // set up commit waiter (early, to clean up any freezing etc we do)
8947 if (!mdr->more()->slave_commit)
8948 mdr->more()->slave_commit = new C_MDS_SlaveRenameCommit(this, mdr, srcdn, destdn, straydn);
8949
8950 // am i srcdn auth?
8951 if (srcdn->is_auth()) {
8952 set<mds_rank_t> srcdnrep;
8953 srcdn->list_replicas(srcdnrep);
8954
8955 bool reply_witness = false;
8956 if (srcdnl->is_primary() && !srcdnl->get_inode()->state_test(CInode::STATE_AMBIGUOUSAUTH)) {
8957 // freeze?
8958 // we need this to
8959 // - avoid conflicting lock state changes
8960 // - avoid concurrent updates to the inode
8961 // (this could also be accomplished with the versionlock)
11fdf7f2 8962 int allowance = 3; // 1 for the mdr auth_pin, 1 for the link lock, 1 for the snap lock
7c673cae
FG
8963 dout(10) << " freezing srci " << *srcdnl->get_inode() << " with allowance " << allowance << dendl;
8964 bool frozen_inode = srcdnl->get_inode()->freeze_inode(allowance);
8965
8966 // unfreeze auth pin after freezing the inode to avoid queueing waiters
8967 if (srcdnl->get_inode()->is_frozen_auth_pin())
8968 mdr->unfreeze_auth_pin();
8969
8970 if (!frozen_inode) {
8971 srcdnl->get_inode()->add_waiter(CInode::WAIT_FROZEN, new C_MDS_RetryRequest(mdcache, mdr));
8972 return;
8973 }
8974
8975 /*
8976 * set ambiguous auth for srci
8977 * NOTE: we don't worry about ambiguous cache expire as we do
8978 * with subtree migrations because all slaves will pin
8979 * srcdn->get_inode() for duration of this rename.
8980 */
8981 mdr->set_ambiguous_auth(srcdnl->get_inode());
8982
8983 // just mark the source inode as ambiguous auth if more than two MDS are involved.
8984 // the master will send another OP_RENAMEPREP slave request later.
8985 if (mdr->slave_request->witnesses.size() > 1) {
8986 dout(10) << " set srci ambiguous auth; providing srcdn replica list" << dendl;
8987 reply_witness = true;
8988 }
8989
8990 // make sure bystanders have received all lock related messages
8991 for (set<mds_rank_t>::iterator p = srcdnrep.begin(); p != srcdnrep.end(); ++p) {
8992 if (*p == mdr->slave_to_mds ||
8993 (mds->is_cluster_degraded() &&
8994 !mds->mdsmap->is_clientreplay_or_active_or_stopping(*p)))
8995 continue;
9f95a23c 8996 auto notify = make_message<MMDSSlaveRequest>(mdr->reqid, mdr->attempt, MMDSSlaveRequest::OP_RENAMENOTIFY);
7c673cae
FG
8997 mds->send_message_mds(notify, *p);
8998 mdr->more()->waiting_on_slave.insert(*p);
8999 }
9000
9001 // make sure clients have received all cap related messages
9002 set<client_t> export_client_set;
9003 mdcache->migrator->get_export_client_set(srcdnl->get_inode(), export_client_set);
9004
9005 MDSGatherBuilder gather(g_ceph_context);
9006 flush_client_sessions(export_client_set, gather);
9007 if (gather.has_subs()) {
9008 mdr->more()->waiting_on_slave.insert(MDS_RANK_NONE);
9009 gather.set_finisher(new C_MDS_SlaveRenameSessionsFlushed(this, mdr));
9010 gather.activate();
9011 }
9012 }
9013
9014 // is witness list sufficient?
9015 for (set<mds_rank_t>::iterator p = srcdnrep.begin(); p != srcdnrep.end(); ++p) {
9016 if (*p == mdr->slave_to_mds ||
9017 mdr->slave_request->witnesses.count(*p)) continue;
9018 dout(10) << " witness list insufficient; providing srcdn replica list" << dendl;
9019 reply_witness = true;
9020 break;
9021 }
9022
9023 if (reply_witness) {
11fdf7f2 9024 ceph_assert(!srcdnrep.empty());
9f95a23c 9025 auto reply = make_message<MMDSSlaveRequest>(mdr->reqid, mdr->attempt, MMDSSlaveRequest::OP_RENAMEPREPACK);
7c673cae
FG
9026 reply->witnesses.swap(srcdnrep);
9027 mds->send_message_mds(reply, mdr->slave_to_mds);
91327a77 9028 mdr->reset_slave_request();
7c673cae
FG
9029 return;
9030 }
9031 dout(10) << " witness list sufficient: includes all srcdn replicas" << dendl;
9032 if (!mdr->more()->waiting_on_slave.empty()) {
9033 dout(10) << " still waiting for rename notify acks from "
9034 << mdr->more()->waiting_on_slave << dendl;
9035 return;
9036 }
9037 } else if (srcdnl->is_primary() && srcdn->authority() != destdn->authority()) {
9038 // set ambiguous auth for srci on witnesses
9039 mdr->set_ambiguous_auth(srcdnl->get_inode());
9040 }
9041
9042 // encode everything we'd need to roll this back... basically, just the original state.
9043 rename_rollback rollback;
9044
9045 rollback.reqid = mdr->reqid;
9046
9047 rollback.orig_src.dirfrag = srcdn->get_dir()->dirfrag();
9048 rollback.orig_src.dirfrag_old_mtime = srcdn->get_dir()->get_projected_fnode()->fragstat.mtime;
9049 rollback.orig_src.dirfrag_old_rctime = srcdn->get_dir()->get_projected_fnode()->rstat.rctime;
11fdf7f2 9050 rollback.orig_src.dname = srcdn->get_name();
7c673cae
FG
9051 if (srcdnl->is_primary())
9052 rollback.orig_src.ino = srcdnl->get_inode()->ino();
9053 else {
11fdf7f2 9054 ceph_assert(srcdnl->is_remote());
7c673cae
FG
9055 rollback.orig_src.remote_ino = srcdnl->get_remote_ino();
9056 rollback.orig_src.remote_d_type = srcdnl->get_remote_d_type();
9057 }
9058
9059 rollback.orig_dest.dirfrag = destdn->get_dir()->dirfrag();
9060 rollback.orig_dest.dirfrag_old_mtime = destdn->get_dir()->get_projected_fnode()->fragstat.mtime;
9061 rollback.orig_dest.dirfrag_old_rctime = destdn->get_dir()->get_projected_fnode()->rstat.rctime;
11fdf7f2 9062 rollback.orig_dest.dname = destdn->get_name();
7c673cae
FG
9063 if (destdnl->is_primary())
9064 rollback.orig_dest.ino = destdnl->get_inode()->ino();
9065 else if (destdnl->is_remote()) {
9066 rollback.orig_dest.remote_ino = destdnl->get_remote_ino();
9067 rollback.orig_dest.remote_d_type = destdnl->get_remote_d_type();
9068 }
9069
9070 if (straydn) {
9071 rollback.stray.dirfrag = straydn->get_dir()->dirfrag();
9072 rollback.stray.dirfrag_old_mtime = straydn->get_dir()->get_projected_fnode()->fragstat.mtime;
9073 rollback.stray.dirfrag_old_rctime = straydn->get_dir()->get_projected_fnode()->rstat.rctime;
11fdf7f2
TL
9074 rollback.stray.dname = straydn->get_name();
9075 }
9076 if (mdr->slave_request->desti_snapbl.length()) {
9077 CInode *oldin = destdnl->get_inode();
9078 if (oldin->snaprealm) {
9079 encode(true, rollback.desti_snapbl);
9080 oldin->encode_snap_blob(rollback.desti_snapbl);
9081 } else {
9082 encode(false, rollback.desti_snapbl);
9083 }
9084 }
9085 if (mdr->slave_request->srci_snapbl.length()) {
9086 if (srci->snaprealm) {
9087 encode(true, rollback.srci_snapbl);
9088 srci->encode_snap_blob(rollback.srci_snapbl);
9089 } else {
9090 encode(false, rollback.srci_snapbl);
9091 }
7c673cae 9092 }
11fdf7f2
TL
9093 encode(rollback, mdr->more()->rollback_bl);
9094 // FIXME: rollback snaprealm
7c673cae
FG
9095 dout(20) << " rollback is " << mdr->more()->rollback_bl.length() << " bytes" << dendl;
9096
9097 // journal.
9098 mdr->ls = mdlog->get_current_segment();
9099 ESlaveUpdate *le = new ESlaveUpdate(mdlog, "slave_rename_prep", mdr->reqid, mdr->slave_to_mds,
9100 ESlaveUpdate::OP_PREPARE, ESlaveUpdate::RENAME);
9101 mdlog->start_entry(le);
9102 le->rollback = mdr->more()->rollback_bl;
9103
9104 bufferlist blah; // inode import data... obviously not used if we're the slave
9105 _rename_prepare(mdr, &le->commit, &blah, srcdn, destdn, straydn);
9106
9107 if (le->commit.empty()) {
9108 dout(10) << " empty metablob, skipping journal" << dendl;
9109 mdlog->cancel_entry(le);
9110 mdr->ls = NULL;
9111 _logged_slave_rename(mdr, srcdn, destdn, straydn);
9112 } else {
e306af50 9113 mdcache->add_uncommitted_slave(mdr->reqid, mdr->ls, mdr->slave_to_mds);
7c673cae
FG
9114 mdr->more()->slave_update_journaled = true;
9115 submit_mdlog_entry(le, new C_MDS_SlaveRenamePrep(this, mdr, srcdn, destdn, straydn),
9116 mdr, __func__);
9117 mdlog->flush();
9118 }
9119}
9120
9121void Server::_logged_slave_rename(MDRequestRef& mdr,
9122 CDentry *srcdn, CDentry *destdn, CDentry *straydn)
9123{
9124 dout(10) << "_logged_slave_rename " << *mdr << dendl;
9125
9126 // prepare ack
9f95a23c 9127 ref_t<MMDSSlaveRequest> reply;
7c673cae 9128 if (!mdr->aborted) {
9f95a23c 9129 reply = make_message<MMDSSlaveRequest>(mdr->reqid, mdr->attempt, MMDSSlaveRequest::OP_RENAMEPREPACK);
7c673cae
FG
9130 if (!mdr->more()->slave_update_journaled)
9131 reply->mark_not_journaled();
9132 }
9133
9134 CDentry::linkage_t *srcdnl = srcdn->get_linkage();
7c673cae
FG
9135 //CDentry::linkage_t *straydnl = straydn ? straydn->get_linkage() : 0;
9136
9137 // export srci?
9138 if (srcdn->is_auth() && srcdnl->is_primary()) {
9139 // set export bounds for CInode::encode_export()
11fdf7f2 9140 if (reply) {
9f95a23c 9141 std::vector<CDir*> bounds;
11fdf7f2
TL
9142 if (srcdnl->get_inode()->is_dir()) {
9143 srcdnl->get_inode()->get_dirfrags(bounds);
9f95a23c
TL
9144 for (const auto& bound : bounds) {
9145 bound->state_set(CDir::STATE_EXPORTBOUND);
9146 }
11fdf7f2 9147 }
7c673cae 9148
11fdf7f2
TL
9149 map<client_t,entity_inst_t> exported_client_map;
9150 map<client_t, client_metadata_t> exported_client_metadata_map;
9151 bufferlist inodebl;
9152 mdcache->migrator->encode_export_inode(srcdnl->get_inode(), inodebl,
9153 exported_client_map,
9154 exported_client_metadata_map);
7c673cae 9155
9f95a23c
TL
9156 for (const auto& bound : bounds) {
9157 bound->state_clear(CDir::STATE_EXPORTBOUND);
9158 }
7c673cae 9159
11fdf7f2
TL
9160 encode(exported_client_map, reply->inode_export, mds->mdsmap->get_up_features());
9161 encode(exported_client_metadata_map, reply->inode_export);
7c673cae
FG
9162 reply->inode_export.claim_append(inodebl);
9163 reply->inode_export_v = srcdnl->get_inode()->inode.version;
9164 }
9165
9166 // remove mdr auth pin
9167 mdr->auth_unpin(srcdnl->get_inode());
9168 mdr->more()->is_inode_exporter = true;
9169
9170 if (srcdnl->get_inode()->is_dirty())
9171 srcdnl->get_inode()->mark_clean();
9172
9173 dout(10) << " exported srci " << *srcdnl->get_inode() << dendl;
9174 }
9175
9176 // apply
9177 _rename_apply(mdr, srcdn, destdn, straydn);
11fdf7f2
TL
9178
9179 CDentry::linkage_t *destdnl = destdn->get_linkage();
7c673cae
FG
9180
9181 // bump popularity
11fdf7f2 9182 mds->balancer->hit_dir(srcdn->get_dir(), META_POP_IWR);
7c673cae 9183 if (destdnl->get_inode() && destdnl->get_inode()->is_auth())
11fdf7f2 9184 mds->balancer->hit_inode(destdnl->get_inode(), META_POP_IWR);
7c673cae
FG
9185
9186 // done.
91327a77 9187 mdr->reset_slave_request();
7c673cae
FG
9188 mdr->straydn = 0;
9189
9190 if (reply) {
9191 mds->send_message_mds(reply, mdr->slave_to_mds);
9192 } else {
11fdf7f2 9193 ceph_assert(mdr->aborted);
7c673cae
FG
9194 dout(10) << " abort flag set, finishing" << dendl;
9195 mdcache->request_finish(mdr);
9196 }
9197}
9198
9199void Server::_commit_slave_rename(MDRequestRef& mdr, int r,
9200 CDentry *srcdn, CDentry *destdn, CDentry *straydn)
9201{
9202 dout(10) << "_commit_slave_rename " << *mdr << " r=" << r << dendl;
9203
f64942e4
AA
9204 CInode *in = destdn->get_linkage()->get_inode();
9205
9206 inodeno_t migrated_stray;
9207 if (srcdn->is_auth() && srcdn->get_dir()->inode->is_stray())
9208 migrated_stray = in->ino();
7c673cae 9209
11fdf7f2 9210 MDSContext::vec finished;
7c673cae
FG
9211 if (r == 0) {
9212 // unfreeze+singleauth inode
9213 // hmm, do i really need to delay this?
9214 if (mdr->more()->is_inode_exporter) {
7c673cae
FG
9215 // drop our pins
9216 // we exported, clear out any xlocks that we moved to another MDS
7c673cae 9217
11fdf7f2
TL
9218 for (auto i = mdr->locks.lower_bound(&in->versionlock);
9219 i != mdr->locks.end(); ) {
9220 SimpleLock *lock = i->lock;
9221 if (lock->get_parent() != in)
9222 break;
7c673cae 9223 // we only care about xlocks on the exported inode
11fdf7f2
TL
9224 if (i->is_xlock() && !lock->is_locallock())
9225 mds->locker->xlock_export(i++, mdr.get());
9226 else
9227 ++i;
7c673cae
FG
9228 }
9229
9230 map<client_t,Capability::Import> peer_imported;
11fdf7f2
TL
9231 auto bp = mdr->more()->inode_import.cbegin();
9232 decode(peer_imported, bp);
7c673cae 9233
f64942e4 9234 dout(10) << " finishing inode export on " << *in << dendl;
11fdf7f2 9235 mdcache->migrator->finish_export_inode(in, mdr->slave_to_mds, peer_imported, finished);
7c673cae
FG
9236 mds->queue_waiters(finished); // this includes SINGLEAUTH waiters.
9237
9238 // unfreeze
11fdf7f2 9239 ceph_assert(in->is_frozen_inode());
f64942e4 9240 in->unfreeze_inode(finished);
7c673cae
FG
9241 }
9242
9243 // singleauth
9244 if (mdr->more()->is_ambiguous_auth) {
9245 mdr->more()->rename_inode->clear_ambiguous_auth(finished);
9246 mdr->more()->is_ambiguous_auth = false;
9247 }
9248
31f18b77
FG
9249 if (straydn && mdr->more()->slave_update_journaled) {
9250 CInode *strayin = straydn->get_projected_linkage()->get_inode();
9251 if (strayin && !strayin->snaprealm)
9252 mdcache->clear_dirty_bits_for_stray(strayin);
9253 }
7c673cae
FG
9254
9255 mds->queue_waiters(finished);
9256 mdr->cleanup();
9257
9258 if (mdr->more()->slave_update_journaled) {
9259 // write a commit to the journal
9260 ESlaveUpdate *le = new ESlaveUpdate(mdlog, "slave_rename_commit", mdr->reqid,
9261 mdr->slave_to_mds, ESlaveUpdate::OP_COMMIT,
9262 ESlaveUpdate::RENAME);
9263 mdlog->start_entry(le);
9264 submit_mdlog_entry(le, new C_MDS_CommittedSlave(this, mdr), mdr, __func__);
9265 mdlog->flush();
9266 } else {
9267 _committed_slave(mdr);
9268 }
9269 } else {
9270
9271 // abort
9272 // rollback_bl may be empty if we froze the inode but had to provide an expanded
9273 // witness list from the master, and they failed before we tried prep again.
9274 if (mdr->more()->rollback_bl.length()) {
9275 if (mdr->more()->is_inode_exporter) {
f64942e4
AA
9276 dout(10) << " reversing inode export of " << *in << dendl;
9277 in->abort_export();
7c673cae
FG
9278 }
9279 if (mdcache->is_ambiguous_slave_update(mdr->reqid, mdr->slave_to_mds)) {
9280 mdcache->remove_ambiguous_slave_update(mdr->reqid, mdr->slave_to_mds);
9281 // rollback but preserve the slave request
9282 do_rename_rollback(mdr->more()->rollback_bl, mdr->slave_to_mds, mdr, false);
9283 mdr->more()->rollback_bl.clear();
9284 } else
9285 do_rename_rollback(mdr->more()->rollback_bl, mdr->slave_to_mds, mdr, true);
9286 } else {
9287 dout(10) << " rollback_bl empty, not rollback back rename (master failed after getting extra witnesses?)" << dendl;
9288 // singleauth
9289 if (mdr->more()->is_ambiguous_auth) {
9290 if (srcdn->is_auth())
9291 mdr->more()->rename_inode->unfreeze_inode(finished);
9292
9293 mdr->more()->rename_inode->clear_ambiguous_auth(finished);
9294 mdr->more()->is_ambiguous_auth = false;
9295 }
9296 mds->queue_waiters(finished);
9297 mdcache->request_finish(mdr);
9298 }
9299 }
f64942e4
AA
9300
9301 if (migrated_stray && mds->is_stopping())
9302 mdcache->shutdown_export_stray_finish(migrated_stray);
7c673cae
FG
9303}
9304
9305void _rollback_repair_dir(MutationRef& mut, CDir *dir, rename_rollback::drec &r, utime_t ctime,
9306 bool isdir, int linkunlink, nest_info_t &rstat)
9307{
9308 fnode_t *pf;
9309 pf = dir->project_fnode();
9310 mut->add_projected_fnode(dir);
9311 pf->version = dir->pre_dirty();
9312
9313 if (isdir) {
9314 pf->fragstat.nsubdirs += linkunlink;
9315 } else {
9316 pf->fragstat.nfiles += linkunlink;
9317 }
9318 if (r.ino) {
9319 pf->rstat.rbytes += linkunlink * rstat.rbytes;
9320 pf->rstat.rfiles += linkunlink * rstat.rfiles;
9321 pf->rstat.rsubdirs += linkunlink * rstat.rsubdirs;
11fdf7f2 9322 pf->rstat.rsnaps += linkunlink * rstat.rsnaps;
7c673cae
FG
9323 }
9324 if (pf->fragstat.mtime == ctime) {
9325 pf->fragstat.mtime = r.dirfrag_old_mtime;
9326 if (pf->rstat.rctime == ctime)
9327 pf->rstat.rctime = r.dirfrag_old_rctime;
9328 }
9329 mut->add_updated_lock(&dir->get_inode()->filelock);
9330 mut->add_updated_lock(&dir->get_inode()->nestlock);
9331}
9332
9333struct C_MDS_LoggedRenameRollback : public ServerLogContext {
9334 MutationRef mut;
9335 CDentry *srcdn;
9336 version_t srcdnpv;
9337 CDentry *destdn;
9338 CDentry *straydn;
9f95a23c 9339 map<client_t,ref_t<MClientSnap>> splits[2];
7c673cae
FG
9340 bool finish_mdr;
9341 C_MDS_LoggedRenameRollback(Server *s, MutationRef& m, MDRequestRef& r,
11fdf7f2 9342 CDentry *sd, version_t pv, CDentry *dd, CDentry *st,
9f95a23c 9343 map<client_t,ref_t<MClientSnap>> _splits[2], bool f) :
7c673cae 9344 ServerLogContext(s, r), mut(m), srcdn(sd), srcdnpv(pv), destdn(dd),
11fdf7f2
TL
9345 straydn(st), finish_mdr(f) {
9346 splits[0].swap(_splits[0]);
9347 splits[1].swap(_splits[1]);
9348 }
7c673cae
FG
9349 void finish(int r) override {
9350 server->_rename_rollback_finish(mut, mdr, srcdn, srcdnpv,
11fdf7f2 9351 destdn, straydn, splits, finish_mdr);
7c673cae
FG
9352 }
9353};
9354
9355void Server::do_rename_rollback(bufferlist &rbl, mds_rank_t master, MDRequestRef& mdr,
9356 bool finish_mdr)
9357{
9358 rename_rollback rollback;
11fdf7f2
TL
9359 auto p = rbl.cbegin();
9360 decode(rollback, p);
7c673cae
FG
9361
9362 dout(10) << "do_rename_rollback on " << rollback.reqid << dendl;
9363 // need to finish this update before sending resolve to claim the subtree
9364 mdcache->add_rollback(rollback.reqid, master);
9365
9366 MutationRef mut(new MutationImpl(nullptr, utime_t(), rollback.reqid));
9367 mut->ls = mds->mdlog->get_current_segment();
9368
9369 CDentry *srcdn = NULL;
9370 CDir *srcdir = mdcache->get_dirfrag(rollback.orig_src.dirfrag);
9371 if (!srcdir)
9372 srcdir = mdcache->get_dirfrag(rollback.orig_src.dirfrag.ino, rollback.orig_src.dname);
9373 if (srcdir) {
9374 dout(10) << " srcdir " << *srcdir << dendl;
9375 srcdn = srcdir->lookup(rollback.orig_src.dname);
9376 if (srcdn) {
9377 dout(10) << " srcdn " << *srcdn << dendl;
11fdf7f2 9378 ceph_assert(srcdn->get_linkage()->is_null());
7c673cae
FG
9379 } else
9380 dout(10) << " srcdn not found" << dendl;
9381 } else
9382 dout(10) << " srcdir not found" << dendl;
9383
9384 CDentry *destdn = NULL;
9385 CDir *destdir = mdcache->get_dirfrag(rollback.orig_dest.dirfrag);
9386 if (!destdir)
9387 destdir = mdcache->get_dirfrag(rollback.orig_dest.dirfrag.ino, rollback.orig_dest.dname);
9388 if (destdir) {
9389 dout(10) << " destdir " << *destdir << dendl;
9390 destdn = destdir->lookup(rollback.orig_dest.dname);
9391 if (destdn)
9392 dout(10) << " destdn " << *destdn << dendl;
9393 else
9394 dout(10) << " destdn not found" << dendl;
9395 } else
9396 dout(10) << " destdir not found" << dendl;
9397
9398 CInode *in = NULL;
9399 if (rollback.orig_src.ino) {
9400 in = mdcache->get_inode(rollback.orig_src.ino);
9401 if (in && in->is_dir())
11fdf7f2 9402 ceph_assert(srcdn && destdn);
7c673cae
FG
9403 } else
9404 in = mdcache->get_inode(rollback.orig_src.remote_ino);
9405
9406 CDir *straydir = NULL;
9407 CDentry *straydn = NULL;
9408 if (rollback.stray.dirfrag.ino) {
9409 straydir = mdcache->get_dirfrag(rollback.stray.dirfrag);
9410 if (straydir) {
9411 dout(10) << "straydir " << *straydir << dendl;
9412 straydn = straydir->lookup(rollback.stray.dname);
9413 if (straydn) {
9414 dout(10) << " straydn " << *straydn << dendl;
11fdf7f2 9415 ceph_assert(straydn->get_linkage()->is_primary());
7c673cae
FG
9416 } else
9417 dout(10) << " straydn not found" << dendl;
9418 } else
9419 dout(10) << "straydir not found" << dendl;
9420 }
9421
9422 CInode *target = NULL;
9423 if (rollback.orig_dest.ino) {
9424 target = mdcache->get_inode(rollback.orig_dest.ino);
9425 if (target)
11fdf7f2 9426 ceph_assert(destdn && straydn);
7c673cae
FG
9427 } else if (rollback.orig_dest.remote_ino)
9428 target = mdcache->get_inode(rollback.orig_dest.remote_ino);
9429
9430 // can't use is_auth() in the resolve stage
9431 mds_rank_t whoami = mds->get_nodeid();
9432 // slave
11fdf7f2
TL
9433 ceph_assert(!destdn || destdn->authority().first != whoami);
9434 ceph_assert(!straydn || straydn->authority().first != whoami);
7c673cae
FG
9435
9436 bool force_journal_src = false;
9437 bool force_journal_dest = false;
9438 if (in && in->is_dir() && srcdn->authority().first != whoami)
9439 force_journal_src = _need_force_journal(in, false);
9440 if (in && target && target->is_dir())
9441 force_journal_dest = _need_force_journal(in, true);
9442
9443 version_t srcdnpv = 0;
9444 // repair src
9445 if (srcdn) {
9446 if (srcdn->authority().first == whoami)
9447 srcdnpv = srcdn->pre_dirty();
9448 if (rollback.orig_src.ino) {
11fdf7f2 9449 ceph_assert(in);
7c673cae
FG
9450 srcdn->push_projected_linkage(in);
9451 } else
9452 srcdn->push_projected_linkage(rollback.orig_src.remote_ino,
9453 rollback.orig_src.remote_d_type);
9454 }
9455
9f95a23c 9456 map<client_t,ref_t<MClientSnap>> splits[2];
11fdf7f2
TL
9457
9458 CInode::mempool_inode *pip = nullptr;
7c673cae 9459 if (in) {
11fdf7f2
TL
9460 bool projected;
9461 if (in->get_projected_parent_dn()->authority().first == whoami) {
94b18763 9462 auto &pi = in->project_inode();
94b18763 9463 pip = &pi.inode;
11fdf7f2
TL
9464 mut->add_projected_inode(in);
9465 pip->version = in->pre_dirty();
9466 projected = true;
9467 } else {
94b18763 9468 pip = in->get_projected_inode();
11fdf7f2
TL
9469 projected = false;
9470 }
94b18763 9471 if (pip->ctime == rollback.ctime)
91327a77 9472 pip->ctime = rollback.orig_src.old_ctime;
11fdf7f2
TL
9473
9474 if (rollback.srci_snapbl.length() && in->snaprealm) {
9475 bool hadrealm;
9476 auto p = rollback.srci_snapbl.cbegin();
9477 decode(hadrealm, p);
9478 if (hadrealm) {
9479 if (projected && !mds->is_resolve()) {
9480 sr_t *new_srnode = new sr_t();
9481 decode(*new_srnode, p);
9482 in->project_snaprealm(new_srnode);
9483 } else
9484 decode(in->snaprealm->srnode, p);
9485 } else {
9486 SnapRealm *realm;
9487 if (rollback.orig_src.ino) {
9488 ceph_assert(srcdir);
9489 realm = srcdir->get_inode()->find_snaprealm();
9490 } else {
9491 realm = in->snaprealm->parent;
9492 }
9493 if (!mds->is_resolve())
9494 mdcache->prepare_realm_merge(in->snaprealm, realm, splits[0]);
9495 if (projected)
9496 in->project_snaprealm(NULL);
9497 else
9498 in->snaprealm->merge_to(realm);
9499 }
9500 }
7c673cae
FG
9501 }
9502
9503 if (srcdn && srcdn->authority().first == whoami) {
9504 nest_info_t blah;
9505 _rollback_repair_dir(mut, srcdir, rollback.orig_src, rollback.ctime,
94b18763 9506 in ? in->is_dir() : false, 1, pip ? pip->accounted_rstat : blah);
7c673cae
FG
9507 }
9508
9509 // repair dest
9510 if (destdn) {
9511 if (rollback.orig_dest.ino && target) {
9512 destdn->push_projected_linkage(target);
9513 } else if (rollback.orig_dest.remote_ino) {
9514 destdn->push_projected_linkage(rollback.orig_dest.remote_ino,
9515 rollback.orig_dest.remote_d_type);
9516 } else {
9517 // the dentry will be trimmed soon, it's ok to have wrong linkage
9518 if (rollback.orig_dest.ino)
11fdf7f2 9519 ceph_assert(mds->is_resolve());
7c673cae
FG
9520 destdn->push_projected_linkage();
9521 }
9522 }
9523
9524 if (straydn)
9525 straydn->push_projected_linkage();
9526
9527 if (target) {
11fdf7f2
TL
9528 bool projected;
9529 CInode::mempool_inode *ti = nullptr;
9530 if (target->get_projected_parent_dn()->authority().first == whoami) {
94b18763 9531 auto &pi = target->project_inode();
94b18763 9532 ti = &pi.inode;
11fdf7f2
TL
9533 mut->add_projected_inode(target);
9534 ti->version = target->pre_dirty();
9535 projected = true;
9536 } else {
7c673cae 9537 ti = target->get_projected_inode();
11fdf7f2
TL
9538 projected = false;
9539 }
7c673cae 9540 if (ti->ctime == rollback.ctime)
91327a77 9541 ti->ctime = rollback.orig_dest.old_ctime;
7c673cae
FG
9542 if (MDS_INO_IS_STRAY(rollback.orig_src.dirfrag.ino)) {
9543 if (MDS_INO_IS_STRAY(rollback.orig_dest.dirfrag.ino))
11fdf7f2 9544 ceph_assert(!rollback.orig_dest.ino && !rollback.orig_dest.remote_ino);
7c673cae 9545 else
11fdf7f2 9546 ceph_assert(rollback.orig_dest.remote_ino &&
7c673cae
FG
9547 rollback.orig_dest.remote_ino == rollback.orig_src.ino);
9548 } else
9549 ti->nlink++;
11fdf7f2
TL
9550
9551 if (rollback.desti_snapbl.length() && target->snaprealm) {
9552 bool hadrealm;
9553 auto p = rollback.desti_snapbl.cbegin();
9554 decode(hadrealm, p);
9555 if (hadrealm) {
9556 if (projected && !mds->is_resolve()) {
9557 sr_t *new_srnode = new sr_t();
9558 decode(*new_srnode, p);
9559 target->project_snaprealm(new_srnode);
9560 } else
9561 decode(target->snaprealm->srnode, p);
9562 } else {
9563 SnapRealm *realm;
9564 if (rollback.orig_dest.ino) {
9565 ceph_assert(destdir);
9566 realm = destdir->get_inode()->find_snaprealm();
9567 } else {
9568 realm = target->snaprealm->parent;
9569 }
9570 if (!mds->is_resolve())
9571 mdcache->prepare_realm_merge(target->snaprealm, realm, splits[1]);
9572 if (projected)
9573 target->project_snaprealm(NULL);
9574 else
9575 target->snaprealm->merge_to(realm);
9576 }
9577 }
7c673cae
FG
9578 }
9579
9580 if (srcdn)
9581 dout(0) << " srcdn back to " << *srcdn << dendl;
9582 if (in)
9583 dout(0) << " srci back to " << *in << dendl;
9584 if (destdn)
9585 dout(0) << " destdn back to " << *destdn << dendl;
9586 if (target)
9587 dout(0) << " desti back to " << *target << dendl;
9588
9589 // journal it
9590 ESlaveUpdate *le = new ESlaveUpdate(mdlog, "slave_rename_rollback", rollback.reqid, master,
9591 ESlaveUpdate::OP_ROLLBACK, ESlaveUpdate::RENAME);
9592 mdlog->start_entry(le);
9593
9594 if (srcdn && (srcdn->authority().first == whoami || force_journal_src)) {
9595 le->commit.add_dir_context(srcdir);
9596 if (rollback.orig_src.ino)
9597 le->commit.add_primary_dentry(srcdn, 0, true);
9598 else
9599 le->commit.add_remote_dentry(srcdn, true);
9600 }
9601
9602 if (!rollback.orig_src.ino && // remote linkage
9603 in && in->authority().first == whoami) {
9604 le->commit.add_dir_context(in->get_projected_parent_dir());
9605 le->commit.add_primary_dentry(in->get_projected_parent_dn(), in, true);
9606 }
9607
9608 if (force_journal_dest) {
11fdf7f2 9609 ceph_assert(rollback.orig_dest.ino);
7c673cae
FG
9610 le->commit.add_dir_context(destdir);
9611 le->commit.add_primary_dentry(destdn, 0, true);
9612 }
9613
9614 // slave: no need to journal straydn
9615
9616 if (target && target != in && target->authority().first == whoami) {
11fdf7f2 9617 ceph_assert(rollback.orig_dest.remote_ino);
7c673cae
FG
9618 le->commit.add_dir_context(target->get_projected_parent_dir());
9619 le->commit.add_primary_dentry(target->get_projected_parent_dn(), target, true);
9620 }
9621
9622 if (in && in->is_dir() && (srcdn->authority().first == whoami || force_journal_src)) {
9623 dout(10) << " noting renamed dir ino " << in->ino() << " in metablob" << dendl;
9624 le->commit.renamed_dirino = in->ino();
9625 if (srcdn->authority().first == whoami) {
9f95a23c
TL
9626 auto&& ls = in->get_dirfrags();
9627 for (const auto& dir : ls) {
7c673cae
FG
9628 if (!dir->is_auth())
9629 le->commit.renamed_dir_frags.push_back(dir->get_frag());
9630 }
9631 dout(10) << " noting renamed dir open frags " << le->commit.renamed_dir_frags << dendl;
9632 }
9633 } else if (force_journal_dest) {
9634 dout(10) << " noting rename target ino " << target->ino() << " in metablob" << dendl;
9635 le->commit.renamed_dirino = target->ino();
9636 }
9637
9638 if (target && target->is_dir()) {
11fdf7f2 9639 ceph_assert(destdn);
7c673cae
FG
9640 mdcache->project_subtree_rename(target, straydir, destdir);
9641 }
9642
9643 if (in && in->is_dir()) {
11fdf7f2 9644 ceph_assert(srcdn);
7c673cae
FG
9645 mdcache->project_subtree_rename(in, destdir, srcdir);
9646 }
9647
9648 if (mdr && !mdr->more()->slave_update_journaled) {
11fdf7f2 9649 ceph_assert(le->commit.empty());
7c673cae
FG
9650 mdlog->cancel_entry(le);
9651 mut->ls = NULL;
11fdf7f2 9652 _rename_rollback_finish(mut, mdr, srcdn, srcdnpv, destdn, straydn, splits, finish_mdr);
7c673cae 9653 } else {
11fdf7f2 9654 ceph_assert(!le->commit.empty());
7c673cae
FG
9655 if (mdr)
9656 mdr->more()->slave_update_journaled = false;
11fdf7f2
TL
9657 MDSLogContextBase *fin = new C_MDS_LoggedRenameRollback(this, mut, mdr,
9658 srcdn, srcdnpv, destdn, straydn,
9659 splits, finish_mdr);
7c673cae
FG
9660 submit_mdlog_entry(le, fin, mdr, __func__);
9661 mdlog->flush();
9662 }
9663}
9664
9665void Server::_rename_rollback_finish(MutationRef& mut, MDRequestRef& mdr, CDentry *srcdn,
11fdf7f2 9666 version_t srcdnpv, CDentry *destdn, CDentry *straydn,
9f95a23c 9667 map<client_t,ref_t<MClientSnap>> splits[2], bool finish_mdr)
7c673cae
FG
9668{
9669 dout(10) << "_rename_rollback_finish " << mut->reqid << dendl;
9670
9671 if (straydn) {
9672 straydn->get_dir()->unlink_inode(straydn);
9673 straydn->pop_projected_linkage();
9674 }
9675 if (destdn) {
9676 destdn->get_dir()->unlink_inode(destdn);
9677 destdn->pop_projected_linkage();
9678 }
9679 if (srcdn) {
9680 srcdn->pop_projected_linkage();
11fdf7f2 9681 if (srcdn->authority().first == mds->get_nodeid()) {
7c673cae 9682 srcdn->mark_dirty(srcdnpv, mut->ls);
11fdf7f2
TL
9683 if (srcdn->get_linkage()->is_primary())
9684 srcdn->get_linkage()->get_inode()->state_set(CInode::STATE_AUTH);
9685 }
7c673cae
FG
9686 }
9687
9688 mut->apply();
9689
9690 if (srcdn && srcdn->get_linkage()->is_primary()) {
9691 CInode *in = srcdn->get_linkage()->get_inode();
7c673cae 9692 if (in && in->is_dir()) {
11fdf7f2 9693 ceph_assert(destdn);
7c673cae
FG
9694 mdcache->adjust_subtree_after_rename(in, destdn->get_dir(), true);
9695 }
9696 }
9697
9698 if (destdn) {
9699 CInode *oldin = destdn->get_linkage()->get_inode();
9700 // update subtree map?
9701 if (oldin && oldin->is_dir()) {
11fdf7f2 9702 ceph_assert(straydn);
7c673cae
FG
9703 mdcache->adjust_subtree_after_rename(oldin, straydn->get_dir(), true);
9704 }
9705 }
9706
9707 if (mds->is_resolve()) {
9708 CDir *root = NULL;
9709 if (straydn)
9710 root = mdcache->get_subtree_root(straydn->get_dir());
9711 else if (destdn)
9712 root = mdcache->get_subtree_root(destdn->get_dir());
9713 if (root)
9714 mdcache->try_trim_non_auth_subtree(root);
11fdf7f2
TL
9715 } else {
9716 mdcache->send_snaps(splits[1]);
9717 mdcache->send_snaps(splits[0]);
7c673cae
FG
9718 }
9719
9720 if (mdr) {
11fdf7f2 9721 MDSContext::vec finished;
7c673cae
FG
9722 if (mdr->more()->is_ambiguous_auth) {
9723 if (srcdn->is_auth())
9724 mdr->more()->rename_inode->unfreeze_inode(finished);
9725
9726 mdr->more()->rename_inode->clear_ambiguous_auth(finished);
9727 mdr->more()->is_ambiguous_auth = false;
9728 }
9729 mds->queue_waiters(finished);
9730 if (finish_mdr || mdr->aborted)
9731 mdcache->request_finish(mdr);
9732 else
9733 mdr->more()->slave_rolling_back = false;
9734 }
9735
e306af50 9736 mdcache->finish_rollback(mut->reqid, mdr);
7c673cae
FG
9737
9738 mut->cleanup();
9739}
9740
9f95a23c 9741void Server::handle_slave_rename_prep_ack(MDRequestRef& mdr, const cref_t<MMDSSlaveRequest> &ack)
7c673cae
FG
9742{
9743 dout(10) << "handle_slave_rename_prep_ack " << *mdr
9744 << " witnessed by " << ack->get_source()
9745 << " " << *ack << dendl;
9746 mds_rank_t from = mds_rank_t(ack->get_source().num());
9747
9748 // note slave
9749 mdr->more()->slaves.insert(from);
9750 if (mdr->more()->srcdn_auth_mds == from &&
9751 mdr->more()->is_remote_frozen_authpin &&
9752 !mdr->more()->is_ambiguous_auth) {
9753 mdr->set_ambiguous_auth(mdr->more()->rename_inode);
9754 }
9755
9756 // witnessed? or add extra witnesses?
11fdf7f2 9757 ceph_assert(mdr->more()->witnessed.count(from) == 0);
31f18b77
FG
9758 if (ack->is_interrupted()) {
9759 dout(10) << " slave request interrupted, noop" << dendl;
9760 } else if (ack->witnesses.empty()) {
7c673cae
FG
9761 mdr->more()->witnessed.insert(from);
9762 if (!ack->is_not_journaled())
9763 mdr->more()->has_journaled_slaves = true;
9764 } else {
9765 dout(10) << " extra witnesses (srcdn replicas) are " << ack->witnesses << dendl;
11fdf7f2 9766 mdr->more()->extra_witnesses = ack->witnesses;
7c673cae
FG
9767 mdr->more()->extra_witnesses.erase(mds->get_nodeid()); // not me!
9768 }
9769
9770 // srci import?
9771 if (ack->inode_export.length()) {
9772 dout(10) << " got srci import" << dendl;
11fdf7f2 9773 mdr->more()->inode_import.share(ack->inode_export);
7c673cae
FG
9774 mdr->more()->inode_import_v = ack->inode_export_v;
9775 }
9776
9777 // remove from waiting list
11fdf7f2 9778 ceph_assert(mdr->more()->waiting_on_slave.count(from));
7c673cae
FG
9779 mdr->more()->waiting_on_slave.erase(from);
9780
9781 if (mdr->more()->waiting_on_slave.empty())
9782 dispatch_client_request(mdr); // go again!
9783 else
9784 dout(10) << "still waiting on slaves " << mdr->more()->waiting_on_slave << dendl;
9785}
9786
9f95a23c 9787void Server::handle_slave_rename_notify_ack(MDRequestRef& mdr, const cref_t<MMDSSlaveRequest> &ack)
7c673cae
FG
9788{
9789 dout(10) << "handle_slave_rename_notify_ack " << *mdr << " from mds."
9790 << ack->get_source() << dendl;
11fdf7f2 9791 ceph_assert(mdr->is_slave());
7c673cae
FG
9792 mds_rank_t from = mds_rank_t(ack->get_source().num());
9793
9794 if (mdr->more()->waiting_on_slave.count(from)) {
9795 mdr->more()->waiting_on_slave.erase(from);
9796
9797 if (mdr->more()->waiting_on_slave.empty()) {
9798 if (mdr->slave_request)
9799 dispatch_slave_request(mdr);
9800 } else
9801 dout(10) << " still waiting for rename notify acks from "
9802 << mdr->more()->waiting_on_slave << dendl;
9803 }
9804}
9805
9806void Server::_slave_rename_sessions_flushed(MDRequestRef& mdr)
9807{
9808 dout(10) << "_slave_rename_sessions_flushed " << *mdr << dendl;
9809
9810 if (mdr->more()->waiting_on_slave.count(MDS_RANK_NONE)) {
9811 mdr->more()->waiting_on_slave.erase(MDS_RANK_NONE);
9812
9813 if (mdr->more()->waiting_on_slave.empty()) {
9814 if (mdr->slave_request)
9815 dispatch_slave_request(mdr);
9816 } else
9817 dout(10) << " still waiting for rename notify acks from "
9818 << mdr->more()->waiting_on_slave << dendl;
9819 }
9820}
9821
9822// snaps
9823/* This function takes responsibility for the passed mdr*/
9824void Server::handle_client_lssnap(MDRequestRef& mdr)
9825{
9f95a23c 9826 const cref_t<MClientRequest> &req = mdr->client_request;
7c673cae
FG
9827
9828 // traverse to path
9f95a23c
TL
9829 CInode *diri = try_get_auth_inode(mdr, req->get_filepath().get_ino());
9830 if (!diri)
7c673cae 9831 return;
9f95a23c 9832
7c673cae
FG
9833 if (!diri->is_dir()) {
9834 respond_to_request(mdr, -ENOTDIR);
9835 return;
9836 }
9837 dout(10) << "lssnap on " << *diri << dendl;
9838
9839 // lock snap
9f95a23c 9840 if (!mds->locker->try_rdlock_snap_layout(diri, mdr))
7c673cae
FG
9841 return;
9842
9843 if (!check_access(mdr, diri, MAY_READ))
9844 return;
9845
9846 SnapRealm *realm = diri->find_snaprealm();
11fdf7f2 9847 map<snapid_t,const SnapInfo*> infomap;
7c673cae
FG
9848 realm->get_snap_info(infomap, diri->get_oldest_snap());
9849
9850 unsigned max_entries = req->head.args.readdir.max_entries;
9851 if (!max_entries)
9852 max_entries = infomap.size();
9853 int max_bytes = req->head.args.readdir.max_bytes;
9854 if (!max_bytes)
9855 // make sure at least one item can be encoded
11fdf7f2 9856 max_bytes = (512 << 10) + g_conf()->mds_max_xattr_pairs_size;
7c673cae
FG
9857
9858 __u64 last_snapid = 0;
9859 string offset_str = req->get_path2();
9860 if (!offset_str.empty())
9861 last_snapid = realm->resolve_snapname(offset_str, diri->ino());
9862
11fdf7f2 9863 //Empty DirStat
7c673cae 9864 bufferlist dirbl;
11fdf7f2
TL
9865 static DirStat empty;
9866 CDir::encode_dirstat(dirbl, mdr->session->info, empty);
7c673cae
FG
9867
9868 max_bytes -= dirbl.length() - sizeof(__u32) + sizeof(__u8) * 2;
9869
9870 __u32 num = 0;
9871 bufferlist dnbl;
11fdf7f2 9872 auto p = infomap.upper_bound(last_snapid);
7c673cae
FG
9873 for (; p != infomap.end() && num < max_entries; ++p) {
9874 dout(10) << p->first << " -> " << *p->second << dendl;
9875
9876 // actual
9877 string snap_name;
9878 if (p->second->ino == diri->ino())
11fdf7f2 9879 snap_name = p->second->name;
7c673cae 9880 else
11fdf7f2 9881 snap_name = p->second->get_long_name();
7c673cae
FG
9882
9883 unsigned start_len = dnbl.length();
9884 if (int(start_len + snap_name.length() + sizeof(__u32) + sizeof(LeaseStat)) > max_bytes)
9885 break;
9886
11fdf7f2
TL
9887 encode(snap_name, dnbl);
9888 //infinite lease
9f95a23c 9889 LeaseStat e(CEPH_LEASE_VALID, -1, 0);
11fdf7f2
TL
9890 mds->locker->encode_lease(dnbl, mdr->session->info, e);
9891 dout(20) << "encode_infinite_lease" << dendl;
7c673cae
FG
9892
9893 int r = diri->encode_inodestat(dnbl, mdr->session, realm, p->first, max_bytes - (int)dnbl.length());
9894 if (r < 0) {
9895 bufferlist keep;
9896 keep.substr_of(dnbl, 0, start_len);
9897 dnbl.swap(keep);
9898 break;
9899 }
9900 ++num;
9901 }
9902
11fdf7f2 9903 encode(num, dirbl);
7c673cae
FG
9904 __u16 flags = 0;
9905 if (p == infomap.end()) {
9906 flags = CEPH_READDIR_FRAG_END;
9907 if (last_snapid == 0)
9908 flags |= CEPH_READDIR_FRAG_COMPLETE;
9909 }
11fdf7f2 9910 encode(flags, dirbl);
7c673cae
FG
9911 dirbl.claim_append(dnbl);
9912
9913 mdr->reply_extra_bl = dirbl;
9914 mdr->tracei = diri;
9915 respond_to_request(mdr, 0);
9916}
9917
9918
9919// MKSNAP
9920
9921struct C_MDS_mksnap_finish : public ServerLogContext {
9922 CInode *diri;
9923 SnapInfo info;
9924 C_MDS_mksnap_finish(Server *s, MDRequestRef& r, CInode *di, SnapInfo &i) :
9925 ServerLogContext(s, r), diri(di), info(i) {}
9926 void finish(int r) override {
9927 server->_mksnap_finish(mdr, diri, info);
9928 }
9929};
9930
9931/* This function takes responsibility for the passed mdr*/
9932void Server::handle_client_mksnap(MDRequestRef& mdr)
9933{
9f95a23c 9934 const cref_t<MClientRequest> &req = mdr->client_request;
11fdf7f2
TL
9935 // make sure we have as new a map as the client
9936 if (req->get_mdsmap_epoch() > mds->mdsmap->get_epoch()) {
9937 mds->wait_for_mdsmap(req->get_mdsmap_epoch(), new C_MDS_RetryRequest(mdcache, mdr));
9938 return;
9939 }
7c673cae
FG
9940 if (!mds->mdsmap->allows_snaps()) {
9941 // you can't make snapshots until you set an option right now
9942 respond_to_request(mdr, -EPERM);
9943 return;
9944 }
9945
9f95a23c
TL
9946 CInode *diri = try_get_auth_inode(mdr, req->get_filepath().get_ino());
9947 if (!diri)
7c673cae 9948 return;
7c673cae
FG
9949
9950 // dir only
9951 if (!diri->is_dir()) {
9952 respond_to_request(mdr, -ENOTDIR);
9953 return;
9954 }
9955 if (diri->is_system() && !diri->is_root()) {
9956 // no snaps in system dirs (root is ok)
9957 respond_to_request(mdr, -EPERM);
9958 return;
9959 }
9960
11fdf7f2 9961 std::string_view snapname = req->get_filepath().last_dentry();
7c673cae 9962
11fdf7f2 9963 if (mdr->client_request->get_caller_uid() < g_conf()->mds_snap_min_uid || mdr->client_request->get_caller_uid() > g_conf()->mds_snap_max_uid) {
7c673cae
FG
9964 dout(20) << "mksnap " << snapname << " on " << *diri << " denied to uid " << mdr->client_request->get_caller_uid() << dendl;
9965 respond_to_request(mdr, -EPERM);
9966 return;
9967 }
9968
9969 dout(10) << "mksnap " << snapname << " on " << *diri << dendl;
9970
9971 // lock snap
9f95a23c
TL
9972 if (!(mdr->locking_state & MutationImpl::ALL_LOCKED)) {
9973 MutationImpl::LockOpVec lov;
9974 lov.add_xlock(&diri->snaplock);
9975 if (!mds->locker->acquire_locks(mdr, lov))
9976 return;
7c673cae 9977
9f95a23c
TL
9978 if (CDentry *pdn = diri->get_projected_parent_dn(); pdn) {
9979 if (!mds->locker->try_rdlock_snap_layout(pdn->get_dir()->get_inode(), mdr))
9980 return;
9981 }
9982 mdr->locking_state |= MutationImpl::ALL_LOCKED;
9983 }
7c673cae 9984
9f95a23c 9985 if (!check_access(mdr, diri, MAY_WRITE|MAY_SNAPSHOT))
7c673cae
FG
9986 return;
9987
adb31ebb
TL
9988 if (inodeno_t subvol_ino = diri->find_snaprealm()->get_subvolume_ino();
9989 (subvol_ino && subvol_ino != diri->ino())) {
9990 respond_to_request(mdr, -EPERM);
9991 return;
9992 }
9993
9f95a23c
TL
9994 // check if we can create any more snapshots
9995 // we don't allow any more if we are already at or beyond the limit
9996 if (diri->snaprealm &&
9997 diri->snaprealm->get_snaps().size() >= max_snaps_per_dir) {
9998 respond_to_request(mdr, -EMLINK);
7c673cae 9999 return;
9f95a23c 10000 }
7c673cae
FG
10001
10002 // make sure name is unique
10003 if (diri->snaprealm &&
10004 diri->snaprealm->exists(snapname)) {
10005 respond_to_request(mdr, -EEXIST);
10006 return;
10007 }
10008 if (snapname.length() == 0 ||
10009 snapname[0] == '_') {
10010 respond_to_request(mdr, -EINVAL);
10011 return;
10012 }
10013
10014 // allocate a snapid
10015 if (!mdr->more()->stid) {
10016 // prepare an stid
10017 mds->snapclient->prepare_create(diri->ino(), snapname,
10018 mdr->get_mds_stamp(),
10019 &mdr->more()->stid, &mdr->more()->snapidbl,
10020 new C_MDS_RetryRequest(mdcache, mdr));
10021 return;
10022 }
10023
10024 version_t stid = mdr->more()->stid;
10025 snapid_t snapid;
11fdf7f2
TL
10026 auto p = mdr->more()->snapidbl.cbegin();
10027 decode(snapid, p);
7c673cae
FG
10028 dout(10) << " stid " << stid << " snapid " << snapid << dendl;
10029
11fdf7f2
TL
10030 ceph_assert(mds->snapclient->get_cached_version() >= stid);
10031
7c673cae
FG
10032 // journal
10033 SnapInfo info;
10034 info.ino = diri->ino();
10035 info.snapid = snapid;
11fdf7f2 10036 info.name = snapname;
7c673cae
FG
10037 info.stamp = mdr->get_op_stamp();
10038
94b18763 10039 auto &pi = diri->project_inode(false, true);
91327a77
AA
10040 pi.inode.ctime = info.stamp;
10041 if (info.stamp > pi.inode.rstat.rctime)
10042 pi.inode.rstat.rctime = info.stamp;
11fdf7f2 10043 pi.inode.rstat.rsnaps++;
94b18763 10044 pi.inode.version = diri->pre_dirty();
7c673cae
FG
10045
10046 // project the snaprealm
94b18763
FG
10047 auto &newsnap = *pi.snapnode;
10048 newsnap.created = snapid;
10049 auto em = newsnap.snaps.emplace(std::piecewise_construct, std::forward_as_tuple(snapid), std::forward_as_tuple(info));
10050 if (!em.second)
10051 em.first->second = info;
10052 newsnap.seq = snapid;
10053 newsnap.last_created = snapid;
7c673cae
FG
10054
10055 // journal the inode changes
10056 mdr->ls = mdlog->get_current_segment();
10057 EUpdate *le = new EUpdate(mdlog, "mksnap");
10058 mdlog->start_entry(le);
10059
10060 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
10061 le->metablob.add_table_transaction(TABLE_SNAP, stid);
10062 mdcache->predirty_journal_parents(mdr, &le->metablob, diri, 0, PREDIRTY_PRIMARY, false);
10063 mdcache->journal_dirty_inode(mdr.get(), &le->metablob, diri);
10064
10065 // journal the snaprealm changes
10066 submit_mdlog_entry(le, new C_MDS_mksnap_finish(this, mdr, diri, info),
10067 mdr, __func__);
10068 mdlog->flush();
10069}
10070
10071void Server::_mksnap_finish(MDRequestRef& mdr, CInode *diri, SnapInfo &info)
10072{
10073 dout(10) << "_mksnap_finish " << *mdr << " " << info << dendl;
10074
10075 int op = (diri->snaprealm? CEPH_SNAP_OP_CREATE : CEPH_SNAP_OP_SPLIT);
10076
10077 diri->pop_and_dirty_projected_inode(mdr->ls);
10078 mdr->apply();
10079
10080 mds->snapclient->commit(mdr->more()->stid, mdr->ls);
10081
10082 // create snap
10083 dout(10) << "snaprealm now " << *diri->snaprealm << dendl;
10084
11fdf7f2
TL
10085 // notify other mds
10086 mdcache->send_snap_update(diri, mdr->more()->stid, op);
10087
7c673cae
FG
10088 mdcache->do_realm_invalidate_and_update_notify(diri, op);
10089
10090 // yay
10091 mdr->in[0] = diri;
10092 mdr->snapid = info.snapid;
10093 mdr->tracei = diri;
10094 respond_to_request(mdr, 0);
10095}
10096
10097
10098// RMSNAP
10099
10100struct C_MDS_rmsnap_finish : public ServerLogContext {
10101 CInode *diri;
10102 snapid_t snapid;
10103 C_MDS_rmsnap_finish(Server *s, MDRequestRef& r, CInode *di, snapid_t sn) :
10104 ServerLogContext(s, r), diri(di), snapid(sn) {}
10105 void finish(int r) override {
10106 server->_rmsnap_finish(mdr, diri, snapid);
10107 }
10108};
10109
10110/* This function takes responsibility for the passed mdr*/
10111void Server::handle_client_rmsnap(MDRequestRef& mdr)
10112{
9f95a23c 10113 const cref_t<MClientRequest> &req = mdr->client_request;
7c673cae 10114
9f95a23c
TL
10115 CInode *diri = try_get_auth_inode(mdr, req->get_filepath().get_ino());
10116 if (!diri)
7c673cae 10117 return;
9f95a23c 10118
7c673cae
FG
10119 if (!diri->is_dir()) {
10120 respond_to_request(mdr, -ENOTDIR);
10121 return;
10122 }
10123
11fdf7f2 10124 std::string_view snapname = req->get_filepath().last_dentry();
7c673cae 10125
11fdf7f2 10126 if (mdr->client_request->get_caller_uid() < g_conf()->mds_snap_min_uid || mdr->client_request->get_caller_uid() > g_conf()->mds_snap_max_uid) {
7c673cae
FG
10127 dout(20) << "rmsnap " << snapname << " on " << *diri << " denied to uid " << mdr->client_request->get_caller_uid() << dendl;
10128 respond_to_request(mdr, -EPERM);
10129 return;
10130 }
10131
10132 dout(10) << "rmsnap " << snapname << " on " << *diri << dendl;
10133
10134 // does snap exist?
10135 if (snapname.length() == 0 || snapname[0] == '_') {
10136 respond_to_request(mdr, -EINVAL); // can't prune a parent snap, currently.
10137 return;
10138 }
10139 if (!diri->snaprealm || !diri->snaprealm->exists(snapname)) {
10140 respond_to_request(mdr, -ENOENT);
10141 return;
10142 }
10143 snapid_t snapid = diri->snaprealm->resolve_snapname(snapname, diri->ino());
10144 dout(10) << " snapname " << snapname << " is " << snapid << dendl;
10145
9f95a23c
TL
10146 if (!(mdr->locking_state & MutationImpl::ALL_LOCKED)) {
10147 MutationImpl::LockOpVec lov;
10148 lov.add_xlock(&diri->snaplock);
10149 if (!mds->locker->acquire_locks(mdr, lov))
10150 return;
10151 if (CDentry *pdn = diri->get_projected_parent_dn(); pdn) {
10152 if (!mds->locker->try_rdlock_snap_layout(pdn->get_dir()->get_inode(), mdr))
10153 return;
10154 }
10155 mdr->locking_state |= MutationImpl::ALL_LOCKED;
10156 }
7c673cae 10157
11fdf7f2 10158 if (!check_access(mdr, diri, MAY_WRITE|MAY_SNAPSHOT))
7c673cae
FG
10159 return;
10160
10161 // prepare
10162 if (!mdr->more()->stid) {
10163 mds->snapclient->prepare_destroy(diri->ino(), snapid,
10164 &mdr->more()->stid, &mdr->more()->snapidbl,
10165 new C_MDS_RetryRequest(mdcache, mdr));
10166 return;
10167 }
10168 version_t stid = mdr->more()->stid;
11fdf7f2 10169 auto p = mdr->more()->snapidbl.cbegin();
7c673cae 10170 snapid_t seq;
11fdf7f2 10171 decode(seq, p);
7c673cae
FG
10172 dout(10) << " stid is " << stid << ", seq is " << seq << dendl;
10173
11fdf7f2
TL
10174 ceph_assert(mds->snapclient->get_cached_version() >= stid);
10175
7c673cae 10176 // journal
94b18763
FG
10177 auto &pi = diri->project_inode(false, true);
10178 pi.inode.version = diri->pre_dirty();
91327a77
AA
10179 pi.inode.ctime = mdr->get_op_stamp();
10180 if (mdr->get_op_stamp() > pi.inode.rstat.rctime)
10181 pi.inode.rstat.rctime = mdr->get_op_stamp();
11fdf7f2 10182 pi.inode.rstat.rsnaps--;
7c673cae
FG
10183
10184 mdr->ls = mdlog->get_current_segment();
10185 EUpdate *le = new EUpdate(mdlog, "rmsnap");
10186 mdlog->start_entry(le);
10187
10188 // project the snaprealm
94b18763
FG
10189 auto &newnode = *pi.snapnode;
10190 newnode.snaps.erase(snapid);
10191 newnode.seq = seq;
10192 newnode.last_destroyed = seq;
7c673cae
FG
10193
10194 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
10195 le->metablob.add_table_transaction(TABLE_SNAP, stid);
10196 mdcache->predirty_journal_parents(mdr, &le->metablob, diri, 0, PREDIRTY_PRIMARY, false);
10197 mdcache->journal_dirty_inode(mdr.get(), &le->metablob, diri);
10198
10199 submit_mdlog_entry(le, new C_MDS_rmsnap_finish(this, mdr, diri, snapid),
10200 mdr, __func__);
10201 mdlog->flush();
10202}
10203
10204void Server::_rmsnap_finish(MDRequestRef& mdr, CInode *diri, snapid_t snapid)
10205{
10206 dout(10) << "_rmsnap_finish " << *mdr << " " << snapid << dendl;
10207 snapid_t stid = mdr->more()->stid;
11fdf7f2 10208 auto p = mdr->more()->snapidbl.cbegin();
7c673cae 10209 snapid_t seq;
11fdf7f2 10210 decode(seq, p);
7c673cae
FG
10211
10212 diri->pop_and_dirty_projected_inode(mdr->ls);
10213 mdr->apply();
10214
10215 mds->snapclient->commit(stid, mdr->ls);
10216
10217 dout(10) << "snaprealm now " << *diri->snaprealm << dendl;
10218
11fdf7f2
TL
10219 // notify other mds
10220 mdcache->send_snap_update(diri, mdr->more()->stid, CEPH_SNAP_OP_DESTROY);
10221
7c673cae
FG
10222 mdcache->do_realm_invalidate_and_update_notify(diri, CEPH_SNAP_OP_DESTROY);
10223
10224 // yay
10225 mdr->in[0] = diri;
10226 respond_to_request(mdr, 0);
10227
10228 // purge snapshot data
10229 if (diri->snaprealm->have_past_parents_open())
10230 diri->purge_stale_snap_data(diri->snaprealm->get_snaps());
10231}
10232
10233struct C_MDS_renamesnap_finish : public ServerLogContext {
10234 CInode *diri;
10235 snapid_t snapid;
10236 C_MDS_renamesnap_finish(Server *s, MDRequestRef& r, CInode *di, snapid_t sn) :
10237 ServerLogContext(s, r), diri(di), snapid(sn) {}
10238 void finish(int r) override {
10239 server->_renamesnap_finish(mdr, diri, snapid);
10240 }
10241};
10242
10243/* This function takes responsibility for the passed mdr*/
10244void Server::handle_client_renamesnap(MDRequestRef& mdr)
10245{
9f95a23c 10246 const cref_t<MClientRequest> &req = mdr->client_request;
7c673cae
FG
10247 if (req->get_filepath().get_ino() != req->get_filepath2().get_ino()) {
10248 respond_to_request(mdr, -EINVAL);
10249 return;
10250 }
10251
9f95a23c
TL
10252 CInode *diri = try_get_auth_inode(mdr, req->get_filepath().get_ino());
10253 if (!diri)
7c673cae 10254 return;
7c673cae
FG
10255
10256 if (!diri->is_dir()) { // dir only
10257 respond_to_request(mdr, -ENOTDIR);
10258 return;
10259 }
10260
11fdf7f2
TL
10261 if (mdr->client_request->get_caller_uid() < g_conf()->mds_snap_min_uid ||
10262 mdr->client_request->get_caller_uid() > g_conf()->mds_snap_max_uid) {
7c673cae
FG
10263 respond_to_request(mdr, -EPERM);
10264 return;
10265 }
10266
11fdf7f2
TL
10267 std::string_view dstname = req->get_filepath().last_dentry();
10268 std::string_view srcname = req->get_filepath2().last_dentry();
7c673cae
FG
10269 dout(10) << "renamesnap " << srcname << "->" << dstname << " on " << *diri << dendl;
10270
10271 if (srcname.length() == 0 || srcname[0] == '_') {
10272 respond_to_request(mdr, -EINVAL); // can't rename a parent snap.
10273 return;
10274 }
10275 if (!diri->snaprealm || !diri->snaprealm->exists(srcname)) {
10276 respond_to_request(mdr, -ENOENT);
10277 return;
10278 }
10279 if (dstname.length() == 0 || dstname[0] == '_') {
10280 respond_to_request(mdr, -EINVAL);
10281 return;
10282 }
10283 if (diri->snaprealm->exists(dstname)) {
10284 respond_to_request(mdr, -EEXIST);
10285 return;
10286 }
10287
10288 snapid_t snapid = diri->snaprealm->resolve_snapname(srcname, diri->ino());
10289 dout(10) << " snapname " << srcname << " is " << snapid << dendl;
10290
10291 // lock snap
9f95a23c
TL
10292 if (!(mdr->locking_state & MutationImpl::ALL_LOCKED)) {
10293 MutationImpl::LockOpVec lov;
10294 lov.add_xlock(&diri->snaplock);
10295 if (!mds->locker->acquire_locks(mdr, lov))
10296 return;
10297 if (CDentry *pdn = diri->get_projected_parent_dn(); pdn) {
10298 if (!mds->locker->try_rdlock_snap_layout(pdn->get_dir()->get_inode(), mdr))
10299 return;
10300 }
10301 mdr->locking_state |= MutationImpl::ALL_LOCKED;
10302 }
7c673cae 10303
11fdf7f2 10304 if (!check_access(mdr, diri, MAY_WRITE|MAY_SNAPSHOT))
7c673cae
FG
10305 return;
10306
10307 // prepare
10308 if (!mdr->more()->stid) {
10309 mds->snapclient->prepare_update(diri->ino(), snapid, dstname, utime_t(),
11fdf7f2 10310 &mdr->more()->stid,
7c673cae
FG
10311 new C_MDS_RetryRequest(mdcache, mdr));
10312 return;
10313 }
10314
10315 version_t stid = mdr->more()->stid;
11fdf7f2
TL
10316 dout(10) << " stid is " << stid << dendl;
10317
10318 ceph_assert(mds->snapclient->get_cached_version() >= stid);
7c673cae
FG
10319
10320 // journal
94b18763 10321 auto &pi = diri->project_inode(false, true);
91327a77
AA
10322 pi.inode.ctime = mdr->get_op_stamp();
10323 if (mdr->get_op_stamp() > pi.inode.rstat.rctime)
10324 pi.inode.rstat.rctime = mdr->get_op_stamp();
94b18763 10325 pi.inode.version = diri->pre_dirty();
7c673cae
FG
10326
10327 // project the snaprealm
94b18763
FG
10328 auto &newsnap = *pi.snapnode;
10329 auto it = newsnap.snaps.find(snapid);
11fdf7f2
TL
10330 ceph_assert(it != newsnap.snaps.end());
10331 it->second.name = dstname;
7c673cae
FG
10332
10333 // journal the inode changes
10334 mdr->ls = mdlog->get_current_segment();
10335 EUpdate *le = new EUpdate(mdlog, "renamesnap");
10336 mdlog->start_entry(le);
10337
10338 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
10339 le->metablob.add_table_transaction(TABLE_SNAP, stid);
10340 mdcache->predirty_journal_parents(mdr, &le->metablob, diri, 0, PREDIRTY_PRIMARY, false);
10341 mdcache->journal_dirty_inode(mdr.get(), &le->metablob, diri);
10342
10343 // journal the snaprealm changes
10344 submit_mdlog_entry(le, new C_MDS_renamesnap_finish(this, mdr, diri, snapid),
10345 mdr, __func__);
10346 mdlog->flush();
10347}
10348
10349void Server::_renamesnap_finish(MDRequestRef& mdr, CInode *diri, snapid_t snapid)
10350{
10351 dout(10) << "_renamesnap_finish " << *mdr << " " << snapid << dendl;
10352
10353 diri->pop_and_dirty_projected_inode(mdr->ls);
10354 mdr->apply();
10355
10356 mds->snapclient->commit(mdr->more()->stid, mdr->ls);
10357
10358 dout(10) << "snaprealm now " << *diri->snaprealm << dendl;
10359
11fdf7f2
TL
10360 // notify other mds
10361 mdcache->send_snap_update(diri, mdr->more()->stid, CEPH_SNAP_OP_UPDATE);
10362
10363 mdcache->do_realm_invalidate_and_update_notify(diri, CEPH_SNAP_OP_UPDATE);
7c673cae
FG
10364
10365 // yay
10366 mdr->in[0] = diri;
10367 mdr->tracei = diri;
10368 mdr->snapid = snapid;
10369 respond_to_request(mdr, 0);
10370}
10371
10372/**
10373 * Return true if server is in state RECONNECT and this
10374 * client has not yet reconnected.
10375 */
10376bool Server::waiting_for_reconnect(client_t c) const
10377{
10378 return client_reconnect_gather.count(c) > 0;
10379}
10380
10381void Server::dump_reconnect_status(Formatter *f) const
10382{
10383 f->open_object_section("reconnect_status");
10384 f->dump_stream("client_reconnect_gather") << client_reconnect_gather;
10385 f->close_section();
10386}