]> git.proxmox.com Git - ceph.git/blob - ceph/src/mds/Server.cc
update source to Ceph Pacific 16.2.2
[ceph.git] / ceph / src / mds / Server.cc
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3 /*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 *
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
12 *
13 */
14
15 #include <boost/lexical_cast.hpp>
16 #include "include/ceph_assert.h" // lexical_cast includes system assert.h
17
18 #include <boost/config/warning_disable.hpp>
19 #include <boost/fusion/include/std_pair.hpp>
20 #include <boost/range/adaptor/reversed.hpp>
21
22 #include "MDSRank.h"
23 #include "Server.h"
24 #include "Locker.h"
25 #include "MDCache.h"
26 #include "MDLog.h"
27 #include "Migrator.h"
28 #include "MDBalancer.h"
29 #include "InoTable.h"
30 #include "SnapClient.h"
31 #include "Mutation.h"
32 #include "MetricsHandler.h"
33 #include "cephfs_features.h"
34
35 #include "msg/Messenger.h"
36
37 #include "osdc/Objecter.h"
38
39 #include "events/EUpdate.h"
40 #include "events/EPeerUpdate.h"
41 #include "events/ESession.h"
42 #include "events/EOpen.h"
43 #include "events/ECommitted.h"
44 #include "events/EPurged.h"
45
46 #include "include/stringify.h"
47 #include "include/filepath.h"
48 #include "common/errno.h"
49 #include "common/Timer.h"
50 #include "common/perf_counters.h"
51 #include "include/compat.h"
52 #include "osd/OSDMap.h"
53
54 #include <errno.h>
55
56 #include <list>
57 #include <regex>
58 #include <string_view>
59 #include <functional>
60
61 #include "common/config.h"
62
63 #define dout_context g_ceph_context
64 #define dout_subsys ceph_subsys_mds
65 #undef dout_prefix
66 #define dout_prefix *_dout << "mds." << mds->get_nodeid() << ".server "
67
68 class ServerContext : public MDSContext {
69 protected:
70 Server *server;
71 MDSRank *get_mds() override
72 {
73 return server->mds;
74 }
75
76 public:
77 explicit ServerContext(Server *s) : server(s) {
78 ceph_assert(server != NULL);
79 }
80 };
81
82 class Batch_Getattr_Lookup : public BatchOp {
83 protected:
84 Server* server;
85 ceph::ref_t<MDRequestImpl> mdr;
86 std::vector<ceph::ref_t<MDRequestImpl>> batch_reqs;
87 int res = 0;
88 public:
89 Batch_Getattr_Lookup(Server* s, const ceph::ref_t<MDRequestImpl>& r)
90 : server(s), mdr(r) {
91 if (mdr->client_request->get_op() == CEPH_MDS_OP_LOOKUP)
92 mdr->batch_op_map = &mdr->dn[0].back()->batch_ops;
93 else
94 mdr->batch_op_map = &mdr->in[0]->batch_ops;
95 }
96 void add_request(const ceph::ref_t<MDRequestImpl>& r) override {
97 batch_reqs.push_back(r);
98 }
99 ceph::ref_t<MDRequestImpl> find_new_head() override {
100 while (!batch_reqs.empty()) {
101 auto r = std::move(batch_reqs.back());
102 batch_reqs.pop_back();
103 if (r->killed)
104 continue;
105
106 r->batch_op_map = mdr->batch_op_map;
107 mdr->batch_op_map = nullptr;
108 mdr = r;
109 return mdr;
110 }
111 return nullptr;
112 }
113 void _forward(mds_rank_t t) override {
114 MDCache* mdcache = server->mdcache;
115 mdcache->mds->forward_message_mds(mdr->release_client_request(), t);
116 mdr->set_mds_stamp(ceph_clock_now());
117 for (auto& m : batch_reqs) {
118 if (!m->killed)
119 mdcache->request_forward(m, t);
120 }
121 batch_reqs.clear();
122 }
123 void _respond(int r) override {
124 mdr->set_mds_stamp(ceph_clock_now());
125 for (auto& m : batch_reqs) {
126 if (!m->killed) {
127 m->tracei = mdr->tracei;
128 m->tracedn = mdr->tracedn;
129 server->respond_to_request(m, r);
130 }
131 }
132 batch_reqs.clear();
133 server->reply_client_request(mdr, make_message<MClientReply>(*mdr->client_request, r));
134 }
135 void print(std::ostream& o) {
136 o << "[batch front=" << *mdr << "]";
137 }
138 };
139
140 class ServerLogContext : public MDSLogContextBase {
141 protected:
142 Server *server;
143 MDSRank *get_mds() override
144 {
145 return server->mds;
146 }
147
148 MDRequestRef mdr;
149 void pre_finish(int r) override {
150 if (mdr)
151 mdr->mark_event("journal_committed: ");
152 }
153 public:
154 explicit ServerLogContext(Server *s) : server(s) {
155 ceph_assert(server != NULL);
156 }
157 explicit ServerLogContext(Server *s, MDRequestRef& r) : server(s), mdr(r) {
158 ceph_assert(server != NULL);
159 }
160 };
161
162 void Server::create_logger()
163 {
164 PerfCountersBuilder plb(g_ceph_context, "mds_server", l_mdss_first, l_mdss_last);
165
166 plb.add_u64_counter(l_mdss_handle_client_request, "handle_client_request",
167 "Client requests", "hcr", PerfCountersBuilder::PRIO_INTERESTING);
168 plb.add_u64_counter(l_mdss_handle_peer_request, "handle_peer_request",
169 "Peer requests", "hsr", PerfCountersBuilder::PRIO_INTERESTING);
170 plb.add_u64_counter(l_mdss_handle_client_session,
171 "handle_client_session", "Client session messages", "hcs",
172 PerfCountersBuilder::PRIO_INTERESTING);
173 plb.add_u64_counter(l_mdss_cap_revoke_eviction, "cap_revoke_eviction",
174 "Cap Revoke Client Eviction", "cre", PerfCountersBuilder::PRIO_INTERESTING);
175 plb.add_u64_counter(l_mdss_cap_acquisition_throttle,
176 "cap_acquisition_throttle", "Cap acquisition throttle counter", "cat",
177 PerfCountersBuilder::PRIO_INTERESTING);
178
179 // fop latencies are useful
180 plb.set_prio_default(PerfCountersBuilder::PRIO_USEFUL);
181 plb.add_time_avg(l_mdss_req_lookuphash_latency, "req_lookuphash_latency",
182 "Request type lookup hash of inode latency");
183 plb.add_time_avg(l_mdss_req_lookupino_latency, "req_lookupino_latency",
184 "Request type lookup inode latency");
185 plb.add_time_avg(l_mdss_req_lookupparent_latency, "req_lookupparent_latency",
186 "Request type lookup parent latency");
187 plb.add_time_avg(l_mdss_req_lookupname_latency, "req_lookupname_latency",
188 "Request type lookup name latency");
189 plb.add_time_avg(l_mdss_req_lookup_latency, "req_lookup_latency",
190 "Request type lookup latency");
191 plb.add_time_avg(l_mdss_req_lookupsnap_latency, "req_lookupsnap_latency",
192 "Request type lookup snapshot latency");
193 plb.add_time_avg(l_mdss_req_getattr_latency, "req_getattr_latency",
194 "Request type get attribute latency");
195 plb.add_time_avg(l_mdss_req_setattr_latency, "req_setattr_latency",
196 "Request type set attribute latency");
197 plb.add_time_avg(l_mdss_req_setlayout_latency, "req_setlayout_latency",
198 "Request type set file layout latency");
199 plb.add_time_avg(l_mdss_req_setdirlayout_latency, "req_setdirlayout_latency",
200 "Request type set directory layout latency");
201 plb.add_time_avg(l_mdss_req_setxattr_latency, "req_setxattr_latency",
202 "Request type set extended attribute latency");
203 plb.add_time_avg(l_mdss_req_rmxattr_latency, "req_rmxattr_latency",
204 "Request type remove extended attribute latency");
205 plb.add_time_avg(l_mdss_req_readdir_latency, "req_readdir_latency",
206 "Request type read directory latency");
207 plb.add_time_avg(l_mdss_req_setfilelock_latency, "req_setfilelock_latency",
208 "Request type set file lock latency");
209 plb.add_time_avg(l_mdss_req_getfilelock_latency, "req_getfilelock_latency",
210 "Request type get file lock latency");
211 plb.add_time_avg(l_mdss_req_create_latency, "req_create_latency",
212 "Request type create latency");
213 plb.add_time_avg(l_mdss_req_open_latency, "req_open_latency",
214 "Request type open latency");
215 plb.add_time_avg(l_mdss_req_mknod_latency, "req_mknod_latency",
216 "Request type make node latency");
217 plb.add_time_avg(l_mdss_req_link_latency, "req_link_latency",
218 "Request type link latency");
219 plb.add_time_avg(l_mdss_req_unlink_latency, "req_unlink_latency",
220 "Request type unlink latency");
221 plb.add_time_avg(l_mdss_req_rmdir_latency, "req_rmdir_latency",
222 "Request type remove directory latency");
223 plb.add_time_avg(l_mdss_req_rename_latency, "req_rename_latency",
224 "Request type rename latency");
225 plb.add_time_avg(l_mdss_req_mkdir_latency, "req_mkdir_latency",
226 "Request type make directory latency");
227 plb.add_time_avg(l_mdss_req_symlink_latency, "req_symlink_latency",
228 "Request type symbolic link latency");
229 plb.add_time_avg(l_mdss_req_lssnap_latency, "req_lssnap_latency",
230 "Request type list snapshot latency");
231 plb.add_time_avg(l_mdss_req_mksnap_latency, "req_mksnap_latency",
232 "Request type make snapshot latency");
233 plb.add_time_avg(l_mdss_req_rmsnap_latency, "req_rmsnap_latency",
234 "Request type remove snapshot latency");
235 plb.add_time_avg(l_mdss_req_renamesnap_latency, "req_renamesnap_latency",
236 "Request type rename snapshot latency");
237
238 plb.set_prio_default(PerfCountersBuilder::PRIO_DEBUGONLY);
239 plb.add_u64_counter(l_mdss_dispatch_client_request, "dispatch_client_request",
240 "Client requests dispatched");
241 plb.add_u64_counter(l_mdss_dispatch_peer_request, "dispatch_server_request",
242 "Server requests dispatched");
243
244 logger = plb.create_perf_counters();
245 g_ceph_context->get_perfcounters_collection()->add(logger);
246 }
247
248 Server::Server(MDSRank *m, MetricsHandler *metrics_handler) :
249 mds(m),
250 mdcache(mds->mdcache), mdlog(mds->mdlog),
251 recall_throttle(g_conf().get_val<double>("mds_recall_max_decay_rate")),
252 metrics_handler(metrics_handler)
253 {
254 forward_all_requests_to_auth = g_conf().get_val<bool>("mds_forward_all_requests_to_auth");
255 replay_unsafe_with_closed_session = g_conf().get_val<bool>("mds_replay_unsafe_with_closed_session");
256 cap_revoke_eviction_timeout = g_conf().get_val<double>("mds_cap_revoke_eviction_timeout");
257 max_snaps_per_dir = g_conf().get_val<uint64_t>("mds_max_snaps_per_dir");
258 delegate_inos_pct = g_conf().get_val<uint64_t>("mds_client_delegate_inos_pct");
259 max_caps_per_client = g_conf().get_val<uint64_t>("mds_max_caps_per_client");
260 cap_acquisition_throttle = g_conf().get_val<uint64_t>("mds_session_cap_acquisition_throttle");
261 max_caps_throttle_ratio = g_conf().get_val<double>("mds_session_max_caps_throttle_ratio");
262 caps_throttle_retry_request_timeout = g_conf().get_val<double>("mds_cap_acquisition_throttle_retry_request_timeout");
263 supported_features = feature_bitset_t(CEPHFS_FEATURES_MDS_SUPPORTED);
264 }
265
266 void Server::dispatch(const cref_t<Message> &m)
267 {
268 switch (m->get_type()) {
269 case CEPH_MSG_CLIENT_RECONNECT:
270 handle_client_reconnect(ref_cast<MClientReconnect>(m));
271 return;
272 }
273
274 /*
275 *In reconnect phase, client sent unsafe requests to mds before reconnect msg. Seting sessionclosed_isok will handle scenario like this:
276
277 1. In reconnect phase, client sent unsafe requests to mds.
278 2. It reached reconnect timeout. All sessions without sending reconnect msg in time, some of which may had sent unsafe requests, are marked as closed.
279 (Another situation is #31668, which will deny all client reconnect msg to speed up reboot).
280 3.So these unsafe request from session without sending reconnect msg in time or being denied could be handled in clientreplay phase.
281
282 */
283 bool sessionclosed_isok = replay_unsafe_with_closed_session;
284 // active?
285 // handle_peer_request()/handle_client_session() will wait if necessary
286 if (m->get_type() == CEPH_MSG_CLIENT_REQUEST && !mds->is_active()) {
287 const auto &req = ref_cast<MClientRequest>(m);
288 if (mds->is_reconnect() || mds->get_want_state() == CEPH_MDS_STATE_RECONNECT) {
289 Session *session = mds->get_session(req);
290 if (!session || (!session->is_open() && !sessionclosed_isok)) {
291 dout(5) << "session is closed, dropping " << req->get_reqid() << dendl;
292 return;
293 }
294 bool queue_replay = false;
295 if (req->is_replay() || req->is_async()) {
296 dout(3) << "queuing replayed op" << dendl;
297 queue_replay = true;
298 if (req->head.ino &&
299 !session->have_completed_request(req->get_reqid().tid, nullptr)) {
300 inodeno_t ino(req->head.ino);
301 mdcache->add_replay_ino_alloc(ino);
302 if (replay_unsafe_with_closed_session &&
303 session->free_prealloc_inos.contains(ino)) {
304 // don't purge inodes that will be created by later replay
305 session->free_prealloc_inos.erase(ino);
306 session->delegated_inos.insert(ino);
307 }
308 }
309 } else if (req->get_retry_attempt()) {
310 // process completed request in clientreplay stage. The completed request
311 // might have created new file/directorie. This guarantees MDS sends a reply
312 // to client before other request modifies the new file/directorie.
313 if (session->have_completed_request(req->get_reqid().tid, NULL)) {
314 dout(3) << "queuing completed op" << dendl;
315 queue_replay = true;
316 }
317 // this request was created before the cap reconnect message, drop any embedded
318 // cap releases.
319 req->releases.clear();
320 }
321 if (queue_replay) {
322 req->mark_queued_for_replay();
323 mds->enqueue_replay(new C_MDS_RetryMessage(mds, m));
324 return;
325 }
326 }
327
328 bool wait_for_active = true;
329 if (mds->is_stopping()) {
330 wait_for_active = false;
331 } else if (mds->is_clientreplay()) {
332 if (req->is_queued_for_replay()) {
333 wait_for_active = false;
334 }
335 }
336 if (wait_for_active) {
337 dout(3) << "not active yet, waiting" << dendl;
338 mds->wait_for_active(new C_MDS_RetryMessage(mds, m));
339 return;
340 }
341 }
342
343 switch (m->get_type()) {
344 case CEPH_MSG_CLIENT_SESSION:
345 handle_client_session(ref_cast<MClientSession>(m));
346 return;
347 case CEPH_MSG_CLIENT_REQUEST:
348 handle_client_request(ref_cast<MClientRequest>(m));
349 return;
350 case CEPH_MSG_CLIENT_RECLAIM:
351 handle_client_reclaim(ref_cast<MClientReclaim>(m));
352 return;
353 case MSG_MDS_PEER_REQUEST:
354 handle_peer_request(ref_cast<MMDSPeerRequest>(m));
355 return;
356 default:
357 derr << "server unknown message " << m->get_type() << dendl;
358 ceph_abort_msg("server unknown message");
359 }
360 }
361
362
363
364 // ----------------------------------------------------------
365 // SESSION management
366
367 class C_MDS_session_finish : public ServerLogContext {
368 Session *session;
369 uint64_t state_seq;
370 bool open;
371 version_t cmapv;
372 interval_set<inodeno_t> inos_to_free;
373 version_t inotablev;
374 interval_set<inodeno_t> inos_to_purge;
375 LogSegment *ls = nullptr;
376 Context *fin;
377 public:
378 C_MDS_session_finish(Server *srv, Session *se, uint64_t sseq, bool s, version_t mv, Context *fin_ = nullptr) :
379 ServerLogContext(srv), session(se), state_seq(sseq), open(s), cmapv(mv), inotablev(0), fin(fin_) { }
380 C_MDS_session_finish(Server *srv, Session *se, uint64_t sseq, bool s, version_t mv,
381 const interval_set<inodeno_t>& to_free, version_t iv,
382 const interval_set<inodeno_t>& to_purge, LogSegment *_ls, Context *fin_ = nullptr) :
383 ServerLogContext(srv), session(se), state_seq(sseq), open(s), cmapv(mv),
384 inos_to_free(to_free), inotablev(iv), inos_to_purge(to_purge), ls(_ls), fin(fin_) {}
385 void finish(int r) override {
386 ceph_assert(r == 0);
387 server->_session_logged(session, state_seq, open, cmapv, inos_to_free, inotablev, inos_to_purge, ls);
388 if (fin) {
389 fin->complete(r);
390 }
391 }
392 };
393
394 Session* Server::find_session_by_uuid(std::string_view uuid)
395 {
396 Session* session = nullptr;
397 for (auto& it : mds->sessionmap.get_sessions()) {
398 auto& metadata = it.second->info.client_metadata;
399
400 auto p = metadata.find("uuid");
401 if (p == metadata.end() || p->second != uuid)
402 continue;
403
404 if (!session) {
405 session = it.second;
406 } else if (!session->reclaiming_from) {
407 assert(it.second->reclaiming_from == session);
408 session = it.second;
409 } else {
410 assert(session->reclaiming_from == it.second);
411 }
412 }
413 return session;
414 }
415
416 void Server::reclaim_session(Session *session, const cref_t<MClientReclaim> &m)
417 {
418 if (!session->is_open() && !session->is_stale()) {
419 dout(10) << "session not open, dropping this req" << dendl;
420 return;
421 }
422
423 auto reply = make_message<MClientReclaimReply>(0);
424 if (m->get_uuid().empty()) {
425 dout(10) << __func__ << " invalid message (no uuid)" << dendl;
426 reply->set_result(-CEPHFS_EINVAL);
427 mds->send_message_client(reply, session);
428 return;
429 }
430
431 unsigned flags = m->get_flags();
432 if (flags != CEPH_RECLAIM_RESET) { // currently only support reset
433 dout(10) << __func__ << " unsupported flags" << dendl;
434 reply->set_result(-CEPHFS_EOPNOTSUPP);
435 mds->send_message_client(reply, session);
436 return;
437 }
438
439 Session* target = find_session_by_uuid(m->get_uuid());
440 if (target) {
441 if (session->info.auth_name != target->info.auth_name) {
442 dout(10) << __func__ << " session auth_name " << session->info.auth_name
443 << " != target auth_name " << target->info.auth_name << dendl;
444 reply->set_result(-CEPHFS_EPERM);
445 mds->send_message_client(reply, session);
446 }
447
448 assert(!target->reclaiming_from);
449 assert(!session->reclaiming_from);
450 session->reclaiming_from = target;
451 reply->set_addrs(entity_addrvec_t(target->info.inst.addr));
452 }
453
454 if (flags & CEPH_RECLAIM_RESET) {
455 finish_reclaim_session(session, reply);
456 return;
457 }
458
459 ceph_abort();
460 }
461
462 void Server::finish_reclaim_session(Session *session, const ref_t<MClientReclaimReply> &reply)
463 {
464 Session *target = session->reclaiming_from;
465 if (target) {
466 session->reclaiming_from = nullptr;
467
468 Context *send_reply;
469 if (reply) {
470 int64_t session_id = session->get_client().v;
471 send_reply = new LambdaContext([this, session_id, reply](int r) {
472 assert(ceph_mutex_is_locked_by_me(mds->mds_lock));
473 Session *session = mds->sessionmap.get_session(entity_name_t::CLIENT(session_id));
474 if (!session) {
475 return;
476 }
477 auto epoch = mds->objecter->with_osdmap([](const OSDMap &map){ return map.get_epoch(); });
478 reply->set_epoch(epoch);
479 mds->send_message_client(reply, session);
480 });
481 } else {
482 send_reply = nullptr;
483 }
484
485 bool blocklisted = mds->objecter->with_osdmap([target](const OSDMap &map) {
486 return map.is_blocklisted(target->info.inst.addr);
487 });
488
489 if (blocklisted || !g_conf()->mds_session_blocklist_on_evict) {
490 kill_session(target, send_reply);
491 } else {
492 CachedStackStringStream css;
493 mds->evict_client(target->get_client().v, false, true, *css, send_reply);
494 }
495 } else if (reply) {
496 mds->send_message_client(reply, session);
497 }
498 }
499
500 void Server::handle_client_reclaim(const cref_t<MClientReclaim> &m)
501 {
502 Session *session = mds->get_session(m);
503 dout(3) << __func__ << " " << *m << " from " << m->get_source() << dendl;
504 assert(m->get_source().is_client()); // should _not_ come from an mds!
505
506 if (!session) {
507 dout(0) << " ignoring sessionless msg " << *m << dendl;
508 return;
509 }
510
511 std::string_view fs_name = mds->get_fs_name();
512 if (!fs_name.empty() && !session->fs_name_capable(fs_name, MAY_READ)) {
513 dout(0) << " dropping message not allowed for this fs_name: " << *m << dendl;
514 return;
515 }
516
517 if (mds->get_state() < MDSMap::STATE_CLIENTREPLAY) {
518 mds->wait_for_replay(new C_MDS_RetryMessage(mds, m));
519 return;
520 }
521
522 if (m->get_flags() & MClientReclaim::FLAG_FINISH) {
523 finish_reclaim_session(session);
524 } else {
525 reclaim_session(session, m);
526 }
527 }
528
529 void Server::handle_client_session(const cref_t<MClientSession> &m)
530 {
531 version_t pv;
532 Session *session = mds->get_session(m);
533
534 dout(3) << "handle_client_session " << *m << " from " << m->get_source() << dendl;
535 ceph_assert(m->get_source().is_client()); // should _not_ come from an mds!
536
537 if (!session) {
538 dout(0) << " ignoring sessionless msg " << *m << dendl;
539 auto reply = make_message<MClientSession>(CEPH_SESSION_REJECT);
540 reply->metadata["error_string"] = "sessionless";
541 mds->send_message(reply, m->get_connection());
542 return;
543 }
544
545 std::string_view fs_name = mds->get_fs_name();
546 if (!fs_name.empty() && !session->fs_name_capable(fs_name, MAY_READ)) {
547 dout(0) << " dropping message not allowed for this fs_name: " << *m << dendl;
548 auto reply = make_message<MClientSession>(CEPH_SESSION_REJECT);
549 reply->metadata["error_string"] = "client doesn't have caps for FS \"" +
550 std::string(fs_name) + "\"";
551 mds->send_message(std::move(reply), m->get_connection());
552 return;
553 }
554
555 if (m->get_op() == CEPH_SESSION_REQUEST_RENEWCAPS) {
556 // always handle renewcaps (state >= MDSMap::STATE_RECONNECT)
557 } else if (m->get_op() == CEPH_SESSION_REQUEST_CLOSE) {
558 // close requests need to be handled when mds is active
559 if (mds->get_state() < MDSMap::STATE_ACTIVE) {
560 mds->wait_for_active(new C_MDS_RetryMessage(mds, m));
561 return;
562 }
563 } else {
564 if (mds->get_state() < MDSMap::STATE_CLIENTREPLAY) {
565 mds->wait_for_replay(new C_MDS_RetryMessage(mds, m));
566 return;
567 }
568 }
569
570 if (logger)
571 logger->inc(l_mdss_handle_client_session);
572
573 uint64_t sseq = 0;
574 switch (m->get_op()) {
575 case CEPH_SESSION_REQUEST_OPEN:
576 if (session->is_opening() ||
577 session->is_open() ||
578 session->is_stale() ||
579 session->is_killing() ||
580 terminating_sessions) {
581 dout(10) << "currently open|opening|stale|killing, dropping this req" << dendl;
582 return;
583 }
584 ceph_assert(session->is_closed() || session->is_closing());
585
586 if (mds->is_stopping()) {
587 dout(10) << "mds is stopping, dropping open req" << dendl;
588 return;
589 }
590
591 {
592 auto& addr = session->info.inst.addr;
593 session->set_client_metadata(client_metadata_t(m->metadata, m->supported_features, m->metric_spec));
594 auto& client_metadata = session->info.client_metadata;
595
596 auto log_session_status = [this, m, session](std::string_view status, std::string_view err) {
597 auto now = ceph_clock_now();
598 auto throttle_elapsed = m->get_recv_complete_stamp() - m->get_throttle_stamp();
599 auto elapsed = now - m->get_recv_stamp();
600 CachedStackStringStream css;
601 *css << "New client session:"
602 << " addr=\"" << session->info.inst.addr << "\""
603 << ",elapsed=" << elapsed
604 << ",throttled=" << throttle_elapsed
605 << ",status=\"" << status << "\"";
606 if (!err.empty()) {
607 *css << ",error=\"" << err << "\"";
608 }
609 const auto& metadata = session->info.client_metadata;
610 if (auto it = metadata.find("root"); it != metadata.end()) {
611 *css << ",root=\"" << it->second << "\"";
612 }
613 dout(2) << css->strv() << dendl;
614 };
615
616 auto send_reject_message = [this, &session, &log_session_status](std::string_view err_str) {
617 auto m = make_message<MClientSession>(CEPH_SESSION_REJECT);
618 if (session->info.has_feature(CEPHFS_FEATURE_MIMIC))
619 m->metadata["error_string"] = err_str;
620 mds->send_message_client(m, session);
621 log_session_status("REJECTED", err_str);
622 };
623
624 bool blocklisted = mds->objecter->with_osdmap(
625 [&addr](const OSDMap &osd_map) -> bool {
626 return osd_map.is_blocklisted(addr);
627 });
628
629 if (blocklisted) {
630 dout(10) << "rejecting blocklisted client " << addr << dendl;
631 // This goes on the wire and the "blacklisted" substring is
632 // depended upon by the kernel client for detecting whether it
633 // has been blocklisted. If mounted with recover_session=clean
634 // (since 5.4), it tries to automatically recover itself from
635 // blocklisting.
636 send_reject_message("blocklisted (blacklisted)");
637 session->clear();
638 break;
639 }
640
641 if (client_metadata.features.empty())
642 infer_supported_features(session, client_metadata);
643
644 dout(20) << __func__ << " CEPH_SESSION_REQUEST_OPEN metadata entries:" << dendl;
645 dout(20) << " features: '" << client_metadata.features << "'" << dendl;
646 dout(20) << " metric specification: [" << client_metadata.metric_spec << "]" << dendl;
647 for (const auto& p : client_metadata) {
648 dout(20) << " " << p.first << ": " << p.second << dendl;
649 }
650
651 feature_bitset_t missing_features = required_client_features;
652 missing_features -= client_metadata.features;
653 if (!missing_features.empty()) {
654 CachedStackStringStream css;
655 *css << "missing required features '" << missing_features << "'";
656 send_reject_message(css->strv());
657 mds->clog->warn() << "client session (" << session->info.inst
658 << ") lacks required features " << missing_features
659 << "; client supports " << client_metadata.features;
660 session->clear();
661 break;
662 }
663
664 // Special case for the 'root' metadata path; validate that the claimed
665 // root is actually within the caps of the session
666 if (auto it = client_metadata.find("root"); it != client_metadata.end()) {
667 auto claimed_root = it->second;
668 CachedStackStringStream css;
669 bool denied = false;
670 // claimed_root has a leading "/" which we strip before passing
671 // into caps check
672 if (claimed_root.empty() || claimed_root[0] != '/') {
673 denied = true;
674 *css << "invalue root '" << claimed_root << "'";
675 } else if (!session->auth_caps.path_capable(claimed_root.substr(1))) {
676 denied = true;
677 *css << "non-allowable root '" << claimed_root << "'";
678 }
679
680 if (denied) {
681 // Tell the client we're rejecting their open
682 send_reject_message(css->strv());
683 mds->clog->warn() << "client session with " << css->strv()
684 << " denied (" << session->info.inst << ")";
685 session->clear();
686 break;
687 }
688 }
689
690 if (auto it = client_metadata.find("uuid"); it != client_metadata.end()) {
691 if (find_session_by_uuid(it->second)) {
692 send_reject_message("duplicated session uuid");
693 mds->clog->warn() << "client session with duplicated session uuid '"
694 << it->second << "' denied (" << session->info.inst << ")";
695 session->clear();
696 break;
697 }
698 }
699
700 if (session->is_closed()) {
701 mds->sessionmap.add_session(session);
702 }
703
704 pv = mds->sessionmap.mark_projected(session);
705 sseq = mds->sessionmap.set_state(session, Session::STATE_OPENING);
706 mds->sessionmap.touch_session(session);
707 auto fin = new LambdaContext([log_session_status = std::move(log_session_status)](int r){
708 ceph_assert(r == 0);
709 log_session_status("ACCEPTED", "");
710 });
711 mdlog->start_submit_entry(new ESession(m->get_source_inst(), true, pv, client_metadata),
712 new C_MDS_session_finish(this, session, sseq, true, pv, fin));
713 mdlog->flush();
714 }
715 break;
716
717 case CEPH_SESSION_REQUEST_RENEWCAPS:
718 if (session->is_open() || session->is_stale()) {
719 mds->sessionmap.touch_session(session);
720 if (session->is_stale()) {
721 mds->sessionmap.set_state(session, Session::STATE_OPEN);
722 mds->locker->resume_stale_caps(session);
723 mds->sessionmap.touch_session(session);
724 }
725 auto reply = make_message<MClientSession>(CEPH_SESSION_RENEWCAPS, m->get_seq());
726 mds->send_message_client(reply, session);
727 } else {
728 dout(10) << "ignoring renewcaps on non open|stale session (" << session->get_state_name() << ")" << dendl;
729 }
730 break;
731
732 case CEPH_SESSION_REQUEST_CLOSE:
733 {
734 if (session->is_closed() ||
735 session->is_closing() ||
736 session->is_killing()) {
737 dout(10) << "already closed|closing|killing, dropping this req" << dendl;
738 return;
739 }
740 if (session->is_importing()) {
741 dout(10) << "ignoring close req on importing session" << dendl;
742 return;
743 }
744 ceph_assert(session->is_open() ||
745 session->is_stale() ||
746 session->is_opening());
747 if (m->get_seq() < session->get_push_seq()) {
748 dout(10) << "old push seq " << m->get_seq() << " < " << session->get_push_seq()
749 << ", dropping" << dendl;
750 return;
751 }
752 // We are getting a seq that is higher than expected.
753 // Handle the same as any other seqn error.
754 //
755 if (m->get_seq() != session->get_push_seq()) {
756 dout(0) << "old push seq " << m->get_seq() << " != " << session->get_push_seq()
757 << ", BUGGY!" << dendl;
758 mds->clog->warn() << "incorrect push seq " << m->get_seq() << " != "
759 << session->get_push_seq() << ", dropping" << " from client : " << session->get_human_name();
760 return;
761 }
762 journal_close_session(session, Session::STATE_CLOSING, NULL);
763 }
764 break;
765
766 case CEPH_SESSION_FLUSHMSG_ACK:
767 finish_flush_session(session, m->get_seq());
768 break;
769
770 case CEPH_SESSION_REQUEST_FLUSH_MDLOG:
771 if (mds->is_active())
772 mdlog->flush();
773 break;
774
775 default:
776 ceph_abort();
777 }
778 }
779
780 void Server::flush_session(Session *session, MDSGatherBuilder& gather) {
781 if (!session->is_open() ||
782 !session->get_connection() ||
783 !session->get_connection()->has_feature(CEPH_FEATURE_EXPORT_PEER)) {
784 return;
785 }
786
787 version_t seq = session->wait_for_flush(gather.new_sub());
788 mds->send_message_client(
789 make_message<MClientSession>(CEPH_SESSION_FLUSHMSG, seq), session);
790 }
791
792 void Server::flush_client_sessions(set<client_t>& client_set, MDSGatherBuilder& gather)
793 {
794 for (const auto& client : client_set) {
795 Session *session = mds->sessionmap.get_session(entity_name_t::CLIENT(client.v));
796 ceph_assert(session);
797 flush_session(session, gather);
798 }
799 }
800
801 void Server::finish_flush_session(Session *session, version_t seq)
802 {
803 MDSContext::vec finished;
804 session->finish_flush(seq, finished);
805 mds->queue_waiters(finished);
806 }
807
808 void Server::_session_logged(Session *session, uint64_t state_seq, bool open, version_t pv,
809 const interval_set<inodeno_t>& inos_to_free, version_t piv,
810 const interval_set<inodeno_t>& inos_to_purge, LogSegment *ls)
811 {
812 dout(10) << "_session_logged " << session->info.inst
813 << " state_seq " << state_seq
814 << " " << (open ? "open":"close") << " " << pv
815 << " inos_to_free " << inos_to_free << " inotablev " << piv
816 << " inos_to_purge " << inos_to_purge << dendl;
817
818 if (!open) {
819 if (inos_to_purge.size()){
820 ceph_assert(ls);
821 session->info.prealloc_inos.subtract(inos_to_purge);
822 ls->purging_inodes.insert(inos_to_purge);
823 if (mds->is_clientreplay() || mds->is_active() || mds->is_stopping())
824 mdcache->purge_inodes(inos_to_purge, ls);
825 }
826
827 if (inos_to_free.size()) {
828 ceph_assert(piv);
829 ceph_assert(session->is_closing() || session->is_killing() ||
830 session->is_opening()); // re-open closing session
831 session->info.prealloc_inos.subtract(inos_to_free);
832 mds->inotable->apply_release_ids(inos_to_free);
833 ceph_assert(mds->inotable->get_version() == piv);
834 }
835 session->free_prealloc_inos = session->info.prealloc_inos;
836 session->delegated_inos.clear();
837 }
838
839 mds->sessionmap.mark_dirty(session);
840
841 // apply
842 if (session->get_state_seq() != state_seq) {
843 dout(10) << " journaled state_seq " << state_seq << " != current " << session->get_state_seq()
844 << ", noop" << dendl;
845 // close must have been canceled (by an import?), or any number of other things..
846 } else if (open) {
847 ceph_assert(session->is_opening());
848 mds->sessionmap.set_state(session, Session::STATE_OPEN);
849 mds->sessionmap.touch_session(session);
850 metrics_handler->add_session(session);
851 ceph_assert(session->get_connection());
852 auto reply = make_message<MClientSession>(CEPH_SESSION_OPEN);
853 if (session->info.has_feature(CEPHFS_FEATURE_MIMIC))
854 reply->supported_features = supported_features;
855 mds->send_message_client(reply, session);
856 if (mdcache->is_readonly()) {
857 auto m = make_message<MClientSession>(CEPH_SESSION_FORCE_RO);
858 mds->send_message_client(m, session);
859 }
860 } else if (session->is_closing() ||
861 session->is_killing()) {
862 // kill any lingering capabilities, leases, requests
863 bool killing = session->is_killing();
864 while (!session->caps.empty()) {
865 Capability *cap = session->caps.front();
866 CInode *in = cap->get_inode();
867 dout(20) << " killing capability " << ccap_string(cap->issued()) << " on " << *in << dendl;
868 mds->locker->remove_client_cap(in, cap, killing);
869 }
870 while (!session->leases.empty()) {
871 ClientLease *r = session->leases.front();
872 CDentry *dn = static_cast<CDentry*>(r->parent);
873 dout(20) << " killing client lease of " << *dn << dendl;
874 dn->remove_client_lease(r, mds->locker);
875 }
876 if (client_reconnect_gather.erase(session->info.get_client())) {
877 dout(20) << " removing client from reconnect set" << dendl;
878 if (client_reconnect_gather.empty()) {
879 dout(7) << " client " << session->info.inst << " was last reconnect, finishing" << dendl;
880 reconnect_gather_finish();
881 }
882 }
883 if (client_reclaim_gather.erase(session->info.get_client())) {
884 dout(20) << " removing client from reclaim set" << dendl;
885 if (client_reclaim_gather.empty()) {
886 dout(7) << " client " << session->info.inst << " was last reclaimed, finishing" << dendl;
887 mds->maybe_clientreplay_done();
888 }
889 }
890
891 if (session->is_closing()) {
892 // mark con disposable. if there is a fault, we will get a
893 // reset and clean it up. if the client hasn't received the
894 // CLOSE message yet, they will reconnect and get an
895 // ms_handle_remote_reset() and realize they had in fact closed.
896 // do this *before* sending the message to avoid a possible
897 // race.
898 if (session->get_connection()) {
899 // Conditional because terminate_sessions will indiscrimately
900 // put sessions in CLOSING whether they ever had a conn or not.
901 session->get_connection()->mark_disposable();
902 }
903
904 // reset session
905 mds->send_message_client(make_message<MClientSession>(CEPH_SESSION_CLOSE), session);
906 mds->sessionmap.set_state(session, Session::STATE_CLOSED);
907 session->clear();
908 metrics_handler->remove_session(session);
909 mds->sessionmap.remove_session(session);
910 } else if (session->is_killing()) {
911 // destroy session, close connection
912 if (session->get_connection()) {
913 session->get_connection()->mark_down();
914 mds->sessionmap.set_state(session, Session::STATE_CLOSED);
915 session->set_connection(nullptr);
916 }
917 metrics_handler->remove_session(session);
918 mds->sessionmap.remove_session(session);
919 } else {
920 ceph_abort();
921 }
922 } else {
923 ceph_abort();
924 }
925 }
926
927 /**
928 * Inject sessions from some source other than actual connections.
929 *
930 * For example:
931 * - sessions inferred from journal replay
932 * - sessions learned from other MDSs during rejoin
933 * - sessions learned from other MDSs during dir/caps migration
934 * - sessions learned from other MDSs during a cross-MDS rename
935 */
936 version_t Server::prepare_force_open_sessions(map<client_t,entity_inst_t>& cm,
937 map<client_t,client_metadata_t>& cmm,
938 map<client_t, pair<Session*,uint64_t> >& smap)
939 {
940 version_t pv = mds->sessionmap.get_projected();
941
942 dout(10) << "prepare_force_open_sessions " << pv
943 << " on " << cm.size() << " clients"
944 << dendl;
945
946 mds->objecter->with_osdmap(
947 [this, &cm, &cmm](const OSDMap &osd_map) {
948 for (auto p = cm.begin(); p != cm.end(); ) {
949 if (osd_map.is_blocklisted(p->second.addr)) {
950 dout(10) << " ignoring blocklisted client." << p->first
951 << " (" << p->second.addr << ")" << dendl;
952 cmm.erase(p->first);
953 cm.erase(p++);
954 } else {
955 ++p;
956 }
957 }
958 });
959
960 for (map<client_t,entity_inst_t>::iterator p = cm.begin(); p != cm.end(); ++p) {
961 Session *session = mds->sessionmap.get_or_add_session(p->second);
962 pv = mds->sessionmap.mark_projected(session);
963 uint64_t sseq;
964 if (session->is_closed() ||
965 session->is_closing() ||
966 session->is_killing()) {
967 sseq = mds->sessionmap.set_state(session, Session::STATE_OPENING);
968 auto q = cmm.find(p->first);
969 if (q != cmm.end())
970 session->info.client_metadata.merge(q->second);
971 } else {
972 ceph_assert(session->is_open() ||
973 session->is_opening() ||
974 session->is_stale());
975 sseq = 0;
976 }
977 smap[p->first] = make_pair(session, sseq);
978 session->inc_importing();
979 }
980 return pv;
981 }
982
983 void Server::finish_force_open_sessions(const map<client_t,pair<Session*,uint64_t> >& smap,
984 bool dec_import)
985 {
986 /*
987 * FIXME: need to carefully consider the race conditions between a
988 * client trying to close a session and an MDS doing an import
989 * trying to force open a session...
990 */
991 dout(10) << "finish_force_open_sessions on " << smap.size() << " clients,"
992 << " initial v " << mds->sessionmap.get_version() << dendl;
993
994 for (auto &it : smap) {
995 Session *session = it.second.first;
996 uint64_t sseq = it.second.second;
997 if (sseq > 0) {
998 if (session->get_state_seq() != sseq) {
999 dout(10) << "force_open_sessions skipping changed " << session->info.inst << dendl;
1000 } else {
1001 dout(10) << "force_open_sessions opened " << session->info.inst << dendl;
1002 mds->sessionmap.set_state(session, Session::STATE_OPEN);
1003 mds->sessionmap.touch_session(session);
1004 metrics_handler->add_session(session);
1005
1006 auto reply = make_message<MClientSession>(CEPH_SESSION_OPEN);
1007 if (session->info.has_feature(CEPHFS_FEATURE_MIMIC))
1008 reply->supported_features = supported_features;
1009 mds->send_message_client(reply, session);
1010
1011 if (mdcache->is_readonly())
1012 mds->send_message_client(make_message<MClientSession>(CEPH_SESSION_FORCE_RO), session);
1013 }
1014 } else {
1015 dout(10) << "force_open_sessions skipping already-open " << session->info.inst << dendl;
1016 ceph_assert(session->is_open() || session->is_stale());
1017 }
1018
1019 if (dec_import) {
1020 session->dec_importing();
1021 }
1022
1023 mds->sessionmap.mark_dirty(session);
1024 }
1025
1026 dout(10) << __func__ << ": final v " << mds->sessionmap.get_version() << dendl;
1027 }
1028
1029 class C_MDS_TerminatedSessions : public ServerContext {
1030 void finish(int r) override {
1031 server->terminating_sessions = false;
1032 }
1033 public:
1034 explicit C_MDS_TerminatedSessions(Server *s) : ServerContext(s) {}
1035 };
1036
1037 void Server::terminate_sessions()
1038 {
1039 dout(5) << "terminating all sessions..." << dendl;
1040
1041 terminating_sessions = true;
1042
1043 // kill them off. clients will retry etc.
1044 set<Session*> sessions;
1045 mds->sessionmap.get_client_session_set(sessions);
1046 for (set<Session*>::const_iterator p = sessions.begin();
1047 p != sessions.end();
1048 ++p) {
1049 Session *session = *p;
1050 if (session->is_closing() ||
1051 session->is_killing() ||
1052 session->is_closed())
1053 continue;
1054 journal_close_session(session, Session::STATE_CLOSING, NULL);
1055 }
1056
1057 mdlog->wait_for_safe(new C_MDS_TerminatedSessions(this));
1058 }
1059
1060
1061 void Server::find_idle_sessions()
1062 {
1063 auto now = clock::now();
1064 auto last_cleared_laggy = mds->last_cleared_laggy();
1065
1066 dout(10) << "find_idle_sessions. last cleared laggy state " << last_cleared_laggy << "s ago" << dendl;
1067
1068 // timeout/stale
1069 // (caps go stale, lease die)
1070 double queue_max_age = mds->get_dispatch_queue_max_age(ceph_clock_now());
1071 double cutoff = queue_max_age + mds->mdsmap->get_session_timeout();
1072
1073 // don't kick clients if we've been laggy
1074 if (last_cleared_laggy < cutoff) {
1075 dout(10) << " last cleared laggy " << last_cleared_laggy << "s ago (< cutoff " << cutoff
1076 << "), not marking any client stale" << dendl;
1077 return;
1078 }
1079
1080 std::vector<Session*> to_evict;
1081
1082 bool defer_session_stale = g_conf().get_val<bool>("mds_defer_session_stale");
1083 const auto sessions_p1 = mds->sessionmap.by_state.find(Session::STATE_OPEN);
1084 if (sessions_p1 != mds->sessionmap.by_state.end() && !sessions_p1->second->empty()) {
1085 std::vector<Session*> new_stale;
1086
1087 for (auto session : *(sessions_p1->second)) {
1088 auto last_cap_renew_span = std::chrono::duration<double>(now - session->last_cap_renew).count();
1089 if (last_cap_renew_span < cutoff) {
1090 dout(20) << "laggiest active session is " << session->info.inst
1091 << " and renewed caps recently (" << last_cap_renew_span << "s ago)" << dendl;
1092 break;
1093 }
1094
1095 if (session->last_seen > session->last_cap_renew) {
1096 last_cap_renew_span = std::chrono::duration<double>(now - session->last_seen).count();
1097 if (last_cap_renew_span < cutoff) {
1098 dout(20) << "laggiest active session is " << session->info.inst
1099 << " and renewed caps recently (" << last_cap_renew_span << "s ago)" << dendl;
1100 continue;
1101 }
1102 }
1103
1104 if (last_cap_renew_span >= mds->mdsmap->get_session_autoclose()) {
1105 dout(20) << "evicting session " << session->info.inst << " since autoclose "
1106 "has arrived" << dendl;
1107 // evict session without marking it stale
1108 to_evict.push_back(session);
1109 continue;
1110 }
1111
1112 if (defer_session_stale &&
1113 !session->is_any_flush_waiter() &&
1114 !mds->locker->is_revoking_any_caps_from(session->get_client())) {
1115 dout(20) << "deferring marking session " << session->info.inst << " stale "
1116 "since it holds no caps" << dendl;
1117 continue;
1118 }
1119
1120 auto it = session->info.client_metadata.find("timeout");
1121 if (it != session->info.client_metadata.end()) {
1122 unsigned timeout = strtoul(it->second.c_str(), nullptr, 0);
1123 if (timeout == 0) {
1124 dout(10) << "skipping session " << session->info.inst
1125 << ", infinite timeout specified" << dendl;
1126 continue;
1127 }
1128 double cutoff = queue_max_age + timeout;
1129 if (last_cap_renew_span < cutoff) {
1130 dout(10) << "skipping session " << session->info.inst
1131 << ", timeout (" << timeout << ") specified"
1132 << " and renewed caps recently (" << last_cap_renew_span << "s ago)" << dendl;
1133 continue;
1134 }
1135
1136 // do not go through stale, evict it directly.
1137 to_evict.push_back(session);
1138 } else {
1139 dout(10) << "new stale session " << session->info.inst
1140 << " last renewed caps " << last_cap_renew_span << "s ago" << dendl;
1141 new_stale.push_back(session);
1142 }
1143 }
1144
1145 for (auto session : new_stale) {
1146 mds->sessionmap.set_state(session, Session::STATE_STALE);
1147 if (mds->locker->revoke_stale_caps(session)) {
1148 mds->locker->remove_stale_leases(session);
1149 finish_flush_session(session, session->get_push_seq());
1150 auto m = make_message<MClientSession>(CEPH_SESSION_STALE, session->get_push_seq());
1151 mds->send_message_client(m, session);
1152 } else {
1153 to_evict.push_back(session);
1154 }
1155 }
1156 }
1157
1158 // autoclose
1159 cutoff = queue_max_age + mds->mdsmap->get_session_autoclose();
1160
1161 // Collect a list of sessions exceeding the autoclose threshold
1162 const auto sessions_p2 = mds->sessionmap.by_state.find(Session::STATE_STALE);
1163 if (sessions_p2 != mds->sessionmap.by_state.end() && !sessions_p2->second->empty()) {
1164 for (auto session : *(sessions_p2->second)) {
1165 assert(session->is_stale());
1166 auto last_cap_renew_span = std::chrono::duration<double>(now - session->last_cap_renew).count();
1167 if (last_cap_renew_span < cutoff) {
1168 dout(20) << "oldest stale session is " << session->info.inst
1169 << " and recently renewed caps " << last_cap_renew_span << "s ago" << dendl;
1170 break;
1171 }
1172 to_evict.push_back(session);
1173 }
1174 }
1175
1176 for (auto session: to_evict) {
1177 if (session->is_importing()) {
1178 dout(10) << "skipping session " << session->info.inst << ", it's being imported" << dendl;
1179 continue;
1180 }
1181
1182 auto last_cap_renew_span = std::chrono::duration<double>(now - session->last_cap_renew).count();
1183 mds->clog->warn() << "evicting unresponsive client " << *session
1184 << ", after " << last_cap_renew_span << " seconds";
1185 dout(10) << "autoclosing stale session " << session->info.inst
1186 << " last renewed caps " << last_cap_renew_span << "s ago" << dendl;
1187
1188 if (g_conf()->mds_session_blocklist_on_timeout) {
1189 CachedStackStringStream css;
1190 mds->evict_client(session->get_client().v, false, true, *css, nullptr);
1191 } else {
1192 kill_session(session, NULL);
1193 }
1194 }
1195 }
1196
1197 void Server::evict_cap_revoke_non_responders() {
1198 if (!cap_revoke_eviction_timeout) {
1199 return;
1200 }
1201
1202 auto&& to_evict = mds->locker->get_late_revoking_clients(cap_revoke_eviction_timeout);
1203
1204 for (auto const &client: to_evict) {
1205 mds->clog->warn() << "client id " << client << " has not responded to"
1206 << " cap revoke by MDS for over " << cap_revoke_eviction_timeout
1207 << " seconds, evicting";
1208 dout(1) << __func__ << ": evicting cap revoke non-responder client id "
1209 << client << dendl;
1210
1211 CachedStackStringStream css;
1212 bool evicted = mds->evict_client(client.v, false,
1213 g_conf()->mds_session_blocklist_on_evict,
1214 *css, nullptr);
1215 if (evicted && logger) {
1216 logger->inc(l_mdss_cap_revoke_eviction);
1217 }
1218 }
1219 }
1220
1221 void Server::handle_conf_change(const std::set<std::string>& changed) {
1222 if (changed.count("mds_forward_all_requests_to_auth")){
1223 forward_all_requests_to_auth = g_conf().get_val<bool>("mds_forward_all_requests_to_auth");
1224 }
1225 if (changed.count("mds_cap_revoke_eviction_timeout")) {
1226 cap_revoke_eviction_timeout = g_conf().get_val<double>("mds_cap_revoke_eviction_timeout");
1227 dout(20) << __func__ << " cap revoke eviction timeout changed to "
1228 << cap_revoke_eviction_timeout << dendl;
1229 }
1230 if (changed.count("mds_recall_max_decay_rate")) {
1231 recall_throttle = DecayCounter(g_conf().get_val<double>("mds_recall_max_decay_rate"));
1232 }
1233 if (changed.count("mds_max_snaps_per_dir")) {
1234 max_snaps_per_dir = g_conf().get_val<uint64_t>("mds_max_snaps_per_dir");
1235 dout(20) << __func__ << " max snapshots per directory changed to "
1236 << max_snaps_per_dir << dendl;
1237 }
1238 if (changed.count("mds_client_delegate_inos_pct")) {
1239 delegate_inos_pct = g_conf().get_val<uint64_t>("mds_client_delegate_inos_pct");
1240 }
1241 if (changed.count("mds_max_caps_per_client")) {
1242 max_caps_per_client = g_conf().get_val<uint64_t>("mds_max_caps_per_client");
1243 }
1244 if (changed.count("mds_session_cap_acquisition_throttle")) {
1245 cap_acquisition_throttle = g_conf().get_val<uint64_t>("mds_session_cap_acquisition_throttle");
1246 }
1247 if (changed.count("mds_session_max_caps_throttle_ratio")) {
1248 max_caps_throttle_ratio = g_conf().get_val<double>("mds_session_max_caps_throttle_ratio");
1249 }
1250 if (changed.count("mds_cap_acquisition_throttle_retry_request_timeout")) {
1251 caps_throttle_retry_request_timeout = g_conf().get_val<double>("mds_cap_acquisition_throttle_retry_request_timeout");
1252 }
1253 if (changed.count("mds_alternate_name_max")) {
1254 alternate_name_max = g_conf().get_val<Option::size_t>("mds_alternate_name_max");
1255 }
1256 }
1257
1258 /*
1259 * XXX bump in the interface here, not using an MDSContext here
1260 * because all the callers right now happen to use a SaferCond
1261 */
1262 void Server::kill_session(Session *session, Context *on_safe)
1263 {
1264 ceph_assert(ceph_mutex_is_locked_by_me(mds->mds_lock));
1265
1266 if ((session->is_opening() ||
1267 session->is_open() ||
1268 session->is_stale()) &&
1269 !session->is_importing()) {
1270 dout(10) << "kill_session " << session << dendl;
1271 journal_close_session(session, Session::STATE_KILLING, on_safe);
1272 } else {
1273 dout(10) << "kill_session importing or already closing/killing " << session << dendl;
1274 if (session->is_closing() ||
1275 session->is_killing()) {
1276 if (on_safe)
1277 mdlog->wait_for_safe(new MDSInternalContextWrapper(mds, on_safe));
1278 } else {
1279 ceph_assert(session->is_closed() ||
1280 session->is_importing());
1281 if (on_safe)
1282 on_safe->complete(0);
1283 }
1284 }
1285 }
1286
1287 size_t Server::apply_blocklist(const std::set<entity_addr_t> &blocklist)
1288 {
1289 bool prenautilus = mds->objecter->with_osdmap(
1290 [&](const OSDMap& o) {
1291 return o.require_osd_release < ceph_release_t::nautilus;
1292 });
1293
1294 std::vector<Session*> victims;
1295 const auto& sessions = mds->sessionmap.get_sessions();
1296 for (const auto& p : sessions) {
1297 if (!p.first.is_client()) {
1298 // Do not apply OSDMap blocklist to MDS daemons, we find out
1299 // about their death via MDSMap.
1300 continue;
1301 }
1302
1303 Session *s = p.second;
1304 auto inst_addr = s->info.inst.addr;
1305 // blocklist entries are always TYPE_ANY for nautilus+
1306 inst_addr.set_type(entity_addr_t::TYPE_ANY);
1307 if (blocklist.count(inst_addr)) {
1308 victims.push_back(s);
1309 continue;
1310 }
1311 if (prenautilus) {
1312 // ...except pre-nautilus, they were TYPE_LEGACY
1313 inst_addr.set_type(entity_addr_t::TYPE_LEGACY);
1314 if (blocklist.count(inst_addr)) {
1315 victims.push_back(s);
1316 }
1317 }
1318 }
1319
1320 for (const auto& s : victims) {
1321 kill_session(s, nullptr);
1322 }
1323
1324 dout(10) << "apply_blocklist: killed " << victims.size() << dendl;
1325
1326 return victims.size();
1327 }
1328
1329 void Server::journal_close_session(Session *session, int state, Context *on_safe)
1330 {
1331 dout(10) << __func__ << " : "
1332 << session->info.inst
1333 << " pending_prealloc_inos " << session->pending_prealloc_inos
1334 << " free_prealloc_inos " << session->free_prealloc_inos
1335 << " delegated_inos " << session->delegated_inos << dendl;
1336
1337 uint64_t sseq = mds->sessionmap.set_state(session, state);
1338 version_t pv = mds->sessionmap.mark_projected(session);
1339 version_t piv = 0;
1340
1341 // release alloc and pending-alloc inos for this session
1342 // and wipe out session state, in case the session close aborts for some reason
1343 interval_set<inodeno_t> inos_to_free;
1344 inos_to_free.insert(session->pending_prealloc_inos);
1345 inos_to_free.insert(session->free_prealloc_inos);
1346 if (inos_to_free.size()) {
1347 mds->inotable->project_release_ids(inos_to_free);
1348 piv = mds->inotable->get_projected_version();
1349 } else
1350 piv = 0;
1351
1352 auto le = new ESession(session->info.inst, false, pv, inos_to_free, piv, session->delegated_inos);
1353 auto fin = new C_MDS_session_finish(this, session, sseq, false, pv, inos_to_free, piv,
1354 session->delegated_inos, mdlog->get_current_segment(), on_safe);
1355 mdlog->start_submit_entry(le, fin);
1356 mdlog->flush();
1357
1358 // clean up requests, too
1359 while(!session->requests.empty()) {
1360 auto mdr = MDRequestRef(*session->requests.begin());
1361 mdcache->request_kill(mdr);
1362 }
1363
1364 finish_flush_session(session, session->get_push_seq());
1365 }
1366
1367 void Server::reconnect_clients(MDSContext *reconnect_done_)
1368 {
1369 reconnect_done = reconnect_done_;
1370
1371 auto now = clock::now();
1372 set<Session*> sessions;
1373 mds->sessionmap.get_client_session_set(sessions);
1374 for (auto session : sessions) {
1375 if (session->is_open()) {
1376 client_reconnect_gather.insert(session->get_client());
1377 session->set_reconnecting(true);
1378 session->last_cap_renew = now;
1379 }
1380 }
1381
1382 if (client_reconnect_gather.empty()) {
1383 dout(7) << "reconnect_clients -- no sessions, doing nothing." << dendl;
1384 reconnect_gather_finish();
1385 return;
1386 }
1387
1388 // clients will get the mdsmap and discover we're reconnecting via the monitor.
1389
1390 reconnect_start = now;
1391 dout(1) << "reconnect_clients -- " << client_reconnect_gather.size() << " sessions" << dendl;
1392 mds->sessionmap.dump();
1393 }
1394
1395 void Server::handle_client_reconnect(const cref_t<MClientReconnect> &m)
1396 {
1397 dout(7) << "handle_client_reconnect " << m->get_source()
1398 << (m->has_more() ? " (more)" : "") << dendl;
1399 client_t from = m->get_source().num();
1400 Session *session = mds->get_session(m);
1401 if (!session) {
1402 dout(0) << " ignoring sessionless msg " << *m << dendl;
1403 auto reply = make_message<MClientSession>(CEPH_SESSION_REJECT);
1404 reply->metadata["error_string"] = "sessionless";
1405 mds->send_message(reply, m->get_connection());
1406 return;
1407 }
1408
1409 if (!session->is_open()) {
1410 dout(0) << " ignoring msg from not-open session" << *m << dendl;
1411 auto reply = make_message<MClientSession>(CEPH_SESSION_CLOSE);
1412 mds->send_message(reply, m->get_connection());
1413 return;
1414 }
1415
1416 bool reconnect_all_deny = g_conf().get_val<bool>("mds_deny_all_reconnect");
1417
1418 if (!mds->is_reconnect() && mds->get_want_state() == CEPH_MDS_STATE_RECONNECT) {
1419 dout(10) << " we're almost in reconnect state (mdsmap delivery race?); waiting" << dendl;
1420 mds->wait_for_reconnect(new C_MDS_RetryMessage(mds, m));
1421 return;
1422 }
1423
1424 auto delay = std::chrono::duration<double>(clock::now() - reconnect_start).count();
1425 dout(10) << " reconnect_start " << reconnect_start << " delay " << delay << dendl;
1426
1427 bool deny = false;
1428 if (reconnect_all_deny || !mds->is_reconnect() || mds->get_want_state() != CEPH_MDS_STATE_RECONNECT || reconnect_evicting) {
1429 // XXX maybe in the future we can do better than this?
1430 if (reconnect_all_deny) {
1431 dout(1) << "mds_deny_all_reconnect was set to speed up reboot phase, ignoring reconnect, sending close" << dendl;
1432 } else {
1433 dout(1) << "no longer in reconnect state, ignoring reconnect, sending close" << dendl;
1434 }
1435 mds->clog->info() << "denied reconnect attempt (mds is "
1436 << ceph_mds_state_name(mds->get_state())
1437 << ") from " << m->get_source_inst()
1438 << " after " << delay << " (allowed interval " << g_conf()->mds_reconnect_timeout << ")";
1439 deny = true;
1440 } else {
1441 std::string error_str;
1442 if (!session->is_open()) {
1443 error_str = "session is closed";
1444 } else if (mdcache->is_readonly()) {
1445 error_str = "mds is readonly";
1446 } else {
1447 if (session->info.client_metadata.features.empty())
1448 infer_supported_features(session, session->info.client_metadata);
1449
1450 feature_bitset_t missing_features = required_client_features;
1451 missing_features -= session->info.client_metadata.features;
1452 if (!missing_features.empty()) {
1453 CachedStackStringStream css;
1454 *css << "missing required features '" << missing_features << "'";
1455 error_str = css->strv();
1456 }
1457 }
1458
1459 if (!error_str.empty()) {
1460 deny = true;
1461 dout(1) << " " << error_str << ", ignoring reconnect, sending close" << dendl;
1462 mds->clog->info() << "denied reconnect attempt from "
1463 << m->get_source_inst() << " (" << error_str << ")";
1464 }
1465 }
1466
1467 if (deny) {
1468 auto r = make_message<MClientSession>(CEPH_SESSION_CLOSE);
1469 mds->send_message_client(r, session);
1470 if (session->is_open()) {
1471 client_reconnect_denied.insert(session->get_client());
1472 }
1473 return;
1474 }
1475
1476 if (!m->has_more()) {
1477 metrics_handler->add_session(session);
1478 // notify client of success with an OPEN
1479 auto reply = make_message<MClientSession>(CEPH_SESSION_OPEN);
1480 if (session->info.has_feature(CEPHFS_FEATURE_MIMIC))
1481 reply->supported_features = supported_features;
1482 mds->send_message_client(reply, session);
1483 mds->clog->debug() << "reconnect by " << session->info.inst << " after " << delay;
1484 }
1485
1486 session->last_cap_renew = clock::now();
1487
1488 // snaprealms
1489 for (const auto &r : m->realms) {
1490 CInode *in = mdcache->get_inode(inodeno_t(r.realm.ino));
1491 if (in && in->state_test(CInode::STATE_PURGING))
1492 continue;
1493 if (in) {
1494 if (in->snaprealm) {
1495 dout(15) << "open snaprealm (w inode) on " << *in << dendl;
1496 } else {
1497 // this can happen if we are non-auth or we rollback snaprealm
1498 dout(15) << "open snaprealm (null snaprealm) on " << *in << dendl;
1499 }
1500 mdcache->add_reconnected_snaprealm(from, inodeno_t(r.realm.ino), snapid_t(r.realm.seq));
1501 } else {
1502 dout(15) << "open snaprealm (w/o inode) on " << inodeno_t(r.realm.ino)
1503 << " seq " << r.realm.seq << dendl;
1504 mdcache->add_reconnected_snaprealm(from, inodeno_t(r.realm.ino), snapid_t(r.realm.seq));
1505 }
1506 }
1507
1508 // caps
1509 for (const auto &p : m->caps) {
1510 // make sure our last_cap_id is MAX over all issued caps
1511 if (p.second.capinfo.cap_id > mdcache->last_cap_id)
1512 mdcache->last_cap_id = p.second.capinfo.cap_id;
1513
1514 CInode *in = mdcache->get_inode(p.first);
1515 if (in && in->state_test(CInode::STATE_PURGING))
1516 continue;
1517 if (in && in->is_auth()) {
1518 // we recovered it, and it's ours. take note.
1519 dout(15) << "open cap realm " << inodeno_t(p.second.capinfo.snaprealm)
1520 << " on " << *in << dendl;
1521 in->reconnect_cap(from, p.second, session);
1522 mdcache->add_reconnected_cap(from, p.first, p.second);
1523 recover_filelocks(in, p.second.flockbl, m->get_orig_source().num());
1524 continue;
1525 }
1526
1527 if (in && !in->is_auth()) {
1528 // not mine.
1529 dout(10) << "non-auth " << *in << ", will pass off to authority" << dendl;
1530 // add to cap export list.
1531 mdcache->rejoin_export_caps(p.first, from, p.second,
1532 in->authority().first, true);
1533 } else {
1534 // don't know if the inode is mine
1535 dout(10) << "missing ino " << p.first << ", will load later" << dendl;
1536 mdcache->rejoin_recovered_caps(p.first, from, p.second, MDS_RANK_NONE);
1537 }
1538 }
1539
1540 reconnect_last_seen = clock::now();
1541
1542 if (!m->has_more()) {
1543 mdcache->rejoin_recovered_client(session->get_client(), session->info.inst);
1544
1545 // remove from gather set
1546 client_reconnect_gather.erase(from);
1547 session->set_reconnecting(false);
1548 if (client_reconnect_gather.empty())
1549 reconnect_gather_finish();
1550 }
1551 }
1552
1553 void Server::infer_supported_features(Session *session, client_metadata_t& client_metadata)
1554 {
1555 int supported = -1;
1556 auto it = client_metadata.find("ceph_version");
1557 if (it != client_metadata.end()) {
1558 // user space client
1559 if (it->second.compare(0, 16, "ceph version 12.") == 0)
1560 supported = CEPHFS_FEATURE_LUMINOUS;
1561 else if (session->get_connection()->has_feature(CEPH_FEATURE_FS_CHANGE_ATTR))
1562 supported = CEPHFS_FEATURE_KRAKEN;
1563 } else {
1564 it = client_metadata.find("kernel_version");
1565 if (it != client_metadata.end()) {
1566 // kernel client
1567 if (session->get_connection()->has_feature(CEPH_FEATURE_NEW_OSDOP_ENCODING))
1568 supported = CEPHFS_FEATURE_LUMINOUS;
1569 }
1570 }
1571 if (supported == -1 &&
1572 session->get_connection()->has_feature(CEPH_FEATURE_FS_FILE_LAYOUT_V2))
1573 supported = CEPHFS_FEATURE_JEWEL;
1574
1575 if (supported >= 0) {
1576 unsigned long value = (1UL << (supported + 1)) - 1;
1577 client_metadata.features = feature_bitset_t(value);
1578 dout(10) << __func__ << " got '" << client_metadata.features << "'" << dendl;
1579 }
1580 }
1581
1582 void Server::update_required_client_features()
1583 {
1584 required_client_features = mds->mdsmap->get_required_client_features();
1585 dout(7) << "required_client_features: " << required_client_features << dendl;
1586
1587 if (mds->get_state() >= MDSMap::STATE_RECONNECT) {
1588 set<Session*> sessions;
1589 mds->sessionmap.get_client_session_set(sessions);
1590 for (auto session : sessions) {
1591 feature_bitset_t missing_features = required_client_features;
1592 missing_features -= session->info.client_metadata.features;
1593 if (!missing_features.empty()) {
1594 bool blocklisted = mds->objecter->with_osdmap(
1595 [session](const OSDMap &osd_map) -> bool {
1596 return osd_map.is_blocklisted(session->info.inst.addr);
1597 });
1598 if (blocklisted)
1599 continue;
1600
1601 mds->clog->warn() << "evicting session " << *session << ", missing required features '"
1602 << missing_features << "'";
1603 CachedStackStringStream css;
1604 mds->evict_client(session->get_client().v, false,
1605 g_conf()->mds_session_blocklist_on_evict, *css);
1606 }
1607 }
1608 }
1609 }
1610
1611 void Server::reconnect_gather_finish()
1612 {
1613 dout(7) << "reconnect_gather_finish. failed on " << failed_reconnects << " clients" << dendl;
1614 ceph_assert(reconnect_done);
1615
1616 if (!mds->snapclient->is_synced()) {
1617 // make sure snaptable cache is populated. snaprealms will be
1618 // extensively used in rejoin stage.
1619 dout(7) << " snaptable cache isn't synced, delaying state transition" << dendl;
1620 mds->snapclient->wait_for_sync(reconnect_done);
1621 } else {
1622 reconnect_done->complete(0);
1623 }
1624 reconnect_done = NULL;
1625 }
1626
1627 void Server::reconnect_tick()
1628 {
1629 bool reject_all_reconnect = false;
1630 if (reconnect_evicting) {
1631 dout(7) << "reconnect_tick: waiting for evictions" << dendl;
1632 return;
1633 }
1634
1635 /*
1636 * Set mds_deny_all_reconnect to reject all the reconnect req ,
1637 * then load less meta information in rejoin phase. This will shorten reboot time.
1638 * Moreover, loading less meta increases the chance standby with less memory can failover.
1639
1640 * Why not shorten reconnect period?
1641 * Clients may send unsafe or retry requests, which haven't been
1642 * completed before old mds stop, to new mds. These requests may
1643 * need to be processed during new mds's clientreplay phase,
1644 * see: #https://github.com/ceph/ceph/pull/29059.
1645 */
1646 bool reconnect_all_deny = g_conf().get_val<bool>("mds_deny_all_reconnect");
1647 if (client_reconnect_gather.empty())
1648 return;
1649
1650 if (reconnect_all_deny && (client_reconnect_gather == client_reconnect_denied))
1651 reject_all_reconnect = true;
1652
1653 auto now = clock::now();
1654 auto elapse1 = std::chrono::duration<double>(now - reconnect_start).count();
1655 if (elapse1 < g_conf()->mds_reconnect_timeout && !reject_all_reconnect)
1656 return;
1657
1658 vector<Session*> remaining_sessions;
1659 remaining_sessions.reserve(client_reconnect_gather.size());
1660 for (auto c : client_reconnect_gather) {
1661 Session *session = mds->sessionmap.get_session(entity_name_t::CLIENT(c.v));
1662 ceph_assert(session);
1663 remaining_sessions.push_back(session);
1664 // client re-sends cap flush messages before the reconnect message
1665 if (session->last_seen > reconnect_last_seen)
1666 reconnect_last_seen = session->last_seen;
1667 }
1668
1669 auto elapse2 = std::chrono::duration<double>(now - reconnect_last_seen).count();
1670 if (elapse2 < g_conf()->mds_reconnect_timeout / 2 && !reject_all_reconnect) {
1671 dout(7) << "reconnect_tick: last seen " << elapse2
1672 << " seconds ago, extending reconnect interval" << dendl;
1673 return;
1674 }
1675
1676 dout(7) << "reconnect timed out, " << remaining_sessions.size()
1677 << " clients have not reconnected in time" << dendl;
1678
1679 // If we're doing blocklist evictions, use this to wait for them before
1680 // proceeding to reconnect_gather_finish
1681 MDSGatherBuilder gather(g_ceph_context);
1682
1683 for (auto session : remaining_sessions) {
1684 // Keep sessions that have specified timeout. These sessions will prevent
1685 // mds from going to active. MDS goes to active after they all have been
1686 // killed or reclaimed.
1687 if (session->info.client_metadata.find("timeout") !=
1688 session->info.client_metadata.end()) {
1689 dout(1) << "reconnect keeps " << session->info.inst
1690 << ", need to be reclaimed" << dendl;
1691 client_reclaim_gather.insert(session->get_client());
1692 continue;
1693 }
1694
1695 dout(1) << "reconnect gives up on " << session->info.inst << dendl;
1696
1697 mds->clog->warn() << "evicting unresponsive client " << *session
1698 << ", after waiting " << elapse1
1699 << " seconds during MDS startup";
1700
1701 // make _session_logged() purge orphan objects of lost async/unsafe requests
1702 session->delegated_inos.swap(session->free_prealloc_inos);
1703
1704 if (g_conf()->mds_session_blocklist_on_timeout) {
1705 CachedStackStringStream css;
1706 mds->evict_client(session->get_client().v, false, true, *css,
1707 gather.new_sub());
1708 } else {
1709 kill_session(session, NULL);
1710 }
1711
1712 failed_reconnects++;
1713 }
1714 client_reconnect_gather.clear();
1715 client_reconnect_denied.clear();
1716
1717 if (gather.has_subs()) {
1718 dout(1) << "reconnect will complete once clients are evicted" << dendl;
1719 gather.set_finisher(new MDSInternalContextWrapper(mds, new LambdaContext(
1720 [this](int r){reconnect_gather_finish();})));
1721 gather.activate();
1722 reconnect_evicting = true;
1723 } else {
1724 reconnect_gather_finish();
1725 }
1726 }
1727
1728 void Server::recover_filelocks(CInode *in, bufferlist locks, int64_t client)
1729 {
1730 if (!locks.length()) return;
1731 int numlocks;
1732 ceph_filelock lock;
1733 auto p = locks.cbegin();
1734 decode(numlocks, p);
1735 for (int i = 0; i < numlocks; ++i) {
1736 decode(lock, p);
1737 lock.client = client;
1738 in->get_fcntl_lock_state()->held_locks.insert(pair<uint64_t, ceph_filelock>(lock.start, lock));
1739 ++in->get_fcntl_lock_state()->client_held_lock_counts[client];
1740 }
1741 decode(numlocks, p);
1742 for (int i = 0; i < numlocks; ++i) {
1743 decode(lock, p);
1744 lock.client = client;
1745 in->get_flock_lock_state()->held_locks.insert(pair<uint64_t, ceph_filelock> (lock.start, lock));
1746 ++in->get_flock_lock_state()->client_held_lock_counts[client];
1747 }
1748 }
1749
1750 /**
1751 * Call this when the MDCache is oversized, to send requests to the clients
1752 * to trim some caps, and consequently unpin some inodes in the MDCache so
1753 * that it can trim too.
1754 */
1755 std::pair<bool, uint64_t> Server::recall_client_state(MDSGatherBuilder* gather, RecallFlags flags)
1756 {
1757 const auto now = clock::now();
1758 const bool steady = !!(flags&RecallFlags::STEADY);
1759 const bool enforce_max = !!(flags&RecallFlags::ENFORCE_MAX);
1760 const bool enforce_liveness = !!(flags&RecallFlags::ENFORCE_LIVENESS);
1761 const bool trim = !!(flags&RecallFlags::TRIM);
1762
1763 const auto max_caps_per_client = g_conf().get_val<uint64_t>("mds_max_caps_per_client");
1764 const auto min_caps_per_client = g_conf().get_val<uint64_t>("mds_min_caps_per_client");
1765 const auto recall_global_max_decay_threshold = g_conf().get_val<Option::size_t>("mds_recall_global_max_decay_threshold");
1766 const auto recall_max_caps = g_conf().get_val<Option::size_t>("mds_recall_max_caps");
1767 const auto recall_max_decay_threshold = g_conf().get_val<Option::size_t>("mds_recall_max_decay_threshold");
1768 const auto cache_liveness_magnitude = g_conf().get_val<Option::size_t>("mds_session_cache_liveness_magnitude");
1769
1770 dout(7) << __func__ << ":"
1771 << " min=" << min_caps_per_client
1772 << " max=" << max_caps_per_client
1773 << " total=" << Capability::count()
1774 << " flags=" << flags
1775 << dendl;
1776
1777 /* trim caps of sessions with the most caps first */
1778 std::multimap<uint64_t, Session*> caps_session;
1779 auto f = [&caps_session, enforce_max, enforce_liveness, trim, max_caps_per_client, cache_liveness_magnitude](auto& s) {
1780 auto num_caps = s->caps.size();
1781 auto cache_liveness = s->get_session_cache_liveness();
1782 if (trim || (enforce_max && num_caps > max_caps_per_client) || (enforce_liveness && cache_liveness < (num_caps>>cache_liveness_magnitude))) {
1783 caps_session.emplace(std::piecewise_construct, std::forward_as_tuple(num_caps), std::forward_as_tuple(s));
1784 }
1785 };
1786 mds->sessionmap.get_client_sessions(std::move(f));
1787
1788 std::pair<bool, uint64_t> result = {false, 0};
1789 auto& [throttled, caps_recalled] = result;
1790 last_recall_state = now;
1791 for (const auto& [num_caps, session] : boost::adaptors::reverse(caps_session)) {
1792 if (!session->is_open() ||
1793 !session->get_connection() ||
1794 !session->info.inst.name.is_client())
1795 continue;
1796
1797 dout(10) << __func__ << ":"
1798 << " session " << session->info.inst
1799 << " caps " << num_caps
1800 << ", leases " << session->leases.size()
1801 << dendl;
1802
1803 uint64_t newlim;
1804 if (num_caps < recall_max_caps || (num_caps-recall_max_caps) < min_caps_per_client) {
1805 newlim = min_caps_per_client;
1806 } else {
1807 newlim = num_caps-recall_max_caps;
1808 }
1809 if (num_caps > newlim) {
1810 /* now limit the number of caps we recall at a time to prevent overloading ourselves */
1811 uint64_t recall = std::min<uint64_t>(recall_max_caps, num_caps-newlim);
1812 newlim = num_caps-recall;
1813 const uint64_t session_recall_throttle = session->get_recall_caps_throttle();
1814 const uint64_t session_recall_throttle2o = session->get_recall_caps_throttle2o();
1815 const uint64_t global_recall_throttle = recall_throttle.get();
1816 if (session_recall_throttle+recall > recall_max_decay_threshold) {
1817 dout(15) << " session recall threshold (" << recall_max_decay_threshold << ") hit at " << session_recall_throttle << "; skipping!" << dendl;
1818 throttled = true;
1819 continue;
1820 } else if (session_recall_throttle2o+recall > recall_max_caps*2) {
1821 dout(15) << " session recall 2nd-order threshold (" << 2*recall_max_caps << ") hit at " << session_recall_throttle2o << "; skipping!" << dendl;
1822 throttled = true;
1823 continue;
1824 } else if (global_recall_throttle+recall > recall_global_max_decay_threshold) {
1825 dout(15) << " global recall threshold (" << recall_global_max_decay_threshold << ") hit at " << global_recall_throttle << "; skipping!" << dendl;
1826 throttled = true;
1827 break;
1828 }
1829
1830 // now check if we've recalled caps recently and the client is unlikely to satisfy a new recall
1831 if (steady) {
1832 const auto session_recall = session->get_recall_caps();
1833 const auto session_release = session->get_release_caps();
1834 if (2*session_release < session_recall && 2*session_recall > recall_max_decay_threshold) {
1835 /* The session has been unable to keep up with the number of caps
1836 * recalled (by half); additionally, to prevent marking sessions
1837 * we've just begun to recall from, the session_recall counter
1838 * (decayed count of caps recently recalled) is **greater** than the
1839 * session threshold for the session's cap recall throttle.
1840 */
1841 dout(15) << " 2*session_release < session_recall"
1842 " (2*" << session_release << " < " << session_recall << ") &&"
1843 " 2*session_recall < recall_max_decay_threshold"
1844 " (2*" << session_recall << " > " << recall_max_decay_threshold << ")"
1845 " Skipping because we are unlikely to get more released." << dendl;
1846 continue;
1847 } else if (recall < recall_max_caps && 2*recall < session_recall) {
1848 /* The number of caps recalled is less than the number we *could*
1849 * recall (so there isn't much left to recall?) and the number of
1850 * caps is less than the current recall_caps counter (decayed count
1851 * of caps recently recalled).
1852 */
1853 dout(15) << " 2*recall < session_recall "
1854 " (2*" << recall << " < " << session_recall << ") &&"
1855 " recall < recall_max_caps (" << recall << " < " << recall_max_caps << ");"
1856 " Skipping because we are unlikely to get more released." << dendl;
1857 continue;
1858 }
1859 }
1860
1861 dout(7) << " recalling " << recall << " caps; session_recall_throttle = " << session_recall_throttle << "; global_recall_throttle = " << global_recall_throttle << dendl;
1862
1863 auto m = make_message<MClientSession>(CEPH_SESSION_RECALL_STATE);
1864 m->head.max_caps = newlim;
1865 mds->send_message_client(m, session);
1866 if (gather) {
1867 flush_session(session, *gather);
1868 }
1869 caps_recalled += session->notify_recall_sent(newlim);
1870 recall_throttle.hit(recall);
1871 }
1872 }
1873
1874 dout(7) << "recalled" << (throttled ? " (throttled)" : "") << " " << caps_recalled << " client caps." << dendl;
1875
1876 return result;
1877 }
1878
1879 void Server::force_clients_readonly()
1880 {
1881 dout(10) << "force_clients_readonly" << dendl;
1882 set<Session*> sessions;
1883 mds->sessionmap.get_client_session_set(sessions);
1884 for (set<Session*>::const_iterator p = sessions.begin();
1885 p != sessions.end();
1886 ++p) {
1887 Session *session = *p;
1888 if (!session->info.inst.name.is_client() ||
1889 !(session->is_open() || session->is_stale()))
1890 continue;
1891 mds->send_message_client(make_message<MClientSession>(CEPH_SESSION_FORCE_RO), session);
1892 }
1893 }
1894
1895 /*******
1896 * some generic stuff for finishing off requests
1897 */
1898 void Server::journal_and_reply(MDRequestRef& mdr, CInode *in, CDentry *dn, LogEvent *le, MDSLogContextBase *fin)
1899 {
1900 dout(10) << "journal_and_reply tracei " << in << " tracedn " << dn << dendl;
1901 ceph_assert(!mdr->has_completed);
1902
1903 // note trace items for eventual reply.
1904 mdr->tracei = in;
1905 if (in)
1906 mdr->pin(in);
1907
1908 mdr->tracedn = dn;
1909 if (dn)
1910 mdr->pin(dn);
1911
1912 early_reply(mdr, in, dn);
1913
1914 mdr->committing = true;
1915 submit_mdlog_entry(le, fin, mdr, __func__);
1916
1917 if (mdr->client_request && mdr->client_request->is_queued_for_replay()) {
1918 if (mds->queue_one_replay()) {
1919 dout(10) << " queued next replay op" << dendl;
1920 } else {
1921 dout(10) << " journaled last replay op" << dendl;
1922 }
1923 } else if (mdr->did_early_reply)
1924 mds->locker->drop_rdlocks_for_early_reply(mdr.get());
1925 else
1926 mdlog->flush();
1927 }
1928
1929 void Server::submit_mdlog_entry(LogEvent *le, MDSLogContextBase *fin, MDRequestRef& mdr,
1930 std::string_view event)
1931 {
1932 if (mdr) {
1933 string event_str("submit entry: ");
1934 event_str += event;
1935 mdr->mark_event(event_str);
1936 }
1937 mdlog->submit_entry(le, fin);
1938 }
1939
1940 /*
1941 * send response built from mdr contents and error code; clean up mdr
1942 */
1943 void Server::respond_to_request(MDRequestRef& mdr, int r)
1944 {
1945 if (mdr->client_request) {
1946 if (mdr->is_batch_head()) {
1947 dout(20) << __func__ << " batch head " << *mdr << dendl;
1948 mdr->release_batch_op()->respond(r);
1949 } else {
1950 reply_client_request(mdr, make_message<MClientReply>(*mdr->client_request, r));
1951 }
1952 } else if (mdr->internal_op > -1) {
1953 dout(10) << "respond_to_request on internal request " << mdr << dendl;
1954 if (!mdr->internal_op_finish)
1955 ceph_abort_msg("trying to respond to internal op without finisher");
1956 mdr->internal_op_finish->complete(r);
1957 mdcache->request_finish(mdr);
1958 }
1959 }
1960
1961 // statistics mds req op number and latency
1962 void Server::perf_gather_op_latency(const cref_t<MClientRequest> &req, utime_t lat)
1963 {
1964 int code = l_mdss_first;
1965 switch(req->get_op()) {
1966 case CEPH_MDS_OP_LOOKUPHASH:
1967 code = l_mdss_req_lookuphash_latency;
1968 break;
1969 case CEPH_MDS_OP_LOOKUPINO:
1970 code = l_mdss_req_lookupino_latency;
1971 break;
1972 case CEPH_MDS_OP_LOOKUPPARENT:
1973 code = l_mdss_req_lookupparent_latency;
1974 break;
1975 case CEPH_MDS_OP_LOOKUPNAME:
1976 code = l_mdss_req_lookupname_latency;
1977 break;
1978 case CEPH_MDS_OP_LOOKUP:
1979 code = l_mdss_req_lookup_latency;
1980 break;
1981 case CEPH_MDS_OP_LOOKUPSNAP:
1982 code = l_mdss_req_lookupsnap_latency;
1983 break;
1984 case CEPH_MDS_OP_GETATTR:
1985 code = l_mdss_req_getattr_latency;
1986 break;
1987 case CEPH_MDS_OP_SETATTR:
1988 code = l_mdss_req_setattr_latency;
1989 break;
1990 case CEPH_MDS_OP_SETLAYOUT:
1991 code = l_mdss_req_setlayout_latency;
1992 break;
1993 case CEPH_MDS_OP_SETDIRLAYOUT:
1994 code = l_mdss_req_setdirlayout_latency;
1995 break;
1996 case CEPH_MDS_OP_SETXATTR:
1997 code = l_mdss_req_setxattr_latency;
1998 break;
1999 case CEPH_MDS_OP_RMXATTR:
2000 code = l_mdss_req_rmxattr_latency;
2001 break;
2002 case CEPH_MDS_OP_READDIR:
2003 code = l_mdss_req_readdir_latency;
2004 break;
2005 case CEPH_MDS_OP_SETFILELOCK:
2006 code = l_mdss_req_setfilelock_latency;
2007 break;
2008 case CEPH_MDS_OP_GETFILELOCK:
2009 code = l_mdss_req_getfilelock_latency;
2010 break;
2011 case CEPH_MDS_OP_CREATE:
2012 code = l_mdss_req_create_latency;
2013 break;
2014 case CEPH_MDS_OP_OPEN:
2015 code = l_mdss_req_open_latency;
2016 break;
2017 case CEPH_MDS_OP_MKNOD:
2018 code = l_mdss_req_mknod_latency;
2019 break;
2020 case CEPH_MDS_OP_LINK:
2021 code = l_mdss_req_link_latency;
2022 break;
2023 case CEPH_MDS_OP_UNLINK:
2024 code = l_mdss_req_unlink_latency;
2025 break;
2026 case CEPH_MDS_OP_RMDIR:
2027 code = l_mdss_req_rmdir_latency;
2028 break;
2029 case CEPH_MDS_OP_RENAME:
2030 code = l_mdss_req_rename_latency;
2031 break;
2032 case CEPH_MDS_OP_MKDIR:
2033 code = l_mdss_req_mkdir_latency;
2034 break;
2035 case CEPH_MDS_OP_SYMLINK:
2036 code = l_mdss_req_symlink_latency;
2037 break;
2038 case CEPH_MDS_OP_LSSNAP:
2039 code = l_mdss_req_lssnap_latency;
2040 break;
2041 case CEPH_MDS_OP_MKSNAP:
2042 code = l_mdss_req_mksnap_latency;
2043 break;
2044 case CEPH_MDS_OP_RMSNAP:
2045 code = l_mdss_req_rmsnap_latency;
2046 break;
2047 case CEPH_MDS_OP_RENAMESNAP:
2048 code = l_mdss_req_renamesnap_latency;
2049 break;
2050 default: ceph_abort();
2051 }
2052 logger->tinc(code, lat);
2053 }
2054
2055 void Server::early_reply(MDRequestRef& mdr, CInode *tracei, CDentry *tracedn)
2056 {
2057 if (!g_conf()->mds_early_reply)
2058 return;
2059
2060 if (mdr->no_early_reply) {
2061 dout(10) << "early_reply - flag no_early_reply is set, not allowed." << dendl;
2062 return;
2063 }
2064
2065 if (mdr->has_more() && mdr->more()->has_journaled_peers) {
2066 dout(10) << "early_reply - there are journaled peers, not allowed." << dendl;
2067 return;
2068 }
2069
2070 if (mdr->alloc_ino) {
2071 dout(10) << "early_reply - allocated ino, not allowed" << dendl;
2072 return;
2073 }
2074
2075 const cref_t<MClientRequest> &req = mdr->client_request;
2076 entity_inst_t client_inst = req->get_source_inst();
2077 if (client_inst.name.is_mds())
2078 return;
2079
2080 if (req->is_replay()) {
2081 dout(10) << " no early reply on replay op" << dendl;
2082 return;
2083 }
2084
2085
2086 auto reply = make_message<MClientReply>(*req, 0);
2087 reply->set_unsafe();
2088
2089 // mark xlocks "done", indicating that we are exposing uncommitted changes.
2090 //
2091 //_rename_finish() does not send dentry link/unlink message to replicas.
2092 // so do not set xlocks on dentries "done", the xlocks prevent dentries
2093 // that have projected linkages from getting new replica.
2094 mds->locker->set_xlocks_done(mdr.get(), req->get_op() == CEPH_MDS_OP_RENAME);
2095
2096 dout(10) << "early_reply " << reply->get_result()
2097 << " (" << cpp_strerror(reply->get_result())
2098 << ") " << *req << dendl;
2099
2100 if (tracei || tracedn) {
2101 if (tracei)
2102 mdr->cap_releases.erase(tracei->vino());
2103 if (tracedn)
2104 mdr->cap_releases.erase(tracedn->get_dir()->get_inode()->vino());
2105
2106 set_trace_dist(reply, tracei, tracedn, mdr);
2107 }
2108
2109 reply->set_extra_bl(mdr->reply_extra_bl);
2110 mds->send_message_client(reply, mdr->session);
2111
2112 mdr->did_early_reply = true;
2113
2114 mds->logger->inc(l_mds_reply);
2115 utime_t lat = ceph_clock_now() - req->get_recv_stamp();
2116 mds->logger->tinc(l_mds_reply_latency, lat);
2117 if (client_inst.name.is_client()) {
2118 mds->sessionmap.hit_session(mdr->session);
2119 }
2120 perf_gather_op_latency(req, lat);
2121 dout(20) << "lat " << lat << dendl;
2122
2123 mdr->mark_event("early_replied");
2124 }
2125
2126 /*
2127 * send given reply
2128 * include a trace to tracei
2129 * Clean up mdr
2130 */
2131 void Server::reply_client_request(MDRequestRef& mdr, const ref_t<MClientReply> &reply)
2132 {
2133 ceph_assert(mdr.get());
2134 const cref_t<MClientRequest> &req = mdr->client_request;
2135
2136 dout(7) << "reply_client_request " << reply->get_result()
2137 << " (" << cpp_strerror(reply->get_result())
2138 << ") " << *req << dendl;
2139
2140 mdr->mark_event("replying");
2141
2142 Session *session = mdr->session;
2143
2144 // note successful request in session map?
2145 //
2146 // setfilelock requests are special, they only modify states in MDS memory.
2147 // The states get lost when MDS fails. If Client re-send a completed
2148 // setfilelock request, it means that client did not receive corresponding
2149 // setfilelock reply. So MDS should re-execute the setfilelock request.
2150 if (req->may_write() && req->get_op() != CEPH_MDS_OP_SETFILELOCK &&
2151 reply->get_result() == 0 && session) {
2152 inodeno_t created = mdr->alloc_ino ? mdr->alloc_ino : mdr->used_prealloc_ino;
2153 session->add_completed_request(mdr->reqid.tid, created);
2154 if (mdr->ls) {
2155 mdr->ls->touched_sessions.insert(session->info.inst.name);
2156 }
2157 }
2158
2159 // give any preallocated inos to the session
2160 apply_allocated_inos(mdr, session);
2161
2162 // get tracei/tracedn from mdr?
2163 CInode *tracei = mdr->tracei;
2164 CDentry *tracedn = mdr->tracedn;
2165
2166 bool is_replay = mdr->client_request->is_replay();
2167 bool did_early_reply = mdr->did_early_reply;
2168 entity_inst_t client_inst = req->get_source_inst();
2169
2170 if (!did_early_reply && !is_replay) {
2171
2172 mds->logger->inc(l_mds_reply);
2173 utime_t lat = ceph_clock_now() - mdr->client_request->get_recv_stamp();
2174 mds->logger->tinc(l_mds_reply_latency, lat);
2175 if (session && client_inst.name.is_client()) {
2176 mds->sessionmap.hit_session(session);
2177 }
2178 perf_gather_op_latency(req, lat);
2179 dout(20) << "lat " << lat << dendl;
2180
2181 if (tracei)
2182 mdr->cap_releases.erase(tracei->vino());
2183 if (tracedn)
2184 mdr->cap_releases.erase(tracedn->get_dir()->get_inode()->vino());
2185 }
2186
2187 // drop non-rdlocks before replying, so that we can issue leases
2188 mdcache->request_drop_non_rdlocks(mdr);
2189
2190 // reply at all?
2191 if (session && !client_inst.name.is_mds()) {
2192 // send reply.
2193 if (!did_early_reply && // don't issue leases if we sent an earlier reply already
2194 (tracei || tracedn)) {
2195 if (is_replay) {
2196 if (tracei)
2197 mdcache->try_reconnect_cap(tracei, session);
2198 } else {
2199 // include metadata in reply
2200 set_trace_dist(reply, tracei, tracedn, mdr);
2201 }
2202 }
2203
2204 // We can set the extra bl unconditionally: if it's already been sent in the
2205 // early_reply, set_extra_bl will have claimed it and reply_extra_bl is empty
2206 reply->set_extra_bl(mdr->reply_extra_bl);
2207
2208 reply->set_mdsmap_epoch(mds->mdsmap->get_epoch());
2209 mds->send_message_client(reply, session);
2210 }
2211
2212 if (req->is_queued_for_replay() &&
2213 (mdr->has_completed || reply->get_result() < 0)) {
2214 if (reply->get_result() < 0) {
2215 int r = reply->get_result();
2216 derr << "reply_client_request: failed to replay " << *req
2217 << " error " << r << " (" << cpp_strerror(r) << ")" << dendl;
2218 mds->clog->warn() << "failed to replay " << req->get_reqid() << " error " << r;
2219 }
2220 mds->queue_one_replay();
2221 }
2222
2223 // clean up request
2224 mdcache->request_finish(mdr);
2225
2226 // take a closer look at tracei, if it happens to be a remote link
2227 if (tracei &&
2228 tracedn &&
2229 tracedn->get_projected_linkage()->is_remote()) {
2230 mdcache->eval_remote(tracedn);
2231 }
2232 }
2233
2234 /*
2235 * pass inode OR dentry (not both, or we may get confused)
2236 *
2237 * trace is in reverse order (i.e. root inode comes last)
2238 */
2239 void Server::set_trace_dist(const ref_t<MClientReply> &reply,
2240 CInode *in, CDentry *dn,
2241 MDRequestRef& mdr)
2242 {
2243 // skip doing this for debugging purposes?
2244 if (g_conf()->mds_inject_traceless_reply_probability &&
2245 mdr->ls && !mdr->o_trunc &&
2246 (rand() % 10000 < g_conf()->mds_inject_traceless_reply_probability * 10000.0)) {
2247 dout(5) << "deliberately skipping trace for " << *reply << dendl;
2248 return;
2249 }
2250
2251 // inode, dentry, dir, ..., inode
2252 bufferlist bl;
2253 mds_rank_t whoami = mds->get_nodeid();
2254 Session *session = mdr->session;
2255 snapid_t snapid = mdr->snapid;
2256 utime_t now = ceph_clock_now();
2257
2258 dout(20) << "set_trace_dist snapid " << snapid << dendl;
2259
2260 // realm
2261 if (snapid == CEPH_NOSNAP) {
2262 SnapRealm *realm;
2263 if (in)
2264 realm = in->find_snaprealm();
2265 else
2266 realm = dn->get_dir()->get_inode()->find_snaprealm();
2267 reply->snapbl = realm->get_snap_trace();
2268 dout(10) << "set_trace_dist snaprealm " << *realm << " len=" << reply->snapbl.length() << dendl;
2269 }
2270
2271 // dir + dentry?
2272 if (dn) {
2273 reply->head.is_dentry = 1;
2274 CDir *dir = dn->get_dir();
2275 CInode *diri = dir->get_inode();
2276
2277 diri->encode_inodestat(bl, session, NULL, snapid);
2278 dout(20) << "set_trace_dist added diri " << *diri << dendl;
2279
2280 #ifdef MDS_VERIFY_FRAGSTAT
2281 if (dir->is_complete())
2282 dir->verify_fragstat();
2283 #endif
2284 DirStat ds;
2285 ds.frag = dir->get_frag();
2286 ds.auth = dir->get_dir_auth().first;
2287 if (dir->is_auth() && !forward_all_requests_to_auth)
2288 dir->get_dist_spec(ds.dist, whoami);
2289
2290 dir->encode_dirstat(bl, session->info, ds);
2291 dout(20) << "set_trace_dist added dir " << *dir << dendl;
2292
2293 encode(dn->get_name(), bl);
2294
2295 int lease_mask = 0;
2296 CDentry::linkage_t *dnl = dn->get_linkage(mdr->get_client(), mdr);
2297 if (dnl->is_primary()) {
2298 ceph_assert(dnl->get_inode() == in);
2299 lease_mask = CEPH_LEASE_PRIMARY_LINK;
2300 } else {
2301 if (dnl->is_remote())
2302 ceph_assert(dnl->get_remote_ino() == in->ino());
2303 else
2304 ceph_assert(!in);
2305 }
2306 mds->locker->issue_client_lease(dn, mdr, lease_mask, now, bl);
2307 dout(20) << "set_trace_dist added dn " << snapid << " " << *dn << dendl;
2308 } else
2309 reply->head.is_dentry = 0;
2310
2311 // inode
2312 if (in) {
2313 in->encode_inodestat(bl, session, NULL, snapid, 0, mdr->getattr_caps);
2314 dout(20) << "set_trace_dist added in " << *in << dendl;
2315 reply->head.is_target = 1;
2316 } else
2317 reply->head.is_target = 0;
2318
2319 reply->set_trace(bl);
2320 }
2321
2322 void Server::handle_client_request(const cref_t<MClientRequest> &req)
2323 {
2324 dout(4) << "handle_client_request " << *req << dendl;
2325
2326 if (mds->logger)
2327 mds->logger->inc(l_mds_request);
2328 if (logger)
2329 logger->inc(l_mdss_handle_client_request);
2330
2331 if (!mdcache->is_open()) {
2332 dout(5) << "waiting for root" << dendl;
2333 mdcache->wait_for_open(new C_MDS_RetryMessage(mds, req));
2334 return;
2335 }
2336
2337 bool sessionclosed_isok = replay_unsafe_with_closed_session;
2338 // active session?
2339 Session *session = 0;
2340 if (req->get_source().is_client()) {
2341 session = mds->get_session(req);
2342 if (!session) {
2343 dout(5) << "no session for " << req->get_source() << ", dropping" << dendl;
2344 } else if ((session->is_closed() && (!mds->is_clientreplay() || !sessionclosed_isok)) ||
2345 session->is_closing() ||
2346 session->is_killing()) {
2347 dout(5) << "session closed|closing|killing, dropping" << dendl;
2348 session = NULL;
2349 }
2350 if (!session) {
2351 if (req->is_queued_for_replay())
2352 mds->queue_one_replay();
2353 return;
2354 }
2355 }
2356
2357 // old mdsmap?
2358 if (req->get_mdsmap_epoch() < mds->mdsmap->get_epoch()) {
2359 // send it? hrm, this isn't ideal; they may get a lot of copies if
2360 // they have a high request rate.
2361 }
2362
2363 // completed request?
2364 bool has_completed = false;
2365 if (req->is_replay() || req->get_retry_attempt()) {
2366 ceph_assert(session);
2367 inodeno_t created;
2368 if (session->have_completed_request(req->get_reqid().tid, &created)) {
2369 has_completed = true;
2370 if (!session->is_open())
2371 return;
2372 // Don't send traceless reply if the completed request has created
2373 // new inode. Treat the request as lookup request instead.
2374 if (req->is_replay() ||
2375 ((created == inodeno_t() || !mds->is_clientreplay()) &&
2376 req->get_op() != CEPH_MDS_OP_OPEN &&
2377 req->get_op() != CEPH_MDS_OP_CREATE)) {
2378 dout(5) << "already completed " << req->get_reqid() << dendl;
2379 auto reply = make_message<MClientReply>(*req, 0);
2380 if (created != inodeno_t()) {
2381 bufferlist extra;
2382 encode(created, extra);
2383 reply->set_extra_bl(extra);
2384 }
2385 mds->send_message_client(reply, session);
2386
2387 if (req->is_queued_for_replay())
2388 mds->queue_one_replay();
2389
2390 return;
2391 }
2392 if (req->get_op() != CEPH_MDS_OP_OPEN &&
2393 req->get_op() != CEPH_MDS_OP_CREATE) {
2394 dout(10) << " completed request which created new inode " << created
2395 << ", convert it to lookup request" << dendl;
2396 req->head.op = req->get_dentry_wanted() ? CEPH_MDS_OP_LOOKUP : CEPH_MDS_OP_GETATTR;
2397 req->head.args.getattr.mask = CEPH_STAT_CAP_INODE_ALL;
2398 }
2399 }
2400 }
2401
2402 // trim completed_request list
2403 if (req->get_oldest_client_tid() > 0) {
2404 dout(15) << " oldest_client_tid=" << req->get_oldest_client_tid() << dendl;
2405 ceph_assert(session);
2406 if (session->trim_completed_requests(req->get_oldest_client_tid())) {
2407 // Sessions 'completed_requests' was dirtied, mark it to be
2408 // potentially flushed at segment expiry.
2409 mdlog->get_current_segment()->touched_sessions.insert(session->info.inst.name);
2410
2411 if (session->get_num_trim_requests_warnings() > 0 &&
2412 session->get_num_completed_requests() * 2 < g_conf()->mds_max_completed_requests)
2413 session->reset_num_trim_requests_warnings();
2414 } else {
2415 if (session->get_num_completed_requests() >=
2416 (g_conf()->mds_max_completed_requests << session->get_num_trim_requests_warnings())) {
2417 session->inc_num_trim_requests_warnings();
2418 CachedStackStringStream css;
2419 *css << "client." << session->get_client() << " does not advance its oldest_client_tid ("
2420 << req->get_oldest_client_tid() << "), "
2421 << session->get_num_completed_requests()
2422 << " completed requests recorded in session\n";
2423 mds->clog->warn() << css->strv();
2424 dout(20) << __func__ << " " << css->strv() << dendl;
2425 }
2426 }
2427 }
2428
2429 // register + dispatch
2430 MDRequestRef mdr = mdcache->request_start(req);
2431 if (!mdr.get())
2432 return;
2433
2434 if (session) {
2435 mdr->session = session;
2436 session->requests.push_back(&mdr->item_session_request);
2437 }
2438
2439 if (has_completed)
2440 mdr->has_completed = true;
2441
2442 // process embedded cap releases?
2443 // (only if NOT replay!)
2444 if (!req->releases.empty() && req->get_source().is_client() && !req->is_replay()) {
2445 client_t client = req->get_source().num();
2446 for (const auto &r : req->releases) {
2447 mds->locker->process_request_cap_release(mdr, client, r.item, r.dname);
2448 }
2449 req->releases.clear();
2450 }
2451
2452 dispatch_client_request(mdr);
2453 return;
2454 }
2455
2456 void Server::handle_osd_map()
2457 {
2458 /* Note that we check the OSDMAP_FULL flag directly rather than
2459 * using osdmap_full_flag(), because we want to know "is the flag set"
2460 * rather than "does the flag apply to us?" */
2461 mds->objecter->with_osdmap([this](const OSDMap& o) {
2462 auto pi = o.get_pg_pool(mds->mdsmap->get_metadata_pool());
2463 is_full = pi && pi->has_flag(pg_pool_t::FLAG_FULL);
2464 dout(7) << __func__ << ": full = " << is_full << " epoch = "
2465 << o.get_epoch() << dendl;
2466 });
2467 }
2468
2469 void Server::dispatch_client_request(MDRequestRef& mdr)
2470 {
2471 // we shouldn't be waiting on anyone.
2472 ceph_assert(!mdr->has_more() || mdr->more()->waiting_on_peer.empty());
2473
2474 if (mdr->killed) {
2475 dout(10) << "request " << *mdr << " was killed" << dendl;
2476 //if the mdr is a "batch_op" and it has followers, pick a follower as
2477 //the new "head of the batch ops" and go on processing the new one.
2478 if (mdr->is_batch_head()) {
2479 int mask = mdr->client_request->head.args.getattr.mask;
2480 auto it = mdr->batch_op_map->find(mask);
2481 auto new_batch_head = it->second->find_new_head();
2482 if (!new_batch_head) {
2483 mdr->batch_op_map->erase(it);
2484 return;
2485 }
2486 mdr = std::move(new_batch_head);
2487 } else {
2488 return;
2489 }
2490 } else if (mdr->aborted) {
2491 mdr->aborted = false;
2492 mdcache->request_kill(mdr);
2493 return;
2494 }
2495
2496 const cref_t<MClientRequest> &req = mdr->client_request;
2497
2498 if (logger) logger->inc(l_mdss_dispatch_client_request);
2499
2500 dout(7) << "dispatch_client_request " << *req << dendl;
2501
2502 if (req->may_write() && mdcache->is_readonly()) {
2503 dout(10) << " read-only FS" << dendl;
2504 respond_to_request(mdr, -CEPHFS_EROFS);
2505 return;
2506 }
2507 if (mdr->has_more() && mdr->more()->peer_error) {
2508 dout(10) << " got error from peers" << dendl;
2509 respond_to_request(mdr, mdr->more()->peer_error);
2510 return;
2511 }
2512
2513 if (is_full) {
2514 if (req->get_op() == CEPH_MDS_OP_SETLAYOUT ||
2515 req->get_op() == CEPH_MDS_OP_SETDIRLAYOUT ||
2516 req->get_op() == CEPH_MDS_OP_SETLAYOUT ||
2517 req->get_op() == CEPH_MDS_OP_RMXATTR ||
2518 req->get_op() == CEPH_MDS_OP_SETXATTR ||
2519 req->get_op() == CEPH_MDS_OP_CREATE ||
2520 req->get_op() == CEPH_MDS_OP_SYMLINK ||
2521 req->get_op() == CEPH_MDS_OP_MKSNAP ||
2522 ((req->get_op() == CEPH_MDS_OP_LINK ||
2523 req->get_op() == CEPH_MDS_OP_RENAME) &&
2524 (!mdr->has_more() || mdr->more()->witnessed.empty())) // haven't started peer request
2525 ) {
2526
2527 dout(20) << __func__ << ": full, responding CEPHFS_ENOSPC to op " << ceph_mds_op_name(req->get_op()) << dendl;
2528 respond_to_request(mdr, -CEPHFS_ENOSPC);
2529 return;
2530 } else {
2531 dout(20) << __func__ << ": full, permitting op " << ceph_mds_op_name(req->get_op()) << dendl;
2532 }
2533 }
2534
2535 switch (req->get_op()) {
2536 case CEPH_MDS_OP_LOOKUPHASH:
2537 case CEPH_MDS_OP_LOOKUPINO:
2538 handle_client_lookup_ino(mdr, false, false);
2539 break;
2540 case CEPH_MDS_OP_LOOKUPPARENT:
2541 handle_client_lookup_ino(mdr, true, false);
2542 break;
2543 case CEPH_MDS_OP_LOOKUPNAME:
2544 handle_client_lookup_ino(mdr, false, true);
2545 break;
2546
2547 // inodes ops.
2548 case CEPH_MDS_OP_LOOKUP:
2549 handle_client_getattr(mdr, true);
2550 break;
2551
2552 case CEPH_MDS_OP_LOOKUPSNAP:
2553 // lookupsnap does not reference a CDentry; treat it as a getattr
2554 case CEPH_MDS_OP_GETATTR:
2555 handle_client_getattr(mdr, false);
2556 break;
2557
2558 case CEPH_MDS_OP_SETATTR:
2559 handle_client_setattr(mdr);
2560 break;
2561 case CEPH_MDS_OP_SETLAYOUT:
2562 handle_client_setlayout(mdr);
2563 break;
2564 case CEPH_MDS_OP_SETDIRLAYOUT:
2565 handle_client_setdirlayout(mdr);
2566 break;
2567 case CEPH_MDS_OP_SETXATTR:
2568 handle_client_setxattr(mdr);
2569 break;
2570 case CEPH_MDS_OP_RMXATTR:
2571 handle_client_removexattr(mdr);
2572 break;
2573
2574 case CEPH_MDS_OP_READDIR:
2575 handle_client_readdir(mdr);
2576 break;
2577
2578 case CEPH_MDS_OP_SETFILELOCK:
2579 handle_client_file_setlock(mdr);
2580 break;
2581
2582 case CEPH_MDS_OP_GETFILELOCK:
2583 handle_client_file_readlock(mdr);
2584 break;
2585
2586 // funky.
2587 case CEPH_MDS_OP_CREATE:
2588 if (mdr->has_completed)
2589 handle_client_open(mdr); // already created.. just open
2590 else
2591 handle_client_openc(mdr);
2592 break;
2593
2594 case CEPH_MDS_OP_OPEN:
2595 handle_client_open(mdr);
2596 break;
2597
2598 // namespace.
2599 // no prior locks.
2600 case CEPH_MDS_OP_MKNOD:
2601 handle_client_mknod(mdr);
2602 break;
2603 case CEPH_MDS_OP_LINK:
2604 handle_client_link(mdr);
2605 break;
2606 case CEPH_MDS_OP_UNLINK:
2607 case CEPH_MDS_OP_RMDIR:
2608 handle_client_unlink(mdr);
2609 break;
2610 case CEPH_MDS_OP_RENAME:
2611 handle_client_rename(mdr);
2612 break;
2613 case CEPH_MDS_OP_MKDIR:
2614 handle_client_mkdir(mdr);
2615 break;
2616 case CEPH_MDS_OP_SYMLINK:
2617 handle_client_symlink(mdr);
2618 break;
2619
2620
2621 // snaps
2622 case CEPH_MDS_OP_LSSNAP:
2623 handle_client_lssnap(mdr);
2624 break;
2625 case CEPH_MDS_OP_MKSNAP:
2626 handle_client_mksnap(mdr);
2627 break;
2628 case CEPH_MDS_OP_RMSNAP:
2629 handle_client_rmsnap(mdr);
2630 break;
2631 case CEPH_MDS_OP_RENAMESNAP:
2632 handle_client_renamesnap(mdr);
2633 break;
2634
2635 default:
2636 dout(1) << " unknown client op " << req->get_op() << dendl;
2637 respond_to_request(mdr, -CEPHFS_EOPNOTSUPP);
2638 }
2639 }
2640
2641
2642 // ---------------------------------------
2643 // PEER REQUESTS
2644
2645 void Server::handle_peer_request(const cref_t<MMDSPeerRequest> &m)
2646 {
2647 dout(4) << "handle_peer_request " << m->get_reqid() << " from " << m->get_source() << dendl;
2648 mds_rank_t from = mds_rank_t(m->get_source().num());
2649
2650 if (logger) logger->inc(l_mdss_handle_peer_request);
2651
2652 // reply?
2653 if (m->is_reply())
2654 return handle_peer_request_reply(m);
2655
2656 // the purpose of rename notify is enforcing causal message ordering. making sure
2657 // bystanders have received all messages from rename srcdn's auth MDS.
2658 if (m->get_op() == MMDSPeerRequest::OP_RENAMENOTIFY) {
2659 auto reply = make_message<MMDSPeerRequest>(m->get_reqid(), m->get_attempt(), MMDSPeerRequest::OP_RENAMENOTIFYACK);
2660 mds->send_message(reply, m->get_connection());
2661 return;
2662 }
2663
2664 CDentry *straydn = NULL;
2665 if (m->straybl.length() > 0) {
2666 mdcache->decode_replica_stray(straydn, m->straybl, from);
2667 ceph_assert(straydn);
2668 m->straybl.clear();
2669 }
2670
2671 if (!mds->is_clientreplay() && !mds->is_active() && !mds->is_stopping()) {
2672 dout(3) << "not clientreplay|active yet, waiting" << dendl;
2673 mds->wait_for_replay(new C_MDS_RetryMessage(mds, m));
2674 return;
2675 }
2676
2677 // am i a new peer?
2678 MDRequestRef mdr;
2679 if (mdcache->have_request(m->get_reqid())) {
2680 // existing?
2681 mdr = mdcache->request_get(m->get_reqid());
2682
2683 // is my request newer?
2684 if (mdr->attempt > m->get_attempt()) {
2685 dout(10) << "local request " << *mdr << " attempt " << mdr->attempt << " > " << m->get_attempt()
2686 << ", dropping " << *m << dendl;
2687 return;
2688 }
2689
2690 if (mdr->attempt < m->get_attempt()) {
2691 // mine is old, close it out
2692 dout(10) << "local request " << *mdr << " attempt " << mdr->attempt << " < " << m->get_attempt()
2693 << ", closing out" << dendl;
2694 mdcache->request_finish(mdr);
2695 mdr.reset();
2696 } else if (mdr->peer_to_mds != from) {
2697 dout(10) << "local request " << *mdr << " not peer to mds." << from << dendl;
2698 return;
2699 }
2700
2701 // may get these while mdr->peer_request is non-null
2702 if (m->get_op() == MMDSPeerRequest::OP_DROPLOCKS) {
2703 mds->locker->drop_locks(mdr.get());
2704 return;
2705 }
2706 if (m->get_op() == MMDSPeerRequest::OP_FINISH) {
2707 if (m->is_abort()) {
2708 mdr->aborted = true;
2709 if (mdr->peer_request) {
2710 // only abort on-going xlock, wrlock and auth pin
2711 ceph_assert(!mdr->peer_did_prepare());
2712 } else {
2713 mdcache->request_finish(mdr);
2714 }
2715 } else {
2716 if (m->inode_export.length() > 0)
2717 mdr->more()->inode_import = m->inode_export;
2718 // finish off request.
2719 mdcache->request_finish(mdr);
2720 }
2721 return;
2722 }
2723 }
2724 if (!mdr.get()) {
2725 // new?
2726 if (m->get_op() == MMDSPeerRequest::OP_FINISH) {
2727 dout(10) << "missing peer request for " << m->get_reqid()
2728 << " OP_FINISH, must have lost race with a forward" << dendl;
2729 return;
2730 }
2731 mdr = mdcache->request_start_peer(m->get_reqid(), m->get_attempt(), m);
2732 mdr->set_op_stamp(m->op_stamp);
2733 }
2734 ceph_assert(mdr->peer_request == 0); // only one at a time, please!
2735
2736 if (straydn) {
2737 mdr->pin(straydn);
2738 mdr->straydn = straydn;
2739 }
2740
2741 if (mds->is_clientreplay() && !mds->mdsmap->is_clientreplay(from) &&
2742 mdr->locks.empty()) {
2743 dout(3) << "not active yet, waiting" << dendl;
2744 mds->wait_for_active(new C_MDS_RetryMessage(mds, m));
2745 return;
2746 }
2747
2748 mdr->reset_peer_request(m);
2749
2750 dispatch_peer_request(mdr);
2751 }
2752
2753 void Server::handle_peer_request_reply(const cref_t<MMDSPeerRequest> &m)
2754 {
2755 mds_rank_t from = mds_rank_t(m->get_source().num());
2756
2757 if (!mds->is_clientreplay() && !mds->is_active() && !mds->is_stopping()) {
2758 metareqid_t r = m->get_reqid();
2759 if (!mdcache->have_uncommitted_leader(r, from)) {
2760 dout(10) << "handle_peer_request_reply ignoring peer reply from mds."
2761 << from << " reqid " << r << dendl;
2762 return;
2763 }
2764 dout(3) << "not clientreplay|active yet, waiting" << dendl;
2765 mds->wait_for_replay(new C_MDS_RetryMessage(mds, m));
2766 return;
2767 }
2768
2769 if (m->get_op() == MMDSPeerRequest::OP_COMMITTED) {
2770 metareqid_t r = m->get_reqid();
2771 mdcache->committed_leader_peer(r, from);
2772 return;
2773 }
2774
2775 MDRequestRef mdr = mdcache->request_get(m->get_reqid());
2776 if (m->get_attempt() != mdr->attempt) {
2777 dout(10) << "handle_peer_request_reply " << *mdr << " ignoring reply from other attempt "
2778 << m->get_attempt() << dendl;
2779 return;
2780 }
2781
2782 switch (m->get_op()) {
2783 case MMDSPeerRequest::OP_XLOCKACK:
2784 {
2785 // identify lock, leader request
2786 SimpleLock *lock = mds->locker->get_lock(m->get_lock_type(),
2787 m->get_object_info());
2788 mdr->more()->peers.insert(from);
2789 lock->decode_locked_state(m->get_lock_data());
2790 dout(10) << "got remote xlock on " << *lock << " on " << *lock->get_parent() << dendl;
2791 mdr->emplace_lock(lock, MutationImpl::LockOp::XLOCK);
2792 mdr->finish_locking(lock);
2793 lock->get_xlock(mdr, mdr->get_client());
2794
2795 ceph_assert(mdr->more()->waiting_on_peer.count(from));
2796 mdr->more()->waiting_on_peer.erase(from);
2797 ceph_assert(mdr->more()->waiting_on_peer.empty());
2798 mdcache->dispatch_request(mdr);
2799 }
2800 break;
2801
2802 case MMDSPeerRequest::OP_WRLOCKACK:
2803 {
2804 // identify lock, leader request
2805 SimpleLock *lock = mds->locker->get_lock(m->get_lock_type(),
2806 m->get_object_info());
2807 mdr->more()->peers.insert(from);
2808 dout(10) << "got remote wrlock on " << *lock << " on " << *lock->get_parent() << dendl;
2809 auto it = mdr->emplace_lock(lock, MutationImpl::LockOp::REMOTE_WRLOCK, from);
2810 ceph_assert(it->is_remote_wrlock());
2811 ceph_assert(it->wrlock_target == from);
2812
2813 mdr->finish_locking(lock);
2814
2815 ceph_assert(mdr->more()->waiting_on_peer.count(from));
2816 mdr->more()->waiting_on_peer.erase(from);
2817 ceph_assert(mdr->more()->waiting_on_peer.empty());
2818 mdcache->dispatch_request(mdr);
2819 }
2820 break;
2821
2822 case MMDSPeerRequest::OP_AUTHPINACK:
2823 handle_peer_auth_pin_ack(mdr, m);
2824 break;
2825
2826 case MMDSPeerRequest::OP_LINKPREPACK:
2827 handle_peer_link_prep_ack(mdr, m);
2828 break;
2829
2830 case MMDSPeerRequest::OP_RMDIRPREPACK:
2831 handle_peer_rmdir_prep_ack(mdr, m);
2832 break;
2833
2834 case MMDSPeerRequest::OP_RENAMEPREPACK:
2835 handle_peer_rename_prep_ack(mdr, m);
2836 break;
2837
2838 case MMDSPeerRequest::OP_RENAMENOTIFYACK:
2839 handle_peer_rename_notify_ack(mdr, m);
2840 break;
2841
2842 default:
2843 ceph_abort();
2844 }
2845 }
2846
2847 void Server::dispatch_peer_request(MDRequestRef& mdr)
2848 {
2849 dout(7) << "dispatch_peer_request " << *mdr << " " << *mdr->peer_request << dendl;
2850
2851 if (mdr->aborted) {
2852 dout(7) << " abort flag set, finishing" << dendl;
2853 mdcache->request_finish(mdr);
2854 return;
2855 }
2856
2857 if (logger) logger->inc(l_mdss_dispatch_peer_request);
2858
2859 int op = mdr->peer_request->get_op();
2860 switch (op) {
2861 case MMDSPeerRequest::OP_XLOCK:
2862 case MMDSPeerRequest::OP_WRLOCK:
2863 {
2864 // identify object
2865 SimpleLock *lock = mds->locker->get_lock(mdr->peer_request->get_lock_type(),
2866 mdr->peer_request->get_object_info());
2867
2868 if (!lock) {
2869 dout(10) << "don't have object, dropping" << dendl;
2870 ceph_abort(); // can this happen, if we auth pinned properly.
2871 }
2872 if (op == MMDSPeerRequest::OP_XLOCK && !lock->get_parent()->is_auth()) {
2873 dout(10) << "not auth for remote xlock attempt, dropping on "
2874 << *lock << " on " << *lock->get_parent() << dendl;
2875 } else {
2876 // use acquire_locks so that we get auth_pinning.
2877 MutationImpl::LockOpVec lov;
2878 for (const auto& p : mdr->locks) {
2879 if (p.is_xlock())
2880 lov.add_xlock(p.lock);
2881 else if (p.is_wrlock())
2882 lov.add_wrlock(p.lock);
2883 }
2884
2885 int replycode = 0;
2886 switch (op) {
2887 case MMDSPeerRequest::OP_XLOCK:
2888 lov.add_xlock(lock);
2889 replycode = MMDSPeerRequest::OP_XLOCKACK;
2890 break;
2891 case MMDSPeerRequest::OP_WRLOCK:
2892 lov.add_wrlock(lock);
2893 replycode = MMDSPeerRequest::OP_WRLOCKACK;
2894 break;
2895 }
2896
2897 if (!mds->locker->acquire_locks(mdr, lov))
2898 return;
2899
2900 // ack
2901 auto r = make_message<MMDSPeerRequest>(mdr->reqid, mdr->attempt, replycode);
2902 r->set_lock_type(lock->get_type());
2903 lock->get_parent()->set_object_info(r->get_object_info());
2904 if (replycode == MMDSPeerRequest::OP_XLOCKACK)
2905 lock->encode_locked_state(r->get_lock_data());
2906 mds->send_message(r, mdr->peer_request->get_connection());
2907 }
2908
2909 // done.
2910 mdr->reset_peer_request();
2911 }
2912 break;
2913
2914 case MMDSPeerRequest::OP_UNXLOCK:
2915 case MMDSPeerRequest::OP_UNWRLOCK:
2916 {
2917 SimpleLock *lock = mds->locker->get_lock(mdr->peer_request->get_lock_type(),
2918 mdr->peer_request->get_object_info());
2919 ceph_assert(lock);
2920 auto it = mdr->locks.find(lock);
2921 ceph_assert(it != mdr->locks.end());
2922 bool need_issue = false;
2923 switch (op) {
2924 case MMDSPeerRequest::OP_UNXLOCK:
2925 mds->locker->xlock_finish(it, mdr.get(), &need_issue);
2926 break;
2927 case MMDSPeerRequest::OP_UNWRLOCK:
2928 mds->locker->wrlock_finish(it, mdr.get(), &need_issue);
2929 break;
2930 }
2931 if (need_issue)
2932 mds->locker->issue_caps(static_cast<CInode*>(lock->get_parent()));
2933
2934 // done. no ack necessary.
2935 mdr->reset_peer_request();
2936 }
2937 break;
2938
2939 case MMDSPeerRequest::OP_AUTHPIN:
2940 handle_peer_auth_pin(mdr);
2941 break;
2942
2943 case MMDSPeerRequest::OP_LINKPREP:
2944 case MMDSPeerRequest::OP_UNLINKPREP:
2945 handle_peer_link_prep(mdr);
2946 break;
2947
2948 case MMDSPeerRequest::OP_RMDIRPREP:
2949 handle_peer_rmdir_prep(mdr);
2950 break;
2951
2952 case MMDSPeerRequest::OP_RENAMEPREP:
2953 handle_peer_rename_prep(mdr);
2954 break;
2955
2956 default:
2957 ceph_abort();
2958 }
2959 }
2960
2961 void Server::handle_peer_auth_pin(MDRequestRef& mdr)
2962 {
2963 dout(10) << "handle_peer_auth_pin " << *mdr << dendl;
2964
2965 // build list of objects
2966 list<MDSCacheObject*> objects;
2967 CInode *auth_pin_freeze = NULL;
2968 bool nonblocking = mdr->peer_request->is_nonblocking();
2969 bool fail = false, wouldblock = false, readonly = false;
2970 ref_t<MMDSPeerRequest> reply;
2971
2972 if (mdcache->is_readonly()) {
2973 dout(10) << " read-only FS" << dendl;
2974 readonly = true;
2975 fail = true;
2976 }
2977
2978 if (!fail) {
2979 for (const auto &oi : mdr->peer_request->get_authpins()) {
2980 MDSCacheObject *object = mdcache->get_object(oi);
2981 if (!object) {
2982 dout(10) << " don't have " << oi << dendl;
2983 fail = true;
2984 break;
2985 }
2986
2987 objects.push_back(object);
2988 if (oi == mdr->peer_request->get_authpin_freeze())
2989 auth_pin_freeze = static_cast<CInode*>(object);
2990 }
2991 }
2992
2993 // can we auth pin them?
2994 if (!fail) {
2995 for (const auto& obj : objects) {
2996 if (!obj->is_auth()) {
2997 dout(10) << " not auth for " << *obj << dendl;
2998 fail = true;
2999 break;
3000 }
3001 if (mdr->is_auth_pinned(obj))
3002 continue;
3003 if (!mdr->can_auth_pin(obj)) {
3004 if (nonblocking) {
3005 dout(10) << " can't auth_pin (freezing?) " << *obj << " nonblocking" << dendl;
3006 fail = true;
3007 wouldblock = true;
3008 break;
3009 }
3010 // wait
3011 dout(10) << " waiting for authpinnable on " << *obj << dendl;
3012 obj->add_waiter(CDir::WAIT_UNFREEZE, new C_MDS_RetryRequest(mdcache, mdr));
3013 mdr->drop_local_auth_pins();
3014
3015 mds->locker->notify_freeze_waiter(obj);
3016 goto blocked;
3017 }
3018 }
3019 }
3020
3021 if (!fail) {
3022 /* freeze authpin wrong inode */
3023 if (mdr->has_more() && mdr->more()->is_freeze_authpin &&
3024 mdr->more()->rename_inode != auth_pin_freeze)
3025 mdr->unfreeze_auth_pin(true);
3026
3027 /* handle_peer_rename_prep() call freeze_inode() to wait for all other operations
3028 * on the source inode to complete. This happens after all locks for the rename
3029 * operation are acquired. But to acquire locks, we need auth pin locks' parent
3030 * objects first. So there is an ABBA deadlock if someone auth pins the source inode
3031 * after locks are acquired and before Server::handle_peer_rename_prep() is called.
3032 * The solution is freeze the inode and prevent other MDRequests from getting new
3033 * auth pins.
3034 */
3035 if (auth_pin_freeze) {
3036 dout(10) << " freezing auth pin on " << *auth_pin_freeze << dendl;
3037 if (!mdr->freeze_auth_pin(auth_pin_freeze)) {
3038 auth_pin_freeze->add_waiter(CInode::WAIT_FROZEN, new C_MDS_RetryRequest(mdcache, mdr));
3039 mds->mdlog->flush();
3040 goto blocked;
3041 }
3042 }
3043 }
3044
3045 reply = make_message<MMDSPeerRequest>(mdr->reqid, mdr->attempt, MMDSPeerRequest::OP_AUTHPINACK);
3046
3047 if (fail) {
3048 mdr->drop_local_auth_pins(); // just in case
3049 if (readonly)
3050 reply->mark_error_rofs();
3051 if (wouldblock)
3052 reply->mark_error_wouldblock();
3053 } else {
3054 // auth pin!
3055 for (const auto& obj : objects) {
3056 dout(10) << "auth_pinning " << *obj << dendl;
3057 mdr->auth_pin(obj);
3058 }
3059 // return list of my auth_pins (if any)
3060 for (const auto &p : mdr->object_states) {
3061 if (!p.second.auth_pinned)
3062 continue;
3063 MDSCacheObjectInfo info;
3064 p.first->set_object_info(info);
3065 reply->get_authpins().push_back(info);
3066 if (p.first == (MDSCacheObject*)auth_pin_freeze)
3067 auth_pin_freeze->set_object_info(reply->get_authpin_freeze());
3068 }
3069 }
3070
3071 mds->send_message_mds(reply, mdr->peer_to_mds);
3072
3073 // clean up this request
3074 mdr->reset_peer_request();
3075 return;
3076
3077 blocked:
3078 if (mdr->peer_request->should_notify_blocking()) {
3079 reply = make_message<MMDSPeerRequest>(mdr->reqid, mdr->attempt, MMDSPeerRequest::OP_AUTHPINACK);
3080 reply->mark_req_blocked();
3081 mds->send_message_mds(reply, mdr->peer_to_mds);
3082 mdr->peer_request->clear_notify_blocking();
3083 }
3084 return;
3085 }
3086
3087 void Server::handle_peer_auth_pin_ack(MDRequestRef& mdr, const cref_t<MMDSPeerRequest> &ack)
3088 {
3089 dout(10) << "handle_peer_auth_pin_ack on " << *mdr << " " << *ack << dendl;
3090 mds_rank_t from = mds_rank_t(ack->get_source().num());
3091
3092 if (ack->is_req_blocked()) {
3093 mdr->disable_lock_cache();
3094 // peer auth pin is blocked, drop locks to avoid deadlock
3095 mds->locker->drop_locks(mdr.get(), nullptr);
3096 return;
3097 }
3098
3099 // added auth pins?
3100 set<MDSCacheObject*> pinned;
3101 for (const auto &oi : ack->get_authpins()) {
3102 MDSCacheObject *object = mdcache->get_object(oi);
3103 ceph_assert(object); // we pinned it
3104 dout(10) << " remote has pinned " << *object << dendl;
3105 mdr->set_remote_auth_pinned(object, from);
3106 if (oi == ack->get_authpin_freeze())
3107 mdr->set_remote_frozen_auth_pin(static_cast<CInode *>(object));
3108 pinned.insert(object);
3109 }
3110
3111 // removed frozen auth pin ?
3112 if (mdr->more()->is_remote_frozen_authpin &&
3113 ack->get_authpin_freeze() == MDSCacheObjectInfo()) {
3114 auto stat_p = mdr->find_object_state(mdr->more()->rename_inode);
3115 ceph_assert(stat_p);
3116 if (stat_p->remote_auth_pinned == from) {
3117 mdr->more()->is_remote_frozen_authpin = false;
3118 }
3119 }
3120
3121 // removed auth pins?
3122 for (auto& p : mdr->object_states) {
3123 if (p.second.remote_auth_pinned == MDS_RANK_NONE)
3124 continue;
3125 MDSCacheObject* object = p.first;
3126 if (p.second.remote_auth_pinned == from && pinned.count(object) == 0) {
3127 dout(10) << " remote has unpinned " << *object << dendl;
3128 mdr->_clear_remote_auth_pinned(p.second);
3129 }
3130 }
3131
3132 // note peer
3133 mdr->more()->peers.insert(from);
3134
3135 // clear from waiting list
3136 auto ret = mdr->more()->waiting_on_peer.erase(from);
3137 ceph_assert(ret);
3138
3139 if (ack->is_error_rofs()) {
3140 mdr->more()->peer_error = -CEPHFS_EROFS;
3141 } else if (ack->is_error_wouldblock()) {
3142 mdr->more()->peer_error = -CEPHFS_EWOULDBLOCK;
3143 }
3144
3145 // go again?
3146 if (mdr->more()->waiting_on_peer.empty())
3147 mdcache->dispatch_request(mdr);
3148 else
3149 dout(10) << "still waiting on peers " << mdr->more()->waiting_on_peer << dendl;
3150 }
3151
3152
3153 // ---------------------------------------
3154 // HELPERS
3155
3156
3157 /**
3158 * check whether we are permitted to complete a request
3159 *
3160 * Check whether we have permission to perform the operation specified
3161 * by mask on the given inode, based on the capability in the mdr's
3162 * session.
3163 */
3164 bool Server::check_access(MDRequestRef& mdr, CInode *in, unsigned mask)
3165 {
3166 if (mdr->session) {
3167 int r = mdr->session->check_access(
3168 in, mask,
3169 mdr->client_request->get_caller_uid(),
3170 mdr->client_request->get_caller_gid(),
3171 &mdr->client_request->get_caller_gid_list(),
3172 mdr->client_request->head.args.setattr.uid,
3173 mdr->client_request->head.args.setattr.gid);
3174 if (r < 0) {
3175 respond_to_request(mdr, r);
3176 return false;
3177 }
3178 }
3179 return true;
3180 }
3181
3182 /**
3183 * check whether fragment has reached maximum size
3184 *
3185 */
3186 bool Server::check_fragment_space(MDRequestRef &mdr, CDir *in)
3187 {
3188 const auto size = in->get_frag_size();
3189 if (size >= g_conf()->mds_bal_fragment_size_max) {
3190 dout(10) << "fragment " << *in << " size exceeds " << g_conf()->mds_bal_fragment_size_max << " (CEPHFS_ENOSPC)" << dendl;
3191 respond_to_request(mdr, -CEPHFS_ENOSPC);
3192 return false;
3193 }
3194
3195 return true;
3196 }
3197
3198 CDentry* Server::prepare_stray_dentry(MDRequestRef& mdr, CInode *in)
3199 {
3200 string straydname;
3201 in->name_stray_dentry(straydname);
3202
3203 CDentry *straydn = mdr->straydn;
3204 if (straydn) {
3205 ceph_assert(straydn->get_name() == straydname);
3206 return straydn;
3207 }
3208 CDir *straydir = mdcache->get_stray_dir(in);
3209
3210 if (!mdr->client_request->is_replay() &&
3211 !check_fragment_space(mdr, straydir))
3212 return nullptr;
3213
3214 straydn = straydir->lookup(straydname);
3215 if (!straydn) {
3216 if (straydir->is_frozen_dir()) {
3217 dout(10) << __func__ << ": " << *straydir << " is frozen, waiting" << dendl;
3218 mds->locker->drop_locks(mdr.get());
3219 mdr->drop_local_auth_pins();
3220 straydir->add_waiter(CInode::WAIT_UNFREEZE, new C_MDS_RetryRequest(mdcache, mdr));
3221 return nullptr;
3222 }
3223 straydn = straydir->add_null_dentry(straydname);
3224 straydn->mark_new();
3225 } else {
3226 ceph_assert(straydn->get_projected_linkage()->is_null());
3227 }
3228
3229 straydn->state_set(CDentry::STATE_STRAY);
3230 mdr->straydn = straydn;
3231 mdr->pin(straydn);
3232
3233 return straydn;
3234 }
3235
3236 /** prepare_new_inode
3237 *
3238 * create a new inode. set c/m/atime. hit dir pop.
3239 */
3240 CInode* Server::prepare_new_inode(MDRequestRef& mdr, CDir *dir, inodeno_t useino, unsigned mode,
3241 const file_layout_t *layout)
3242 {
3243 CInode *in = new CInode(mdcache);
3244 auto _inode = in->_get_inode();
3245
3246 // Server::prepare_force_open_sessions() can re-open session in closing
3247 // state. In that corner case, session's prealloc_inos are being freed.
3248 // To simplify the code, we disallow using/refilling session's prealloc_ino
3249 // while session is opening.
3250 bool allow_prealloc_inos = mdr->session->is_open();
3251
3252 // assign ino
3253 if (allow_prealloc_inos && (mdr->used_prealloc_ino = _inode->ino = mdr->session->take_ino(useino))) {
3254 mds->sessionmap.mark_projected(mdr->session);
3255 dout(10) << "prepare_new_inode used_prealloc " << mdr->used_prealloc_ino
3256 << " (" << mdr->session->info.prealloc_inos.size() << " left)"
3257 << dendl;
3258 } else {
3259 mdr->alloc_ino =
3260 _inode->ino = mds->inotable->project_alloc_id(useino);
3261 dout(10) << "prepare_new_inode alloc " << mdr->alloc_ino << dendl;
3262 }
3263
3264 if (useino && useino != _inode->ino) {
3265 dout(0) << "WARNING: client specified " << useino << " and i allocated " << _inode->ino << dendl;
3266 mds->clog->error() << mdr->client_request->get_source()
3267 << " specified ino " << useino
3268 << " but mds." << mds->get_nodeid() << " allocated " << _inode->ino;
3269 //ceph_abort(); // just for now.
3270 }
3271
3272 if (allow_prealloc_inos &&
3273 mdr->session->get_num_projected_prealloc_inos() < g_conf()->mds_client_prealloc_inos / 2) {
3274 int need = g_conf()->mds_client_prealloc_inos - mdr->session->get_num_projected_prealloc_inos();
3275 mds->inotable->project_alloc_ids(mdr->prealloc_inos, need);
3276 ceph_assert(mdr->prealloc_inos.size()); // or else fix projected increment semantics
3277 mdr->session->pending_prealloc_inos.insert(mdr->prealloc_inos);
3278 mds->sessionmap.mark_projected(mdr->session);
3279 dout(10) << "prepare_new_inode prealloc " << mdr->prealloc_inos << dendl;
3280 }
3281
3282 _inode->version = 1;
3283 _inode->xattr_version = 1;
3284 _inode->nlink = 1; // FIXME
3285
3286 _inode->mode = mode;
3287
3288 // FIPS zeroization audit 20191117: this memset is not security related.
3289 memset(&_inode->dir_layout, 0, sizeof(_inode->dir_layout));
3290 if (_inode->is_dir()) {
3291 _inode->dir_layout.dl_dir_hash = g_conf()->mds_default_dir_hash;
3292 } else if (layout) {
3293 _inode->layout = *layout;
3294 } else {
3295 _inode->layout = mdcache->default_file_layout;
3296 }
3297
3298 _inode->truncate_size = -1ull; // not truncated, yet!
3299 _inode->truncate_seq = 1; /* starting with 1, 0 is kept for no-truncation logic */
3300
3301 CInode *diri = dir->get_inode();
3302
3303 dout(10) << oct << " dir mode 0" << diri->get_inode()->mode << " new mode 0" << mode << dec << dendl;
3304
3305 if (diri->get_inode()->mode & S_ISGID) {
3306 dout(10) << " dir is sticky" << dendl;
3307 _inode->gid = diri->get_inode()->gid;
3308 if (S_ISDIR(mode)) {
3309 dout(10) << " new dir also sticky" << dendl;
3310 _inode->mode |= S_ISGID;
3311 }
3312 } else
3313 _inode->gid = mdr->client_request->get_caller_gid();
3314
3315 _inode->uid = mdr->client_request->get_caller_uid();
3316
3317 _inode->btime = _inode->ctime = _inode->mtime = _inode->atime =
3318 mdr->get_op_stamp();
3319
3320 _inode->change_attr = 0;
3321
3322 const cref_t<MClientRequest> &req = mdr->client_request;
3323 if (req->get_data().length()) {
3324 auto p = req->get_data().cbegin();
3325
3326 // xattrs on new inode?
3327 auto _xattrs = CInode::allocate_xattr_map();
3328 decode_noshare(*_xattrs, p);
3329 dout(10) << "prepare_new_inode setting xattrs " << *_xattrs << dendl;
3330 in->reset_xattrs(std::move(_xattrs));
3331 }
3332
3333 if (!mds->mdsmap->get_inline_data_enabled() ||
3334 !mdr->session->get_connection()->has_feature(CEPH_FEATURE_MDS_INLINE_DATA))
3335 _inode->inline_data.version = CEPH_INLINE_NONE;
3336
3337 mdcache->add_inode(in); // add
3338 dout(10) << "prepare_new_inode " << *in << dendl;
3339 return in;
3340 }
3341
3342 void Server::journal_allocated_inos(MDRequestRef& mdr, EMetaBlob *blob)
3343 {
3344 dout(20) << "journal_allocated_inos sessionmapv " << mds->sessionmap.get_projected()
3345 << " inotablev " << mds->inotable->get_projected_version()
3346 << dendl;
3347 blob->set_ino_alloc(mdr->alloc_ino,
3348 mdr->used_prealloc_ino,
3349 mdr->prealloc_inos,
3350 mdr->client_request->get_source(),
3351 mds->sessionmap.get_projected(),
3352 mds->inotable->get_projected_version());
3353 }
3354
3355 void Server::apply_allocated_inos(MDRequestRef& mdr, Session *session)
3356 {
3357 dout(10) << "apply_allocated_inos " << mdr->alloc_ino
3358 << " / " << mdr->prealloc_inos
3359 << " / " << mdr->used_prealloc_ino << dendl;
3360
3361 if (mdr->alloc_ino) {
3362 mds->inotable->apply_alloc_id(mdr->alloc_ino);
3363 }
3364 if (mdr->prealloc_inos.size()) {
3365 ceph_assert(session);
3366 session->pending_prealloc_inos.subtract(mdr->prealloc_inos);
3367 session->free_prealloc_inos.insert(mdr->prealloc_inos);
3368 session->info.prealloc_inos.insert(mdr->prealloc_inos);
3369 mds->sessionmap.mark_dirty(session, !mdr->used_prealloc_ino);
3370 mds->inotable->apply_alloc_ids(mdr->prealloc_inos);
3371 }
3372 if (mdr->used_prealloc_ino) {
3373 ceph_assert(session);
3374 session->info.prealloc_inos.erase(mdr->used_prealloc_ino);
3375 mds->sessionmap.mark_dirty(session);
3376 }
3377 }
3378
3379 class C_MDS_TryFindInode : public ServerContext {
3380 MDRequestRef mdr;
3381 public:
3382 C_MDS_TryFindInode(Server *s, MDRequestRef& r) : ServerContext(s), mdr(r) {}
3383 void finish(int r) override {
3384 if (r == -CEPHFS_ESTALE) // :( find_ino_peers failed
3385 server->respond_to_request(mdr, r);
3386 else
3387 server->dispatch_client_request(mdr);
3388 }
3389 };
3390
3391 /* If this returns null, the request has been handled
3392 * as appropriate: forwarded on, or the client's been replied to */
3393 CInode* Server::rdlock_path_pin_ref(MDRequestRef& mdr,
3394 bool want_auth,
3395 bool no_want_auth)
3396 {
3397 const filepath& refpath = mdr->get_filepath();
3398 dout(10) << "rdlock_path_pin_ref " << *mdr << " " << refpath << dendl;
3399
3400 if (mdr->locking_state & MutationImpl::PATH_LOCKED)
3401 return mdr->in[0];
3402
3403 // traverse
3404 CF_MDS_RetryRequestFactory cf(mdcache, mdr, true);
3405 int flags = 0;
3406 if (refpath.is_last_snap()) {
3407 if (!no_want_auth)
3408 want_auth = true;
3409 } else {
3410 if (!no_want_auth && forward_all_requests_to_auth)
3411 want_auth = true;
3412 flags |= MDS_TRAVERSE_RDLOCK_PATH | MDS_TRAVERSE_RDLOCK_SNAP;
3413 }
3414 if (want_auth)
3415 flags |= MDS_TRAVERSE_WANT_AUTH;
3416 int r = mdcache->path_traverse(mdr, cf, refpath, flags, &mdr->dn[0], &mdr->in[0]);
3417 if (r > 0)
3418 return nullptr; // delayed
3419 if (r < 0) { // error
3420 if (r == -CEPHFS_ENOENT && !mdr->dn[0].empty()) {
3421 if (mdr->client_request &&
3422 mdr->client_request->get_dentry_wanted())
3423 mdr->tracedn = mdr->dn[0].back();
3424 respond_to_request(mdr, r);
3425 } else if (r == -CEPHFS_ESTALE) {
3426 dout(10) << "FAIL on CEPHFS_ESTALE but attempting recovery" << dendl;
3427 MDSContext *c = new C_MDS_TryFindInode(this, mdr);
3428 mdcache->find_ino_peers(refpath.get_ino(), c);
3429 } else {
3430 dout(10) << "FAIL on error " << r << dendl;
3431 respond_to_request(mdr, r);
3432 }
3433 return nullptr;
3434 }
3435 CInode *ref = mdr->in[0];
3436 dout(10) << "ref is " << *ref << dendl;
3437
3438 if (want_auth) {
3439 // auth_pin?
3440 // do NOT proceed if freezing, as cap release may defer in that case, and
3441 // we could deadlock when we try to lock @ref.
3442 // if we're already auth_pinned, continue; the release has already been processed.
3443 if (ref->is_frozen() || ref->is_frozen_auth_pin() ||
3444 (ref->is_freezing() && !mdr->is_auth_pinned(ref))) {
3445 dout(7) << "waiting for !frozen/authpinnable on " << *ref << dendl;
3446 ref->add_waiter(CInode::WAIT_UNFREEZE, cf.build());
3447 if (mdr->is_any_remote_auth_pin())
3448 mds->locker->notify_freeze_waiter(ref);
3449 return 0;
3450 }
3451 mdr->auth_pin(ref);
3452 }
3453
3454 // set and pin ref
3455 mdr->pin(ref);
3456 return ref;
3457 }
3458
3459
3460 /** rdlock_path_xlock_dentry
3461 * traverse path to the directory that could/would contain dentry.
3462 * make sure i am auth for that dentry, forward as necessary.
3463 * create null dentry in place (or use existing if okexist).
3464 * get rdlocks on traversed dentries, xlock on new dentry.
3465 */
3466 CDentry* Server::rdlock_path_xlock_dentry(MDRequestRef& mdr,
3467 bool create, bool okexist, bool want_layout)
3468 {
3469 const filepath& refpath = mdr->get_filepath();
3470 dout(10) << "rdlock_path_xlock_dentry " << *mdr << " " << refpath << dendl;
3471
3472 if (mdr->locking_state & MutationImpl::PATH_LOCKED)
3473 return mdr->dn[0].back();
3474
3475 // figure parent dir vs dname
3476 if (refpath.depth() == 0) {
3477 dout(7) << "invalid path (zero length)" << dendl;
3478 respond_to_request(mdr, -CEPHFS_EINVAL);
3479 return nullptr;
3480 }
3481
3482 if (refpath.is_last_snap()) {
3483 respond_to_request(mdr, -CEPHFS_EROFS);
3484 return nullptr;
3485 }
3486
3487 if (refpath.is_last_dot_or_dotdot()) {
3488 dout(7) << "invalid path (last dot or dot_dot)" << dendl;
3489 if (create)
3490 respond_to_request(mdr, -CEPHFS_EEXIST);
3491 else
3492 respond_to_request(mdr, -CEPHFS_ENOTEMPTY);
3493 return nullptr;
3494 }
3495
3496 // traverse to parent dir
3497 CF_MDS_RetryRequestFactory cf(mdcache, mdr, true);
3498 int flags = MDS_TRAVERSE_RDLOCK_SNAP | MDS_TRAVERSE_RDLOCK_PATH |
3499 MDS_TRAVERSE_WANT_DENTRY | MDS_TRAVERSE_XLOCK_DENTRY |
3500 MDS_TRAVERSE_WANT_AUTH;
3501 if (refpath.depth() == 1 && !mdr->lock_cache_disabled)
3502 flags |= MDS_TRAVERSE_CHECK_LOCKCACHE;
3503 if (create)
3504 flags |= MDS_TRAVERSE_RDLOCK_AUTHLOCK;
3505 if (want_layout)
3506 flags |= MDS_TRAVERSE_WANT_DIRLAYOUT;
3507 int r = mdcache->path_traverse(mdr, cf, refpath, flags, &mdr->dn[0]);
3508 if (r > 0)
3509 return nullptr; // delayed
3510 if (r < 0) {
3511 if (r == -CEPHFS_ESTALE) {
3512 dout(10) << "FAIL on CEPHFS_ESTALE but attempting recovery" << dendl;
3513 mdcache->find_ino_peers(refpath.get_ino(), new C_MDS_TryFindInode(this, mdr));
3514 return nullptr;
3515 }
3516 respond_to_request(mdr, r);
3517 return nullptr;
3518 }
3519
3520 CDentry *dn = mdr->dn[0].back();
3521 CDir *dir = dn->get_dir();
3522 CInode *diri = dir->get_inode();
3523
3524 if (!mdr->reqid.name.is_mds()) {
3525 if (diri->is_system() && !diri->is_root()) {
3526 respond_to_request(mdr, -CEPHFS_EROFS);
3527 return nullptr;
3528 }
3529 }
3530
3531 if (!diri->is_base() && diri->get_projected_parent_dir()->inode->is_stray()) {
3532 respond_to_request(mdr, -CEPHFS_ENOENT);
3533 return nullptr;
3534 }
3535
3536 CDentry::linkage_t *dnl = dn->get_projected_linkage();
3537 if (dnl->is_null()) {
3538 if (!create && okexist) {
3539 respond_to_request(mdr, -CEPHFS_ENOENT);
3540 return nullptr;
3541 }
3542
3543 snapid_t next_snap = mdcache->get_global_snaprealm()->get_newest_seq() + 1;
3544 dn->first = std::max(dn->first, next_snap);
3545 } else {
3546 if (!okexist) {
3547 respond_to_request(mdr, -CEPHFS_EEXIST);
3548 return nullptr;
3549 }
3550 mdr->in[0] = dnl->get_inode();
3551 }
3552
3553 return dn;
3554 }
3555
3556 /** rdlock_two_paths_xlock_destdn
3557 * traverse two paths and lock the two paths in proper order.
3558 * The order of taking locks is:
3559 * 1. Lock directory inodes or dentries according to which trees they
3560 * are under. Lock objects under fs root before objects under mdsdir.
3561 * 2. Lock directory inodes or dentries according to their depth, in
3562 * ascending order.
3563 * 3. Lock directory inodes or dentries according to inode numbers or
3564 * dentries' parent inode numbers, in ascending order.
3565 * 4. Lock dentries in the same directory in order of their keys.
3566 * 5. Lock non-directory inodes according to inode numbers, in ascending
3567 * order.
3568 */
3569 std::pair<CDentry*, CDentry*>
3570 Server::rdlock_two_paths_xlock_destdn(MDRequestRef& mdr, bool xlock_srcdn)
3571 {
3572
3573 const filepath& refpath = mdr->get_filepath();
3574 const filepath& refpath2 = mdr->get_filepath2();
3575
3576 dout(10) << "rdlock_two_paths_xlock_destdn " << *mdr << " " << refpath << " " << refpath2 << dendl;
3577
3578 if (mdr->locking_state & MutationImpl::PATH_LOCKED)
3579 return std::make_pair(mdr->dn[0].back(), mdr->dn[1].back());
3580
3581 if (refpath.depth() != 1 || refpath2.depth() != 1) {
3582 respond_to_request(mdr, -CEPHFS_EINVAL);
3583 return std::pair<CDentry*, CDentry*>(nullptr, nullptr);
3584 }
3585
3586 if (refpath.is_last_snap() || refpath2.is_last_snap()) {
3587 respond_to_request(mdr, -CEPHFS_EROFS);
3588 return std::make_pair(nullptr, nullptr);
3589 }
3590
3591 // traverse to parent dir
3592 CF_MDS_RetryRequestFactory cf(mdcache, mdr, true);
3593 int flags = MDS_TRAVERSE_RDLOCK_SNAP | MDS_TRAVERSE_WANT_DENTRY | MDS_TRAVERSE_WANT_AUTH;
3594 int r = mdcache->path_traverse(mdr, cf, refpath, flags, &mdr->dn[0]);
3595 if (r != 0) {
3596 if (r == -CEPHFS_ESTALE) {
3597 dout(10) << "CEPHFS_ESTALE on path, attempting recovery" << dendl;
3598 mdcache->find_ino_peers(refpath.get_ino(), new C_MDS_TryFindInode(this, mdr));
3599 } else if (r < 0) {
3600 respond_to_request(mdr, r);
3601 }
3602 return std::make_pair(nullptr, nullptr);
3603 }
3604
3605 flags = MDS_TRAVERSE_RDLOCK_SNAP2 | MDS_TRAVERSE_WANT_DENTRY | MDS_TRAVERSE_DISCOVER;
3606 r = mdcache->path_traverse(mdr, cf, refpath2, flags, &mdr->dn[1]);
3607 if (r != 0) {
3608 if (r == -CEPHFS_ESTALE) {
3609 dout(10) << "CEPHFS_ESTALE on path2, attempting recovery" << dendl;
3610 mdcache->find_ino_peers(refpath2.get_ino(), new C_MDS_TryFindInode(this, mdr));
3611 } else if (r < 0) {
3612 respond_to_request(mdr, r);
3613 }
3614 return std::make_pair(nullptr, nullptr);
3615 }
3616
3617 CDentry *srcdn = mdr->dn[1].back();
3618 CDir *srcdir = srcdn->get_dir();
3619 CDentry *destdn = mdr->dn[0].back();
3620 CDir *destdir = destdn->get_dir();
3621
3622 if (!mdr->reqid.name.is_mds()) {
3623 if ((srcdir->get_inode()->is_system() && !srcdir->get_inode()->is_root()) ||
3624 (destdir->get_inode()->is_system() && !destdir->get_inode()->is_root())) {
3625 respond_to_request(mdr, -CEPHFS_EROFS);
3626 return std::make_pair(nullptr, nullptr);
3627 }
3628 }
3629
3630 if (!destdir->get_inode()->is_base() &&
3631 destdir->get_inode()->get_projected_parent_dir()->inode->is_stray()) {
3632 respond_to_request(mdr, -CEPHFS_ENOENT);
3633 return std::make_pair(nullptr, nullptr);
3634 }
3635
3636 MutationImpl::LockOpVec lov;
3637 if (srcdir->get_inode() == destdir->get_inode()) {
3638 lov.add_wrlock(&destdir->inode->filelock);
3639 lov.add_wrlock(&destdir->inode->nestlock);
3640 if (xlock_srcdn && srcdir != destdir) {
3641 mds_rank_t srcdir_auth = srcdir->authority().first;
3642 if (srcdir_auth != mds->get_nodeid()) {
3643 lov.add_remote_wrlock(&srcdir->inode->filelock, srcdir_auth);
3644 lov.add_remote_wrlock(&srcdir->inode->nestlock, srcdir_auth);
3645 }
3646 }
3647
3648 if (srcdn->get_name() > destdn->get_name())
3649 lov.add_xlock(&destdn->lock);
3650
3651 if (xlock_srcdn)
3652 lov.add_xlock(&srcdn->lock);
3653 else
3654 lov.add_rdlock(&srcdn->lock);
3655
3656 if (srcdn->get_name() < destdn->get_name())
3657 lov.add_xlock(&destdn->lock);
3658 } else {
3659 int cmp = mdr->compare_paths();
3660 bool lock_destdir_first =
3661 (cmp < 0 || (cmp == 0 && destdir->ino() < srcdir->ino()));
3662
3663 if (lock_destdir_first) {
3664 lov.add_wrlock(&destdir->inode->filelock);
3665 lov.add_wrlock(&destdir->inode->nestlock);
3666 lov.add_xlock(&destdn->lock);
3667 }
3668
3669 if (xlock_srcdn) {
3670 mds_rank_t srcdir_auth = srcdir->authority().first;
3671 if (srcdir_auth == mds->get_nodeid()) {
3672 lov.add_wrlock(&srcdir->inode->filelock);
3673 lov.add_wrlock(&srcdir->inode->nestlock);
3674 } else {
3675 lov.add_remote_wrlock(&srcdir->inode->filelock, srcdir_auth);
3676 lov.add_remote_wrlock(&srcdir->inode->nestlock, srcdir_auth);
3677 }
3678 lov.add_xlock(&srcdn->lock);
3679 } else {
3680 lov.add_rdlock(&srcdn->lock);
3681 }
3682
3683 if (!lock_destdir_first) {
3684 lov.add_wrlock(&destdir->inode->filelock);
3685 lov.add_wrlock(&destdir->inode->nestlock);
3686 lov.add_xlock(&destdn->lock);
3687 }
3688 }
3689
3690 CInode *auth_pin_freeze = nullptr;
3691 // XXX any better way to do this?
3692 if (xlock_srcdn && !srcdn->is_auth()) {
3693 CDentry::linkage_t *srcdnl = srcdn->get_projected_linkage();
3694 auth_pin_freeze = srcdnl->is_primary() ? srcdnl->get_inode() : nullptr;
3695 }
3696 if (!mds->locker->acquire_locks(mdr, lov, auth_pin_freeze))
3697 return std::make_pair(nullptr, nullptr);
3698
3699 if (srcdn->get_projected_linkage()->is_null()) {
3700 respond_to_request(mdr, -CEPHFS_ENOENT);
3701 return std::make_pair(nullptr, nullptr);
3702 }
3703
3704 if (destdn->get_projected_linkage()->is_null()) {
3705 snapid_t next_snap = mdcache->get_global_snaprealm()->get_newest_seq() + 1;
3706 destdn->first = std::max(destdn->first, next_snap);
3707 }
3708
3709 mdr->locking_state |= MutationImpl::PATH_LOCKED;
3710
3711 return std::make_pair(destdn, srcdn);
3712 }
3713
3714 /**
3715 * try_open_auth_dirfrag -- open dirfrag, or forward to dirfrag auth
3716 *
3717 * @param diri base inode
3718 * @param fg the exact frag we want
3719 * @param mdr request
3720 * @returns the pointer, or NULL if it had to be delayed (but mdr is taken care of)
3721 */
3722 CDir* Server::try_open_auth_dirfrag(CInode *diri, frag_t fg, MDRequestRef& mdr)
3723 {
3724 CDir *dir = diri->get_dirfrag(fg);
3725
3726 if (dir) {
3727 // am i auth for the dirfrag?
3728 if (!dir->is_auth()) {
3729 mds_rank_t auth = dir->authority().first;
3730 dout(7) << "try_open_auth_dirfrag: not auth for " << *dir
3731 << ", fw to mds." << auth << dendl;
3732 mdcache->request_forward(mdr, auth);
3733 return nullptr;
3734 }
3735 } else {
3736 // not open and inode not mine?
3737 if (!diri->is_auth()) {
3738 mds_rank_t inauth = diri->authority().first;
3739 dout(7) << "try_open_auth_dirfrag: not open, not inode auth, fw to mds." << inauth << dendl;
3740 mdcache->request_forward(mdr, inauth);
3741 return nullptr;
3742 }
3743
3744 // not open and inode frozen?
3745 if (diri->is_frozen()) {
3746 dout(10) << "try_open_auth_dirfrag: dir inode is frozen, waiting " << *diri << dendl;
3747 ceph_assert(diri->get_parent_dir());
3748 diri->add_waiter(CInode::WAIT_UNFREEZE, new C_MDS_RetryRequest(mdcache, mdr));
3749 return nullptr;
3750 }
3751
3752 // invent?
3753 dir = diri->get_or_open_dirfrag(mdcache, fg);
3754 }
3755
3756 return dir;
3757 }
3758
3759
3760 // ===============================================================================
3761 // STAT
3762
3763 void Server::handle_client_getattr(MDRequestRef& mdr, bool is_lookup)
3764 {
3765 const cref_t<MClientRequest> &req = mdr->client_request;
3766
3767 if (req->get_filepath().depth() == 0 && is_lookup) {
3768 // refpath can't be empty for lookup but it can for
3769 // getattr (we do getattr with empty refpath for mount of '/')
3770 respond_to_request(mdr, -CEPHFS_EINVAL);
3771 return;
3772 }
3773
3774 bool want_auth = false;
3775 int mask = req->head.args.getattr.mask;
3776 if (mask & CEPH_STAT_RSTAT)
3777 want_auth = true; // set want_auth for CEPH_STAT_RSTAT mask
3778
3779 if (!mdr->is_batch_head() && mdr->can_batch()) {
3780 CF_MDS_RetryRequestFactory cf(mdcache, mdr, false);
3781 int r = mdcache->path_traverse(mdr, cf, mdr->get_filepath(),
3782 (want_auth ? MDS_TRAVERSE_WANT_AUTH : 0),
3783 &mdr->dn[0], &mdr->in[0]);
3784 if (r > 0)
3785 return; // delayed
3786
3787 if (r < 0) {
3788 // fall-thru. let rdlock_path_pin_ref() check again.
3789 } else if (is_lookup) {
3790 CDentry* dn = mdr->dn[0].back();
3791 mdr->pin(dn);
3792 auto em = dn->batch_ops.emplace(std::piecewise_construct, std::forward_as_tuple(mask), std::forward_as_tuple());
3793 if (em.second) {
3794 em.first->second = std::make_unique<Batch_Getattr_Lookup>(this, mdr);
3795 } else {
3796 dout(20) << __func__ << ": LOOKUP op, wait for previous same getattr ops to respond. " << *mdr << dendl;
3797 em.first->second->add_request(mdr);
3798 return;
3799 }
3800 } else {
3801 CInode *in = mdr->in[0];
3802 mdr->pin(in);
3803 auto em = in->batch_ops.emplace(std::piecewise_construct, std::forward_as_tuple(mask), std::forward_as_tuple());
3804 if (em.second) {
3805 em.first->second = std::make_unique<Batch_Getattr_Lookup>(this, mdr);
3806 } else {
3807 dout(20) << __func__ << ": GETATTR op, wait for previous same getattr ops to respond. " << *mdr << dendl;
3808 em.first->second->add_request(mdr);
3809 return;
3810 }
3811 }
3812 }
3813
3814 CInode *ref = rdlock_path_pin_ref(mdr, want_auth, false);
3815 if (!ref)
3816 return;
3817
3818 mdr->getattr_caps = mask;
3819
3820 /*
3821 * if client currently holds the EXCL cap on a field, do not rdlock
3822 * it; client's stat() will result in valid info if _either_ EXCL
3823 * cap is held or MDS rdlocks and reads the value here.
3824 *
3825 * handling this case here is easier than weakening rdlock
3826 * semantics... that would cause problems elsewhere.
3827 */
3828 client_t client = mdr->get_client();
3829 int issued = 0;
3830 Capability *cap = ref->get_client_cap(client);
3831 if (cap && (mdr->snapid == CEPH_NOSNAP ||
3832 mdr->snapid <= cap->client_follows))
3833 issued = cap->issued();
3834
3835 // FIXME
3836 MutationImpl::LockOpVec lov;
3837 if ((mask & CEPH_CAP_LINK_SHARED) && !(issued & CEPH_CAP_LINK_EXCL))
3838 lov.add_rdlock(&ref->linklock);
3839 if ((mask & CEPH_CAP_AUTH_SHARED) && !(issued & CEPH_CAP_AUTH_EXCL))
3840 lov.add_rdlock(&ref->authlock);
3841 if ((mask & CEPH_CAP_XATTR_SHARED) && !(issued & CEPH_CAP_XATTR_EXCL))
3842 lov.add_rdlock(&ref->xattrlock);
3843 if ((mask & CEPH_CAP_FILE_SHARED) && !(issued & CEPH_CAP_FILE_EXCL)) {
3844 // Don't wait on unstable filelock if client is allowed to read file size.
3845 // This can reduce the response time of getattr in the case that multiple
3846 // clients do stat(2) and there are writers.
3847 // The downside of this optimization is that mds may not issue Fs caps along
3848 // with getattr reply. Client may need to send more getattr requests.
3849 if (mdr->is_rdlocked(&ref->filelock)) {
3850 lov.add_rdlock(&ref->filelock);
3851 } else if (ref->filelock.is_stable() ||
3852 ref->filelock.get_num_wrlocks() > 0 ||
3853 !ref->filelock.can_read(mdr->get_client())) {
3854 lov.add_rdlock(&ref->filelock);
3855 mdr->locking_state &= ~MutationImpl::ALL_LOCKED;
3856 }
3857 }
3858
3859 if (!mds->locker->acquire_locks(mdr, lov))
3860 return;
3861
3862 if (!check_access(mdr, ref, MAY_READ))
3863 return;
3864
3865 utime_t now = ceph_clock_now();
3866 mdr->set_mds_stamp(now);
3867
3868 // note which caps are requested, so we return at least a snapshot
3869 // value for them. (currently this matters for xattrs and inline data)
3870 mdr->getattr_caps = mask;
3871
3872 mds->balancer->hit_inode(ref, META_POP_IRD, req->get_source().num());
3873
3874 // reply
3875 dout(10) << "reply to stat on " << *req << dendl;
3876 mdr->tracei = ref;
3877 if (is_lookup)
3878 mdr->tracedn = mdr->dn[0].back();
3879 respond_to_request(mdr, 0);
3880 }
3881
3882 struct C_MDS_LookupIno2 : public ServerContext {
3883 MDRequestRef mdr;
3884 C_MDS_LookupIno2(Server *s, MDRequestRef& r) : ServerContext(s), mdr(r) {}
3885 void finish(int r) override {
3886 server->_lookup_ino_2(mdr, r);
3887 }
3888 };
3889
3890 /*
3891 * filepath: ino
3892 */
3893 void Server::handle_client_lookup_ino(MDRequestRef& mdr,
3894 bool want_parent, bool want_dentry)
3895 {
3896 const cref_t<MClientRequest> &req = mdr->client_request;
3897
3898 if ((uint64_t)req->head.args.lookupino.snapid > 0)
3899 return _lookup_snap_ino(mdr);
3900
3901 inodeno_t ino = req->get_filepath().get_ino();
3902 CInode *in = mdcache->get_inode(ino);
3903 if (in && in->state_test(CInode::STATE_PURGING)) {
3904 respond_to_request(mdr, -CEPHFS_ESTALE);
3905 return;
3906 }
3907 if (!in) {
3908 mdcache->open_ino(ino, (int64_t)-1, new C_MDS_LookupIno2(this, mdr), false);
3909 return;
3910 }
3911
3912 // check for nothing (not read or write); this still applies the
3913 // path check.
3914 if (!check_access(mdr, in, 0))
3915 return;
3916
3917 CDentry *dn = in->get_projected_parent_dn();
3918 CInode *diri = dn ? dn->get_dir()->inode : NULL;
3919
3920 MutationImpl::LockOpVec lov;
3921 if (dn && (want_parent || want_dentry)) {
3922 mdr->pin(dn);
3923 lov.add_rdlock(&dn->lock);
3924 }
3925
3926 unsigned mask = req->head.args.lookupino.mask;
3927 if (mask) {
3928 Capability *cap = in->get_client_cap(mdr->get_client());
3929 int issued = 0;
3930 if (cap && (mdr->snapid == CEPH_NOSNAP || mdr->snapid <= cap->client_follows))
3931 issued = cap->issued();
3932 // FIXME
3933 // permission bits, ACL/security xattrs
3934 if ((mask & CEPH_CAP_AUTH_SHARED) && (issued & CEPH_CAP_AUTH_EXCL) == 0)
3935 lov.add_rdlock(&in->authlock);
3936 if ((mask & CEPH_CAP_XATTR_SHARED) && (issued & CEPH_CAP_XATTR_EXCL) == 0)
3937 lov.add_rdlock(&in->xattrlock);
3938
3939 mdr->getattr_caps = mask;
3940 }
3941
3942 if (!lov.empty()) {
3943 if (!mds->locker->acquire_locks(mdr, lov))
3944 return;
3945
3946 if (diri != NULL) {
3947 // need read access to directory inode
3948 if (!check_access(mdr, diri, MAY_READ))
3949 return;
3950 }
3951 }
3952
3953 if (want_parent) {
3954 if (in->is_base()) {
3955 respond_to_request(mdr, -CEPHFS_EINVAL);
3956 return;
3957 }
3958 if (!diri || diri->is_stray()) {
3959 respond_to_request(mdr, -CEPHFS_ESTALE);
3960 return;
3961 }
3962 dout(10) << "reply to lookup_parent " << *in << dendl;
3963 mdr->tracei = diri;
3964 respond_to_request(mdr, 0);
3965 } else {
3966 if (want_dentry) {
3967 inodeno_t dirino = req->get_filepath2().get_ino();
3968 if (!diri || (dirino != inodeno_t() && diri->ino() != dirino)) {
3969 respond_to_request(mdr, -CEPHFS_ENOENT);
3970 return;
3971 }
3972 dout(10) << "reply to lookup_name " << *in << dendl;
3973 } else
3974 dout(10) << "reply to lookup_ino " << *in << dendl;
3975
3976 mdr->tracei = in;
3977 if (want_dentry)
3978 mdr->tracedn = dn;
3979 respond_to_request(mdr, 0);
3980 }
3981 }
3982
3983 void Server::_lookup_snap_ino(MDRequestRef& mdr)
3984 {
3985 const cref_t<MClientRequest> &req = mdr->client_request;
3986
3987 vinodeno_t vino;
3988 vino.ino = req->get_filepath().get_ino();
3989 vino.snapid = (__u64)req->head.args.lookupino.snapid;
3990 inodeno_t parent_ino = (__u64)req->head.args.lookupino.parent;
3991 __u32 hash = req->head.args.lookupino.hash;
3992
3993 dout(7) << "lookup_snap_ino " << vino << " parent " << parent_ino << " hash " << hash << dendl;
3994
3995 CInode *in = mdcache->lookup_snap_inode(vino);
3996 if (!in) {
3997 in = mdcache->get_inode(vino.ino);
3998 if (in) {
3999 if (in->state_test(CInode::STATE_PURGING) ||
4000 !in->has_snap_data(vino.snapid)) {
4001 if (in->is_dir() || !parent_ino) {
4002 respond_to_request(mdr, -CEPHFS_ESTALE);
4003 return;
4004 }
4005 in = NULL;
4006 }
4007 }
4008 }
4009
4010 if (in) {
4011 dout(10) << "reply to lookup_snap_ino " << *in << dendl;
4012 mdr->snapid = vino.snapid;
4013 mdr->tracei = in;
4014 respond_to_request(mdr, 0);
4015 return;
4016 }
4017
4018 CInode *diri = NULL;
4019 if (parent_ino) {
4020 diri = mdcache->get_inode(parent_ino);
4021 if (!diri) {
4022 mdcache->open_ino(parent_ino, mds->mdsmap->get_metadata_pool(), new C_MDS_LookupIno2(this, mdr));
4023 return;
4024 }
4025
4026 if (!diri->is_dir()) {
4027 respond_to_request(mdr, -CEPHFS_EINVAL);
4028 return;
4029 }
4030
4031 MutationImpl::LockOpVec lov;
4032 lov.add_rdlock(&diri->dirfragtreelock);
4033 if (!mds->locker->acquire_locks(mdr, lov))
4034 return;
4035
4036 frag_t frag = diri->dirfragtree[hash];
4037 CDir *dir = try_open_auth_dirfrag(diri, frag, mdr);
4038 if (!dir)
4039 return;
4040
4041 if (!dir->is_complete()) {
4042 if (dir->is_frozen()) {
4043 mds->locker->drop_locks(mdr.get());
4044 mdr->drop_local_auth_pins();
4045 dir->add_waiter(CDir::WAIT_UNFREEZE, new C_MDS_RetryRequest(mdcache, mdr));
4046 return;
4047 }
4048 dir->fetch(new C_MDS_RetryRequest(mdcache, mdr), true);
4049 return;
4050 }
4051
4052 respond_to_request(mdr, -CEPHFS_ESTALE);
4053 } else {
4054 mdcache->open_ino(vino.ino, mds->mdsmap->get_metadata_pool(), new C_MDS_LookupIno2(this, mdr), false);
4055 }
4056 }
4057
4058 void Server::_lookup_ino_2(MDRequestRef& mdr, int r)
4059 {
4060 inodeno_t ino = mdr->client_request->get_filepath().get_ino();
4061 dout(10) << "_lookup_ino_2 " << mdr.get() << " ino " << ino << " r=" << r << dendl;
4062
4063 // `r` is a rank if >=0, else an error code
4064 if (r >= 0) {
4065 mds_rank_t dest_rank(r);
4066 if (dest_rank == mds->get_nodeid())
4067 dispatch_client_request(mdr);
4068 else
4069 mdcache->request_forward(mdr, dest_rank);
4070 return;
4071 }
4072
4073 // give up
4074 if (r == -CEPHFS_ENOENT || r == -CEPHFS_ENODATA)
4075 r = -CEPHFS_ESTALE;
4076 respond_to_request(mdr, r);
4077 }
4078
4079
4080 /* This function takes responsibility for the passed mdr*/
4081 void Server::handle_client_open(MDRequestRef& mdr)
4082 {
4083 const cref_t<MClientRequest> &req = mdr->client_request;
4084 dout(7) << "open on " << req->get_filepath() << dendl;
4085
4086 int flags = req->head.args.open.flags;
4087 int cmode = ceph_flags_to_mode(flags);
4088 if (cmode < 0) {
4089 respond_to_request(mdr, -CEPHFS_EINVAL);
4090 return;
4091 }
4092
4093 bool need_auth = !file_mode_is_readonly(cmode) ||
4094 (flags & (CEPH_O_TRUNC | CEPH_O_DIRECTORY));
4095
4096 if ((cmode & CEPH_FILE_MODE_WR) && mdcache->is_readonly()) {
4097 dout(7) << "read-only FS" << dendl;
4098 respond_to_request(mdr, -CEPHFS_EROFS);
4099 return;
4100 }
4101
4102 CInode *cur = rdlock_path_pin_ref(mdr, need_auth);
4103 if (!cur)
4104 return;
4105
4106 if (cur->is_frozen() || cur->state_test(CInode::STATE_EXPORTINGCAPS)) {
4107 ceph_assert(!need_auth);
4108 mdr->locking_state &= ~(MutationImpl::PATH_LOCKED | MutationImpl::ALL_LOCKED);
4109 CInode *cur = rdlock_path_pin_ref(mdr, true);
4110 if (!cur)
4111 return;
4112 }
4113
4114 if (!cur->is_file()) {
4115 // can only open non-regular inode with mode FILE_MODE_PIN, at least for now.
4116 cmode = CEPH_FILE_MODE_PIN;
4117 // the inode is symlink and client wants to follow it, ignore the O_TRUNC flag.
4118 if (cur->is_symlink() && !(flags & CEPH_O_NOFOLLOW))
4119 flags &= ~CEPH_O_TRUNC;
4120 }
4121
4122 dout(10) << "open flags = " << flags
4123 << ", filemode = " << cmode
4124 << ", need_auth = " << need_auth
4125 << dendl;
4126
4127 // regular file?
4128 /*if (!cur->inode.is_file() && !cur->inode.is_dir()) {
4129 dout(7) << "not a file or dir " << *cur << dendl;
4130 respond_to_request(mdr, -CEPHFS_ENXIO); // FIXME what error do we want?
4131 return;
4132 }*/
4133 if ((flags & CEPH_O_DIRECTORY) && !cur->is_dir() && !cur->is_symlink()) {
4134 dout(7) << "specified O_DIRECTORY on non-directory " << *cur << dendl;
4135 respond_to_request(mdr, -CEPHFS_EINVAL);
4136 return;
4137 }
4138
4139 if ((flags & CEPH_O_TRUNC) && !cur->is_file()) {
4140 dout(7) << "specified O_TRUNC on !(file|symlink) " << *cur << dendl;
4141 // we should return -CEPHFS_EISDIR for directory, return -CEPHFS_EINVAL for other non-regular
4142 respond_to_request(mdr, cur->is_dir() ? -CEPHFS_EISDIR : -CEPHFS_EINVAL);
4143 return;
4144 }
4145
4146 if (cur->get_inode()->inline_data.version != CEPH_INLINE_NONE &&
4147 !mdr->session->get_connection()->has_feature(CEPH_FEATURE_MDS_INLINE_DATA)) {
4148 dout(7) << "old client cannot open inline data file " << *cur << dendl;
4149 respond_to_request(mdr, -CEPHFS_EPERM);
4150 return;
4151 }
4152
4153 // snapped data is read only
4154 if (mdr->snapid != CEPH_NOSNAP &&
4155 ((cmode & CEPH_FILE_MODE_WR) || req->may_write())) {
4156 dout(7) << "snap " << mdr->snapid << " is read-only " << *cur << dendl;
4157 respond_to_request(mdr, -CEPHFS_EROFS);
4158 return;
4159 }
4160
4161 MutationImpl::LockOpVec lov;
4162
4163 unsigned mask = req->head.args.open.mask;
4164 if (mask) {
4165 Capability *cap = cur->get_client_cap(mdr->get_client());
4166 int issued = 0;
4167 if (cap && (mdr->snapid == CEPH_NOSNAP || mdr->snapid <= cap->client_follows))
4168 issued = cap->issued();
4169 // permission bits, ACL/security xattrs
4170 if ((mask & CEPH_CAP_AUTH_SHARED) && (issued & CEPH_CAP_AUTH_EXCL) == 0)
4171 lov.add_rdlock(&cur->authlock);
4172 if ((mask & CEPH_CAP_XATTR_SHARED) && (issued & CEPH_CAP_XATTR_EXCL) == 0)
4173 lov.add_rdlock(&cur->xattrlock);
4174
4175 mdr->getattr_caps = mask;
4176 }
4177
4178 // O_TRUNC
4179 if ((flags & CEPH_O_TRUNC) && !mdr->has_completed) {
4180 ceph_assert(cur->is_auth());
4181
4182 lov.add_xlock(&cur->filelock);
4183 if (!mds->locker->acquire_locks(mdr, lov))
4184 return;
4185
4186 if (!check_access(mdr, cur, MAY_WRITE))
4187 return;
4188
4189 // wait for pending truncate?
4190 const auto& pi = cur->get_projected_inode();
4191 if (pi->is_truncating()) {
4192 dout(10) << " waiting for pending truncate from " << pi->truncate_from
4193 << " to " << pi->truncate_size << " to complete on " << *cur << dendl;
4194 mds->locker->drop_locks(mdr.get());
4195 mdr->drop_local_auth_pins();
4196 cur->add_waiter(CInode::WAIT_TRUNC, new C_MDS_RetryRequest(mdcache, mdr));
4197 return;
4198 }
4199
4200 do_open_truncate(mdr, cmode);
4201 return;
4202 }
4203
4204 // sync filelock if snapped.
4205 // this makes us wait for writers to flushsnaps, ensuring we get accurate metadata,
4206 // and that data itself is flushed so that we can read the snapped data off disk.
4207 if (mdr->snapid != CEPH_NOSNAP && !cur->is_dir()) {
4208 lov.add_rdlock(&cur->filelock);
4209 }
4210
4211 if (!mds->locker->acquire_locks(mdr, lov))
4212 return;
4213
4214 mask = MAY_READ;
4215 if (cmode & CEPH_FILE_MODE_WR)
4216 mask |= MAY_WRITE;
4217 if (!check_access(mdr, cur, mask))
4218 return;
4219
4220 utime_t now = ceph_clock_now();
4221 mdr->set_mds_stamp(now);
4222
4223 if (cur->is_file() || cur->is_dir()) {
4224 if (mdr->snapid == CEPH_NOSNAP) {
4225 // register new cap
4226 Capability *cap = mds->locker->issue_new_caps(cur, cmode, mdr, nullptr);
4227 if (cap)
4228 dout(12) << "open issued caps " << ccap_string(cap->pending())
4229 << " for " << req->get_source()
4230 << " on " << *cur << dendl;
4231 } else {
4232 int caps = ceph_caps_for_mode(cmode);
4233 dout(12) << "open issued IMMUTABLE SNAP caps " << ccap_string(caps)
4234 << " for " << req->get_source()
4235 << " snapid " << mdr->snapid
4236 << " on " << *cur << dendl;
4237 mdr->snap_caps = caps;
4238 }
4239 }
4240
4241 // increase max_size?
4242 if (cmode & CEPH_FILE_MODE_WR)
4243 mds->locker->check_inode_max_size(cur);
4244
4245 // make sure this inode gets into the journal
4246 if (cur->is_auth() && cur->last == CEPH_NOSNAP &&
4247 mdcache->open_file_table.should_log_open(cur)) {
4248 EOpen *le = new EOpen(mds->mdlog);
4249 mdlog->start_entry(le);
4250 le->add_clean_inode(cur);
4251 mdlog->submit_entry(le);
4252 }
4253
4254 // hit pop
4255 if (cmode & CEPH_FILE_MODE_WR)
4256 mds->balancer->hit_inode(cur, META_POP_IWR);
4257 else
4258 mds->balancer->hit_inode(cur, META_POP_IRD,
4259 mdr->client_request->get_source().num());
4260
4261 CDentry *dn = 0;
4262 if (req->get_dentry_wanted()) {
4263 ceph_assert(mdr->dn[0].size());
4264 dn = mdr->dn[0].back();
4265 }
4266
4267 mdr->tracei = cur;
4268 mdr->tracedn = dn;
4269 respond_to_request(mdr, 0);
4270 }
4271
4272 class C_MDS_openc_finish : public ServerLogContext {
4273 CDentry *dn;
4274 CInode *newi;
4275 public:
4276 C_MDS_openc_finish(Server *s, MDRequestRef& r, CDentry *d, CInode *ni) :
4277 ServerLogContext(s, r), dn(d), newi(ni) {}
4278 void finish(int r) override {
4279 ceph_assert(r == 0);
4280
4281 dn->pop_projected_linkage();
4282
4283 // dirty inode, dn, dir
4284 newi->mark_dirty(mdr->ls);
4285 newi->mark_dirty_parent(mdr->ls, true);
4286
4287 mdr->apply();
4288
4289 get_mds()->locker->share_inode_max_size(newi);
4290
4291 MDRequestRef null_ref;
4292 get_mds()->mdcache->send_dentry_link(dn, null_ref);
4293
4294 get_mds()->balancer->hit_inode(newi, META_POP_IWR);
4295
4296 server->respond_to_request(mdr, 0);
4297
4298 ceph_assert(g_conf()->mds_kill_openc_at != 1);
4299 }
4300 };
4301
4302 /* This function takes responsibility for the passed mdr*/
4303 void Server::handle_client_openc(MDRequestRef& mdr)
4304 {
4305 const cref_t<MClientRequest> &req = mdr->client_request;
4306 client_t client = mdr->get_client();
4307
4308 dout(7) << "open w/ O_CREAT on " << req->get_filepath() << dendl;
4309
4310 int cmode = ceph_flags_to_mode(req->head.args.open.flags);
4311 if (cmode < 0) {
4312 respond_to_request(mdr, -CEPHFS_EINVAL);
4313 return;
4314 }
4315
4316 bool excl = req->head.args.open.flags & CEPH_O_EXCL;
4317 CDentry *dn = rdlock_path_xlock_dentry(mdr, true, !excl, true);
4318 if (!dn)
4319 return;
4320
4321 CDentry::linkage_t *dnl = dn->get_projected_linkage();
4322 if (!excl && !dnl->is_null()) {
4323 // it existed.
4324 mds->locker->xlock_downgrade(&dn->lock, mdr.get());
4325
4326 MutationImpl::LockOpVec lov;
4327 lov.add_rdlock(&dnl->get_inode()->snaplock);
4328 if (!mds->locker->acquire_locks(mdr, lov))
4329 return;
4330
4331 handle_client_open(mdr);
4332 return;
4333 }
4334
4335 ceph_assert(dnl->is_null());
4336
4337 if (req->get_alternate_name().size() > alternate_name_max) {
4338 dout(10) << " alternate_name longer than " << alternate_name_max << dendl;
4339 respond_to_request(mdr, -CEPHFS_ENAMETOOLONG);
4340 return;
4341 }
4342 dn->set_alternate_name(req->get_alternate_name());
4343
4344 // set layout
4345 file_layout_t layout;
4346 if (mdr->dir_layout != file_layout_t())
4347 layout = mdr->dir_layout;
4348 else
4349 layout = mdcache->default_file_layout;
4350
4351 // What kind of client caps are required to complete this operation
4352 uint64_t access = MAY_WRITE;
4353
4354 const auto default_layout = layout;
4355
4356 // fill in any special params from client
4357 if (req->head.args.open.stripe_unit)
4358 layout.stripe_unit = req->head.args.open.stripe_unit;
4359 if (req->head.args.open.stripe_count)
4360 layout.stripe_count = req->head.args.open.stripe_count;
4361 if (req->head.args.open.object_size)
4362 layout.object_size = req->head.args.open.object_size;
4363 if (req->get_connection()->has_feature(CEPH_FEATURE_CREATEPOOLID) &&
4364 (__s32)req->head.args.open.pool >= 0) {
4365 layout.pool_id = req->head.args.open.pool;
4366
4367 // make sure we have as new a map as the client
4368 if (req->get_mdsmap_epoch() > mds->mdsmap->get_epoch()) {
4369 mds->wait_for_mdsmap(req->get_mdsmap_epoch(), new C_MDS_RetryRequest(mdcache, mdr));
4370 return;
4371 }
4372 }
4373
4374 // If client doesn't have capability to modify layout pools, then
4375 // only permit this request if the requested pool matches what the
4376 // file would have inherited anyway from its parent.
4377 if (default_layout != layout) {
4378 access |= MAY_SET_VXATTR;
4379 }
4380
4381 if (!layout.is_valid()) {
4382 dout(10) << " invalid initial file layout" << dendl;
4383 respond_to_request(mdr, -CEPHFS_EINVAL);
4384 return;
4385 }
4386 if (!mds->mdsmap->is_data_pool(layout.pool_id)) {
4387 dout(10) << " invalid data pool " << layout.pool_id << dendl;
4388 respond_to_request(mdr, -CEPHFS_EINVAL);
4389 return;
4390 }
4391
4392 // created null dn.
4393 CDir *dir = dn->get_dir();
4394 CInode *diri = dir->get_inode();
4395 if (!check_access(mdr, diri, access))
4396 return;
4397 if (!check_fragment_space(mdr, dir))
4398 return;
4399
4400 if (mdr->dn[0].size() == 1)
4401 mds->locker->create_lock_cache(mdr, diri, &mdr->dir_layout);
4402
4403 // create inode.
4404 CInode *newi = prepare_new_inode(mdr, dn->get_dir(), inodeno_t(req->head.ino),
4405 req->head.args.open.mode | S_IFREG, &layout);
4406 ceph_assert(newi);
4407
4408 // it's a file.
4409 dn->push_projected_linkage(newi);
4410
4411 auto _inode = newi->_get_inode();
4412 _inode->version = dn->pre_dirty();
4413 if (layout.pool_id != mdcache->default_file_layout.pool_id)
4414 _inode->add_old_pool(mdcache->default_file_layout.pool_id);
4415 _inode->update_backtrace();
4416 _inode->rstat.rfiles = 1;
4417 _inode->accounted_rstat = _inode->rstat;
4418
4419 SnapRealm *realm = diri->find_snaprealm();
4420 snapid_t follows = mdcache->get_global_snaprealm()->get_newest_seq();
4421 ceph_assert(follows >= realm->get_newest_seq());
4422
4423 ceph_assert(dn->first == follows+1);
4424 newi->first = dn->first;
4425
4426 // do the open
4427 Capability *cap = mds->locker->issue_new_caps(newi, cmode, mdr, realm);
4428 newi->authlock.set_state(LOCK_EXCL);
4429 newi->xattrlock.set_state(LOCK_EXCL);
4430
4431 if (cap && (cmode & CEPH_FILE_MODE_WR)) {
4432 _inode->client_ranges[client].range.first = 0;
4433 _inode->client_ranges[client].range.last = _inode->layout.stripe_unit;
4434 _inode->client_ranges[client].follows = follows;
4435 newi->mark_clientwriteable();
4436 cap->mark_clientwriteable();
4437 }
4438
4439 // prepare finisher
4440 mdr->ls = mdlog->get_current_segment();
4441 EUpdate *le = new EUpdate(mdlog, "openc");
4442 mdlog->start_entry(le);
4443 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
4444 journal_allocated_inos(mdr, &le->metablob);
4445 mdcache->predirty_journal_parents(mdr, &le->metablob, newi, dn->get_dir(), PREDIRTY_PRIMARY|PREDIRTY_DIR, 1);
4446 le->metablob.add_primary_dentry(dn, newi, true, true, true);
4447
4448 // make sure this inode gets into the journal
4449 le->metablob.add_opened_ino(newi->ino());
4450
4451 C_MDS_openc_finish *fin = new C_MDS_openc_finish(this, mdr, dn, newi);
4452
4453 if (mdr->session->info.has_feature(CEPHFS_FEATURE_DELEG_INO)) {
4454 openc_response_t ocresp;
4455
4456 dout(10) << "adding created_ino and delegated_inos" << dendl;
4457 ocresp.created_ino = _inode->ino;
4458
4459 if (delegate_inos_pct && !req->is_queued_for_replay()) {
4460 // Try to delegate some prealloc_inos to the client, if it's down to half the max
4461 unsigned frac = 100 / delegate_inos_pct;
4462 if (mdr->session->delegated_inos.size() < (unsigned)g_conf()->mds_client_prealloc_inos / frac / 2)
4463 mdr->session->delegate_inos(g_conf()->mds_client_prealloc_inos / frac, ocresp.delegated_inos);
4464 }
4465
4466 encode(ocresp, mdr->reply_extra_bl);
4467 } else if (mdr->client_request->get_connection()->has_feature(CEPH_FEATURE_REPLY_CREATE_INODE)) {
4468 dout(10) << "adding ino to reply to indicate inode was created" << dendl;
4469 // add the file created flag onto the reply if create_flags features is supported
4470 encode(newi->ino(), mdr->reply_extra_bl);
4471 }
4472
4473 journal_and_reply(mdr, newi, dn, le, fin);
4474
4475 // We hit_dir (via hit_inode) in our finish callback, but by then we might
4476 // have overshot the split size (multiple opencs in flight), so here is
4477 // an early chance to split the dir if this openc makes it oversized.
4478 mds->balancer->maybe_fragment(dir, false);
4479 }
4480
4481
4482
4483 void Server::handle_client_readdir(MDRequestRef& mdr)
4484 {
4485 const cref_t<MClientRequest> &req = mdr->client_request;
4486 Session *session = mds->get_session(req);
4487 client_t client = req->get_source().num();
4488 MutationImpl::LockOpVec lov;
4489 CInode *diri = rdlock_path_pin_ref(mdr, false, true);
4490 if (!diri) return;
4491
4492 // it's a directory, right?
4493 if (!diri->is_dir()) {
4494 // not a dir
4495 dout(10) << "reply to " << *req << " readdir -CEPHFS_ENOTDIR" << dendl;
4496 respond_to_request(mdr, -CEPHFS_ENOTDIR);
4497 return;
4498 }
4499
4500 auto num_caps = session->get_num_caps();
4501 auto session_cap_acquisition = session->get_cap_acquisition();
4502
4503 if (num_caps > static_cast<uint64_t>(max_caps_per_client * max_caps_throttle_ratio) && session_cap_acquisition >= cap_acquisition_throttle) {
4504 dout(20) << "readdir throttled. max_caps_per_client: " << max_caps_per_client << " num_caps: " << num_caps
4505 << " session_cap_acquistion: " << session_cap_acquisition << " cap_acquisition_throttle: " << cap_acquisition_throttle << dendl;
4506 if (logger)
4507 logger->inc(l_mdss_cap_acquisition_throttle);
4508
4509 mds->timer.add_event_after(caps_throttle_retry_request_timeout, new C_MDS_RetryRequest(mdcache, mdr));
4510 return;
4511 }
4512
4513 lov.add_rdlock(&diri->filelock);
4514 lov.add_rdlock(&diri->dirfragtreelock);
4515
4516 if (!mds->locker->acquire_locks(mdr, lov))
4517 return;
4518
4519 if (!check_access(mdr, diri, MAY_READ))
4520 return;
4521
4522 // which frag?
4523 frag_t fg = (__u32)req->head.args.readdir.frag;
4524 unsigned req_flags = (__u32)req->head.args.readdir.flags;
4525 string offset_str = req->get_path2();
4526
4527 __u32 offset_hash = 0;
4528 if (!offset_str.empty())
4529 offset_hash = ceph_frag_value(diri->hash_dentry_name(offset_str));
4530 else
4531 offset_hash = (__u32)req->head.args.readdir.offset_hash;
4532
4533 dout(10) << " frag " << fg << " offset '" << offset_str << "'"
4534 << " offset_hash " << offset_hash << " flags " << req_flags << dendl;
4535
4536 // does the frag exist?
4537 if (diri->dirfragtree[fg.value()] != fg) {
4538 frag_t newfg;
4539 if (req_flags & CEPH_READDIR_REPLY_BITFLAGS) {
4540 if (fg.contains((unsigned)offset_hash)) {
4541 newfg = diri->dirfragtree[offset_hash];
4542 } else {
4543 // client actually wants next frag
4544 newfg = diri->dirfragtree[fg.value()];
4545 }
4546 } else {
4547 offset_str.clear();
4548 newfg = diri->dirfragtree[fg.value()];
4549 }
4550 dout(10) << " adjust frag " << fg << " -> " << newfg << " " << diri->dirfragtree << dendl;
4551 fg = newfg;
4552 }
4553
4554 CDir *dir = try_open_auth_dirfrag(diri, fg, mdr);
4555 if (!dir) return;
4556
4557 // ok!
4558 dout(10) << "handle_client_readdir on " << *dir << dendl;
4559 ceph_assert(dir->is_auth());
4560
4561 if (!dir->is_complete()) {
4562 if (dir->is_frozen()) {
4563 dout(7) << "dir is frozen " << *dir << dendl;
4564 mds->locker->drop_locks(mdr.get());
4565 mdr->drop_local_auth_pins();
4566 dir->add_waiter(CDir::WAIT_UNFREEZE, new C_MDS_RetryRequest(mdcache, mdr));
4567 return;
4568 }
4569 // fetch
4570 dout(10) << " incomplete dir contents for readdir on " << *dir << ", fetching" << dendl;
4571 dir->fetch(new C_MDS_RetryRequest(mdcache, mdr), true);
4572 return;
4573 }
4574
4575 #ifdef MDS_VERIFY_FRAGSTAT
4576 dir->verify_fragstat();
4577 #endif
4578
4579 utime_t now = ceph_clock_now();
4580 mdr->set_mds_stamp(now);
4581
4582 snapid_t snapid = mdr->snapid;
4583 dout(10) << "snapid " << snapid << dendl;
4584
4585 SnapRealm *realm = diri->find_snaprealm();
4586
4587 unsigned max = req->head.args.readdir.max_entries;
4588 if (!max)
4589 max = dir->get_num_any(); // whatever, something big.
4590 unsigned max_bytes = req->head.args.readdir.max_bytes;
4591 if (!max_bytes)
4592 // make sure at least one item can be encoded
4593 max_bytes = (512 << 10) + g_conf()->mds_max_xattr_pairs_size;
4594
4595 // start final blob
4596 bufferlist dirbl;
4597 DirStat ds;
4598 ds.frag = dir->get_frag();
4599 ds.auth = dir->get_dir_auth().first;
4600 if (dir->is_auth() && !forward_all_requests_to_auth)
4601 dir->get_dist_spec(ds.dist, mds->get_nodeid());
4602
4603 dir->encode_dirstat(dirbl, mdr->session->info, ds);
4604
4605 // count bytes available.
4606 // this isn't perfect, but we should capture the main variable/unbounded size items!
4607 int front_bytes = dirbl.length() + sizeof(__u32) + sizeof(__u8)*2;
4608 int bytes_left = max_bytes - front_bytes;
4609 bytes_left -= realm->get_snap_trace().length();
4610
4611 // build dir contents
4612 bufferlist dnbl;
4613 __u32 numfiles = 0;
4614 bool start = !offset_hash && offset_str.empty();
4615 // skip all dns < dentry_key_t(snapid, offset_str, offset_hash)
4616 dentry_key_t skip_key(snapid, offset_str.c_str(), offset_hash);
4617 auto it = start ? dir->begin() : dir->lower_bound(skip_key);
4618 bool end = (it == dir->end());
4619 for (; !end && numfiles < max; end = (it == dir->end())) {
4620 CDentry *dn = it->second;
4621 ++it;
4622
4623 if (dn->state_test(CDentry::STATE_PURGING))
4624 continue;
4625
4626 bool dnp = dn->use_projected(client, mdr);
4627 CDentry::linkage_t *dnl = dnp ? dn->get_projected_linkage() : dn->get_linkage();
4628
4629 if (dnl->is_null())
4630 continue;
4631
4632 if (dn->last < snapid || dn->first > snapid) {
4633 dout(20) << "skipping non-overlapping snap " << *dn << dendl;
4634 continue;
4635 }
4636
4637 if (!start) {
4638 dentry_key_t offset_key(dn->last, offset_str.c_str(), offset_hash);
4639 if (!(offset_key < dn->key()))
4640 continue;
4641 }
4642
4643 CInode *in = dnl->get_inode();
4644
4645 if (in && in->ino() == CEPH_INO_CEPH)
4646 continue;
4647
4648 // remote link?
4649 // better for the MDS to do the work, if we think the client will stat any of these files.
4650 if (dnl->is_remote() && !in) {
4651 in = mdcache->get_inode(dnl->get_remote_ino());
4652 if (in) {
4653 dn->link_remote(dnl, in);
4654 } else if (dn->state_test(CDentry::STATE_BADREMOTEINO)) {
4655 dout(10) << "skipping bad remote ino on " << *dn << dendl;
4656 continue;
4657 } else {
4658 // touch everything i _do_ have
4659 for (auto &p : *dir) {
4660 if (!p.second->get_linkage()->is_null())
4661 mdcache->lru.lru_touch(p.second);
4662 }
4663
4664 // already issued caps and leases, reply immediately.
4665 if (dnbl.length() > 0) {
4666 mdcache->open_remote_dentry(dn, dnp, new C_MDSInternalNoop);
4667 dout(10) << " open remote dentry after caps were issued, stopping at "
4668 << dnbl.length() << " < " << bytes_left << dendl;
4669 break;
4670 }
4671
4672 mds->locker->drop_locks(mdr.get());
4673 mdr->drop_local_auth_pins();
4674 mdcache->open_remote_dentry(dn, dnp, new C_MDS_RetryRequest(mdcache, mdr));
4675 return;
4676 }
4677 }
4678 ceph_assert(in);
4679
4680 if ((int)(dnbl.length() + dn->get_name().length() + sizeof(__u32) + sizeof(LeaseStat)) > bytes_left) {
4681 dout(10) << " ran out of room, stopping at " << dnbl.length() << " < " << bytes_left << dendl;
4682 break;
4683 }
4684
4685 unsigned start_len = dnbl.length();
4686
4687 // dentry
4688 dout(12) << "including dn " << *dn << dendl;
4689 encode(dn->get_name(), dnbl);
4690 int lease_mask = dnl->is_primary() ? CEPH_LEASE_PRIMARY_LINK : 0;
4691 mds->locker->issue_client_lease(dn, mdr, lease_mask, now, dnbl);
4692
4693 // inode
4694 dout(12) << "including inode " << *in << dendl;
4695 int r = in->encode_inodestat(dnbl, mdr->session, realm, snapid, bytes_left - (int)dnbl.length());
4696 if (r < 0) {
4697 // chop off dn->name, lease
4698 dout(10) << " ran out of room, stopping at " << start_len << " < " << bytes_left << dendl;
4699 bufferlist keep;
4700 keep.substr_of(dnbl, 0, start_len);
4701 dnbl.swap(keep);
4702 break;
4703 }
4704 ceph_assert(r >= 0);
4705 numfiles++;
4706
4707 // touch dn
4708 mdcache->lru.lru_touch(dn);
4709 }
4710
4711 session->touch_readdir_cap(numfiles);
4712
4713 __u16 flags = 0;
4714 if (end) {
4715 flags = CEPH_READDIR_FRAG_END;
4716 if (start)
4717 flags |= CEPH_READDIR_FRAG_COMPLETE; // FIXME: what purpose does this serve
4718 }
4719 // client only understand END and COMPLETE flags ?
4720 if (req_flags & CEPH_READDIR_REPLY_BITFLAGS) {
4721 flags |= CEPH_READDIR_HASH_ORDER | CEPH_READDIR_OFFSET_HASH;
4722 }
4723
4724 // finish final blob
4725 encode(numfiles, dirbl);
4726 encode(flags, dirbl);
4727 dirbl.claim_append(dnbl);
4728
4729 // yay, reply
4730 dout(10) << "reply to " << *req << " readdir num=" << numfiles
4731 << " bytes=" << dirbl.length()
4732 << " start=" << (int)start
4733 << " end=" << (int)end
4734 << dendl;
4735 mdr->reply_extra_bl = dirbl;
4736
4737 // bump popularity. NOTE: this doesn't quite capture it.
4738 mds->balancer->hit_dir(dir, META_POP_IRD, -1, numfiles);
4739
4740 // reply
4741 mdr->tracei = diri;
4742 respond_to_request(mdr, 0);
4743 }
4744
4745
4746
4747 // ===============================================================================
4748 // INODE UPDATES
4749
4750
4751 /*
4752 * finisher for basic inode updates
4753 */
4754 class C_MDS_inode_update_finish : public ServerLogContext {
4755 CInode *in;
4756 bool truncating_smaller, changed_ranges, adjust_realm;
4757 public:
4758 C_MDS_inode_update_finish(Server *s, MDRequestRef& r, CInode *i,
4759 bool sm=false, bool cr=false, bool ar=false) :
4760 ServerLogContext(s, r), in(i),
4761 truncating_smaller(sm), changed_ranges(cr), adjust_realm(ar) { }
4762 void finish(int r) override {
4763 ceph_assert(r == 0);
4764
4765 int snap_op = (in->snaprealm ? CEPH_SNAP_OP_UPDATE : CEPH_SNAP_OP_SPLIT);
4766
4767 // apply
4768 mdr->apply();
4769
4770 MDSRank *mds = get_mds();
4771
4772 // notify any clients
4773 if (truncating_smaller && in->get_inode()->is_truncating()) {
4774 mds->locker->issue_truncate(in);
4775 mds->mdcache->truncate_inode(in, mdr->ls);
4776 }
4777
4778 if (adjust_realm) {
4779 mds->mdcache->send_snap_update(in, 0, snap_op);
4780 mds->mdcache->do_realm_invalidate_and_update_notify(in, snap_op);
4781 }
4782
4783 get_mds()->balancer->hit_inode(in, META_POP_IWR);
4784
4785 server->respond_to_request(mdr, 0);
4786
4787 if (changed_ranges)
4788 get_mds()->locker->share_inode_max_size(in);
4789 }
4790 };
4791
4792 void Server::handle_client_file_setlock(MDRequestRef& mdr)
4793 {
4794 const cref_t<MClientRequest> &req = mdr->client_request;
4795 MutationImpl::LockOpVec lov;
4796
4797 // get the inode to operate on, and set up any locks needed for that
4798 CInode *cur = rdlock_path_pin_ref(mdr, true);
4799 if (!cur)
4800 return;
4801
4802 lov.add_xlock(&cur->flocklock);
4803 /* acquire_locks will return true if it gets the locks. If it fails,
4804 it will redeliver this request at a later date, so drop the request.
4805 */
4806 if (!mds->locker->acquire_locks(mdr, lov)) {
4807 dout(10) << "handle_client_file_setlock could not get locks!" << dendl;
4808 return;
4809 }
4810
4811 // copy the lock change into a ceph_filelock so we can store/apply it
4812 ceph_filelock set_lock;
4813 set_lock.start = req->head.args.filelock_change.start;
4814 set_lock.length = req->head.args.filelock_change.length;
4815 set_lock.client = req->get_orig_source().num();
4816 set_lock.owner = req->head.args.filelock_change.owner;
4817 set_lock.pid = req->head.args.filelock_change.pid;
4818 set_lock.type = req->head.args.filelock_change.type;
4819 bool will_wait = req->head.args.filelock_change.wait;
4820
4821 dout(10) << "handle_client_file_setlock: " << set_lock << dendl;
4822
4823 ceph_lock_state_t *lock_state = NULL;
4824 bool interrupt = false;
4825
4826 // get the appropriate lock state
4827 switch (req->head.args.filelock_change.rule) {
4828 case CEPH_LOCK_FLOCK_INTR:
4829 interrupt = true;
4830 // fall-thru
4831 case CEPH_LOCK_FLOCK:
4832 lock_state = cur->get_flock_lock_state();
4833 break;
4834
4835 case CEPH_LOCK_FCNTL_INTR:
4836 interrupt = true;
4837 // fall-thru
4838 case CEPH_LOCK_FCNTL:
4839 lock_state = cur->get_fcntl_lock_state();
4840 break;
4841
4842 default:
4843 dout(10) << "got unknown lock type " << set_lock.type
4844 << ", dropping request!" << dendl;
4845 respond_to_request(mdr, -CEPHFS_EOPNOTSUPP);
4846 return;
4847 }
4848
4849 dout(10) << " state prior to lock change: " << *lock_state << dendl;
4850 if (CEPH_LOCK_UNLOCK == set_lock.type) {
4851 list<ceph_filelock> activated_locks;
4852 MDSContext::vec waiters;
4853 if (lock_state->is_waiting(set_lock)) {
4854 dout(10) << " unlock removing waiting lock " << set_lock << dendl;
4855 lock_state->remove_waiting(set_lock);
4856 cur->take_waiting(CInode::WAIT_FLOCK, waiters);
4857 } else if (!interrupt) {
4858 dout(10) << " unlock attempt on " << set_lock << dendl;
4859 lock_state->remove_lock(set_lock, activated_locks);
4860 cur->take_waiting(CInode::WAIT_FLOCK, waiters);
4861 }
4862 mds->queue_waiters(waiters);
4863
4864 respond_to_request(mdr, 0);
4865 } else {
4866 dout(10) << " lock attempt on " << set_lock << dendl;
4867 bool deadlock = false;
4868 if (mdr->more()->flock_was_waiting &&
4869 !lock_state->is_waiting(set_lock)) {
4870 dout(10) << " was waiting for lock but not anymore, must have been canceled " << set_lock << dendl;
4871 respond_to_request(mdr, -CEPHFS_EINTR);
4872 } else if (!lock_state->add_lock(set_lock, will_wait, mdr->more()->flock_was_waiting, &deadlock)) {
4873 dout(10) << " it failed on this attempt" << dendl;
4874 // couldn't set lock right now
4875 if (deadlock) {
4876 respond_to_request(mdr, -CEPHFS_EDEADLK);
4877 } else if (!will_wait) {
4878 respond_to_request(mdr, -CEPHFS_EWOULDBLOCK);
4879 } else {
4880 dout(10) << " added to waiting list" << dendl;
4881 ceph_assert(lock_state->is_waiting(set_lock));
4882 mdr->more()->flock_was_waiting = true;
4883 mds->locker->drop_locks(mdr.get());
4884 mdr->drop_local_auth_pins();
4885 mdr->mark_event("failed to add lock, waiting");
4886 mdr->mark_nowarn();
4887 cur->add_waiter(CInode::WAIT_FLOCK, new C_MDS_RetryRequest(mdcache, mdr));
4888 }
4889 } else
4890 respond_to_request(mdr, 0);
4891 }
4892 dout(10) << " state after lock change: " << *lock_state << dendl;
4893 }
4894
4895 void Server::handle_client_file_readlock(MDRequestRef& mdr)
4896 {
4897 const cref_t<MClientRequest> &req = mdr->client_request;
4898 MutationImpl::LockOpVec lov;
4899
4900 // get the inode to operate on, and set up any locks needed for that
4901 CInode *cur = rdlock_path_pin_ref(mdr, true);
4902 if (!cur)
4903 return;
4904
4905 /* acquire_locks will return true if it gets the locks. If it fails,
4906 it will redeliver this request at a later date, so drop the request.
4907 */
4908 lov.add_rdlock(&cur->flocklock);
4909 if (!mds->locker->acquire_locks(mdr, lov)) {
4910 dout(10) << "handle_client_file_readlock could not get locks!" << dendl;
4911 return;
4912 }
4913
4914 // copy the lock change into a ceph_filelock so we can store/apply it
4915 ceph_filelock checking_lock;
4916 checking_lock.start = req->head.args.filelock_change.start;
4917 checking_lock.length = req->head.args.filelock_change.length;
4918 checking_lock.client = req->get_orig_source().num();
4919 checking_lock.owner = req->head.args.filelock_change.owner;
4920 checking_lock.pid = req->head.args.filelock_change.pid;
4921 checking_lock.type = req->head.args.filelock_change.type;
4922
4923 // get the appropriate lock state
4924 ceph_lock_state_t *lock_state = NULL;
4925 switch (req->head.args.filelock_change.rule) {
4926 case CEPH_LOCK_FLOCK:
4927 lock_state = cur->get_flock_lock_state();
4928 break;
4929
4930 case CEPH_LOCK_FCNTL:
4931 lock_state = cur->get_fcntl_lock_state();
4932 break;
4933
4934 default:
4935 dout(10) << "got unknown lock type " << checking_lock.type << dendl;
4936 respond_to_request(mdr, -CEPHFS_EINVAL);
4937 return;
4938 }
4939 lock_state->look_for_lock(checking_lock);
4940
4941 bufferlist lock_bl;
4942 encode(checking_lock, lock_bl);
4943
4944 mdr->reply_extra_bl = lock_bl;
4945 respond_to_request(mdr, 0);
4946 }
4947
4948 void Server::handle_client_setattr(MDRequestRef& mdr)
4949 {
4950 const cref_t<MClientRequest> &req = mdr->client_request;
4951 MutationImpl::LockOpVec lov;
4952 CInode *cur = rdlock_path_pin_ref(mdr, true);
4953 if (!cur) return;
4954
4955 if (mdr->snapid != CEPH_NOSNAP) {
4956 respond_to_request(mdr, -CEPHFS_EROFS);
4957 return;
4958 }
4959 if (cur->ino() < MDS_INO_SYSTEM_BASE && !cur->is_base()) {
4960 respond_to_request(mdr, -CEPHFS_EPERM);
4961 return;
4962 }
4963
4964 __u32 mask = req->head.args.setattr.mask;
4965 __u32 access_mask = MAY_WRITE;
4966
4967 // xlock inode
4968 if (mask & (CEPH_SETATTR_MODE|CEPH_SETATTR_UID|CEPH_SETATTR_GID|CEPH_SETATTR_BTIME|CEPH_SETATTR_KILL_SGUID))
4969 lov.add_xlock(&cur->authlock);
4970 if (mask & (CEPH_SETATTR_MTIME|CEPH_SETATTR_ATIME|CEPH_SETATTR_SIZE))
4971 lov.add_xlock(&cur->filelock);
4972 if (mask & CEPH_SETATTR_CTIME)
4973 lov.add_wrlock(&cur->versionlock);
4974
4975 if (!mds->locker->acquire_locks(mdr, lov))
4976 return;
4977
4978 if ((mask & CEPH_SETATTR_UID) && (cur->get_inode()->uid != req->head.args.setattr.uid))
4979 access_mask |= MAY_CHOWN;
4980
4981 if ((mask & CEPH_SETATTR_GID) && (cur->get_inode()->gid != req->head.args.setattr.gid))
4982 access_mask |= MAY_CHGRP;
4983
4984 if (!check_access(mdr, cur, access_mask))
4985 return;
4986
4987 // trunc from bigger -> smaller?
4988 const auto& pip = cur->get_projected_inode();
4989
4990 uint64_t old_size = std::max<uint64_t>(pip->size, req->head.args.setattr.old_size);
4991
4992 // CEPHFS_ENOSPC on growing file while full, but allow shrinks
4993 if (is_full && req->head.args.setattr.size > old_size) {
4994 dout(20) << __func__ << ": full, responding CEPHFS_ENOSPC to setattr with larger size" << dendl;
4995 respond_to_request(mdr, -CEPHFS_ENOSPC);
4996 return;
4997 }
4998
4999 bool truncating_smaller = false;
5000 if (mask & CEPH_SETATTR_SIZE) {
5001 truncating_smaller = req->head.args.setattr.size < old_size;
5002 if (truncating_smaller && pip->is_truncating()) {
5003 dout(10) << " waiting for pending truncate from " << pip->truncate_from
5004 << " to " << pip->truncate_size << " to complete on " << *cur << dendl;
5005 mds->locker->drop_locks(mdr.get());
5006 mdr->drop_local_auth_pins();
5007 cur->add_waiter(CInode::WAIT_TRUNC, new C_MDS_RetryRequest(mdcache, mdr));
5008 return;
5009 }
5010 }
5011
5012 bool changed_ranges = false;
5013
5014 // project update
5015 mdr->ls = mdlog->get_current_segment();
5016 EUpdate *le = new EUpdate(mdlog, "setattr");
5017 mdlog->start_entry(le);
5018
5019 auto pi = cur->project_inode(mdr);
5020
5021 if (mask & CEPH_SETATTR_UID)
5022 pi.inode->uid = req->head.args.setattr.uid;
5023 if (mask & CEPH_SETATTR_GID)
5024 pi.inode->gid = req->head.args.setattr.gid;
5025
5026 if (mask & CEPH_SETATTR_MODE)
5027 pi.inode->mode = (pi.inode->mode & ~07777) | (req->head.args.setattr.mode & 07777);
5028 else if ((mask & (CEPH_SETATTR_UID|CEPH_SETATTR_GID|CEPH_SETATTR_KILL_SGUID)) &&
5029 S_ISREG(pi.inode->mode) &&
5030 (pi.inode->mode & (S_IXUSR|S_IXGRP|S_IXOTH))) {
5031 pi.inode->mode &= ~(S_ISUID|S_ISGID);
5032 }
5033
5034 if (mask & CEPH_SETATTR_MTIME)
5035 pi.inode->mtime = req->head.args.setattr.mtime;
5036 if (mask & CEPH_SETATTR_ATIME)
5037 pi.inode->atime = req->head.args.setattr.atime;
5038 if (mask & CEPH_SETATTR_BTIME)
5039 pi.inode->btime = req->head.args.setattr.btime;
5040 if (mask & (CEPH_SETATTR_ATIME | CEPH_SETATTR_MTIME | CEPH_SETATTR_BTIME))
5041 pi.inode->time_warp_seq++; // maybe not a timewarp, but still a serialization point.
5042 if (mask & CEPH_SETATTR_SIZE) {
5043 if (truncating_smaller) {
5044 pi.inode->truncate(old_size, req->head.args.setattr.size);
5045 le->metablob.add_truncate_start(cur->ino());
5046 } else {
5047 pi.inode->size = req->head.args.setattr.size;
5048 pi.inode->rstat.rbytes = pi.inode->size;
5049 }
5050 pi.inode->mtime = mdr->get_op_stamp();
5051
5052 // adjust client's max_size?
5053 if (mds->locker->calc_new_client_ranges(cur, pi.inode->size)) {
5054 dout(10) << " client_ranges " << cur->get_previous_projected_inode()->client_ranges
5055 << " -> " << pi.inode->client_ranges << dendl;
5056 changed_ranges = true;
5057 }
5058 }
5059
5060 pi.inode->version = cur->pre_dirty();
5061 pi.inode->ctime = mdr->get_op_stamp();
5062 if (mdr->get_op_stamp() > pi.inode->rstat.rctime)
5063 pi.inode->rstat.rctime = mdr->get_op_stamp();
5064 pi.inode->change_attr++;
5065
5066 // log + wait
5067 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
5068 mdcache->predirty_journal_parents(mdr, &le->metablob, cur, 0, PREDIRTY_PRIMARY);
5069 mdcache->journal_dirty_inode(mdr.get(), &le->metablob, cur);
5070
5071 journal_and_reply(mdr, cur, 0, le, new C_MDS_inode_update_finish(this, mdr, cur,
5072 truncating_smaller, changed_ranges));
5073
5074 // flush immediately if there are readers/writers waiting
5075 if (mdr->is_xlocked(&cur->filelock) &&
5076 (cur->get_caps_wanted() & (CEPH_CAP_FILE_RD|CEPH_CAP_FILE_WR)))
5077 mds->mdlog->flush();
5078 }
5079
5080 /* Takes responsibility for mdr */
5081 void Server::do_open_truncate(MDRequestRef& mdr, int cmode)
5082 {
5083 CInode *in = mdr->in[0];
5084 client_t client = mdr->get_client();
5085 ceph_assert(in);
5086
5087 dout(10) << "do_open_truncate " << *in << dendl;
5088
5089 SnapRealm *realm = in->find_snaprealm();
5090 Capability *cap = mds->locker->issue_new_caps(in, cmode, mdr, realm);
5091
5092 mdr->ls = mdlog->get_current_segment();
5093 EUpdate *le = new EUpdate(mdlog, "open_truncate");
5094 mdlog->start_entry(le);
5095
5096 // prepare
5097 auto pi = in->project_inode(mdr);
5098 pi.inode->version = in->pre_dirty();
5099 pi.inode->mtime = pi.inode->ctime = mdr->get_op_stamp();
5100 if (mdr->get_op_stamp() > pi.inode->rstat.rctime)
5101 pi.inode->rstat.rctime = mdr->get_op_stamp();
5102 pi.inode->change_attr++;
5103
5104 uint64_t old_size = std::max<uint64_t>(pi.inode->size, mdr->client_request->head.args.open.old_size);
5105 if (old_size > 0) {
5106 pi.inode->truncate(old_size, 0);
5107 le->metablob.add_truncate_start(in->ino());
5108 }
5109
5110 bool changed_ranges = false;
5111 if (cap && (cmode & CEPH_FILE_MODE_WR)) {
5112 pi.inode->client_ranges[client].range.first = 0;
5113 pi.inode->client_ranges[client].range.last = pi.inode->get_layout_size_increment();
5114 pi.inode->client_ranges[client].follows = realm->get_newest_seq();
5115 changed_ranges = true;
5116 in->mark_clientwriteable();
5117 cap->mark_clientwriteable();
5118 }
5119
5120 le->metablob.add_client_req(mdr->reqid, mdr->client_request->get_oldest_client_tid());
5121
5122 mdcache->predirty_journal_parents(mdr, &le->metablob, in, 0, PREDIRTY_PRIMARY);
5123 mdcache->journal_dirty_inode(mdr.get(), &le->metablob, in);
5124
5125 // make sure ino gets into the journal
5126 le->metablob.add_opened_ino(in->ino());
5127
5128 mdr->o_trunc = true;
5129
5130 CDentry *dn = 0;
5131 if (mdr->client_request->get_dentry_wanted()) {
5132 ceph_assert(mdr->dn[0].size());
5133 dn = mdr->dn[0].back();
5134 }
5135
5136 journal_and_reply(mdr, in, dn, le, new C_MDS_inode_update_finish(this, mdr, in, old_size > 0,
5137 changed_ranges));
5138 // Although the `open` part can give an early reply, the truncation won't
5139 // happen until our EUpdate is persistent, to give the client a prompt
5140 // response we must also flush that event.
5141 mdlog->flush();
5142 }
5143
5144
5145 /* This function cleans up the passed mdr */
5146 void Server::handle_client_setlayout(MDRequestRef& mdr)
5147 {
5148 const cref_t<MClientRequest> &req = mdr->client_request;
5149 CInode *cur = rdlock_path_pin_ref(mdr, true);
5150 if (!cur) return;
5151
5152 if (mdr->snapid != CEPH_NOSNAP) {
5153 respond_to_request(mdr, -CEPHFS_EROFS);
5154 return;
5155 }
5156 if (!cur->is_file()) {
5157 respond_to_request(mdr, -CEPHFS_EINVAL);
5158 return;
5159 }
5160 if (cur->get_projected_inode()->size ||
5161 cur->get_projected_inode()->truncate_seq > 1) {
5162 respond_to_request(mdr, -CEPHFS_ENOTEMPTY);
5163 return;
5164 }
5165
5166 // validate layout
5167 file_layout_t layout = cur->get_projected_inode()->layout;
5168 // save existing layout for later
5169 const auto old_layout = layout;
5170
5171 int access = MAY_WRITE;
5172
5173 if (req->head.args.setlayout.layout.fl_object_size > 0)
5174 layout.object_size = req->head.args.setlayout.layout.fl_object_size;
5175 if (req->head.args.setlayout.layout.fl_stripe_unit > 0)
5176 layout.stripe_unit = req->head.args.setlayout.layout.fl_stripe_unit;
5177 if (req->head.args.setlayout.layout.fl_stripe_count > 0)
5178 layout.stripe_count=req->head.args.setlayout.layout.fl_stripe_count;
5179 if (req->head.args.setlayout.layout.fl_pg_pool > 0) {
5180 layout.pool_id = req->head.args.setlayout.layout.fl_pg_pool;
5181
5182 // make sure we have as new a map as the client
5183 if (req->get_mdsmap_epoch() > mds->mdsmap->get_epoch()) {
5184 mds->wait_for_mdsmap(req->get_mdsmap_epoch(), new C_MDS_RetryRequest(mdcache, mdr));
5185 return;
5186 }
5187 }
5188
5189 // Don't permit layout modifications without 'p' caps
5190 if (layout != old_layout) {
5191 access |= MAY_SET_VXATTR;
5192 }
5193
5194 if (!layout.is_valid()) {
5195 dout(10) << "bad layout" << dendl;
5196 respond_to_request(mdr, -CEPHFS_EINVAL);
5197 return;
5198 }
5199 if (!mds->mdsmap->is_data_pool(layout.pool_id)) {
5200 dout(10) << " invalid data pool " << layout.pool_id << dendl;
5201 respond_to_request(mdr, -CEPHFS_EINVAL);
5202 return;
5203 }
5204
5205 MutationImpl::LockOpVec lov;
5206 lov.add_xlock(&cur->filelock);
5207 if (!mds->locker->acquire_locks(mdr, lov))
5208 return;
5209
5210 if (!check_access(mdr, cur, access))
5211 return;
5212
5213 // project update
5214 auto pi = cur->project_inode(mdr);
5215 pi.inode->layout = layout;
5216 // add the old pool to the inode
5217 pi.inode->add_old_pool(old_layout.pool_id);
5218 pi.inode->version = cur->pre_dirty();
5219 pi.inode->ctime = mdr->get_op_stamp();
5220 if (mdr->get_op_stamp() > pi.inode->rstat.rctime)
5221 pi.inode->rstat.rctime = mdr->get_op_stamp();
5222 pi.inode->change_attr++;
5223
5224 // log + wait
5225 mdr->ls = mdlog->get_current_segment();
5226 EUpdate *le = new EUpdate(mdlog, "setlayout");
5227 mdlog->start_entry(le);
5228 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
5229 mdcache->predirty_journal_parents(mdr, &le->metablob, cur, 0, PREDIRTY_PRIMARY);
5230 mdcache->journal_dirty_inode(mdr.get(), &le->metablob, cur);
5231
5232 journal_and_reply(mdr, cur, 0, le, new C_MDS_inode_update_finish(this, mdr, cur));
5233 }
5234
5235 bool Server::xlock_policylock(MDRequestRef& mdr, CInode *in, bool want_layout, bool xlock_snaplock)
5236 {
5237 if (mdr->locking_state & MutationImpl::ALL_LOCKED)
5238 return true;
5239
5240 MutationImpl::LockOpVec lov;
5241 lov.add_xlock(&in->policylock);
5242 if (xlock_snaplock)
5243 lov.add_xlock(&in->snaplock);
5244 else
5245 lov.add_rdlock(&in->snaplock);
5246 if (!mds->locker->acquire_locks(mdr, lov))
5247 return false;
5248
5249 if (want_layout && in->get_projected_inode()->has_layout()) {
5250 mdr->dir_layout = in->get_projected_inode()->layout;
5251 want_layout = false;
5252 }
5253 if (CDentry *pdn = in->get_projected_parent_dn(); pdn) {
5254 if (!mds->locker->try_rdlock_snap_layout(pdn->get_dir()->get_inode(), mdr, 0, want_layout))
5255 return false;
5256 }
5257
5258 mdr->locking_state |= MutationImpl::ALL_LOCKED;
5259 return true;
5260 }
5261
5262 CInode* Server::try_get_auth_inode(MDRequestRef& mdr, inodeno_t ino)
5263 {
5264 CInode *in = mdcache->get_inode(ino);
5265 if (!in || in->state_test(CInode::STATE_PURGING)) {
5266 respond_to_request(mdr, -CEPHFS_ESTALE);
5267 return nullptr;
5268 }
5269 if (!in->is_auth()) {
5270 mdcache->request_forward(mdr, in->authority().first);
5271 return nullptr;
5272 }
5273
5274 return in;
5275 }
5276
5277 void Server::handle_client_setdirlayout(MDRequestRef& mdr)
5278 {
5279 const cref_t<MClientRequest> &req = mdr->client_request;
5280
5281 // can't use rdlock_path_pin_ref because we need to xlock snaplock/policylock
5282 CInode *cur = try_get_auth_inode(mdr, req->get_filepath().get_ino());
5283 if (!cur)
5284 return;
5285
5286 if (!cur->is_dir()) {
5287 respond_to_request(mdr, -CEPHFS_ENOTDIR);
5288 return;
5289 }
5290
5291 if (!xlock_policylock(mdr, cur, true))
5292 return;
5293
5294 // validate layout
5295 const auto& old_pi = cur->get_projected_inode();
5296 file_layout_t layout;
5297 if (old_pi->has_layout())
5298 layout = old_pi->layout;
5299 else if (mdr->dir_layout != file_layout_t())
5300 layout = mdr->dir_layout;
5301 else
5302 layout = mdcache->default_file_layout;
5303
5304 // Level of access required to complete
5305 int access = MAY_WRITE;
5306
5307 const auto old_layout = layout;
5308
5309 if (req->head.args.setlayout.layout.fl_object_size > 0)
5310 layout.object_size = req->head.args.setlayout.layout.fl_object_size;
5311 if (req->head.args.setlayout.layout.fl_stripe_unit > 0)
5312 layout.stripe_unit = req->head.args.setlayout.layout.fl_stripe_unit;
5313 if (req->head.args.setlayout.layout.fl_stripe_count > 0)
5314 layout.stripe_count=req->head.args.setlayout.layout.fl_stripe_count;
5315 if (req->head.args.setlayout.layout.fl_pg_pool > 0) {
5316 layout.pool_id = req->head.args.setlayout.layout.fl_pg_pool;
5317 // make sure we have as new a map as the client
5318 if (req->get_mdsmap_epoch() > mds->mdsmap->get_epoch()) {
5319 mds->wait_for_mdsmap(req->get_mdsmap_epoch(), new C_MDS_RetryRequest(mdcache, mdr));
5320 return;
5321 }
5322 }
5323
5324 if (layout != old_layout) {
5325 access |= MAY_SET_VXATTR;
5326 }
5327
5328 if (!layout.is_valid()) {
5329 dout(10) << "bad layout" << dendl;
5330 respond_to_request(mdr, -CEPHFS_EINVAL);
5331 return;
5332 }
5333 if (!mds->mdsmap->is_data_pool(layout.pool_id)) {
5334 dout(10) << " invalid data pool " << layout.pool_id << dendl;
5335 respond_to_request(mdr, -CEPHFS_EINVAL);
5336 return;
5337 }
5338
5339 if (!check_access(mdr, cur, access))
5340 return;
5341
5342 auto pi = cur->project_inode(mdr);
5343 pi.inode->layout = layout;
5344 pi.inode->version = cur->pre_dirty();
5345
5346 // log + wait
5347 mdr->ls = mdlog->get_current_segment();
5348 EUpdate *le = new EUpdate(mdlog, "setlayout");
5349 mdlog->start_entry(le);
5350 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
5351 mdcache->predirty_journal_parents(mdr, &le->metablob, cur, 0, PREDIRTY_PRIMARY);
5352 mdcache->journal_dirty_inode(mdr.get(), &le->metablob, cur);
5353
5354 mdr->no_early_reply = true;
5355 journal_and_reply(mdr, cur, 0, le, new C_MDS_inode_update_finish(this, mdr, cur));
5356 }
5357
5358 // XATTRS
5359
5360 int Server::parse_layout_vxattr(string name, string value, const OSDMap& osdmap,
5361 file_layout_t *layout, bool validate)
5362 {
5363 dout(20) << "parse_layout_vxattr name " << name << " value '" << value << "'" << dendl;
5364 try {
5365 if (name == "layout") {
5366 string::iterator begin = value.begin();
5367 string::iterator end = value.end();
5368 keys_and_values<string::iterator> p; // create instance of parser
5369 std::map<string, string> m; // map to receive results
5370 if (!qi::parse(begin, end, p, m)) { // returns true if successful
5371 return -CEPHFS_EINVAL;
5372 }
5373 string left(begin, end);
5374 dout(10) << " parsed " << m << " left '" << left << "'" << dendl;
5375 if (begin != end)
5376 return -CEPHFS_EINVAL;
5377 for (map<string,string>::iterator q = m.begin(); q != m.end(); ++q) {
5378 // Skip validation on each attr, we do it once at the end (avoid
5379 // rejecting intermediate states if the overall result is ok)
5380 int r = parse_layout_vxattr(string("layout.") + q->first, q->second,
5381 osdmap, layout, false);
5382 if (r < 0)
5383 return r;
5384 }
5385 } else if (name == "layout.object_size") {
5386 layout->object_size = boost::lexical_cast<unsigned>(value);
5387 } else if (name == "layout.stripe_unit") {
5388 layout->stripe_unit = boost::lexical_cast<unsigned>(value);
5389 } else if (name == "layout.stripe_count") {
5390 layout->stripe_count = boost::lexical_cast<unsigned>(value);
5391 } else if (name == "layout.pool") {
5392 try {
5393 layout->pool_id = boost::lexical_cast<unsigned>(value);
5394 } catch (boost::bad_lexical_cast const&) {
5395 int64_t pool = osdmap.lookup_pg_pool_name(value);
5396 if (pool < 0) {
5397 dout(10) << " unknown pool " << value << dendl;
5398 return -CEPHFS_ENOENT;
5399 }
5400 layout->pool_id = pool;
5401 }
5402 } else if (name == "layout.pool_namespace") {
5403 layout->pool_ns = value;
5404 } else {
5405 dout(10) << " unknown layout vxattr " << name << dendl;
5406 return -CEPHFS_EINVAL;
5407 }
5408 } catch (boost::bad_lexical_cast const&) {
5409 dout(10) << "bad vxattr value, unable to parse int for " << name << dendl;
5410 return -CEPHFS_EINVAL;
5411 }
5412
5413 if (validate && !layout->is_valid()) {
5414 dout(10) << "bad layout" << dendl;
5415 return -CEPHFS_EINVAL;
5416 }
5417 if (!mds->mdsmap->is_data_pool(layout->pool_id)) {
5418 dout(10) << " invalid data pool " << layout->pool_id << dendl;
5419 return -CEPHFS_EINVAL;
5420 }
5421 return 0;
5422 }
5423
5424 int Server::parse_quota_vxattr(string name, string value, quota_info_t *quota)
5425 {
5426 dout(20) << "parse_quota_vxattr name " << name << " value '" << value << "'" << dendl;
5427 try {
5428 if (name == "quota") {
5429 string::iterator begin = value.begin();
5430 string::iterator end = value.end();
5431 if (begin == end) {
5432 // keep quota unchanged. (for create_quota_realm())
5433 return 0;
5434 }
5435 keys_and_values<string::iterator> p; // create instance of parser
5436 std::map<string, string> m; // map to receive results
5437 if (!qi::parse(begin, end, p, m)) { // returns true if successful
5438 return -CEPHFS_EINVAL;
5439 }
5440 string left(begin, end);
5441 dout(10) << " parsed " << m << " left '" << left << "'" << dendl;
5442 if (begin != end)
5443 return -CEPHFS_EINVAL;
5444 for (map<string,string>::iterator q = m.begin(); q != m.end(); ++q) {
5445 int r = parse_quota_vxattr(string("quota.") + q->first, q->second, quota);
5446 if (r < 0)
5447 return r;
5448 }
5449 } else if (name == "quota.max_bytes") {
5450 int64_t q = boost::lexical_cast<int64_t>(value);
5451 if (q < 0)
5452 return -CEPHFS_EINVAL;
5453 quota->max_bytes = q;
5454 } else if (name == "quota.max_files") {
5455 int64_t q = boost::lexical_cast<int64_t>(value);
5456 if (q < 0)
5457 return -CEPHFS_EINVAL;
5458 quota->max_files = q;
5459 } else {
5460 dout(10) << " unknown quota vxattr " << name << dendl;
5461 return -CEPHFS_EINVAL;
5462 }
5463 } catch (boost::bad_lexical_cast const&) {
5464 dout(10) << "bad vxattr value, unable to parse int for " << name << dendl;
5465 return -CEPHFS_EINVAL;
5466 }
5467
5468 if (!quota->is_valid()) {
5469 dout(10) << "bad quota" << dendl;
5470 return -CEPHFS_EINVAL;
5471 }
5472 return 0;
5473 }
5474
5475 void Server::create_quota_realm(CInode *in)
5476 {
5477 dout(10) << __func__ << " " << *in << dendl;
5478
5479 auto req = make_message<MClientRequest>(CEPH_MDS_OP_SETXATTR);
5480 req->set_filepath(filepath(in->ino()));
5481 req->set_string2("ceph.quota");
5482 // empty vxattr value
5483 req->set_tid(mds->issue_tid());
5484
5485 mds->send_message_mds(req, in->authority().first);
5486 }
5487
5488 /*
5489 * Verify that the file layout attribute carried by client
5490 * is well-formatted.
5491 * Return 0 on success, otherwise this function takes
5492 * responsibility for the passed mdr.
5493 */
5494 int Server::check_layout_vxattr(MDRequestRef& mdr,
5495 string name,
5496 string value,
5497 file_layout_t *layout)
5498 {
5499 const cref_t<MClientRequest> &req = mdr->client_request;
5500 epoch_t epoch;
5501 int r;
5502
5503 mds->objecter->with_osdmap([&](const OSDMap& osdmap) {
5504 r = parse_layout_vxattr(name, value, osdmap, layout);
5505 epoch = osdmap.get_epoch();
5506 });
5507
5508 if (r == -CEPHFS_ENOENT) {
5509
5510 // we don't have the specified pool, make sure our map
5511 // is newer than or as new as the client.
5512 epoch_t req_epoch = req->get_osdmap_epoch();
5513
5514 if (req_epoch > epoch) {
5515
5516 // well, our map is older. consult mds.
5517 auto fin = new C_IO_Wrapper(mds, new C_MDS_RetryRequest(mdcache, mdr));
5518
5519 mds->objecter->wait_for_map(req_epoch, lambdafy(fin));
5520 return r;
5521 } else if (req_epoch == 0 && !mdr->waited_for_osdmap) {
5522
5523 // For compatibility with client w/ old code, we still need get the
5524 // latest map. One day if COMPACT_VERSION of MClientRequest >=3,
5525 // we can remove those code.
5526 mdr->waited_for_osdmap = true;
5527 mds->objecter->wait_for_latest_osdmap(std::ref(*new C_IO_Wrapper(
5528 mds, new C_MDS_RetryRequest(mdcache, mdr))));
5529 return r;
5530 }
5531 }
5532
5533 if (r < 0) {
5534
5535 if (r == -CEPHFS_ENOENT)
5536 r = -CEPHFS_EINVAL;
5537
5538 respond_to_request(mdr, r);
5539 return r;
5540 }
5541
5542 // all is well
5543 return 0;
5544 }
5545
5546 void Server::handle_set_vxattr(MDRequestRef& mdr, CInode *cur)
5547 {
5548 const cref_t<MClientRequest> &req = mdr->client_request;
5549 string name(req->get_path2());
5550 bufferlist bl = req->get_data();
5551 string value (bl.c_str(), bl.length());
5552 dout(10) << "handle_set_vxattr " << name
5553 << " val " << value.length()
5554 << " bytes on " << *cur
5555 << dendl;
5556
5557 CInode::mempool_inode *pip = nullptr;
5558 string rest;
5559
5560 if (!check_access(mdr, cur, MAY_SET_VXATTR)) {
5561 return;
5562 }
5563
5564 bool adjust_realm = false;
5565 if (name.compare(0, 15, "ceph.dir.layout") == 0) {
5566 if (!cur->is_dir()) {
5567 respond_to_request(mdr, -CEPHFS_EINVAL);
5568 return;
5569 }
5570
5571 if (!xlock_policylock(mdr, cur, true))
5572 return;
5573
5574 file_layout_t layout;
5575 if (cur->get_projected_inode()->has_layout())
5576 layout = cur->get_projected_inode()->layout;
5577 else if (mdr->dir_layout != file_layout_t())
5578 layout = mdr->dir_layout;
5579 else
5580 layout = mdcache->default_file_layout;
5581
5582 rest = name.substr(name.find("layout"));
5583 if (check_layout_vxattr(mdr, rest, value, &layout) < 0)
5584 return;
5585
5586 auto pi = cur->project_inode(mdr);
5587 pi.inode->layout = layout;
5588 mdr->no_early_reply = true;
5589 pip = pi.inode.get();
5590 } else if (name.compare(0, 16, "ceph.file.layout") == 0) {
5591 if (!cur->is_file()) {
5592 respond_to_request(mdr, -CEPHFS_EINVAL);
5593 return;
5594 }
5595 if (cur->get_projected_inode()->size ||
5596 cur->get_projected_inode()->truncate_seq > 1) {
5597 respond_to_request(mdr, -CEPHFS_ENOTEMPTY);
5598 return;
5599 }
5600 file_layout_t layout = cur->get_projected_inode()->layout;
5601 rest = name.substr(name.find("layout"));
5602 if (check_layout_vxattr(mdr, rest, value, &layout) < 0)
5603 return;
5604
5605 MutationImpl::LockOpVec lov;
5606 lov.add_xlock(&cur->filelock);
5607 if (!mds->locker->acquire_locks(mdr, lov))
5608 return;
5609
5610 auto pi = cur->project_inode(mdr);
5611 int64_t old_pool = pi.inode->layout.pool_id;
5612 pi.inode->add_old_pool(old_pool);
5613 pi.inode->layout = layout;
5614 pip = pi.inode.get();
5615 } else if (name.compare(0, 10, "ceph.quota") == 0) {
5616 if (!cur->is_dir()) {
5617 respond_to_request(mdr, -CEPHFS_EINVAL);
5618 return;
5619 }
5620
5621 quota_info_t quota = cur->get_projected_inode()->quota;
5622
5623 rest = name.substr(name.find("quota"));
5624 int r = parse_quota_vxattr(rest, value, &quota);
5625 if (r < 0) {
5626 respond_to_request(mdr, r);
5627 return;
5628 }
5629
5630 if (quota.is_enable() && !cur->get_projected_srnode())
5631 adjust_realm = true;
5632
5633 if (!xlock_policylock(mdr, cur, false, adjust_realm))
5634 return;
5635
5636 if (cur->get_projected_inode()->quota == quota) {
5637 respond_to_request(mdr, 0);
5638 return;
5639 }
5640
5641 auto pi = cur->project_inode(mdr, false, adjust_realm);
5642 pi.inode->quota = quota;
5643
5644 if (adjust_realm)
5645 pi.snapnode->created = pi.snapnode->seq = cur->find_snaprealm()->get_newest_seq();
5646
5647 mdr->no_early_reply = true;
5648 pip = pi.inode.get();
5649
5650 client_t exclude_ct = mdr->get_client();
5651 mdcache->broadcast_quota_to_client(cur, exclude_ct, true);
5652 } else if (name == "ceph.dir.subvolume"sv) {
5653 if (!cur->is_dir()) {
5654 respond_to_request(mdr, -CEPHFS_EINVAL);
5655 return;
5656 }
5657
5658 bool val;
5659 try {
5660 val = boost::lexical_cast<bool>(value);
5661 } catch (boost::bad_lexical_cast const&) {
5662 dout(10) << "bad vxattr value, unable to parse bool for " << name << dendl;
5663 respond_to_request(mdr, -CEPHFS_EINVAL);
5664 return;
5665 }
5666
5667 if (!xlock_policylock(mdr, cur, false, true))
5668 return;
5669
5670 SnapRealm *realm = cur->find_snaprealm();
5671 if (val) {
5672 inodeno_t subvol_ino = realm->get_subvolume_ino();
5673 // can't create subvolume inside another subvolume
5674 if (subvol_ino && subvol_ino != cur->ino()) {
5675 respond_to_request(mdr, -CEPHFS_EINVAL);
5676 return;
5677 }
5678 }
5679
5680 const auto srnode = cur->get_projected_srnode();
5681 if (val == (srnode && srnode->is_subvolume())) {
5682 respond_to_request(mdr, 0);
5683 return;
5684 }
5685
5686 auto pi = cur->project_inode(mdr, false, true);
5687 if (!srnode)
5688 pi.snapnode->created = pi.snapnode->seq = realm->get_newest_seq();
5689 if (val)
5690 pi.snapnode->mark_subvolume();
5691 else
5692 pi.snapnode->clear_subvolume();
5693
5694 mdr->no_early_reply = true;
5695 pip = pi.inode.get();
5696 adjust_realm = true;
5697 } else if (name == "ceph.dir.pin"sv) {
5698 if (!cur->is_dir() || cur->is_root()) {
5699 respond_to_request(mdr, -CEPHFS_EINVAL);
5700 return;
5701 }
5702
5703 mds_rank_t rank;
5704 try {
5705 rank = boost::lexical_cast<mds_rank_t>(value);
5706 if (rank < 0) rank = MDS_RANK_NONE;
5707 } catch (boost::bad_lexical_cast const&) {
5708 dout(10) << "bad vxattr value, unable to parse int for " << name << dendl;
5709 respond_to_request(mdr, -CEPHFS_EINVAL);
5710 return;
5711 }
5712
5713 if (!xlock_policylock(mdr, cur))
5714 return;
5715
5716 auto pi = cur->project_inode(mdr);
5717 cur->set_export_pin(rank);
5718 pip = pi.inode.get();
5719 } else if (name == "ceph.dir.pin.random"sv) {
5720 if (!cur->is_dir() || cur->is_root()) {
5721 respond_to_request(mdr, -CEPHFS_EINVAL);
5722 return;
5723 }
5724
5725 double val;
5726 try {
5727 val = boost::lexical_cast<double>(value);
5728 } catch (boost::bad_lexical_cast const&) {
5729 dout(10) << "bad vxattr value, unable to parse float for " << name << dendl;
5730 respond_to_request(mdr, -CEPHFS_EINVAL);
5731 return;
5732 }
5733
5734 if (val < 0.0 || 1.0 < val) {
5735 respond_to_request(mdr, -CEPHFS_EDOM);
5736 return;
5737 } else if (mdcache->export_ephemeral_random_max < val) {
5738 respond_to_request(mdr, -CEPHFS_EINVAL);
5739 return;
5740 }
5741
5742 if (!xlock_policylock(mdr, cur))
5743 return;
5744
5745 auto pi = cur->project_inode(mdr);
5746 cur->setxattr_ephemeral_rand(val);
5747 pip = pi.inode.get();
5748 } else if (name == "ceph.dir.pin.distributed"sv) {
5749 if (!cur->is_dir() || cur->is_root()) {
5750 respond_to_request(mdr, -CEPHFS_EINVAL);
5751 return;
5752 }
5753
5754 bool val;
5755 try {
5756 val = boost::lexical_cast<bool>(value);
5757 } catch (boost::bad_lexical_cast const&) {
5758 dout(10) << "bad vxattr value, unable to parse bool for " << name << dendl;
5759 respond_to_request(mdr, -CEPHFS_EINVAL);
5760 return;
5761 }
5762
5763 if (!xlock_policylock(mdr, cur))
5764 return;
5765
5766 auto pi = cur->project_inode(mdr);
5767 cur->setxattr_ephemeral_dist(val);
5768 pip = pi.inode.get();
5769 } else {
5770 dout(10) << " unknown vxattr " << name << dendl;
5771 respond_to_request(mdr, -CEPHFS_EINVAL);
5772 return;
5773 }
5774
5775 pip->change_attr++;
5776 pip->ctime = mdr->get_op_stamp();
5777 if (mdr->get_op_stamp() > pip->rstat.rctime)
5778 pip->rstat.rctime = mdr->get_op_stamp();
5779 pip->version = cur->pre_dirty();
5780 if (cur->is_file())
5781 pip->update_backtrace();
5782
5783 // log + wait
5784 mdr->ls = mdlog->get_current_segment();
5785 EUpdate *le = new EUpdate(mdlog, "set vxattr layout");
5786 mdlog->start_entry(le);
5787 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
5788 mdcache->predirty_journal_parents(mdr, &le->metablob, cur, 0, PREDIRTY_PRIMARY);
5789 mdcache->journal_dirty_inode(mdr.get(), &le->metablob, cur);
5790
5791 journal_and_reply(mdr, cur, 0, le, new C_MDS_inode_update_finish(this, mdr, cur,
5792 false, false, adjust_realm));
5793 return;
5794 }
5795
5796 void Server::handle_remove_vxattr(MDRequestRef& mdr, CInode *cur)
5797 {
5798 const cref_t<MClientRequest> &req = mdr->client_request;
5799 string name(req->get_path2());
5800
5801 dout(10) << __func__ << " " << name << " on " << *cur << dendl;
5802
5803 if (name == "ceph.dir.layout") {
5804 if (!cur->is_dir()) {
5805 respond_to_request(mdr, -CEPHFS_ENODATA);
5806 return;
5807 }
5808 if (cur->is_root()) {
5809 dout(10) << "can't remove layout policy on the root directory" << dendl;
5810 respond_to_request(mdr, -CEPHFS_EINVAL);
5811 return;
5812 }
5813
5814 if (!cur->get_projected_inode()->has_layout()) {
5815 respond_to_request(mdr, -CEPHFS_ENODATA);
5816 return;
5817 }
5818
5819 MutationImpl::LockOpVec lov;
5820 lov.add_xlock(&cur->policylock);
5821 if (!mds->locker->acquire_locks(mdr, lov))
5822 return;
5823
5824 auto pi = cur->project_inode(mdr);
5825 pi.inode->clear_layout();
5826 pi.inode->version = cur->pre_dirty();
5827
5828 // log + wait
5829 mdr->ls = mdlog->get_current_segment();
5830 EUpdate *le = new EUpdate(mdlog, "remove dir layout vxattr");
5831 mdlog->start_entry(le);
5832 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
5833 mdcache->predirty_journal_parents(mdr, &le->metablob, cur, 0, PREDIRTY_PRIMARY);
5834 mdcache->journal_dirty_inode(mdr.get(), &le->metablob, cur);
5835
5836 mdr->no_early_reply = true;
5837 journal_and_reply(mdr, cur, 0, le, new C_MDS_inode_update_finish(this, mdr, cur));
5838 return;
5839 } else if (name == "ceph.dir.layout.pool_namespace"
5840 || name == "ceph.file.layout.pool_namespace") {
5841 // Namespace is the only layout field that has a meaningful
5842 // null/none value (empty string, means default layout). Is equivalent
5843 // to a setxattr with empty string: pass through the empty payload of
5844 // the rmxattr request to do this.
5845 handle_set_vxattr(mdr, cur);
5846 return;
5847 }
5848
5849 respond_to_request(mdr, -CEPHFS_ENODATA);
5850 }
5851
5852 const Server::XattrHandler Server::xattr_handlers[] = {
5853 {
5854 xattr_name: Server::DEFAULT_HANDLER,
5855 description: "default xattr handler",
5856 validate: &Server::default_xattr_validate,
5857 setxattr: &Server::default_setxattr_handler,
5858 removexattr: &Server::default_removexattr_handler,
5859 },
5860 {
5861 xattr_name: "ceph.mirror.info",
5862 description: "mirror info xattr handler",
5863 validate: &Server::mirror_info_xattr_validate,
5864 setxattr: &Server::mirror_info_setxattr_handler,
5865 removexattr: &Server::mirror_info_removexattr_handler
5866 },
5867 };
5868
5869 const Server::XattrHandler* Server::get_xattr_or_default_handler(std::string_view xattr_name) {
5870 const XattrHandler *default_xattr_handler = nullptr;
5871
5872 for (auto &handler : xattr_handlers) {
5873 if (handler.xattr_name == Server::DEFAULT_HANDLER) {
5874 ceph_assert(default_xattr_handler == nullptr);
5875 default_xattr_handler = &handler;
5876 }
5877 if (handler.xattr_name == xattr_name) {
5878 dout(20) << "handler=" << handler.description << dendl;
5879 return &handler;
5880 }
5881 }
5882
5883 ceph_assert(default_xattr_handler != nullptr);
5884 dout(20) << "handler=" << default_xattr_handler->description << dendl;
5885 return default_xattr_handler;
5886 }
5887
5888 int Server::xattr_validate(CInode *cur, const InodeStoreBase::xattr_map_const_ptr xattrs,
5889 const std::string &xattr_name, int op, int flags) {
5890 if (op == CEPH_MDS_OP_SETXATTR) {
5891 if (xattrs) {
5892 if ((flags & CEPH_XATTR_CREATE) && xattrs->count(mempool::mds_co::string(xattr_name))) {
5893 dout(10) << "setxattr '" << xattr_name << "' XATTR_CREATE and CEPHFS_EEXIST on " << *cur << dendl;
5894 return -CEPHFS_EEXIST;
5895 }
5896 }
5897 if ((flags & CEPH_XATTR_REPLACE) && !(xattrs && xattrs->count(mempool::mds_co::string(xattr_name)))) {
5898 dout(10) << "setxattr '" << xattr_name << "' XATTR_REPLACE and CEPHFS_ENODATA on " << *cur << dendl;
5899 return -CEPHFS_ENODATA;
5900 }
5901
5902 return 0;
5903 }
5904
5905 if (op == CEPH_MDS_OP_RMXATTR) {
5906 if (!xattrs || xattrs->count(mempool::mds_co::string(xattr_name)) == 0) {
5907 dout(10) << "removexattr '" << xattr_name << "' and CEPHFS_ENODATA on " << *cur << dendl;
5908 return -CEPHFS_ENODATA;
5909 }
5910
5911 return 0;
5912 }
5913
5914 derr << ": unhandled validation for: " << xattr_name << dendl;
5915 return -CEPHFS_EINVAL;
5916 }
5917
5918 void Server::xattr_set(InodeStoreBase::xattr_map_ptr xattrs, const std::string &xattr_name,
5919 const bufferlist &xattr_value) {
5920 size_t len = xattr_value.length();
5921 bufferptr b = buffer::create(len);
5922 if (len) {
5923 xattr_value.begin().copy(len, b.c_str());
5924 }
5925 auto em = xattrs->emplace(std::piecewise_construct,
5926 std::forward_as_tuple(mempool::mds_co::string(xattr_name)),
5927 std::forward_as_tuple(b));
5928 if (!em.second) {
5929 em.first->second = b;
5930 }
5931 }
5932
5933 void Server::xattr_rm(InodeStoreBase::xattr_map_ptr xattrs, const std::string &xattr_name) {
5934 xattrs->erase(mempool::mds_co::string(xattr_name));
5935 }
5936
5937 int Server::default_xattr_validate(CInode *cur, const InodeStoreBase::xattr_map_const_ptr xattrs,
5938 XattrOp *xattr_op) {
5939 return xattr_validate(cur, xattrs, xattr_op->xattr_name, xattr_op->op, xattr_op->flags);
5940 }
5941
5942 void Server::default_setxattr_handler(CInode *cur, InodeStoreBase::xattr_map_ptr xattrs,
5943 const XattrOp &xattr_op) {
5944 xattr_set(xattrs, xattr_op.xattr_name, xattr_op.xattr_value);
5945 }
5946
5947 void Server::default_removexattr_handler(CInode *cur, InodeStoreBase::xattr_map_ptr xattrs,
5948 const XattrOp &xattr_op) {
5949 xattr_rm(xattrs, xattr_op.xattr_name);
5950 }
5951
5952 // mirror info xattr handlers
5953 const std::string Server::MirrorXattrInfo::MIRROR_INFO_REGEX = "^cluster_id=([a-f0-9]{8}-" \
5954 "[a-f0-9]{4}-[a-f0-9]{4}-" \
5955 "[a-f0-9]{4}-[a-f0-9]{12})" \
5956 " fs_id=(\\d+)$";
5957 const std::string Server::MirrorXattrInfo::CLUSTER_ID = "ceph.mirror.info.cluster_id";
5958 const std::string Server::MirrorXattrInfo::FS_ID = "ceph.mirror.info.fs_id";
5959 int Server::parse_mirror_info_xattr(const std::string &name, const std::string &value,
5960 std::string &cluster_id, std::string &fs_id) {
5961 dout(20) << "parsing name=" << name << ", value=" << value << dendl;
5962
5963 static const std::regex regex(Server::MirrorXattrInfo::MIRROR_INFO_REGEX);
5964 std::smatch match;
5965
5966 std::regex_search(value, match, regex);
5967 if (match.size() != 3) {
5968 derr << "mirror info parse error" << dendl;
5969 return -CEPHFS_EINVAL;
5970 }
5971
5972 cluster_id = match[1];
5973 fs_id = match[2];
5974 dout(20) << " parsed cluster_id=" << cluster_id << ", fs_id=" << fs_id << dendl;
5975 return 0;
5976 }
5977
5978 int Server::mirror_info_xattr_validate(CInode *cur, const InodeStoreBase::xattr_map_const_ptr xattrs,
5979 XattrOp *xattr_op) {
5980 if (!cur->is_root()) {
5981 return -CEPHFS_EINVAL;
5982 }
5983
5984 int v1 = xattr_validate(cur, xattrs, Server::MirrorXattrInfo::CLUSTER_ID, xattr_op->op, xattr_op->flags);
5985 int v2 = xattr_validate(cur, xattrs, Server::MirrorXattrInfo::FS_ID, xattr_op->op, xattr_op->flags);
5986 if (v1 != v2) {
5987 derr << "inconsistent mirror info state (" << v1 << "," << v2 << ")" << dendl;
5988 return -CEPHFS_EINVAL;
5989 }
5990
5991 if (v1 < 0) {
5992 return v1;
5993 }
5994
5995 if (xattr_op->op == CEPH_MDS_OP_RMXATTR) {
5996 return 0;
5997 }
5998
5999 std::string cluster_id;
6000 std::string fs_id;
6001 int r = parse_mirror_info_xattr(xattr_op->xattr_name, xattr_op->xattr_value.to_str(),
6002 cluster_id, fs_id);
6003 if (r < 0) {
6004 return r;
6005 }
6006
6007 xattr_op->xinfo = std::make_unique<MirrorXattrInfo>(cluster_id, fs_id);
6008 return 0;
6009 }
6010
6011 void Server::mirror_info_setxattr_handler(CInode *cur, InodeStoreBase::xattr_map_ptr xattrs,
6012 const XattrOp &xattr_op) {
6013 auto mirror_info = dynamic_cast<MirrorXattrInfo&>(*(xattr_op.xinfo));
6014
6015 bufferlist bl;
6016 bl.append(mirror_info.cluster_id.c_str(), mirror_info.cluster_id.length());
6017 xattr_set(xattrs, Server::MirrorXattrInfo::CLUSTER_ID, bl);
6018
6019 bl.clear();
6020 bl.append(mirror_info.fs_id.c_str(), mirror_info.fs_id.length());
6021 xattr_set(xattrs, Server::MirrorXattrInfo::FS_ID, bl);
6022 }
6023
6024 void Server::mirror_info_removexattr_handler(CInode *cur, InodeStoreBase::xattr_map_ptr xattrs,
6025 const XattrOp &xattr_op) {
6026 xattr_rm(xattrs, Server::MirrorXattrInfo::CLUSTER_ID);
6027 xattr_rm(xattrs, Server::MirrorXattrInfo::FS_ID);
6028 }
6029
6030 void Server::handle_client_setxattr(MDRequestRef& mdr)
6031 {
6032 const cref_t<MClientRequest> &req = mdr->client_request;
6033 string name(req->get_path2());
6034
6035 // is a ceph virtual xattr?
6036 if (is_ceph_vxattr(name)) {
6037 // can't use rdlock_path_pin_ref because we need to xlock snaplock/policylock
6038 CInode *cur = try_get_auth_inode(mdr, req->get_filepath().get_ino());
6039 if (!cur)
6040 return;
6041
6042 handle_set_vxattr(mdr, cur);
6043 return;
6044 }
6045
6046 if (!is_allowed_ceph_xattr(name)) {
6047 respond_to_request(mdr, -CEPHFS_EINVAL);
6048 return;
6049 }
6050
6051 CInode *cur = rdlock_path_pin_ref(mdr, true);
6052 if (!cur)
6053 return;
6054
6055 if (mdr->snapid != CEPH_NOSNAP) {
6056 respond_to_request(mdr, -CEPHFS_EROFS);
6057 return;
6058 }
6059
6060 int flags = req->head.args.setxattr.flags;
6061
6062 MutationImpl::LockOpVec lov;
6063 lov.add_xlock(&cur->xattrlock);
6064 if (!mds->locker->acquire_locks(mdr, lov))
6065 return;
6066
6067 if (!check_access(mdr, cur, MAY_WRITE))
6068 return;
6069
6070 size_t len = req->get_data().length();
6071 size_t inc = len + name.length();
6072
6073 auto handler = Server::get_xattr_or_default_handler(name);
6074 const auto& pxattrs = cur->get_projected_xattrs();
6075 if (pxattrs) {
6076 // check xattrs kv pairs size
6077 size_t cur_xattrs_size = 0;
6078 for (const auto& p : *pxattrs) {
6079 if ((flags & CEPH_XATTR_REPLACE) && name.compare(p.first) == 0) {
6080 continue;
6081 }
6082 cur_xattrs_size += p.first.length() + p.second.length();
6083 }
6084
6085 if (((cur_xattrs_size + inc) > g_conf()->mds_max_xattr_pairs_size)) {
6086 dout(10) << "xattr kv pairs size too big. cur_xattrs_size "
6087 << cur_xattrs_size << ", inc " << inc << dendl;
6088 respond_to_request(mdr, -CEPHFS_ENOSPC);
6089 return;
6090 }
6091 }
6092
6093 XattrOp xattr_op(CEPH_MDS_OP_SETXATTR, name, req->get_data(), flags);
6094 int r = std::invoke(handler->validate, this, cur, pxattrs, &xattr_op);
6095 if (r < 0) {
6096 respond_to_request(mdr, r);
6097 return;
6098 }
6099
6100 dout(10) << "setxattr '" << name << "' len " << len << " on " << *cur << dendl;
6101
6102 // project update
6103 auto pi = cur->project_inode(mdr, true);
6104 pi.inode->version = cur->pre_dirty();
6105 pi.inode->ctime = mdr->get_op_stamp();
6106 if (mdr->get_op_stamp() > pi.inode->rstat.rctime)
6107 pi.inode->rstat.rctime = mdr->get_op_stamp();
6108 if (name == "encryption.ctx"sv)
6109 pi.inode->fscrypt = true;
6110 pi.inode->change_attr++;
6111 pi.inode->xattr_version++;
6112
6113 if ((flags & CEPH_XATTR_REMOVE)) {
6114 std::invoke(handler->removexattr, this, cur, pi.xattrs, xattr_op);
6115 } else {
6116 std::invoke(handler->setxattr, this, cur, pi.xattrs, xattr_op);
6117 }
6118
6119 // log + wait
6120 mdr->ls = mdlog->get_current_segment();
6121 EUpdate *le = new EUpdate(mdlog, "setxattr");
6122 mdlog->start_entry(le);
6123 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
6124 mdcache->predirty_journal_parents(mdr, &le->metablob, cur, 0, PREDIRTY_PRIMARY);
6125 mdcache->journal_dirty_inode(mdr.get(), &le->metablob, cur);
6126
6127 journal_and_reply(mdr, cur, 0, le, new C_MDS_inode_update_finish(this, mdr, cur));
6128 }
6129
6130 void Server::handle_client_removexattr(MDRequestRef& mdr)
6131 {
6132 const cref_t<MClientRequest> &req = mdr->client_request;
6133 std::string name(req->get_path2());
6134
6135 // is a ceph virtual xattr?
6136 if (is_ceph_vxattr(name)) {
6137 // can't use rdlock_path_pin_ref because we need to xlock snaplock/policylock
6138 CInode *cur = try_get_auth_inode(mdr, req->get_filepath().get_ino());
6139 if (!cur)
6140 return;
6141
6142 handle_remove_vxattr(mdr, cur);
6143 return;
6144 }
6145
6146 if (!is_allowed_ceph_xattr(name)) {
6147 respond_to_request(mdr, -CEPHFS_EINVAL);
6148 return;
6149 }
6150
6151 CInode* cur = rdlock_path_pin_ref(mdr, true);
6152 if (!cur)
6153 return;
6154
6155 if (mdr->snapid != CEPH_NOSNAP) {
6156 respond_to_request(mdr, -CEPHFS_EROFS);
6157 return;
6158 }
6159
6160 MutationImpl::LockOpVec lov;
6161 lov.add_xlock(&cur->xattrlock);
6162 if (!mds->locker->acquire_locks(mdr, lov))
6163 return;
6164
6165
6166 auto handler = Server::get_xattr_or_default_handler(name);
6167 bufferlist bl;
6168 XattrOp xattr_op(CEPH_MDS_OP_RMXATTR, name, bl, 0);
6169
6170 const auto& pxattrs = cur->get_projected_xattrs();
6171 int r = std::invoke(handler->validate, this, cur, pxattrs, &xattr_op);
6172 if (r < 0) {
6173 respond_to_request(mdr, r);
6174 return;
6175 }
6176
6177 dout(10) << "removexattr '" << name << "' on " << *cur << dendl;
6178
6179 // project update
6180 auto pi = cur->project_inode(mdr, true);
6181 pi.inode->version = cur->pre_dirty();
6182 pi.inode->ctime = mdr->get_op_stamp();
6183 if (mdr->get_op_stamp() > pi.inode->rstat.rctime)
6184 pi.inode->rstat.rctime = mdr->get_op_stamp();
6185 pi.inode->change_attr++;
6186 pi.inode->xattr_version++;
6187 std::invoke(handler->removexattr, this, cur, pi.xattrs, xattr_op);
6188
6189 // log + wait
6190 mdr->ls = mdlog->get_current_segment();
6191 EUpdate *le = new EUpdate(mdlog, "removexattr");
6192 mdlog->start_entry(le);
6193 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
6194 mdcache->predirty_journal_parents(mdr, &le->metablob, cur, 0, PREDIRTY_PRIMARY);
6195 mdcache->journal_dirty_inode(mdr.get(), &le->metablob, cur);
6196
6197 journal_and_reply(mdr, cur, 0, le, new C_MDS_inode_update_finish(this, mdr, cur));
6198 }
6199
6200
6201 // =================================================================
6202 // DIRECTORY and NAMESPACE OPS
6203
6204
6205 // ------------------------------------------------
6206
6207 // MKNOD
6208
6209 class C_MDS_mknod_finish : public ServerLogContext {
6210 CDentry *dn;
6211 CInode *newi;
6212 public:
6213 C_MDS_mknod_finish(Server *s, MDRequestRef& r, CDentry *d, CInode *ni) :
6214 ServerLogContext(s, r), dn(d), newi(ni) {}
6215 void finish(int r) override {
6216 ceph_assert(r == 0);
6217
6218 // link the inode
6219 dn->pop_projected_linkage();
6220
6221 // be a bit hacky with the inode version, here.. we decrement it
6222 // just to keep mark_dirty() happen. (we didn't bother projecting
6223 // a new version of hte inode since it's just been created)
6224 newi->mark_dirty(mdr->ls);
6225 newi->mark_dirty_parent(mdr->ls, true);
6226
6227 // mkdir?
6228 if (newi->is_dir()) {
6229 CDir *dir = newi->get_dirfrag(frag_t());
6230 ceph_assert(dir);
6231 dir->mark_dirty(mdr->ls);
6232 dir->mark_new(mdr->ls);
6233 }
6234
6235 mdr->apply();
6236
6237 MDRequestRef null_ref;
6238 get_mds()->mdcache->send_dentry_link(dn, null_ref);
6239
6240 if (newi->is_file()) {
6241 get_mds()->locker->share_inode_max_size(newi);
6242 } else if (newi->is_dir()) {
6243 // We do this now so that the linkages on the new directory are stable.
6244 newi->maybe_ephemeral_rand();
6245 }
6246
6247 // hit pop
6248 get_mds()->balancer->hit_inode(newi, META_POP_IWR);
6249
6250 // reply
6251 server->respond_to_request(mdr, 0);
6252 }
6253 };
6254
6255
6256 void Server::handle_client_mknod(MDRequestRef& mdr)
6257 {
6258 const cref_t<MClientRequest> &req = mdr->client_request;
6259 client_t client = mdr->get_client();
6260
6261 unsigned mode = req->head.args.mknod.mode;
6262 if ((mode & S_IFMT) == 0)
6263 mode |= S_IFREG;
6264
6265 mdr->disable_lock_cache();
6266 CDentry *dn = rdlock_path_xlock_dentry(mdr, true, false, S_ISREG(mode));
6267 if (!dn)
6268 return;
6269
6270 CDir *dir = dn->get_dir();
6271 CInode *diri = dir->get_inode();
6272 if (!check_access(mdr, diri, MAY_WRITE))
6273 return;
6274 if (!check_fragment_space(mdr, dn->get_dir()))
6275 return;
6276
6277 ceph_assert(dn->get_projected_linkage()->is_null());
6278 if (req->get_alternate_name().size() > alternate_name_max) {
6279 dout(10) << " alternate_name longer than " << alternate_name_max << dendl;
6280 respond_to_request(mdr, -CEPHFS_ENAMETOOLONG);
6281 return;
6282 }
6283 dn->set_alternate_name(req->get_alternate_name());
6284
6285 // set layout
6286 file_layout_t layout;
6287 if (mdr->dir_layout != file_layout_t())
6288 layout = mdr->dir_layout;
6289 else
6290 layout = mdcache->default_file_layout;
6291
6292 CInode *newi = prepare_new_inode(mdr, dn->get_dir(), inodeno_t(req->head.ino), mode, &layout);
6293 ceph_assert(newi);
6294
6295 dn->push_projected_linkage(newi);
6296
6297 auto _inode = newi->_get_inode();
6298 _inode->version = dn->pre_dirty();
6299 _inode->rdev = req->head.args.mknod.rdev;
6300 _inode->rstat.rfiles = 1;
6301 _inode->accounted_rstat = _inode->rstat;
6302 if (layout.pool_id != mdcache->default_file_layout.pool_id)
6303 _inode->add_old_pool(mdcache->default_file_layout.pool_id);
6304 _inode->update_backtrace();
6305
6306 snapid_t follows = mdcache->get_global_snaprealm()->get_newest_seq();
6307 SnapRealm *realm = dn->get_dir()->inode->find_snaprealm();
6308 ceph_assert(follows >= realm->get_newest_seq());
6309
6310 // if the client created a _regular_ file via MKNOD, it's highly likely they'll
6311 // want to write to it (e.g., if they are reexporting NFS)
6312 if (S_ISREG(_inode->mode)) {
6313 // issue a cap on the file
6314 int cmode = CEPH_FILE_MODE_RDWR;
6315 Capability *cap = mds->locker->issue_new_caps(newi, cmode, mdr, realm);
6316 if (cap) {
6317 cap->set_wanted(0);
6318
6319 // put locks in excl mode
6320 newi->filelock.set_state(LOCK_EXCL);
6321 newi->authlock.set_state(LOCK_EXCL);
6322 newi->xattrlock.set_state(LOCK_EXCL);
6323
6324 dout(15) << " setting a client_range too, since this is a regular file" << dendl;
6325 _inode->client_ranges[client].range.first = 0;
6326 _inode->client_ranges[client].range.last = _inode->layout.stripe_unit;
6327 _inode->client_ranges[client].follows = follows;
6328 newi->mark_clientwriteable();
6329 cap->mark_clientwriteable();
6330 }
6331 }
6332
6333 ceph_assert(dn->first == follows + 1);
6334 newi->first = dn->first;
6335
6336 dout(10) << "mknod mode " << _inode->mode << " rdev " << _inode->rdev << dendl;
6337
6338 // prepare finisher
6339 mdr->ls = mdlog->get_current_segment();
6340 EUpdate *le = new EUpdate(mdlog, "mknod");
6341 mdlog->start_entry(le);
6342 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
6343 journal_allocated_inos(mdr, &le->metablob);
6344
6345 mdcache->predirty_journal_parents(mdr, &le->metablob, newi, dn->get_dir(),
6346 PREDIRTY_PRIMARY|PREDIRTY_DIR, 1);
6347 le->metablob.add_primary_dentry(dn, newi, true, true, true);
6348
6349 journal_and_reply(mdr, newi, dn, le, new C_MDS_mknod_finish(this, mdr, dn, newi));
6350 mds->balancer->maybe_fragment(dn->get_dir(), false);
6351 }
6352
6353
6354
6355 // MKDIR
6356 /* This function takes responsibility for the passed mdr*/
6357 void Server::handle_client_mkdir(MDRequestRef& mdr)
6358 {
6359 const cref_t<MClientRequest> &req = mdr->client_request;
6360
6361 mdr->disable_lock_cache();
6362 CDentry *dn = rdlock_path_xlock_dentry(mdr, true);
6363 if (!dn)
6364 return;
6365
6366 CDir *dir = dn->get_dir();
6367 CInode *diri = dir->get_inode();
6368
6369 // mkdir check access
6370 if (!check_access(mdr, diri, MAY_WRITE))
6371 return;
6372
6373 if (!check_fragment_space(mdr, dir))
6374 return;
6375
6376 ceph_assert(dn->get_projected_linkage()->is_null());
6377 if (req->get_alternate_name().size() > alternate_name_max) {
6378 dout(10) << " alternate_name longer than " << alternate_name_max << dendl;
6379 respond_to_request(mdr, -CEPHFS_ENAMETOOLONG);
6380 return;
6381 }
6382 dn->set_alternate_name(req->get_alternate_name());
6383
6384 // new inode
6385 unsigned mode = req->head.args.mkdir.mode;
6386 mode &= ~S_IFMT;
6387 mode |= S_IFDIR;
6388 CInode *newi = prepare_new_inode(mdr, dir, inodeno_t(req->head.ino), mode);
6389 ceph_assert(newi);
6390
6391 // it's a directory.
6392 dn->push_projected_linkage(newi);
6393
6394 auto _inode = newi->_get_inode();
6395 _inode->version = dn->pre_dirty();
6396 _inode->rstat.rsubdirs = 1;
6397 _inode->accounted_rstat = _inode->rstat;
6398 _inode->update_backtrace();
6399
6400 snapid_t follows = mdcache->get_global_snaprealm()->get_newest_seq();
6401 SnapRealm *realm = dn->get_dir()->inode->find_snaprealm();
6402 ceph_assert(follows >= realm->get_newest_seq());
6403
6404 dout(12) << " follows " << follows << dendl;
6405 ceph_assert(dn->first == follows + 1);
6406 newi->first = dn->first;
6407
6408 // ...and that new dir is empty.
6409 CDir *newdir = newi->get_or_open_dirfrag(mdcache, frag_t());
6410 newdir->state_set(CDir::STATE_CREATING);
6411 newdir->mark_complete();
6412 newdir->_get_fnode()->version = newdir->pre_dirty();
6413
6414 // prepare finisher
6415 mdr->ls = mdlog->get_current_segment();
6416 EUpdate *le = new EUpdate(mdlog, "mkdir");
6417 mdlog->start_entry(le);
6418 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
6419 journal_allocated_inos(mdr, &le->metablob);
6420 mdcache->predirty_journal_parents(mdr, &le->metablob, newi, dn->get_dir(), PREDIRTY_PRIMARY|PREDIRTY_DIR, 1);
6421 le->metablob.add_primary_dentry(dn, newi, true, true);
6422 le->metablob.add_new_dir(newdir); // dirty AND complete AND new
6423
6424 // issue a cap on the directory
6425 int cmode = CEPH_FILE_MODE_RDWR;
6426 Capability *cap = mds->locker->issue_new_caps(newi, cmode, mdr, realm);
6427 if (cap) {
6428 cap->set_wanted(0);
6429
6430 // put locks in excl mode
6431 newi->filelock.set_state(LOCK_EXCL);
6432 newi->authlock.set_state(LOCK_EXCL);
6433 newi->xattrlock.set_state(LOCK_EXCL);
6434 }
6435
6436 // make sure this inode gets into the journal
6437 le->metablob.add_opened_ino(newi->ino());
6438
6439 journal_and_reply(mdr, newi, dn, le, new C_MDS_mknod_finish(this, mdr, dn, newi));
6440
6441 // We hit_dir (via hit_inode) in our finish callback, but by then we might
6442 // have overshot the split size (multiple mkdir in flight), so here is
6443 // an early chance to split the dir if this mkdir makes it oversized.
6444 mds->balancer->maybe_fragment(dir, false);
6445 }
6446
6447
6448 // SYMLINK
6449
6450 void Server::handle_client_symlink(MDRequestRef& mdr)
6451 {
6452 const auto& req = mdr->client_request;
6453
6454 mdr->disable_lock_cache();
6455 CDentry *dn = rdlock_path_xlock_dentry(mdr, true);
6456 if (!dn)
6457 return;
6458
6459 CDir *dir = dn->get_dir();
6460 CInode *diri = dir->get_inode();
6461
6462 if (!check_access(mdr, diri, MAY_WRITE))
6463 return;
6464 if (!check_fragment_space(mdr, dir))
6465 return;
6466
6467 ceph_assert(dn->get_projected_linkage()->is_null());
6468 if (req->get_alternate_name().size() > alternate_name_max) {
6469 dout(10) << " alternate_name longer than " << alternate_name_max << dendl;
6470 respond_to_request(mdr, -CEPHFS_ENAMETOOLONG);
6471 }
6472 dn->set_alternate_name(req->get_alternate_name());
6473
6474 unsigned mode = S_IFLNK | 0777;
6475 CInode *newi = prepare_new_inode(mdr, dir, inodeno_t(req->head.ino), mode);
6476 ceph_assert(newi);
6477
6478 // it's a symlink
6479 dn->push_projected_linkage(newi);
6480
6481 newi->symlink = req->get_path2();
6482 auto _inode = newi->_get_inode();
6483 _inode->version = dn->pre_dirty();
6484 _inode->size = newi->symlink.length();
6485 _inode->rstat.rbytes = _inode->size;
6486 _inode->rstat.rfiles = 1;
6487 _inode->accounted_rstat = _inode->rstat;
6488 _inode->update_backtrace();
6489
6490 newi->first = dn->first;
6491
6492 // prepare finisher
6493 mdr->ls = mdlog->get_current_segment();
6494 EUpdate *le = new EUpdate(mdlog, "symlink");
6495 mdlog->start_entry(le);
6496 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
6497 journal_allocated_inos(mdr, &le->metablob);
6498 mdcache->predirty_journal_parents(mdr, &le->metablob, newi, dn->get_dir(), PREDIRTY_PRIMARY|PREDIRTY_DIR, 1);
6499 le->metablob.add_primary_dentry(dn, newi, true, true);
6500
6501 journal_and_reply(mdr, newi, dn, le, new C_MDS_mknod_finish(this, mdr, dn, newi));
6502 mds->balancer->maybe_fragment(dir, false);
6503 }
6504
6505
6506
6507
6508
6509 // LINK
6510
6511 void Server::handle_client_link(MDRequestRef& mdr)
6512 {
6513 const cref_t<MClientRequest> &req = mdr->client_request;
6514
6515 dout(7) << "handle_client_link " << req->get_filepath()
6516 << " to " << req->get_filepath2()
6517 << dendl;
6518
6519 mdr->disable_lock_cache();
6520
6521 CDentry *destdn;
6522 CInode *targeti;
6523
6524 if (req->get_filepath2().depth() == 0) {
6525 targeti = mdcache->get_inode(req->get_filepath2().get_ino());
6526 if (!targeti) {
6527 dout(10) << "CEPHFS_ESTALE on path2, attempting recovery" << dendl;
6528 mdcache->find_ino_peers(req->get_filepath2().get_ino(), new C_MDS_TryFindInode(this, mdr));
6529 return;
6530 }
6531 mdr->pin(targeti);
6532
6533 if (!(mdr->locking_state & MutationImpl::SNAP2_LOCKED)) {
6534 CDentry *pdn = targeti->get_projected_parent_dn();
6535 if (!pdn) {
6536 dout(7) << "target has no parent dn, failing..." << dendl;
6537 respond_to_request(mdr, -CEPHFS_EINVAL);
6538 return;
6539 }
6540 if (!mds->locker->try_rdlock_snap_layout(pdn->get_dir()->get_inode(), mdr, 1))
6541 return;
6542 mdr->locking_state |= MutationImpl::SNAP2_LOCKED;
6543 }
6544
6545 destdn = rdlock_path_xlock_dentry(mdr, false);
6546 if (!destdn)
6547 return;
6548 } else {
6549 auto ret = rdlock_two_paths_xlock_destdn(mdr, false);
6550 destdn = ret.first;
6551 if (!destdn)
6552 return;
6553
6554 if (!destdn->get_projected_linkage()->is_null()) {
6555 respond_to_request(mdr, -CEPHFS_EEXIST);
6556 return;
6557 }
6558
6559 targeti = ret.second->get_projected_linkage()->get_inode();
6560 }
6561
6562 ceph_assert(destdn->get_projected_linkage()->is_null());
6563 if (req->get_alternate_name().size() > alternate_name_max) {
6564 dout(10) << " alternate_name longer than " << alternate_name_max << dendl;
6565 respond_to_request(mdr, -CEPHFS_ENAMETOOLONG);
6566 return;
6567 }
6568 destdn->set_alternate_name(req->get_alternate_name());
6569
6570 if (targeti->is_dir()) {
6571 dout(7) << "target is a dir, failing..." << dendl;
6572 respond_to_request(mdr, -CEPHFS_EINVAL);
6573 return;
6574 }
6575
6576 CDir *dir = destdn->get_dir();
6577 dout(7) << "handle_client_link link " << destdn->get_name() << " in " << *dir << dendl;
6578 dout(7) << "target is " << *targeti << dendl;
6579
6580 if (!(mdr->locking_state & MutationImpl::ALL_LOCKED)) {
6581 MutationImpl::LockOpVec lov;
6582 lov.add_xlock(&targeti->snaplock);
6583 lov.add_xlock(&targeti->linklock);
6584
6585 if (!mds->locker->acquire_locks(mdr, lov))
6586 return;
6587
6588 mdr->locking_state |= MutationImpl::ALL_LOCKED;
6589 }
6590
6591 if (targeti->get_projected_inode()->nlink == 0) {
6592 dout(7) << "target has no link, failing..." << dendl;
6593 respond_to_request(mdr, -CEPHFS_ENOENT);
6594 }
6595
6596 if ((!mdr->has_more() || mdr->more()->witnessed.empty())) {
6597 if (!check_access(mdr, targeti, MAY_WRITE))
6598 return;
6599
6600 if (!check_access(mdr, dir->get_inode(), MAY_WRITE))
6601 return;
6602
6603 if (!check_fragment_space(mdr, dir))
6604 return;
6605 }
6606
6607 CInode* target_pin = targeti->get_projected_parent_dir()->inode;
6608 SnapRealm *target_realm = target_pin->find_snaprealm();
6609 if (target_pin != dir->inode &&
6610 target_realm->get_subvolume_ino() !=
6611 dir->inode->find_snaprealm()->get_subvolume_ino()) {
6612 dout(7) << "target is in different subvolume, failing..." << dendl;
6613 respond_to_request(mdr, -CEPHFS_EXDEV);
6614 return;
6615 }
6616
6617 // go!
6618 ceph_assert(g_conf()->mds_kill_link_at != 1);
6619
6620 // local or remote?
6621 if (targeti->is_auth())
6622 _link_local(mdr, destdn, targeti, target_realm);
6623 else
6624 _link_remote(mdr, true, destdn, targeti);
6625 mds->balancer->maybe_fragment(dir, false);
6626 }
6627
6628
6629 class C_MDS_link_local_finish : public ServerLogContext {
6630 CDentry *dn;
6631 CInode *targeti;
6632 version_t dnpv;
6633 version_t tipv;
6634 bool adjust_realm;
6635 public:
6636 C_MDS_link_local_finish(Server *s, MDRequestRef& r, CDentry *d, CInode *ti,
6637 version_t dnpv_, version_t tipv_, bool ar) :
6638 ServerLogContext(s, r), dn(d), targeti(ti),
6639 dnpv(dnpv_), tipv(tipv_), adjust_realm(ar) { }
6640 void finish(int r) override {
6641 ceph_assert(r == 0);
6642 server->_link_local_finish(mdr, dn, targeti, dnpv, tipv, adjust_realm);
6643 }
6644 };
6645
6646
6647 void Server::_link_local(MDRequestRef& mdr, CDentry *dn, CInode *targeti, SnapRealm *target_realm)
6648 {
6649 dout(10) << "_link_local " << *dn << " to " << *targeti << dendl;
6650
6651 mdr->ls = mdlog->get_current_segment();
6652
6653 // predirty NEW dentry
6654 version_t dnpv = dn->pre_dirty();
6655 version_t tipv = targeti->pre_dirty();
6656
6657 // project inode update
6658 auto pi = targeti->project_inode(mdr);
6659 pi.inode->nlink++;
6660 pi.inode->ctime = mdr->get_op_stamp();
6661 if (mdr->get_op_stamp() > pi.inode->rstat.rctime)
6662 pi.inode->rstat.rctime = mdr->get_op_stamp();
6663 pi.inode->change_attr++;
6664 pi.inode->version = tipv;
6665
6666 bool adjust_realm = false;
6667 if (!target_realm->get_subvolume_ino() && !targeti->is_projected_snaprealm_global()) {
6668 sr_t *newsnap = targeti->project_snaprealm();
6669 targeti->mark_snaprealm_global(newsnap);
6670 targeti->record_snaprealm_parent_dentry(newsnap, target_realm, targeti->get_projected_parent_dn(), true);
6671 adjust_realm = true;
6672 }
6673
6674 // log + wait
6675 EUpdate *le = new EUpdate(mdlog, "link_local");
6676 mdlog->start_entry(le);
6677 le->metablob.add_client_req(mdr->reqid, mdr->client_request->get_oldest_client_tid());
6678 mdcache->predirty_journal_parents(mdr, &le->metablob, targeti, dn->get_dir(), PREDIRTY_DIR, 1); // new dn
6679 mdcache->predirty_journal_parents(mdr, &le->metablob, targeti, 0, PREDIRTY_PRIMARY); // targeti
6680 le->metablob.add_remote_dentry(dn, true, targeti->ino(), targeti->d_type()); // new remote
6681 mdcache->journal_dirty_inode(mdr.get(), &le->metablob, targeti);
6682
6683 // do this after predirty_*, to avoid funky extra dnl arg
6684 dn->push_projected_linkage(targeti->ino(), targeti->d_type());
6685
6686 journal_and_reply(mdr, targeti, dn, le,
6687 new C_MDS_link_local_finish(this, mdr, dn, targeti, dnpv, tipv, adjust_realm));
6688 }
6689
6690 void Server::_link_local_finish(MDRequestRef& mdr, CDentry *dn, CInode *targeti,
6691 version_t dnpv, version_t tipv, bool adjust_realm)
6692 {
6693 dout(10) << "_link_local_finish " << *dn << " to " << *targeti << dendl;
6694
6695 // link and unlock the NEW dentry
6696 CDentry::linkage_t *dnl = dn->pop_projected_linkage();
6697 if (!dnl->get_inode())
6698 dn->link_remote(dnl, targeti);
6699 dn->mark_dirty(dnpv, mdr->ls);
6700
6701 // target inode
6702 mdr->apply();
6703
6704 MDRequestRef null_ref;
6705 mdcache->send_dentry_link(dn, null_ref);
6706
6707 if (adjust_realm) {
6708 int op = CEPH_SNAP_OP_SPLIT;
6709 mds->mdcache->send_snap_update(targeti, 0, op);
6710 mds->mdcache->do_realm_invalidate_and_update_notify(targeti, op);
6711 }
6712
6713 // bump target popularity
6714 mds->balancer->hit_inode(targeti, META_POP_IWR);
6715 mds->balancer->hit_dir(dn->get_dir(), META_POP_IWR);
6716
6717 // reply
6718 respond_to_request(mdr, 0);
6719 }
6720
6721
6722 // link / unlink remote
6723
6724 class C_MDS_link_remote_finish : public ServerLogContext {
6725 bool inc;
6726 CDentry *dn;
6727 CInode *targeti;
6728 version_t dpv;
6729 public:
6730 C_MDS_link_remote_finish(Server *s, MDRequestRef& r, bool i, CDentry *d, CInode *ti) :
6731 ServerLogContext(s, r), inc(i), dn(d), targeti(ti),
6732 dpv(d->get_projected_version()) {}
6733 void finish(int r) override {
6734 ceph_assert(r == 0);
6735 server->_link_remote_finish(mdr, inc, dn, targeti, dpv);
6736 }
6737 };
6738
6739 void Server::_link_remote(MDRequestRef& mdr, bool inc, CDentry *dn, CInode *targeti)
6740 {
6741 dout(10) << "_link_remote "
6742 << (inc ? "link ":"unlink ")
6743 << *dn << " to " << *targeti << dendl;
6744
6745 // 1. send LinkPrepare to dest (journal nlink++ prepare)
6746 mds_rank_t linkauth = targeti->authority().first;
6747 if (mdr->more()->witnessed.count(linkauth) == 0) {
6748 if (mds->is_cluster_degraded() &&
6749 !mds->mdsmap->is_clientreplay_or_active_or_stopping(linkauth)) {
6750 dout(10) << " targeti auth mds." << linkauth << " is not active" << dendl;
6751 if (mdr->more()->waiting_on_peer.empty())
6752 mds->wait_for_active_peer(linkauth, new C_MDS_RetryRequest(mdcache, mdr));
6753 return;
6754 }
6755
6756 dout(10) << " targeti auth must prepare nlink++/--" << dendl;
6757 int op;
6758 if (inc)
6759 op = MMDSPeerRequest::OP_LINKPREP;
6760 else
6761 op = MMDSPeerRequest::OP_UNLINKPREP;
6762 auto req = make_message<MMDSPeerRequest>(mdr->reqid, mdr->attempt, op);
6763 targeti->set_object_info(req->get_object_info());
6764 req->op_stamp = mdr->get_op_stamp();
6765 if (auto& desti_srnode = mdr->more()->desti_srnode)
6766 encode(*desti_srnode, req->desti_snapbl);
6767 mds->send_message_mds(req, linkauth);
6768
6769 ceph_assert(mdr->more()->waiting_on_peer.count(linkauth) == 0);
6770 mdr->more()->waiting_on_peer.insert(linkauth);
6771 return;
6772 }
6773 dout(10) << " targeti auth has prepared nlink++/--" << dendl;
6774
6775 ceph_assert(g_conf()->mds_kill_link_at != 2);
6776
6777 if (auto& desti_srnode = mdr->more()->desti_srnode) {
6778 delete desti_srnode;
6779 desti_srnode = NULL;
6780 }
6781
6782 mdr->set_mds_stamp(ceph_clock_now());
6783
6784 // add to event
6785 mdr->ls = mdlog->get_current_segment();
6786 EUpdate *le = new EUpdate(mdlog, inc ? "link_remote":"unlink_remote");
6787 mdlog->start_entry(le);
6788 le->metablob.add_client_req(mdr->reqid, mdr->client_request->get_oldest_client_tid());
6789 if (!mdr->more()->witnessed.empty()) {
6790 dout(20) << " noting uncommitted_peers " << mdr->more()->witnessed << dendl;
6791 le->reqid = mdr->reqid;
6792 le->had_peers = true;
6793 mdcache->add_uncommitted_leader(mdr->reqid, mdr->ls, mdr->more()->witnessed);
6794 }
6795
6796 if (inc) {
6797 dn->pre_dirty();
6798 mdcache->predirty_journal_parents(mdr, &le->metablob, targeti, dn->get_dir(), PREDIRTY_DIR, 1);
6799 le->metablob.add_remote_dentry(dn, true, targeti->ino(), targeti->d_type()); // new remote
6800 dn->push_projected_linkage(targeti->ino(), targeti->d_type());
6801 } else {
6802 dn->pre_dirty();
6803 mdcache->predirty_journal_parents(mdr, &le->metablob, targeti, dn->get_dir(), PREDIRTY_DIR, -1);
6804 mdcache->journal_cow_dentry(mdr.get(), &le->metablob, dn);
6805 le->metablob.add_null_dentry(dn, true);
6806 dn->push_projected_linkage();
6807 }
6808
6809 journal_and_reply(mdr, (inc ? targeti : nullptr), dn, le,
6810 new C_MDS_link_remote_finish(this, mdr, inc, dn, targeti));
6811 }
6812
6813 void Server::_link_remote_finish(MDRequestRef& mdr, bool inc,
6814 CDentry *dn, CInode *targeti,
6815 version_t dpv)
6816 {
6817 dout(10) << "_link_remote_finish "
6818 << (inc ? "link ":"unlink ")
6819 << *dn << " to " << *targeti << dendl;
6820
6821 ceph_assert(g_conf()->mds_kill_link_at != 3);
6822
6823 if (!mdr->more()->witnessed.empty())
6824 mdcache->logged_leader_update(mdr->reqid);
6825
6826 if (inc) {
6827 // link the new dentry
6828 CDentry::linkage_t *dnl = dn->pop_projected_linkage();
6829 if (!dnl->get_inode())
6830 dn->link_remote(dnl, targeti);
6831 dn->mark_dirty(dpv, mdr->ls);
6832 } else {
6833 // unlink main dentry
6834 dn->get_dir()->unlink_inode(dn);
6835 dn->pop_projected_linkage();
6836 dn->mark_dirty(dn->get_projected_version(), mdr->ls); // dirty old dentry
6837 }
6838
6839 mdr->apply();
6840
6841 MDRequestRef null_ref;
6842 if (inc)
6843 mdcache->send_dentry_link(dn, null_ref);
6844 else
6845 mdcache->send_dentry_unlink(dn, NULL, null_ref);
6846
6847 // bump target popularity
6848 mds->balancer->hit_inode(targeti, META_POP_IWR);
6849 mds->balancer->hit_dir(dn->get_dir(), META_POP_IWR);
6850
6851 // reply
6852 respond_to_request(mdr, 0);
6853
6854 if (!inc)
6855 // removing a new dn?
6856 dn->get_dir()->try_remove_unlinked_dn(dn);
6857 }
6858
6859
6860 // remote linking/unlinking
6861
6862 class C_MDS_PeerLinkPrep : public ServerLogContext {
6863 CInode *targeti;
6864 bool adjust_realm;
6865 public:
6866 C_MDS_PeerLinkPrep(Server *s, MDRequestRef& r, CInode *t, bool ar) :
6867 ServerLogContext(s, r), targeti(t), adjust_realm(ar) { }
6868 void finish(int r) override {
6869 ceph_assert(r == 0);
6870 server->_logged_peer_link(mdr, targeti, adjust_realm);
6871 }
6872 };
6873
6874 class C_MDS_PeerLinkCommit : public ServerContext {
6875 MDRequestRef mdr;
6876 CInode *targeti;
6877 public:
6878 C_MDS_PeerLinkCommit(Server *s, MDRequestRef& r, CInode *t) :
6879 ServerContext(s), mdr(r), targeti(t) { }
6880 void finish(int r) override {
6881 server->_commit_peer_link(mdr, r, targeti);
6882 }
6883 };
6884
6885 void Server::handle_peer_link_prep(MDRequestRef& mdr)
6886 {
6887 dout(10) << "handle_peer_link_prep " << *mdr
6888 << " on " << mdr->peer_request->get_object_info()
6889 << dendl;
6890
6891 ceph_assert(g_conf()->mds_kill_link_at != 4);
6892
6893 CInode *targeti = mdcache->get_inode(mdr->peer_request->get_object_info().ino);
6894 ceph_assert(targeti);
6895 dout(10) << "targeti " << *targeti << dendl;
6896 CDentry *dn = targeti->get_parent_dn();
6897 CDentry::linkage_t *dnl = dn->get_linkage();
6898 ceph_assert(dnl->is_primary());
6899
6900 mdr->set_op_stamp(mdr->peer_request->op_stamp);
6901
6902 mdr->auth_pin(targeti);
6903
6904 //ceph_abort(); // test hack: make sure leader can handle a peer that fails to prepare...
6905 ceph_assert(g_conf()->mds_kill_link_at != 5);
6906
6907 // journal it
6908 mdr->ls = mdlog->get_current_segment();
6909 EPeerUpdate *le = new EPeerUpdate(mdlog, "peer_link_prep", mdr->reqid, mdr->peer_to_mds,
6910 EPeerUpdate::OP_PREPARE, EPeerUpdate::LINK);
6911 mdlog->start_entry(le);
6912
6913 auto pi = dnl->get_inode()->project_inode(mdr);
6914
6915 // update journaled target inode
6916 bool inc;
6917 bool adjust_realm = false;
6918 bool realm_projected = false;
6919 if (mdr->peer_request->get_op() == MMDSPeerRequest::OP_LINKPREP) {
6920 inc = true;
6921 pi.inode->nlink++;
6922
6923 CDentry *target_pdn = targeti->get_projected_parent_dn();
6924 SnapRealm *target_realm = target_pdn->get_dir()->inode->find_snaprealm();
6925 if (!target_realm->get_subvolume_ino() && !targeti->is_projected_snaprealm_global()) {
6926 sr_t *newsnap = targeti->project_snaprealm();
6927 targeti->mark_snaprealm_global(newsnap);
6928 targeti->record_snaprealm_parent_dentry(newsnap, target_realm, target_pdn, true);
6929 adjust_realm = true;
6930 realm_projected = true;
6931 }
6932 } else {
6933 inc = false;
6934 pi.inode->nlink--;
6935 if (targeti->is_projected_snaprealm_global()) {
6936 ceph_assert(mdr->peer_request->desti_snapbl.length());
6937 auto p = mdr->peer_request->desti_snapbl.cbegin();
6938
6939 sr_t *newsnap = targeti->project_snaprealm();
6940 decode(*newsnap, p);
6941
6942 if (pi.inode->nlink == 0)
6943 ceph_assert(!newsnap->is_parent_global());
6944
6945 realm_projected = true;
6946 } else {
6947 ceph_assert(mdr->peer_request->desti_snapbl.length() == 0);
6948 }
6949 }
6950
6951 link_rollback rollback;
6952 rollback.reqid = mdr->reqid;
6953 rollback.ino = targeti->ino();
6954 rollback.old_ctime = targeti->get_inode()->ctime; // we hold versionlock xlock; no concorrent projections
6955 const auto& pf = targeti->get_parent_dn()->get_dir()->get_projected_fnode();
6956 rollback.old_dir_mtime = pf->fragstat.mtime;
6957 rollback.old_dir_rctime = pf->rstat.rctime;
6958 rollback.was_inc = inc;
6959 if (realm_projected) {
6960 if (targeti->snaprealm) {
6961 encode(true, rollback.snapbl);
6962 targeti->encode_snap_blob(rollback.snapbl);
6963 } else {
6964 encode(false, rollback.snapbl);
6965 }
6966 }
6967 encode(rollback, le->rollback);
6968 mdr->more()->rollback_bl = le->rollback;
6969
6970 pi.inode->ctime = mdr->get_op_stamp();
6971 pi.inode->version = targeti->pre_dirty();
6972
6973 dout(10) << " projected inode " << pi.inode->ino << " v " << pi.inode->version << dendl;
6974
6975 // commit case
6976 mdcache->predirty_journal_parents(mdr, &le->commit, dnl->get_inode(), 0, PREDIRTY_SHALLOW|PREDIRTY_PRIMARY);
6977 mdcache->journal_dirty_inode(mdr.get(), &le->commit, targeti);
6978 mdcache->add_uncommitted_peer(mdr->reqid, mdr->ls, mdr->peer_to_mds);
6979
6980 // set up commit waiter
6981 mdr->more()->peer_commit = new C_MDS_PeerLinkCommit(this, mdr, targeti);
6982
6983 mdr->more()->peer_update_journaled = true;
6984 submit_mdlog_entry(le, new C_MDS_PeerLinkPrep(this, mdr, targeti, adjust_realm),
6985 mdr, __func__);
6986 mdlog->flush();
6987 }
6988
6989 void Server::_logged_peer_link(MDRequestRef& mdr, CInode *targeti, bool adjust_realm)
6990 {
6991 dout(10) << "_logged_peer_link " << *mdr
6992 << " " << *targeti << dendl;
6993
6994 ceph_assert(g_conf()->mds_kill_link_at != 6);
6995
6996 // update the target
6997 mdr->apply();
6998
6999 // hit pop
7000 mds->balancer->hit_inode(targeti, META_POP_IWR);
7001
7002 // done.
7003 mdr->reset_peer_request();
7004
7005 if (adjust_realm) {
7006 int op = CEPH_SNAP_OP_SPLIT;
7007 mds->mdcache->send_snap_update(targeti, 0, op);
7008 mds->mdcache->do_realm_invalidate_and_update_notify(targeti, op);
7009 }
7010
7011 // ack
7012 if (!mdr->aborted) {
7013 auto reply = make_message<MMDSPeerRequest>(mdr->reqid, mdr->attempt, MMDSPeerRequest::OP_LINKPREPACK);
7014 mds->send_message_mds(reply, mdr->peer_to_mds);
7015 } else {
7016 dout(10) << " abort flag set, finishing" << dendl;
7017 mdcache->request_finish(mdr);
7018 }
7019 }
7020
7021
7022 struct C_MDS_CommittedPeer : public ServerLogContext {
7023 C_MDS_CommittedPeer(Server *s, MDRequestRef& m) : ServerLogContext(s, m) {}
7024 void finish(int r) override {
7025 server->_committed_peer(mdr);
7026 }
7027 };
7028
7029 void Server::_commit_peer_link(MDRequestRef& mdr, int r, CInode *targeti)
7030 {
7031 dout(10) << "_commit_peer_link " << *mdr
7032 << " r=" << r
7033 << " " << *targeti << dendl;
7034
7035 ceph_assert(g_conf()->mds_kill_link_at != 7);
7036
7037 if (r == 0) {
7038 // drop our pins, etc.
7039 mdr->cleanup();
7040
7041 // write a commit to the journal
7042 EPeerUpdate *le = new EPeerUpdate(mdlog, "peer_link_commit", mdr->reqid, mdr->peer_to_mds,
7043 EPeerUpdate::OP_COMMIT, EPeerUpdate::LINK);
7044 mdlog->start_entry(le);
7045 submit_mdlog_entry(le, new C_MDS_CommittedPeer(this, mdr), mdr, __func__);
7046 mdlog->flush();
7047 } else {
7048 do_link_rollback(mdr->more()->rollback_bl, mdr->peer_to_mds, mdr);
7049 }
7050 }
7051
7052 void Server::_committed_peer(MDRequestRef& mdr)
7053 {
7054 dout(10) << "_committed_peer " << *mdr << dendl;
7055
7056 ceph_assert(g_conf()->mds_kill_link_at != 8);
7057
7058 bool assert_exist = mdr->more()->peer_update_journaled;
7059 mdcache->finish_uncommitted_peer(mdr->reqid, assert_exist);
7060 auto req = make_message<MMDSPeerRequest>(mdr->reqid, mdr->attempt, MMDSPeerRequest::OP_COMMITTED);
7061 mds->send_message_mds(req, mdr->peer_to_mds);
7062 mdcache->request_finish(mdr);
7063 }
7064
7065 struct C_MDS_LoggedLinkRollback : public ServerLogContext {
7066 MutationRef mut;
7067 map<client_t,ref_t<MClientSnap>> splits;
7068 C_MDS_LoggedLinkRollback(Server *s, MutationRef& m, MDRequestRef& r,
7069 map<client_t,ref_t<MClientSnap>>&& _splits) :
7070 ServerLogContext(s, r), mut(m), splits(std::move(_splits)) {
7071 }
7072 void finish(int r) override {
7073 server->_link_rollback_finish(mut, mdr, splits);
7074 }
7075 };
7076
7077 void Server::do_link_rollback(bufferlist &rbl, mds_rank_t leader, MDRequestRef& mdr)
7078 {
7079 link_rollback rollback;
7080 auto p = rbl.cbegin();
7081 decode(rollback, p);
7082
7083 dout(10) << "do_link_rollback on " << rollback.reqid
7084 << (rollback.was_inc ? " inc":" dec")
7085 << " ino " << rollback.ino
7086 << dendl;
7087
7088 ceph_assert(g_conf()->mds_kill_link_at != 9);
7089
7090 mdcache->add_rollback(rollback.reqid, leader); // need to finish this update before resolve finishes
7091 ceph_assert(mdr || mds->is_resolve());
7092
7093 MutationRef mut(new MutationImpl(nullptr, utime_t(), rollback.reqid));
7094 mut->ls = mds->mdlog->get_current_segment();
7095
7096 CInode *in = mdcache->get_inode(rollback.ino);
7097 ceph_assert(in);
7098 dout(10) << " target is " << *in << dendl;
7099 ceph_assert(!in->is_projected()); // live peer request hold versionlock xlock.
7100
7101 auto pi = in->project_inode(mut);
7102 pi.inode->version = in->pre_dirty();
7103
7104 // parent dir rctime
7105 CDir *parent = in->get_projected_parent_dn()->get_dir();
7106 auto pf = parent->project_fnode(mut);
7107 pf->version = parent->pre_dirty();
7108 if (pf->fragstat.mtime == pi.inode->ctime) {
7109 pf->fragstat.mtime = rollback.old_dir_mtime;
7110 if (pf->rstat.rctime == pi.inode->ctime)
7111 pf->rstat.rctime = rollback.old_dir_rctime;
7112 mut->add_updated_lock(&parent->get_inode()->filelock);
7113 mut->add_updated_lock(&parent->get_inode()->nestlock);
7114 }
7115
7116 // inode
7117 pi.inode->ctime = rollback.old_ctime;
7118 if (rollback.was_inc)
7119 pi.inode->nlink--;
7120 else
7121 pi.inode->nlink++;
7122
7123 map<client_t,ref_t<MClientSnap>> splits;
7124 if (rollback.snapbl.length() && in->snaprealm) {
7125 bool hadrealm;
7126 auto p = rollback.snapbl.cbegin();
7127 decode(hadrealm, p);
7128 if (hadrealm) {
7129 if (!mds->is_resolve()) {
7130 sr_t *new_srnode = new sr_t();
7131 decode(*new_srnode, p);
7132 in->project_snaprealm(new_srnode);
7133 } else {
7134 decode(in->snaprealm->srnode, p);
7135 }
7136 } else {
7137 SnapRealm *realm = parent->get_inode()->find_snaprealm();
7138 if (!mds->is_resolve())
7139 mdcache->prepare_realm_merge(in->snaprealm, realm, splits);
7140 in->project_snaprealm(NULL);
7141 }
7142 }
7143
7144 // journal it
7145 EPeerUpdate *le = new EPeerUpdate(mdlog, "peer_link_rollback", rollback.reqid, leader,
7146 EPeerUpdate::OP_ROLLBACK, EPeerUpdate::LINK);
7147 mdlog->start_entry(le);
7148 le->commit.add_dir_context(parent);
7149 le->commit.add_dir(parent, true);
7150 le->commit.add_primary_dentry(in->get_projected_parent_dn(), 0, true);
7151
7152 submit_mdlog_entry(le, new C_MDS_LoggedLinkRollback(this, mut, mdr, std::move(splits)),
7153 mdr, __func__);
7154 mdlog->flush();
7155 }
7156
7157 void Server::_link_rollback_finish(MutationRef& mut, MDRequestRef& mdr,
7158 map<client_t,ref_t<MClientSnap>>& splits)
7159 {
7160 dout(10) << "_link_rollback_finish" << dendl;
7161
7162 ceph_assert(g_conf()->mds_kill_link_at != 10);
7163
7164 mut->apply();
7165
7166 if (!mds->is_resolve())
7167 mdcache->send_snaps(splits);
7168
7169 if (mdr)
7170 mdcache->request_finish(mdr);
7171
7172 mdcache->finish_rollback(mut->reqid, mdr);
7173
7174 mut->cleanup();
7175 }
7176
7177
7178 void Server::handle_peer_link_prep_ack(MDRequestRef& mdr, const cref_t<MMDSPeerRequest> &m)
7179 {
7180 dout(10) << "handle_peer_link_prep_ack " << *mdr
7181 << " " << *m << dendl;
7182 mds_rank_t from = mds_rank_t(m->get_source().num());
7183
7184 ceph_assert(g_conf()->mds_kill_link_at != 11);
7185
7186 // note peer
7187 mdr->more()->peers.insert(from);
7188
7189 // witnessed!
7190 ceph_assert(mdr->more()->witnessed.count(from) == 0);
7191 mdr->more()->witnessed.insert(from);
7192 ceph_assert(!m->is_not_journaled());
7193 mdr->more()->has_journaled_peers = true;
7194
7195 // remove from waiting list
7196 ceph_assert(mdr->more()->waiting_on_peer.count(from));
7197 mdr->more()->waiting_on_peer.erase(from);
7198
7199 ceph_assert(mdr->more()->waiting_on_peer.empty());
7200
7201 dispatch_client_request(mdr); // go again!
7202 }
7203
7204
7205
7206
7207
7208 // UNLINK
7209
7210 void Server::handle_client_unlink(MDRequestRef& mdr)
7211 {
7212 const cref_t<MClientRequest> &req = mdr->client_request;
7213 client_t client = mdr->get_client();
7214
7215 // rmdir or unlink?
7216 bool rmdir = (req->get_op() == CEPH_MDS_OP_RMDIR);
7217
7218 if (rmdir)
7219 mdr->disable_lock_cache();
7220 CDentry *dn = rdlock_path_xlock_dentry(mdr, false, true);
7221 if (!dn)
7222 return;
7223
7224 CDentry::linkage_t *dnl = dn->get_linkage(client, mdr);
7225 ceph_assert(!dnl->is_null());
7226 CInode *in = dnl->get_inode();
7227
7228 if (rmdir) {
7229 dout(7) << "handle_client_rmdir on " << *dn << dendl;
7230 } else {
7231 dout(7) << "handle_client_unlink on " << *dn << dendl;
7232 }
7233 dout(7) << "dn links to " << *in << dendl;
7234
7235 // rmdir vs is_dir
7236 if (in->is_dir()) {
7237 if (rmdir) {
7238 // do empty directory checks
7239 if (_dir_is_nonempty_unlocked(mdr, in)) {
7240 respond_to_request(mdr, -CEPHFS_ENOTEMPTY);
7241 return;
7242 }
7243 } else {
7244 dout(7) << "handle_client_unlink on dir " << *in << ", returning error" << dendl;
7245 respond_to_request(mdr, -CEPHFS_EISDIR);
7246 return;
7247 }
7248 } else {
7249 if (rmdir) {
7250 // unlink
7251 dout(7) << "handle_client_rmdir on non-dir " << *in << ", returning error" << dendl;
7252 respond_to_request(mdr, -CEPHFS_ENOTDIR);
7253 return;
7254 }
7255 }
7256
7257 CInode *diri = dn->get_dir()->get_inode();
7258 if ((!mdr->has_more() || mdr->more()->witnessed.empty())) {
7259 if (!check_access(mdr, diri, MAY_WRITE))
7260 return;
7261 }
7262
7263 // -- create stray dentry? --
7264 CDentry *straydn = NULL;
7265 if (dnl->is_primary()) {
7266 straydn = prepare_stray_dentry(mdr, dnl->get_inode());
7267 if (!straydn)
7268 return;
7269 dout(10) << " straydn is " << *straydn << dendl;
7270 } else if (mdr->straydn) {
7271 mdr->unpin(mdr->straydn);
7272 mdr->straydn = NULL;
7273 }
7274
7275 // lock
7276 if (!(mdr->locking_state & MutationImpl::ALL_LOCKED)) {
7277 MutationImpl::LockOpVec lov;
7278
7279 lov.add_xlock(&in->linklock);
7280 lov.add_xlock(&in->snaplock);
7281 if (in->is_dir())
7282 lov.add_rdlock(&in->filelock); // to verify it's empty
7283
7284 if (straydn) {
7285 lov.add_wrlock(&straydn->get_dir()->inode->filelock);
7286 lov.add_wrlock(&straydn->get_dir()->inode->nestlock);
7287 lov.add_xlock(&straydn->lock);
7288 }
7289
7290 if (!mds->locker->acquire_locks(mdr, lov))
7291 return;
7292
7293 mdr->locking_state |= MutationImpl::ALL_LOCKED;
7294 }
7295
7296 if (in->is_dir() &&
7297 _dir_is_nonempty(mdr, in)) {
7298 respond_to_request(mdr, -CEPHFS_ENOTEMPTY);
7299 return;
7300 }
7301
7302 if (straydn)
7303 straydn->first = mdcache->get_global_snaprealm()->get_newest_seq() + 1;
7304
7305 if (!mdr->more()->desti_srnode) {
7306 if (in->is_projected_snaprealm_global()) {
7307 sr_t *new_srnode = in->prepare_new_srnode(0);
7308 in->record_snaprealm_parent_dentry(new_srnode, nullptr, dn, dnl->is_primary());
7309 // dropping the last linkage or dropping the last remote linkage,
7310 // detch the inode from global snaprealm
7311 auto nlink = in->get_projected_inode()->nlink;
7312 if (nlink == 1 ||
7313 (nlink == 2 && !dnl->is_primary() &&
7314 !in->get_projected_parent_dir()->inode->is_stray()))
7315 in->clear_snaprealm_global(new_srnode);
7316 mdr->more()->desti_srnode = new_srnode;
7317 } else if (dnl->is_primary()) {
7318 // prepare snaprealm blob for peer request
7319 SnapRealm *realm = in->find_snaprealm();
7320 snapid_t follows = realm->get_newest_seq();
7321 if (in->snaprealm || follows + 1 > in->get_oldest_snap()) {
7322 sr_t *new_srnode = in->prepare_new_srnode(follows);
7323 in->record_snaprealm_past_parent(new_srnode, straydn->get_dir()->inode->find_snaprealm());
7324 mdr->more()->desti_srnode = new_srnode;
7325 }
7326 }
7327 }
7328
7329 // yay!
7330 if (in->is_dir() && in->has_subtree_root_dirfrag()) {
7331 // subtree root auths need to be witnesses
7332 set<mds_rank_t> witnesses;
7333 in->list_replicas(witnesses);
7334 dout(10) << " witnesses " << witnesses << ", have " << mdr->more()->witnessed << dendl;
7335
7336 for (set<mds_rank_t>::iterator p = witnesses.begin();
7337 p != witnesses.end();
7338 ++p) {
7339 if (mdr->more()->witnessed.count(*p)) {
7340 dout(10) << " already witnessed by mds." << *p << dendl;
7341 } else if (mdr->more()->waiting_on_peer.count(*p)) {
7342 dout(10) << " already waiting on witness mds." << *p << dendl;
7343 } else {
7344 if (!_rmdir_prepare_witness(mdr, *p, mdr->dn[0], straydn))
7345 return;
7346 }
7347 }
7348 if (!mdr->more()->waiting_on_peer.empty())
7349 return; // we're waiting for a witness.
7350 }
7351
7352 if (!rmdir && dnl->is_primary() && mdr->dn[0].size() == 1)
7353 mds->locker->create_lock_cache(mdr, diri);
7354
7355 // ok!
7356 if (dnl->is_remote() && !dnl->get_inode()->is_auth())
7357 _link_remote(mdr, false, dn, dnl->get_inode());
7358 else
7359 _unlink_local(mdr, dn, straydn);
7360 }
7361
7362 class C_MDS_unlink_local_finish : public ServerLogContext {
7363 CDentry *dn;
7364 CDentry *straydn;
7365 version_t dnpv; // deleted dentry
7366 public:
7367 C_MDS_unlink_local_finish(Server *s, MDRequestRef& r, CDentry *d, CDentry *sd) :
7368 ServerLogContext(s, r), dn(d), straydn(sd),
7369 dnpv(d->get_projected_version()) {}
7370 void finish(int r) override {
7371 ceph_assert(r == 0);
7372 server->_unlink_local_finish(mdr, dn, straydn, dnpv);
7373 }
7374 };
7375
7376 void Server::_unlink_local(MDRequestRef& mdr, CDentry *dn, CDentry *straydn)
7377 {
7378 dout(10) << "_unlink_local " << *dn << dendl;
7379
7380 CDentry::linkage_t *dnl = dn->get_projected_linkage();
7381 CInode *in = dnl->get_inode();
7382
7383
7384 // ok, let's do it.
7385 mdr->ls = mdlog->get_current_segment();
7386
7387 // prepare log entry
7388 EUpdate *le = new EUpdate(mdlog, "unlink_local");
7389 mdlog->start_entry(le);
7390 le->metablob.add_client_req(mdr->reqid, mdr->client_request->get_oldest_client_tid());
7391 if (!mdr->more()->witnessed.empty()) {
7392 dout(20) << " noting uncommitted_peers " << mdr->more()->witnessed << dendl;
7393 le->reqid = mdr->reqid;
7394 le->had_peers = true;
7395 mdcache->add_uncommitted_leader(mdr->reqid, mdr->ls, mdr->more()->witnessed);
7396 }
7397
7398 if (straydn) {
7399 ceph_assert(dnl->is_primary());
7400 straydn->push_projected_linkage(in);
7401 }
7402
7403 // the unlinked dentry
7404 dn->pre_dirty();
7405
7406 auto pi = in->project_inode(mdr);
7407 {
7408 std::string t;
7409 dn->make_path_string(t, true);
7410 pi.inode->stray_prior_path = std::move(t);
7411 }
7412 pi.inode->version = in->pre_dirty();
7413 pi.inode->ctime = mdr->get_op_stamp();
7414 if (mdr->get_op_stamp() > pi.inode->rstat.rctime)
7415 pi.inode->rstat.rctime = mdr->get_op_stamp();
7416 pi.inode->change_attr++;
7417 pi.inode->nlink--;
7418 if (pi.inode->nlink == 0)
7419 in->state_set(CInode::STATE_ORPHAN);
7420
7421 if (mdr->more()->desti_srnode) {
7422 auto& desti_srnode = mdr->more()->desti_srnode;
7423 in->project_snaprealm(desti_srnode);
7424 desti_srnode = NULL;
7425 }
7426
7427 if (straydn) {
7428 // will manually pop projected inode
7429
7430 // primary link. add stray dentry.
7431 mdcache->predirty_journal_parents(mdr, &le->metablob, in, dn->get_dir(), PREDIRTY_PRIMARY|PREDIRTY_DIR, -1);
7432 mdcache->predirty_journal_parents(mdr, &le->metablob, in, straydn->get_dir(), PREDIRTY_PRIMARY|PREDIRTY_DIR, 1);
7433
7434 pi.inode->update_backtrace();
7435 le->metablob.add_primary_dentry(straydn, in, true, true);
7436 } else {
7437 // remote link. update remote inode.
7438 mdcache->predirty_journal_parents(mdr, &le->metablob, in, dn->get_dir(), PREDIRTY_DIR, -1);
7439 mdcache->predirty_journal_parents(mdr, &le->metablob, in, 0, PREDIRTY_PRIMARY);
7440 mdcache->journal_dirty_inode(mdr.get(), &le->metablob, in);
7441 }
7442
7443 mdcache->journal_cow_dentry(mdr.get(), &le->metablob, dn);
7444 le->metablob.add_null_dentry(dn, true);
7445
7446 if (in->is_dir()) {
7447 dout(10) << " noting renamed (unlinked) dir ino " << in->ino() << " in metablob" << dendl;
7448 le->metablob.renamed_dirino = in->ino();
7449 }
7450
7451 dn->push_projected_linkage();
7452
7453 if (straydn) {
7454 ceph_assert(in->first <= straydn->first);
7455 in->first = straydn->first;
7456 }
7457
7458 if (in->is_dir()) {
7459 ceph_assert(straydn);
7460 mdcache->project_subtree_rename(in, dn->get_dir(), straydn->get_dir());
7461 }
7462
7463 journal_and_reply(mdr, 0, dn, le, new C_MDS_unlink_local_finish(this, mdr, dn, straydn));
7464 }
7465
7466 void Server::_unlink_local_finish(MDRequestRef& mdr,
7467 CDentry *dn, CDentry *straydn,
7468 version_t dnpv)
7469 {
7470 dout(10) << "_unlink_local_finish " << *dn << dendl;
7471
7472 if (!mdr->more()->witnessed.empty())
7473 mdcache->logged_leader_update(mdr->reqid);
7474
7475 CInode *strayin = NULL;
7476 bool hadrealm = false;
7477 if (straydn) {
7478 // if there is newly created snaprealm, need to split old snaprealm's
7479 // inodes_with_caps. So pop snaprealm before linkage changes.
7480 strayin = dn->get_linkage()->get_inode();
7481 hadrealm = strayin->snaprealm ? true : false;
7482 strayin->early_pop_projected_snaprealm();
7483 }
7484
7485 // unlink main dentry
7486 dn->get_dir()->unlink_inode(dn);
7487 dn->pop_projected_linkage();
7488 dn->mark_dirty(dnpv, mdr->ls);
7489
7490 // relink as stray? (i.e. was primary link?)
7491 if (straydn) {
7492 dout(20) << " straydn is " << *straydn << dendl;
7493 straydn->pop_projected_linkage();
7494 mdcache->touch_dentry_bottom(straydn);
7495 }
7496
7497 mdr->apply();
7498
7499 mdcache->send_dentry_unlink(dn, straydn, mdr);
7500
7501 if (straydn) {
7502 // update subtree map?
7503 if (strayin->is_dir())
7504 mdcache->adjust_subtree_after_rename(strayin, dn->get_dir(), true);
7505
7506 if (strayin->snaprealm && !hadrealm)
7507 mdcache->do_realm_invalidate_and_update_notify(strayin, CEPH_SNAP_OP_SPLIT, false);
7508 }
7509
7510 // bump pop
7511 mds->balancer->hit_dir(dn->get_dir(), META_POP_IWR);
7512
7513 // reply
7514 respond_to_request(mdr, 0);
7515
7516 // removing a new dn?
7517 dn->get_dir()->try_remove_unlinked_dn(dn);
7518
7519 // clean up ?
7520 // respond_to_request() drops locks. So stray reintegration can race with us.
7521 if (straydn && !straydn->get_projected_linkage()->is_null()) {
7522 // Tip off the MDCache that this dentry is a stray that
7523 // might be elegible for purge.
7524 mdcache->notify_stray(straydn);
7525 }
7526 }
7527
7528 bool Server::_rmdir_prepare_witness(MDRequestRef& mdr, mds_rank_t who, vector<CDentry*>& trace, CDentry *straydn)
7529 {
7530 if (mds->is_cluster_degraded() &&
7531 !mds->mdsmap->is_clientreplay_or_active_or_stopping(who)) {
7532 dout(10) << "_rmdir_prepare_witness mds." << who << " is not active" << dendl;
7533 if (mdr->more()->waiting_on_peer.empty())
7534 mds->wait_for_active_peer(who, new C_MDS_RetryRequest(mdcache, mdr));
7535 return false;
7536 }
7537
7538 dout(10) << "_rmdir_prepare_witness mds." << who << dendl;
7539 auto req = make_message<MMDSPeerRequest>(mdr->reqid, mdr->attempt, MMDSPeerRequest::OP_RMDIRPREP);
7540 req->srcdnpath = filepath(trace.front()->get_dir()->ino());
7541 for (auto dn : trace)
7542 req->srcdnpath.push_dentry(dn->get_name());
7543 mdcache->encode_replica_stray(straydn, who, req->straybl);
7544 if (mdr->more()->desti_srnode)
7545 encode(*mdr->more()->desti_srnode, req->desti_snapbl);
7546
7547 req->op_stamp = mdr->get_op_stamp();
7548 mds->send_message_mds(req, who);
7549
7550 ceph_assert(mdr->more()->waiting_on_peer.count(who) == 0);
7551 mdr->more()->waiting_on_peer.insert(who);
7552 return true;
7553 }
7554
7555 struct C_MDS_PeerRmdirPrep : public ServerLogContext {
7556 CDentry *dn, *straydn;
7557 C_MDS_PeerRmdirPrep(Server *s, MDRequestRef& r, CDentry *d, CDentry *st)
7558 : ServerLogContext(s, r), dn(d), straydn(st) {}
7559 void finish(int r) override {
7560 server->_logged_peer_rmdir(mdr, dn, straydn);
7561 }
7562 };
7563
7564 struct C_MDS_PeerRmdirCommit : public ServerContext {
7565 MDRequestRef mdr;
7566 CDentry *straydn;
7567 C_MDS_PeerRmdirCommit(Server *s, MDRequestRef& r, CDentry *sd)
7568 : ServerContext(s), mdr(r), straydn(sd) { }
7569 void finish(int r) override {
7570 server->_commit_peer_rmdir(mdr, r, straydn);
7571 }
7572 };
7573
7574 void Server::handle_peer_rmdir_prep(MDRequestRef& mdr)
7575 {
7576 dout(10) << "handle_peer_rmdir_prep " << *mdr
7577 << " " << mdr->peer_request->srcdnpath
7578 << " to " << mdr->peer_request->destdnpath
7579 << dendl;
7580
7581 vector<CDentry*> trace;
7582 filepath srcpath(mdr->peer_request->srcdnpath);
7583 dout(10) << " src " << srcpath << dendl;
7584 CInode *in;
7585 CF_MDS_RetryRequestFactory cf(mdcache, mdr, false);
7586 int r = mdcache->path_traverse(mdr, cf, srcpath,
7587 MDS_TRAVERSE_DISCOVER | MDS_TRAVERSE_PATH_LOCKED,
7588 &trace, &in);
7589 if (r > 0) return;
7590 if (r == -CEPHFS_ESTALE) {
7591 mdcache->find_ino_peers(srcpath.get_ino(), new C_MDS_RetryRequest(mdcache, mdr),
7592 mdr->peer_to_mds, true);
7593 return;
7594 }
7595 ceph_assert(r == 0);
7596 CDentry *dn = trace.back();
7597 dout(10) << " dn " << *dn << dendl;
7598 mdr->pin(dn);
7599
7600 ceph_assert(mdr->straydn);
7601 CDentry *straydn = mdr->straydn;
7602 dout(10) << " straydn " << *straydn << dendl;
7603
7604 mdr->set_op_stamp(mdr->peer_request->op_stamp);
7605
7606 rmdir_rollback rollback;
7607 rollback.reqid = mdr->reqid;
7608 rollback.src_dir = dn->get_dir()->dirfrag();
7609 rollback.src_dname = dn->get_name();
7610 rollback.dest_dir = straydn->get_dir()->dirfrag();
7611 rollback.dest_dname = straydn->get_name();
7612 if (mdr->peer_request->desti_snapbl.length()) {
7613 if (in->snaprealm) {
7614 encode(true, rollback.snapbl);
7615 in->encode_snap_blob(rollback.snapbl);
7616 } else {
7617 encode(false, rollback.snapbl);
7618 }
7619 }
7620 encode(rollback, mdr->more()->rollback_bl);
7621 // FIXME: rollback snaprealm
7622 dout(20) << " rollback is " << mdr->more()->rollback_bl.length() << " bytes" << dendl;
7623
7624 // set up commit waiter
7625 mdr->more()->peer_commit = new C_MDS_PeerRmdirCommit(this, mdr, straydn);
7626
7627 straydn->push_projected_linkage(in);
7628 dn->push_projected_linkage();
7629
7630 ceph_assert(straydn->first >= in->first);
7631 in->first = straydn->first;
7632
7633 if (!in->has_subtree_root_dirfrag(mds->get_nodeid())) {
7634 dout(10) << " no auth subtree in " << *in << ", skipping journal" << dendl;
7635 _logged_peer_rmdir(mdr, dn, straydn);
7636 return;
7637 }
7638
7639 mdr->ls = mdlog->get_current_segment();
7640 EPeerUpdate *le = new EPeerUpdate(mdlog, "peer_rmdir", mdr->reqid, mdr->peer_to_mds,
7641 EPeerUpdate::OP_PREPARE, EPeerUpdate::RMDIR);
7642 mdlog->start_entry(le);
7643 le->rollback = mdr->more()->rollback_bl;
7644
7645 le->commit.add_dir_context(straydn->get_dir());
7646 le->commit.add_primary_dentry(straydn, in, true);
7647 // peer: no need to journal original dentry
7648
7649 dout(10) << " noting renamed (unlinked) dir ino " << in->ino() << " in metablob" << dendl;
7650 le->commit.renamed_dirino = in->ino();
7651
7652 mdcache->project_subtree_rename(in, dn->get_dir(), straydn->get_dir());
7653 mdcache->add_uncommitted_peer(mdr->reqid, mdr->ls, mdr->peer_to_mds);
7654
7655 mdr->more()->peer_update_journaled = true;
7656 submit_mdlog_entry(le, new C_MDS_PeerRmdirPrep(this, mdr, dn, straydn),
7657 mdr, __func__);
7658 mdlog->flush();
7659 }
7660
7661 void Server::_logged_peer_rmdir(MDRequestRef& mdr, CDentry *dn, CDentry *straydn)
7662 {
7663 dout(10) << "_logged_peer_rmdir " << *mdr << " on " << *dn << dendl;
7664 CInode *in = dn->get_linkage()->get_inode();
7665
7666 bool new_realm;
7667 if (mdr->peer_request->desti_snapbl.length()) {
7668 new_realm = !in->snaprealm;
7669 in->decode_snap_blob(mdr->peer_request->desti_snapbl);
7670 ceph_assert(in->snaprealm);
7671 } else {
7672 new_realm = false;
7673 }
7674
7675 // update our cache now, so we are consistent with what is in the journal
7676 // when we journal a subtree map
7677 dn->get_dir()->unlink_inode(dn);
7678 straydn->pop_projected_linkage();
7679 dn->pop_projected_linkage();
7680
7681 mdcache->adjust_subtree_after_rename(in, dn->get_dir(), mdr->more()->peer_update_journaled);
7682
7683 if (new_realm)
7684 mdcache->do_realm_invalidate_and_update_notify(in, CEPH_SNAP_OP_SPLIT, false);
7685
7686 // done.
7687 mdr->reset_peer_request();
7688 mdr->straydn = 0;
7689
7690 if (!mdr->aborted) {
7691 auto reply = make_message<MMDSPeerRequest>(mdr->reqid, mdr->attempt, MMDSPeerRequest::OP_RMDIRPREPACK);
7692 if (!mdr->more()->peer_update_journaled)
7693 reply->mark_not_journaled();
7694 mds->send_message_mds(reply, mdr->peer_to_mds);
7695 } else {
7696 dout(10) << " abort flag set, finishing" << dendl;
7697 mdcache->request_finish(mdr);
7698 }
7699 }
7700
7701 void Server::handle_peer_rmdir_prep_ack(MDRequestRef& mdr, const cref_t<MMDSPeerRequest> &ack)
7702 {
7703 dout(10) << "handle_peer_rmdir_prep_ack " << *mdr
7704 << " " << *ack << dendl;
7705
7706 mds_rank_t from = mds_rank_t(ack->get_source().num());
7707
7708 mdr->more()->peers.insert(from);
7709 mdr->more()->witnessed.insert(from);
7710 if (!ack->is_not_journaled())
7711 mdr->more()->has_journaled_peers = true;
7712
7713 // remove from waiting list
7714 ceph_assert(mdr->more()->waiting_on_peer.count(from));
7715 mdr->more()->waiting_on_peer.erase(from);
7716
7717 if (mdr->more()->waiting_on_peer.empty())
7718 dispatch_client_request(mdr); // go again!
7719 else
7720 dout(10) << "still waiting on peers " << mdr->more()->waiting_on_peer << dendl;
7721 }
7722
7723 void Server::_commit_peer_rmdir(MDRequestRef& mdr, int r, CDentry *straydn)
7724 {
7725 dout(10) << "_commit_peer_rmdir " << *mdr << " r=" << r << dendl;
7726
7727 if (r == 0) {
7728 if (mdr->more()->peer_update_journaled) {
7729 CInode *strayin = straydn->get_projected_linkage()->get_inode();
7730 if (strayin && !strayin->snaprealm)
7731 mdcache->clear_dirty_bits_for_stray(strayin);
7732 }
7733
7734 mdr->cleanup();
7735
7736 if (mdr->more()->peer_update_journaled) {
7737 // write a commit to the journal
7738 EPeerUpdate *le = new EPeerUpdate(mdlog, "peer_rmdir_commit", mdr->reqid,
7739 mdr->peer_to_mds, EPeerUpdate::OP_COMMIT,
7740 EPeerUpdate::RMDIR);
7741 mdlog->start_entry(le);
7742 submit_mdlog_entry(le, new C_MDS_CommittedPeer(this, mdr), mdr, __func__);
7743 mdlog->flush();
7744 } else {
7745 _committed_peer(mdr);
7746 }
7747 } else {
7748 // abort
7749 do_rmdir_rollback(mdr->more()->rollback_bl, mdr->peer_to_mds, mdr);
7750 }
7751 }
7752
7753 struct C_MDS_LoggedRmdirRollback : public ServerLogContext {
7754 metareqid_t reqid;
7755 CDentry *dn;
7756 CDentry *straydn;
7757 C_MDS_LoggedRmdirRollback(Server *s, MDRequestRef& m, metareqid_t mr, CDentry *d, CDentry *st)
7758 : ServerLogContext(s, m), reqid(mr), dn(d), straydn(st) {}
7759 void finish(int r) override {
7760 server->_rmdir_rollback_finish(mdr, reqid, dn, straydn);
7761 }
7762 };
7763
7764 void Server::do_rmdir_rollback(bufferlist &rbl, mds_rank_t leader, MDRequestRef& mdr)
7765 {
7766 // unlink the other rollback methods, the rmdir rollback is only
7767 // needed to record the subtree changes in the journal for inode
7768 // replicas who are auth for empty dirfrags. no actual changes to
7769 // the file system are taking place here, so there is no Mutation.
7770
7771 rmdir_rollback rollback;
7772 auto p = rbl.cbegin();
7773 decode(rollback, p);
7774
7775 dout(10) << "do_rmdir_rollback on " << rollback.reqid << dendl;
7776 mdcache->add_rollback(rollback.reqid, leader); // need to finish this update before resolve finishes
7777 ceph_assert(mdr || mds->is_resolve());
7778
7779 CDir *dir = mdcache->get_dirfrag(rollback.src_dir);
7780 if (!dir)
7781 dir = mdcache->get_dirfrag(rollback.src_dir.ino, rollback.src_dname);
7782 ceph_assert(dir);
7783 CDentry *dn = dir->lookup(rollback.src_dname);
7784 ceph_assert(dn);
7785 dout(10) << " dn " << *dn << dendl;
7786 CDir *straydir = mdcache->get_dirfrag(rollback.dest_dir);
7787 ceph_assert(straydir);
7788 CDentry *straydn = straydir->lookup(rollback.dest_dname);
7789 ceph_assert(straydn);
7790 dout(10) << " straydn " << *straydn << dendl;
7791 CInode *in = straydn->get_linkage()->get_inode();
7792
7793 dn->push_projected_linkage(in);
7794 straydn->push_projected_linkage();
7795
7796 if (rollback.snapbl.length() && in->snaprealm) {
7797 bool hadrealm;
7798 auto p = rollback.snapbl.cbegin();
7799 decode(hadrealm, p);
7800 if (hadrealm) {
7801 decode(in->snaprealm->srnode, p);
7802 } else {
7803 in->snaprealm->merge_to(dir->get_inode()->find_snaprealm());
7804 }
7805 }
7806
7807 if (mdr && !mdr->more()->peer_update_journaled) {
7808 ceph_assert(!in->has_subtree_root_dirfrag(mds->get_nodeid()));
7809
7810 _rmdir_rollback_finish(mdr, rollback.reqid, dn, straydn);
7811 return;
7812 }
7813
7814
7815 EPeerUpdate *le = new EPeerUpdate(mdlog, "peer_rmdir_rollback", rollback.reqid, leader,
7816 EPeerUpdate::OP_ROLLBACK, EPeerUpdate::RMDIR);
7817 mdlog->start_entry(le);
7818
7819 le->commit.add_dir_context(dn->get_dir());
7820 le->commit.add_primary_dentry(dn, in, true);
7821 // peer: no need to journal straydn
7822
7823 dout(10) << " noting renamed (unlinked) dir ino " << in->ino() << " in metablob" << dendl;
7824 le->commit.renamed_dirino = in->ino();
7825
7826 mdcache->project_subtree_rename(in, straydn->get_dir(), dn->get_dir());
7827
7828 submit_mdlog_entry(le,
7829 new C_MDS_LoggedRmdirRollback(this, mdr,rollback.reqid,
7830 dn, straydn),
7831 mdr, __func__);
7832 mdlog->flush();
7833 }
7834
7835 void Server::_rmdir_rollback_finish(MDRequestRef& mdr, metareqid_t reqid, CDentry *dn, CDentry *straydn)
7836 {
7837 dout(10) << "_rmdir_rollback_finish " << reqid << dendl;
7838
7839 straydn->get_dir()->unlink_inode(straydn);
7840 dn->pop_projected_linkage();
7841 straydn->pop_projected_linkage();
7842
7843 CInode *in = dn->get_linkage()->get_inode();
7844 mdcache->adjust_subtree_after_rename(in, straydn->get_dir(),
7845 !mdr || mdr->more()->peer_update_journaled);
7846
7847 if (mds->is_resolve()) {
7848 CDir *root = mdcache->get_subtree_root(straydn->get_dir());
7849 mdcache->try_trim_non_auth_subtree(root);
7850 }
7851
7852 if (mdr)
7853 mdcache->request_finish(mdr);
7854
7855 mdcache->finish_rollback(reqid, mdr);
7856 }
7857
7858
7859 /** _dir_is_nonempty[_unlocked]
7860 *
7861 * check if a directory is non-empty (i.e. we can rmdir it).
7862 *
7863 * the unlocked varient this is a fastpath check. we can't really be
7864 * sure until we rdlock the filelock.
7865 */
7866 bool Server::_dir_is_nonempty_unlocked(MDRequestRef& mdr, CInode *in)
7867 {
7868 dout(10) << "dir_is_nonempty_unlocked " << *in << dendl;
7869 ceph_assert(in->is_auth());
7870
7871 if (in->filelock.is_cached())
7872 return false; // there can be pending async create/unlink. don't know.
7873 if (in->snaprealm && in->snaprealm->srnode.snaps.size())
7874 return true; // in a snapshot!
7875
7876 auto&& ls = in->get_dirfrags();
7877 for (const auto& dir : ls) {
7878 // is the frag obviously non-empty?
7879 if (dir->is_auth()) {
7880 if (dir->get_projected_fnode()->fragstat.size()) {
7881 dout(10) << "dir_is_nonempty_unlocked dirstat has "
7882 << dir->get_projected_fnode()->fragstat.size() << " items " << *dir << dendl;
7883 return true;
7884 }
7885 }
7886 }
7887
7888 return false;
7889 }
7890
7891 bool Server::_dir_is_nonempty(MDRequestRef& mdr, CInode *in)
7892 {
7893 dout(10) << "dir_is_nonempty " << *in << dendl;
7894 ceph_assert(in->is_auth());
7895 ceph_assert(in->filelock.can_read(mdr->get_client()));
7896
7897 frag_info_t dirstat;
7898 version_t dirstat_version = in->get_projected_inode()->dirstat.version;
7899
7900 auto&& ls = in->get_dirfrags();
7901 for (const auto& dir : ls) {
7902 const auto& pf = dir->get_projected_fnode();
7903 if (pf->fragstat.size()) {
7904 dout(10) << "dir_is_nonempty dirstat has "
7905 << pf->fragstat.size() << " items " << *dir << dendl;
7906 return true;
7907 }
7908
7909 if (pf->accounted_fragstat.version == dirstat_version)
7910 dirstat.add(pf->accounted_fragstat);
7911 else
7912 dirstat.add(pf->fragstat);
7913 }
7914
7915 return dirstat.size() != in->get_projected_inode()->dirstat.size();
7916 }
7917
7918
7919 // ======================================================
7920
7921
7922 class C_MDS_rename_finish : public ServerLogContext {
7923 CDentry *srcdn;
7924 CDentry *destdn;
7925 CDentry *straydn;
7926 public:
7927 C_MDS_rename_finish(Server *s, MDRequestRef& r,
7928 CDentry *sdn, CDentry *ddn, CDentry *stdn) :
7929 ServerLogContext(s, r),
7930 srcdn(sdn), destdn(ddn), straydn(stdn) { }
7931 void finish(int r) override {
7932 ceph_assert(r == 0);
7933 server->_rename_finish(mdr, srcdn, destdn, straydn);
7934 }
7935 };
7936
7937
7938 /** handle_client_rename
7939 *
7940 * rename leader is the destdn auth. this is because cached inodes
7941 * must remain connected. thus, any replica of srci, must also
7942 * replicate destdn, and possibly straydn, so that srci (and
7943 * destdn->inode) remain connected during the rename.
7944 *
7945 * to do this, we freeze srci, then leader (destdn auth) verifies that
7946 * all other nodes have also replciated destdn and straydn. note that
7947 * destdn replicas need not also replicate srci. this only works when
7948 * destdn is leader.
7949 *
7950 * This function takes responsibility for the passed mdr.
7951 */
7952 void Server::handle_client_rename(MDRequestRef& mdr)
7953 {
7954 const auto& req = mdr->client_request;
7955 dout(7) << "handle_client_rename " << *req << dendl;
7956
7957 filepath destpath = req->get_filepath();
7958 filepath srcpath = req->get_filepath2();
7959 if (srcpath.is_last_dot_or_dotdot() || destpath.is_last_dot_or_dotdot()) {
7960 respond_to_request(mdr, -CEPHFS_EBUSY);
7961 return;
7962 }
7963
7964 if (req->get_alternate_name().size() > alternate_name_max) {
7965 dout(10) << " alternate_name longer than " << alternate_name_max << dendl;
7966 respond_to_request(mdr, -CEPHFS_ENAMETOOLONG);
7967 return;
7968 }
7969
7970 auto [destdn, srcdn] = rdlock_two_paths_xlock_destdn(mdr, true);
7971 if (!destdn)
7972 return;
7973
7974 dout(10) << " destdn " << *destdn << dendl;
7975 CDir *destdir = destdn->get_dir();
7976 ceph_assert(destdir->is_auth());
7977 CDentry::linkage_t *destdnl = destdn->get_projected_linkage();
7978
7979 dout(10) << " srcdn " << *srcdn << dendl;
7980 CDir *srcdir = srcdn->get_dir();
7981 CDentry::linkage_t *srcdnl = srcdn->get_projected_linkage();
7982 CInode *srci = srcdnl->get_inode();
7983 dout(10) << " srci " << *srci << dendl;
7984
7985 // -- some sanity checks --
7986 if (destdn == srcdn) {
7987 dout(7) << "rename src=dest, noop" << dendl;
7988 respond_to_request(mdr, 0);
7989 return;
7990 }
7991
7992 // dest a child of src?
7993 // e.g. mv /usr /usr/foo
7994 if (srci->is_dir() && srci->is_projected_ancestor_of(destdir->get_inode())) {
7995 dout(7) << "cannot rename item to be a child of itself" << dendl;
7996 respond_to_request(mdr, -CEPHFS_EINVAL);
7997 return;
7998 }
7999
8000 // is this a stray migration, reintegration or merge? (sanity checks!)
8001 if (mdr->reqid.name.is_mds() &&
8002 !(MDS_INO_IS_STRAY(srcpath.get_ino()) &&
8003 MDS_INO_IS_STRAY(destpath.get_ino())) &&
8004 !(destdnl->is_remote() &&
8005 destdnl->get_remote_ino() == srci->ino())) {
8006 respond_to_request(mdr, -CEPHFS_EINVAL); // actually, this won't reply, but whatev.
8007 return;
8008 }
8009
8010 CInode *oldin = 0;
8011 if (!destdnl->is_null()) {
8012 //dout(10) << "dest dn exists " << *destdn << dendl;
8013 oldin = mdcache->get_dentry_inode(destdn, mdr, true);
8014 if (!oldin) return;
8015 dout(10) << " oldin " << *oldin << dendl;
8016
8017 // non-empty dir? do trivial fast unlocked check, do another check later with read locks
8018 if (oldin->is_dir() && _dir_is_nonempty_unlocked(mdr, oldin)) {
8019 respond_to_request(mdr, -CEPHFS_ENOTEMPTY);
8020 return;
8021 }
8022
8023 // mv /some/thing /to/some/existing_other_thing
8024 if (oldin->is_dir() && !srci->is_dir()) {
8025 respond_to_request(mdr, -CEPHFS_EISDIR);
8026 return;
8027 }
8028 if (!oldin->is_dir() && srci->is_dir()) {
8029 respond_to_request(mdr, -CEPHFS_ENOTDIR);
8030 return;
8031 }
8032 if (srci == oldin && !srcdir->inode->is_stray()) {
8033 respond_to_request(mdr, 0); // no-op. POSIX makes no sense.
8034 return;
8035 }
8036 if (destdn->get_alternate_name() != req->get_alternate_name()) {
8037 /* the dentry exists but the alternate_names do not match, fail... */
8038 respond_to_request(mdr, -CEPHFS_EINVAL);
8039 return;
8040 }
8041 }
8042
8043 vector<CDentry*>& srctrace = mdr->dn[1];
8044 vector<CDentry*>& desttrace = mdr->dn[0];
8045
8046 // src+dest traces _must_ share a common ancestor for locking to prevent orphans
8047 if (destpath.get_ino() != srcpath.get_ino() &&
8048 !(req->get_source().is_mds() &&
8049 MDS_INO_IS_STRAY(srcpath.get_ino()))) { // <-- mds 'rename' out of stray dir is ok!
8050 CInode *srcbase = srctrace[0]->get_dir()->get_inode();
8051 CInode *destbase = desttrace[0]->get_dir()->get_inode();
8052 // ok, extend srctrace toward root until it is an ancestor of desttrace.
8053 while (srcbase != destbase &&
8054 !srcbase->is_projected_ancestor_of(destbase)) {
8055 CDentry *pdn = srcbase->get_projected_parent_dn();
8056 srctrace.insert(srctrace.begin(), pdn);
8057 dout(10) << "rename prepending srctrace with " << *pdn << dendl;
8058 srcbase = pdn->get_dir()->get_inode();
8059 }
8060
8061 // then, extend destpath until it shares the same parent inode as srcpath.
8062 while (destbase != srcbase) {
8063 CDentry *pdn = destbase->get_projected_parent_dn();
8064 desttrace.insert(desttrace.begin(), pdn);
8065 dout(10) << "rename prepending desttrace with " << *pdn << dendl;
8066 destbase = pdn->get_dir()->get_inode();
8067 }
8068 dout(10) << "rename src and dest traces now share common ancestor " << *destbase << dendl;
8069 }
8070
8071
8072 bool linkmerge = srcdnl->get_inode() == destdnl->get_inode();
8073 if (linkmerge)
8074 dout(10) << " this is a link merge" << dendl;
8075
8076 // -- create stray dentry? --
8077 CDentry *straydn = NULL;
8078 if (destdnl->is_primary() && !linkmerge) {
8079 straydn = prepare_stray_dentry(mdr, destdnl->get_inode());
8080 if (!straydn)
8081 return;
8082 dout(10) << " straydn is " << *straydn << dendl;
8083 } else if (mdr->straydn) {
8084 mdr->unpin(mdr->straydn);
8085 mdr->straydn = NULL;
8086 }
8087
8088
8089 // -- locks --
8090 if (!(mdr->locking_state & MutationImpl::ALL_LOCKED)) {
8091 MutationImpl::LockOpVec lov;
8092
8093 // we need to update srci's ctime. xlock its least contended lock to do that...
8094 lov.add_xlock(&srci->linklock);
8095 lov.add_xlock(&srci->snaplock);
8096
8097 if (oldin) {
8098 // xlock oldin (for nlink--)
8099 lov.add_xlock(&oldin->linklock);
8100 lov.add_xlock(&oldin->snaplock);
8101 if (oldin->is_dir()) {
8102 ceph_assert(srci->is_dir());
8103 lov.add_rdlock(&oldin->filelock); // to verify it's empty
8104
8105 // adjust locking order?
8106 int cmp = mdr->compare_paths();
8107 if (cmp < 0 || (cmp == 0 && oldin->ino() < srci->ino()))
8108 std::reverse(lov.begin(), lov.end());
8109 } else {
8110 ceph_assert(!srci->is_dir());
8111 // adjust locking order;
8112 if (srci->ino() > oldin->ino())
8113 std::reverse(lov.begin(), lov.end());
8114 }
8115 }
8116
8117 // straydn?
8118 if (straydn) {
8119 lov.add_wrlock(&straydn->get_dir()->inode->filelock);
8120 lov.add_wrlock(&straydn->get_dir()->inode->nestlock);
8121 lov.add_xlock(&straydn->lock);
8122 }
8123
8124 CInode *auth_pin_freeze = !srcdn->is_auth() && srcdnl->is_primary() ? srci : nullptr;
8125 if (!mds->locker->acquire_locks(mdr, lov, auth_pin_freeze))
8126 return;
8127
8128 mdr->locking_state |= MutationImpl::ALL_LOCKED;
8129 }
8130
8131 if (linkmerge)
8132 ceph_assert(srcdir->inode->is_stray() && srcdnl->is_primary() && destdnl->is_remote());
8133
8134 if ((!mdr->has_more() || mdr->more()->witnessed.empty())) {
8135 if (!check_access(mdr, srcdir->get_inode(), MAY_WRITE))
8136 return;
8137
8138 if (!check_access(mdr, destdn->get_dir()->get_inode(), MAY_WRITE))
8139 return;
8140
8141 if (!check_fragment_space(mdr, destdn->get_dir()))
8142 return;
8143
8144 if (!check_access(mdr, srci, MAY_WRITE))
8145 return;
8146 }
8147
8148 // with read lock, really verify oldin is empty
8149 if (oldin &&
8150 oldin->is_dir() &&
8151 _dir_is_nonempty(mdr, oldin)) {
8152 respond_to_request(mdr, -CEPHFS_ENOTEMPTY);
8153 return;
8154 }
8155
8156 /* project_snaprealm_past_parent() will do this job
8157 *
8158 // moving between snaprealms?
8159 if (srcdnl->is_primary() && srci->is_multiversion() && !srci->snaprealm) {
8160 SnapRealm *srcrealm = srci->find_snaprealm();
8161 SnapRealm *destrealm = destdn->get_dir()->inode->find_snaprealm();
8162 if (srcrealm != destrealm &&
8163 (srcrealm->get_newest_seq() + 1 > srcdn->first ||
8164 destrealm->get_newest_seq() + 1 > srcdn->first)) {
8165 dout(10) << " renaming between snaprealms, creating snaprealm for " << *srci << dendl;
8166 mdcache->snaprealm_create(mdr, srci);
8167 return;
8168 }
8169 }
8170 */
8171
8172 SnapRealm *dest_realm = nullptr;
8173 SnapRealm *src_realm = nullptr;
8174 if (!linkmerge) {
8175 dest_realm = destdir->inode->find_snaprealm();
8176 if (srcdir->inode == destdir->inode)
8177 src_realm = dest_realm;
8178 else
8179 src_realm = srcdir->inode->find_snaprealm();
8180 if (src_realm != dest_realm &&
8181 src_realm->get_subvolume_ino() != dest_realm->get_subvolume_ino()) {
8182 respond_to_request(mdr, -CEPHFS_EXDEV);
8183 return;
8184 }
8185 }
8186
8187 ceph_assert(g_conf()->mds_kill_rename_at != 1);
8188
8189 // -- open all srcdn inode frags, if any --
8190 // we need these open so that auth can properly delegate from inode to dirfrags
8191 // after the inode is _ours_.
8192 if (srcdnl->is_primary() &&
8193 !srcdn->is_auth() &&
8194 srci->is_dir()) {
8195 dout(10) << "srci is remote dir, setting stickydirs and opening all frags" << dendl;
8196 mdr->set_stickydirs(srci);
8197
8198 frag_vec_t leaves;
8199 srci->dirfragtree.get_leaves(leaves);
8200 for (const auto& leaf : leaves) {
8201 CDir *dir = srci->get_dirfrag(leaf);
8202 if (!dir) {
8203 dout(10) << " opening " << leaf << " under " << *srci << dendl;
8204 mdcache->open_remote_dirfrag(srci, leaf, new C_MDS_RetryRequest(mdcache, mdr));
8205 return;
8206 }
8207 }
8208 }
8209
8210 // -- prepare snaprealm ---
8211
8212 if (linkmerge) {
8213 if (!mdr->more()->srci_srnode &&
8214 srci->get_projected_inode()->nlink == 1 &&
8215 srci->is_projected_snaprealm_global()) {
8216 sr_t *new_srnode = srci->prepare_new_srnode(0);
8217 srci->record_snaprealm_parent_dentry(new_srnode, nullptr, destdn, false);
8218
8219 srci->clear_snaprealm_global(new_srnode);
8220 mdr->more()->srci_srnode = new_srnode;
8221 }
8222 } else {
8223 if (oldin && !mdr->more()->desti_srnode) {
8224 if (oldin->is_projected_snaprealm_global()) {
8225 sr_t *new_srnode = oldin->prepare_new_srnode(0);
8226 oldin->record_snaprealm_parent_dentry(new_srnode, dest_realm, destdn, destdnl->is_primary());
8227 // dropping the last linkage or dropping the last remote linkage,
8228 // detch the inode from global snaprealm
8229 auto nlink = oldin->get_projected_inode()->nlink;
8230 if (nlink == 1 ||
8231 (nlink == 2 && !destdnl->is_primary() &&
8232 !oldin->get_projected_parent_dir()->inode->is_stray()))
8233 oldin->clear_snaprealm_global(new_srnode);
8234 mdr->more()->desti_srnode = new_srnode;
8235 } else if (destdnl->is_primary()) {
8236 snapid_t follows = dest_realm->get_newest_seq();
8237 if (oldin->snaprealm || follows + 1 > oldin->get_oldest_snap()) {
8238 sr_t *new_srnode = oldin->prepare_new_srnode(follows);
8239 oldin->record_snaprealm_past_parent(new_srnode, straydn->get_dir()->inode->find_snaprealm());
8240 mdr->more()->desti_srnode = new_srnode;
8241 }
8242 }
8243 }
8244 if (!mdr->more()->srci_srnode) {
8245 if (srci->is_projected_snaprealm_global()) {
8246 sr_t *new_srnode = srci->prepare_new_srnode(0);
8247 srci->record_snaprealm_parent_dentry(new_srnode, src_realm, srcdn, srcdnl->is_primary());
8248 mdr->more()->srci_srnode = new_srnode;
8249 } else if (srcdnl->is_primary()) {
8250 snapid_t follows = src_realm->get_newest_seq();
8251 if (src_realm != dest_realm &&
8252 (srci->snaprealm || follows + 1 > srci->get_oldest_snap())) {
8253 sr_t *new_srnode = srci->prepare_new_srnode(follows);
8254 srci->record_snaprealm_past_parent(new_srnode, dest_realm);
8255 mdr->more()->srci_srnode = new_srnode;
8256 }
8257 }
8258 }
8259 }
8260
8261 // -- prepare witnesses --
8262
8263 /*
8264 * NOTE: we use _all_ replicas as witnesses.
8265 * this probably isn't totally necessary (esp for file renames),
8266 * but if/when we change that, we have to make sure rejoin is
8267 * sufficiently robust to handle strong rejoins from survivors
8268 * with totally wrong dentry->inode linkage.
8269 * (currently, it can ignore rename effects, because the resolve
8270 * stage will sort them out.)
8271 */
8272 set<mds_rank_t> witnesses = mdr->more()->extra_witnesses;
8273 if (srcdn->is_auth())
8274 srcdn->list_replicas(witnesses);
8275 else
8276 witnesses.insert(srcdn->authority().first);
8277 if (srcdnl->is_remote() && !srci->is_auth())
8278 witnesses.insert(srci->authority().first);
8279 destdn->list_replicas(witnesses);
8280 if (destdnl->is_remote() && !oldin->is_auth())
8281 witnesses.insert(oldin->authority().first);
8282 dout(10) << " witnesses " << witnesses << ", have " << mdr->more()->witnessed << dendl;
8283
8284 if (!witnesses.empty()) {
8285 // Replicas can't see projected dentry linkages and will get confused.
8286 // We have taken snaplocks on ancestor inodes. Later rename/rmdir requests
8287 // can't project these inodes' linkages.
8288 bool need_flush = false;
8289 for (auto& dn : srctrace) {
8290 if (dn->is_projected()) {
8291 need_flush = true;
8292 break;
8293 }
8294 }
8295 if (!need_flush) {
8296 CDentry *dn = destdn;
8297 do {
8298 if (dn->is_projected()) {
8299 need_flush = true;
8300 break;
8301 }
8302 CInode *diri = dn->get_dir()->get_inode();
8303 dn = diri->get_projected_parent_dn();
8304 } while (dn);
8305 }
8306 if (need_flush) {
8307 mdlog->wait_for_safe(
8308 new MDSInternalContextWrapper(mds,
8309 new C_MDS_RetryRequest(mdcache, mdr)));
8310 mdlog->flush();
8311 return;
8312 }
8313 }
8314
8315 // do srcdn auth last
8316 mds_rank_t last = MDS_RANK_NONE;
8317 if (!srcdn->is_auth()) {
8318 last = srcdn->authority().first;
8319 mdr->more()->srcdn_auth_mds = last;
8320 // ask auth of srci to mark srci as ambiguous auth if more than two MDS
8321 // are involved in the rename operation.
8322 if (srcdnl->is_primary() && !mdr->more()->is_ambiguous_auth) {
8323 dout(10) << " preparing ambiguous auth for srci" << dendl;
8324 ceph_assert(mdr->more()->is_remote_frozen_authpin);
8325 ceph_assert(mdr->more()->rename_inode == srci);
8326 _rename_prepare_witness(mdr, last, witnesses, srctrace, desttrace, straydn);
8327 return;
8328 }
8329 }
8330
8331 for (set<mds_rank_t>::iterator p = witnesses.begin();
8332 p != witnesses.end();
8333 ++p) {
8334 if (*p == last) continue; // do it last!
8335 if (mdr->more()->witnessed.count(*p)) {
8336 dout(10) << " already witnessed by mds." << *p << dendl;
8337 } else if (mdr->more()->waiting_on_peer.count(*p)) {
8338 dout(10) << " already waiting on witness mds." << *p << dendl;
8339 } else {
8340 if (!_rename_prepare_witness(mdr, *p, witnesses, srctrace, desttrace, straydn))
8341 return;
8342 }
8343 }
8344 if (!mdr->more()->waiting_on_peer.empty())
8345 return; // we're waiting for a witness.
8346
8347 if (last != MDS_RANK_NONE && mdr->more()->witnessed.count(last) == 0) {
8348 dout(10) << " preparing last witness (srcdn auth)" << dendl;
8349 ceph_assert(mdr->more()->waiting_on_peer.count(last) == 0);
8350 _rename_prepare_witness(mdr, last, witnesses, srctrace, desttrace, straydn);
8351 return;
8352 }
8353
8354 // test hack: bail after peer does prepare, so we can verify it's _live_ rollback.
8355 if (!mdr->more()->peers.empty() && !srci->is_dir())
8356 ceph_assert(g_conf()->mds_kill_rename_at != 3);
8357 if (!mdr->more()->peers.empty() && srci->is_dir())
8358 ceph_assert(g_conf()->mds_kill_rename_at != 4);
8359
8360 // -- declare now --
8361 mdr->set_mds_stamp(ceph_clock_now());
8362
8363 // -- prepare journal entry --
8364 mdr->ls = mdlog->get_current_segment();
8365 EUpdate *le = new EUpdate(mdlog, "rename");
8366 mdlog->start_entry(le);
8367 le->metablob.add_client_req(mdr->reqid, req->get_oldest_client_tid());
8368 if (!mdr->more()->witnessed.empty()) {
8369 dout(20) << " noting uncommitted_peers " << mdr->more()->witnessed << dendl;
8370
8371 le->reqid = mdr->reqid;
8372 le->had_peers = true;
8373
8374 mdcache->add_uncommitted_leader(mdr->reqid, mdr->ls, mdr->more()->witnessed);
8375 // no need to send frozen auth pin to recovring auth MDS of srci
8376 mdr->more()->is_remote_frozen_authpin = false;
8377 }
8378
8379 _rename_prepare(mdr, &le->metablob, &le->client_map, srcdn, destdn, req->get_alternate_name(), straydn);
8380 if (le->client_map.length())
8381 le->cmapv = mds->sessionmap.get_projected();
8382
8383 // -- commit locally --
8384 C_MDS_rename_finish *fin = new C_MDS_rename_finish(this, mdr, srcdn, destdn, straydn);
8385
8386 journal_and_reply(mdr, srci, destdn, le, fin);
8387 mds->balancer->maybe_fragment(destdn->get_dir(), false);
8388 }
8389
8390
8391 void Server::_rename_finish(MDRequestRef& mdr, CDentry *srcdn, CDentry *destdn, CDentry *straydn)
8392 {
8393 dout(10) << "_rename_finish " << *mdr << dendl;
8394
8395 if (!mdr->more()->witnessed.empty())
8396 mdcache->logged_leader_update(mdr->reqid);
8397
8398 // apply
8399 _rename_apply(mdr, srcdn, destdn, straydn);
8400
8401 mdcache->send_dentry_link(destdn, mdr);
8402
8403 CDentry::linkage_t *destdnl = destdn->get_linkage();
8404 CInode *in = destdnl->get_inode();
8405 bool need_eval = mdr->more()->cap_imports.count(in);
8406
8407 // test hack: test peer commit
8408 if (!mdr->more()->peers.empty() && !in->is_dir())
8409 ceph_assert(g_conf()->mds_kill_rename_at != 5);
8410 if (!mdr->more()->peers.empty() && in->is_dir())
8411 ceph_assert(g_conf()->mds_kill_rename_at != 6);
8412
8413 // bump popularity
8414 mds->balancer->hit_dir(srcdn->get_dir(), META_POP_IWR);
8415 if (destdnl->is_remote() && in->is_auth())
8416 mds->balancer->hit_inode(in, META_POP_IWR);
8417
8418 // did we import srci? if so, explicitly ack that import that, before we unlock and reply.
8419
8420 ceph_assert(g_conf()->mds_kill_rename_at != 7);
8421
8422 // reply
8423 respond_to_request(mdr, 0);
8424
8425 if (need_eval)
8426 mds->locker->eval(in, CEPH_CAP_LOCKS, true);
8427
8428 // clean up?
8429 // respond_to_request() drops locks. So stray reintegration can race with us.
8430 if (straydn && !straydn->get_projected_linkage()->is_null()) {
8431 mdcache->notify_stray(straydn);
8432 }
8433 }
8434
8435
8436
8437 // helpers
8438
8439 bool Server::_rename_prepare_witness(MDRequestRef& mdr, mds_rank_t who, set<mds_rank_t> &witnesse,
8440 vector<CDentry*>& srctrace, vector<CDentry*>& dsttrace, CDentry *straydn)
8441 {
8442 const auto& client_req = mdr->client_request;
8443 ceph_assert(client_req);
8444
8445 if (mds->is_cluster_degraded() &&
8446 !mds->mdsmap->is_clientreplay_or_active_or_stopping(who)) {
8447 dout(10) << "_rename_prepare_witness mds." << who << " is not active" << dendl;
8448 if (mdr->more()->waiting_on_peer.empty())
8449 mds->wait_for_active_peer(who, new C_MDS_RetryRequest(mdcache, mdr));
8450 return false;
8451 }
8452
8453 dout(10) << "_rename_prepare_witness mds." << who << dendl;
8454 auto req = make_message<MMDSPeerRequest>(mdr->reqid, mdr->attempt, MMDSPeerRequest::OP_RENAMEPREP);
8455
8456 req->srcdnpath = filepath(srctrace.front()->get_dir()->ino());
8457 for (auto dn : srctrace)
8458 req->srcdnpath.push_dentry(dn->get_name());
8459 req->destdnpath = filepath(dsttrace.front()->get_dir()->ino());
8460 for (auto dn : dsttrace)
8461 req->destdnpath.push_dentry(dn->get_name());
8462 req->alternate_name = client_req->alternate_name;
8463 if (straydn)
8464 mdcache->encode_replica_stray(straydn, who, req->straybl);
8465
8466 if (mdr->more()->srci_srnode)
8467 encode(*mdr->more()->srci_srnode, req->srci_snapbl);
8468 if (mdr->more()->desti_srnode)
8469 encode(*mdr->more()->desti_srnode, req->desti_snapbl);
8470
8471 req->srcdn_auth = mdr->more()->srcdn_auth_mds;
8472
8473 // srcdn auth will verify our current witness list is sufficient
8474 req->witnesses = witnesse;
8475
8476 req->op_stamp = mdr->get_op_stamp();
8477 mds->send_message_mds(req, who);
8478
8479 ceph_assert(mdr->more()->waiting_on_peer.count(who) == 0);
8480 mdr->more()->waiting_on_peer.insert(who);
8481 return true;
8482 }
8483
8484 version_t Server::_rename_prepare_import(MDRequestRef& mdr, CDentry *srcdn, bufferlist *client_map_bl)
8485 {
8486 version_t oldpv = mdr->more()->inode_import_v;
8487
8488 CDentry::linkage_t *srcdnl = srcdn->get_linkage();
8489
8490 /* import node */
8491 auto blp = mdr->more()->inode_import.cbegin();
8492
8493 // imported caps
8494 map<client_t,entity_inst_t> client_map;
8495 map<client_t, client_metadata_t> client_metadata_map;
8496 decode(client_map, blp);
8497 decode(client_metadata_map, blp);
8498 prepare_force_open_sessions(client_map, client_metadata_map,
8499 mdr->more()->imported_session_map);
8500 encode(client_map, *client_map_bl, mds->mdsmap->get_up_features());
8501 encode(client_metadata_map, *client_map_bl);
8502
8503 list<ScatterLock*> updated_scatterlocks;
8504 mdcache->migrator->decode_import_inode(srcdn, blp, srcdn->authority().first, mdr->ls,
8505 mdr->more()->cap_imports, updated_scatterlocks);
8506
8507 // hack: force back to !auth and clean, temporarily
8508 srcdnl->get_inode()->state_clear(CInode::STATE_AUTH);
8509 srcdnl->get_inode()->mark_clean();
8510
8511 return oldpv;
8512 }
8513
8514 bool Server::_need_force_journal(CInode *diri, bool empty)
8515 {
8516 auto&& dirs = diri->get_dirfrags();
8517
8518 bool force_journal = false;
8519 if (empty) {
8520 for (const auto& dir : dirs) {
8521 if (dir->is_subtree_root() && dir->get_dir_auth().first == mds->get_nodeid()) {
8522 dout(10) << " frag " << dir->get_frag() << " is auth subtree dirfrag, will force journal" << dendl;
8523 force_journal = true;
8524 break;
8525 } else
8526 dout(20) << " frag " << dir->get_frag() << " is not auth subtree dirfrag" << dendl;
8527 }
8528 } else {
8529 // see if any children of our frags are auth subtrees.
8530 std::vector<CDir*> subtrees;
8531 mdcache->get_subtrees(subtrees);
8532 dout(10) << " subtrees " << subtrees << " frags " << dirs << dendl;
8533 for (const auto& dir : dirs) {
8534 for (const auto& subtree : subtrees) {
8535 if (dir->contains(subtree)) {
8536 if (subtree->get_dir_auth().first == mds->get_nodeid()) {
8537 dout(10) << " frag " << dir->get_frag() << " contains (maybe) auth subtree, will force journal "
8538 << *subtree << dendl;
8539 force_journal = true;
8540 break;
8541 } else
8542 dout(20) << " frag " << dir->get_frag() << " contains but isn't auth for " << *subtree << dendl;
8543 } else
8544 dout(20) << " frag " << dir->get_frag() << " does not contain " << *subtree << dendl;
8545 }
8546 if (force_journal)
8547 break;
8548 }
8549 }
8550 return force_journal;
8551 }
8552
8553 void Server::_rename_prepare(MDRequestRef& mdr,
8554 EMetaBlob *metablob, bufferlist *client_map_bl,
8555 CDentry *srcdn, CDentry *destdn, std::string_view alternate_name,
8556 CDentry *straydn)
8557 {
8558 dout(10) << "_rename_prepare " << *mdr << " " << *srcdn << " " << *destdn << dendl;
8559 if (straydn)
8560 dout(10) << " straydn " << *straydn << dendl;
8561
8562 CDentry::linkage_t *srcdnl = srcdn->get_projected_linkage();
8563 CDentry::linkage_t *destdnl = destdn->get_projected_linkage();
8564 CInode *srci = srcdnl->get_inode();
8565 CInode *oldin = destdnl->get_inode();
8566
8567 // primary+remote link merge?
8568 bool linkmerge = (srci == oldin);
8569 if (linkmerge)
8570 ceph_assert(srcdnl->is_primary() && destdnl->is_remote());
8571 bool silent = srcdn->get_dir()->inode->is_stray();
8572
8573 bool force_journal_dest = false;
8574 if (srci->is_dir() && !destdn->is_auth()) {
8575 if (srci->is_auth()) {
8576 // if we are auth for srci and exporting it, force journal because journal replay needs
8577 // the source inode to create auth subtrees.
8578 dout(10) << " we are exporting srci, will force journal destdn" << dendl;
8579 force_journal_dest = true;
8580 } else
8581 force_journal_dest = _need_force_journal(srci, false);
8582 }
8583
8584 bool force_journal_stray = false;
8585 if (oldin && oldin->is_dir() && straydn && !straydn->is_auth())
8586 force_journal_stray = _need_force_journal(oldin, true);
8587
8588 if (linkmerge)
8589 dout(10) << " merging remote and primary links to the same inode" << dendl;
8590 if (silent)
8591 dout(10) << " reintegrating stray; will avoid changing nlink or dir mtime" << dendl;
8592 if (force_journal_dest)
8593 dout(10) << " forcing journal destdn because we (will) have auth subtrees nested beneath it" << dendl;
8594 if (force_journal_stray)
8595 dout(10) << " forcing journal straydn because we (will) have auth subtrees nested beneath it" << dendl;
8596
8597 if (srci->is_dir() && (destdn->is_auth() || force_journal_dest)) {
8598 dout(10) << " noting renamed dir ino " << srci->ino() << " in metablob" << dendl;
8599 metablob->renamed_dirino = srci->ino();
8600 } else if (oldin && oldin->is_dir() && force_journal_stray) {
8601 dout(10) << " noting rename target dir " << oldin->ino() << " in metablob" << dendl;
8602 metablob->renamed_dirino = oldin->ino();
8603 }
8604
8605 // prepare
8606 CInode::mempool_inode *spi = 0; // renamed inode
8607 CInode::mempool_inode *tpi = 0; // target/overwritten inode
8608
8609 // target inode
8610 if (!linkmerge) {
8611 if (destdnl->is_primary()) {
8612 ceph_assert(straydn); // moving to straydn.
8613 // link--, and move.
8614 if (destdn->is_auth()) {
8615 auto pi= oldin->project_inode(mdr); //project_snaprealm
8616 pi.inode->version = straydn->pre_dirty(pi.inode->version);
8617 pi.inode->update_backtrace();
8618 tpi = pi.inode.get();
8619 }
8620 straydn->push_projected_linkage(oldin);
8621 } else if (destdnl->is_remote()) {
8622 // nlink-- targeti
8623 if (oldin->is_auth()) {
8624 auto pi = oldin->project_inode(mdr);
8625 pi.inode->version = oldin->pre_dirty();
8626 tpi = pi.inode.get();
8627 }
8628 }
8629 }
8630
8631 // dest
8632 if (destdnl->is_null()) {
8633 /* handle_client_rename checks that alternate_name matches for existing destdn */
8634 destdn->set_alternate_name(alternate_name);
8635 }
8636 if (srcdnl->is_remote()) {
8637 if (!linkmerge) {
8638 // destdn
8639 if (destdn->is_auth())
8640 mdr->more()->pvmap[destdn] = destdn->pre_dirty();
8641 destdn->push_projected_linkage(srcdnl->get_remote_ino(), srcdnl->get_remote_d_type());
8642 // srci
8643 if (srci->is_auth()) {
8644 auto pi = srci->project_inode(mdr);
8645 pi.inode->version = srci->pre_dirty();
8646 spi = pi.inode.get();
8647 }
8648 } else {
8649 dout(10) << " will merge remote onto primary link" << dendl;
8650 if (destdn->is_auth()) {
8651 auto pi = oldin->project_inode(mdr);
8652 pi.inode->version = mdr->more()->pvmap[destdn] = destdn->pre_dirty(oldin->get_version());
8653 spi = pi.inode.get();
8654 }
8655 }
8656 } else { // primary
8657 if (destdn->is_auth()) {
8658 version_t oldpv;
8659 if (srcdn->is_auth())
8660 oldpv = srci->get_projected_version();
8661 else {
8662 oldpv = _rename_prepare_import(mdr, srcdn, client_map_bl);
8663
8664 // note which dirfrags have child subtrees in the journal
8665 // event, so that we can open those (as bounds) during replay.
8666 if (srci->is_dir()) {
8667 auto&& ls = srci->get_dirfrags();
8668 for (const auto& dir : ls) {
8669 if (!dir->is_auth())
8670 metablob->renamed_dir_frags.push_back(dir->get_frag());
8671 }
8672 dout(10) << " noting renamed dir open frags " << metablob->renamed_dir_frags << dendl;
8673 }
8674 }
8675 auto pi = srci->project_inode(mdr); // project snaprealm if srcdnl->is_primary
8676 // & srcdnl->snaprealm
8677 pi.inode->version = mdr->more()->pvmap[destdn] = destdn->pre_dirty(oldpv);
8678 pi.inode->update_backtrace();
8679 spi = pi.inode.get();
8680 }
8681 destdn->push_projected_linkage(srci);
8682 }
8683
8684 // src
8685 if (srcdn->is_auth())
8686 mdr->more()->pvmap[srcdn] = srcdn->pre_dirty();
8687 srcdn->push_projected_linkage(); // push null linkage
8688
8689 if (!silent) {
8690 if (spi) {
8691 spi->ctime = mdr->get_op_stamp();
8692 if (mdr->get_op_stamp() > spi->rstat.rctime)
8693 spi->rstat.rctime = mdr->get_op_stamp();
8694 spi->change_attr++;
8695 if (linkmerge)
8696 spi->nlink--;
8697 }
8698 if (tpi) {
8699 tpi->ctime = mdr->get_op_stamp();
8700 if (mdr->get_op_stamp() > tpi->rstat.rctime)
8701 tpi->rstat.rctime = mdr->get_op_stamp();
8702 tpi->change_attr++;
8703 {
8704 std::string t;
8705 destdn->make_path_string(t, true);
8706 tpi->stray_prior_path = std::move(t);
8707 }
8708 tpi->nlink--;
8709 if (tpi->nlink == 0)
8710 oldin->state_set(CInode::STATE_ORPHAN);
8711 }
8712 }
8713
8714 // prepare nesting, mtime updates
8715 int predirty_dir = silent ? 0:PREDIRTY_DIR;
8716
8717 // guarantee stray dir is processed first during journal replay. unlink the old inode,
8718 // then link the source inode to destdn
8719 if (destdnl->is_primary()) {
8720 ceph_assert(straydn);
8721 if (straydn->is_auth()) {
8722 metablob->add_dir_context(straydn->get_dir());
8723 metablob->add_dir(straydn->get_dir(), true);
8724 }
8725 }
8726
8727 if (!linkmerge && destdnl->is_remote() && oldin->is_auth()) {
8728 CDir *oldin_dir = oldin->get_projected_parent_dir();
8729 if (oldin_dir != srcdn->get_dir() && oldin_dir != destdn->get_dir())
8730 mdcache->predirty_journal_parents(mdr, metablob, oldin, oldin_dir, PREDIRTY_PRIMARY);
8731 }
8732
8733 // sub off target
8734 if (destdn->is_auth() && !destdnl->is_null()) {
8735 mdcache->predirty_journal_parents(mdr, metablob, oldin, destdn->get_dir(),
8736 (destdnl->is_primary() ? PREDIRTY_PRIMARY:0)|predirty_dir, -1);
8737 if (destdnl->is_primary()) {
8738 ceph_assert(straydn);
8739 mdcache->predirty_journal_parents(mdr, metablob, oldin, straydn->get_dir(),
8740 PREDIRTY_PRIMARY|PREDIRTY_DIR, 1);
8741 }
8742 }
8743
8744 if (srcdnl->is_remote() && srci->is_auth()) {
8745 CDir *srci_dir = srci->get_projected_parent_dir();
8746 if (srci_dir != srcdn->get_dir() && srci_dir != destdn->get_dir())
8747 mdcache->predirty_journal_parents(mdr, metablob, srci, srci_dir, PREDIRTY_PRIMARY);
8748 }
8749
8750 // move srcdn
8751 int predirty_primary = (srcdnl->is_primary() && srcdn->get_dir() != destdn->get_dir()) ? PREDIRTY_PRIMARY:0;
8752 int flags = predirty_dir | predirty_primary;
8753 if (srcdn->is_auth())
8754 mdcache->predirty_journal_parents(mdr, metablob, srci, srcdn->get_dir(), PREDIRTY_SHALLOW|flags, -1);
8755 if (destdn->is_auth())
8756 mdcache->predirty_journal_parents(mdr, metablob, srci, destdn->get_dir(), flags, 1);
8757
8758 // add it all to the metablob
8759 // target inode
8760 if (!linkmerge) {
8761 if (destdnl->is_primary()) {
8762 ceph_assert(straydn);
8763 if (destdn->is_auth()) {
8764 // project snaprealm, too
8765 if (auto& desti_srnode = mdr->more()->desti_srnode) {
8766 oldin->project_snaprealm(desti_srnode);
8767 if (tpi->nlink == 0)
8768 ceph_assert(!desti_srnode->is_parent_global());
8769 desti_srnode = NULL;
8770 }
8771 straydn->first = mdcache->get_global_snaprealm()->get_newest_seq() + 1;
8772 metablob->add_primary_dentry(straydn, oldin, true, true);
8773 } else if (force_journal_stray) {
8774 dout(10) << " forced journaling straydn " << *straydn << dendl;
8775 metablob->add_dir_context(straydn->get_dir());
8776 metablob->add_primary_dentry(straydn, oldin, true);
8777 }
8778 } else if (destdnl->is_remote()) {
8779 if (oldin->is_auth()) {
8780 sr_t *new_srnode = NULL;
8781 if (mdr->peer_request) {
8782 if (mdr->peer_request->desti_snapbl.length() > 0) {
8783 new_srnode = new sr_t();
8784 auto p = mdr->peer_request->desti_snapbl.cbegin();
8785 decode(*new_srnode, p);
8786 }
8787 } else if (auto& desti_srnode = mdr->more()->desti_srnode) {
8788 new_srnode = desti_srnode;
8789 desti_srnode = NULL;
8790 }
8791 if (new_srnode) {
8792 oldin->project_snaprealm(new_srnode);
8793 if (tpi->nlink == 0)
8794 ceph_assert(!new_srnode->is_parent_global());
8795 }
8796 // auth for targeti
8797 CDentry *oldin_pdn = oldin->get_projected_parent_dn();
8798 mdcache->journal_cow_dentry(mdr.get(), metablob, oldin_pdn);
8799 metablob->add_primary_dentry(oldin_pdn, oldin, true);
8800 }
8801 }
8802 }
8803
8804 // dest
8805 if (srcdnl->is_remote()) {
8806 ceph_assert(!linkmerge);
8807 if (destdn->is_auth() && !destdnl->is_null())
8808 mdcache->journal_cow_dentry(mdr.get(), metablob, destdn, CEPH_NOSNAP, 0, destdnl);
8809 else
8810 destdn->first = mdcache->get_global_snaprealm()->get_newest_seq() + 1;
8811
8812 if (destdn->is_auth())
8813 metablob->add_remote_dentry(destdn, true, srcdnl->get_remote_ino(), srcdnl->get_remote_d_type());
8814
8815 if (srci->is_auth() ) { // it's remote
8816 if (mdr->peer_request) {
8817 if (mdr->peer_request->srci_snapbl.length() > 0) {
8818 sr_t *new_srnode = new sr_t();
8819 auto p = mdr->peer_request->srci_snapbl.cbegin();
8820 decode(*new_srnode, p);
8821 srci->project_snaprealm(new_srnode);
8822 }
8823 } else if (auto& srci_srnode = mdr->more()->srci_srnode) {
8824 srci->project_snaprealm(srci_srnode);
8825 srci_srnode = NULL;
8826 }
8827
8828 CDentry *srci_pdn = srci->get_projected_parent_dn();
8829 mdcache->journal_cow_dentry(mdr.get(), metablob, srci_pdn);
8830 metablob->add_primary_dentry(srci_pdn, srci, true);
8831 }
8832 } else if (srcdnl->is_primary()) {
8833 // project snap parent update?
8834 if (destdn->is_auth()) {
8835 if (auto& srci_srnode = mdr->more()->srci_srnode) {
8836 srci->project_snaprealm(srci_srnode);
8837 srci_srnode = NULL;
8838 }
8839 }
8840
8841 if (destdn->is_auth() && !destdnl->is_null())
8842 mdcache->journal_cow_dentry(mdr.get(), metablob, destdn, CEPH_NOSNAP, 0, destdnl);
8843
8844 destdn->first = mdcache->get_global_snaprealm()->get_newest_seq() + 1;
8845
8846 if (destdn->is_auth())
8847 metablob->add_primary_dentry(destdn, srci, true, true);
8848 else if (force_journal_dest) {
8849 dout(10) << " forced journaling destdn " << *destdn << dendl;
8850 metablob->add_dir_context(destdn->get_dir());
8851 metablob->add_primary_dentry(destdn, srci, true);
8852 if (srcdn->is_auth() && srci->is_dir()) {
8853 // journal new subtrees root dirfrags
8854 auto&& ls = srci->get_dirfrags();
8855 for (const auto& dir : ls) {
8856 if (dir->is_auth())
8857 metablob->add_dir(dir, true);
8858 }
8859 }
8860 }
8861 }
8862
8863 // src
8864 if (srcdn->is_auth()) {
8865 dout(10) << " journaling srcdn " << *srcdn << dendl;
8866 mdcache->journal_cow_dentry(mdr.get(), metablob, srcdn, CEPH_NOSNAP, 0, srcdnl);
8867 // also journal the inode in case we need do peer rename rollback. It is Ok to add
8868 // both primary and NULL dentries. Because during journal replay, null dentry is
8869 // processed after primary dentry.
8870 if (srcdnl->is_primary() && !srci->is_dir() && !destdn->is_auth())
8871 metablob->add_primary_dentry(srcdn, srci, true);
8872 metablob->add_null_dentry(srcdn, true);
8873 } else
8874 dout(10) << " NOT journaling srcdn " << *srcdn << dendl;
8875
8876 // make renamed inode first track the dn
8877 if (srcdnl->is_primary() && destdn->is_auth()) {
8878 ceph_assert(srci->first <= destdn->first);
8879 srci->first = destdn->first;
8880 }
8881 // make stray inode first track the straydn
8882 if (straydn && straydn->is_auth()) {
8883 ceph_assert(oldin->first <= straydn->first);
8884 oldin->first = straydn->first;
8885 }
8886
8887 if (oldin && oldin->is_dir()) {
8888 ceph_assert(straydn);
8889 mdcache->project_subtree_rename(oldin, destdn->get_dir(), straydn->get_dir());
8890 }
8891 if (srci->is_dir())
8892 mdcache->project_subtree_rename(srci, srcdn->get_dir(), destdn->get_dir());
8893
8894 }
8895
8896
8897 void Server::_rename_apply(MDRequestRef& mdr, CDentry *srcdn, CDentry *destdn, CDentry *straydn)
8898 {
8899 dout(10) << "_rename_apply " << *mdr << " " << *srcdn << " " << *destdn << dendl;
8900 dout(10) << " pvs " << mdr->more()->pvmap << dendl;
8901
8902 CDentry::linkage_t *srcdnl = srcdn->get_linkage();
8903 CDentry::linkage_t *destdnl = destdn->get_linkage();
8904
8905 CInode *oldin = destdnl->get_inode();
8906
8907 // primary+remote link merge?
8908 bool linkmerge = (srcdnl->get_inode() == oldin);
8909 if (linkmerge)
8910 ceph_assert(srcdnl->is_primary() || destdnl->is_remote());
8911
8912 bool new_in_snaprealm = false;
8913 bool new_oldin_snaprealm = false;
8914
8915 // target inode
8916 if (!linkmerge) {
8917 if (destdnl->is_primary()) {
8918 ceph_assert(straydn);
8919 dout(10) << "straydn is " << *straydn << dendl;
8920
8921 // if there is newly created snaprealm, need to split old snaprealm's
8922 // inodes_with_caps. So pop snaprealm before linkage changes.
8923 if (destdn->is_auth()) {
8924 bool hadrealm = (oldin->snaprealm ? true : false);
8925 oldin->early_pop_projected_snaprealm();
8926 new_oldin_snaprealm = (oldin->snaprealm && !hadrealm);
8927 } else {
8928 ceph_assert(mdr->peer_request);
8929 if (mdr->peer_request->desti_snapbl.length()) {
8930 new_oldin_snaprealm = !oldin->snaprealm;
8931 oldin->decode_snap_blob(mdr->peer_request->desti_snapbl);
8932 ceph_assert(oldin->snaprealm);
8933 }
8934 }
8935
8936 destdn->get_dir()->unlink_inode(destdn, false);
8937
8938 straydn->pop_projected_linkage();
8939 if (mdr->is_peer() && !mdr->more()->peer_update_journaled)
8940 ceph_assert(!straydn->is_projected()); // no other projected
8941
8942 // nlink-- targeti
8943 if (destdn->is_auth())
8944 oldin->pop_and_dirty_projected_inode(mdr->ls, mdr);
8945
8946 mdcache->touch_dentry_bottom(straydn); // drop dn as quickly as possible.
8947 } else if (destdnl->is_remote()) {
8948 destdn->get_dir()->unlink_inode(destdn, false);
8949 if (oldin->is_auth()) {
8950 oldin->pop_and_dirty_projected_inode(mdr->ls, mdr);
8951 } else if (mdr->peer_request) {
8952 if (mdr->peer_request->desti_snapbl.length() > 0) {
8953 ceph_assert(oldin->snaprealm);
8954 oldin->decode_snap_blob(mdr->peer_request->desti_snapbl);
8955 }
8956 } else if (auto& desti_srnode = mdr->more()->desti_srnode) {
8957 delete desti_srnode;
8958 desti_srnode = NULL;
8959 }
8960 }
8961 }
8962
8963 // unlink src before we relink it at dest
8964 CInode *in = srcdnl->get_inode();
8965 ceph_assert(in);
8966
8967 bool srcdn_was_remote = srcdnl->is_remote();
8968 if (!srcdn_was_remote) {
8969 // if there is newly created snaprealm, need to split old snaprealm's
8970 // inodes_with_caps. So pop snaprealm before linkage changes.
8971 if (destdn->is_auth()) {
8972 bool hadrealm = (in->snaprealm ? true : false);
8973 in->early_pop_projected_snaprealm();
8974 new_in_snaprealm = (in->snaprealm && !hadrealm);
8975 } else {
8976 ceph_assert(mdr->peer_request);
8977 if (mdr->peer_request->srci_snapbl.length()) {
8978 new_in_snaprealm = !in->snaprealm;
8979 in->decode_snap_blob(mdr->peer_request->srci_snapbl);
8980 ceph_assert(in->snaprealm);
8981 }
8982 }
8983 }
8984
8985 srcdn->get_dir()->unlink_inode(srcdn);
8986
8987 // dest
8988 if (srcdn_was_remote) {
8989 if (!linkmerge) {
8990 // destdn
8991 destdnl = destdn->pop_projected_linkage();
8992 if (mdr->is_peer() && !mdr->more()->peer_update_journaled)
8993 ceph_assert(!destdn->is_projected()); // no other projected
8994
8995 destdn->link_remote(destdnl, in);
8996 if (destdn->is_auth())
8997 destdn->mark_dirty(mdr->more()->pvmap[destdn], mdr->ls);
8998 // in
8999 if (in->is_auth()) {
9000 in->pop_and_dirty_projected_inode(mdr->ls, mdr);
9001 } else if (mdr->peer_request) {
9002 if (mdr->peer_request->srci_snapbl.length() > 0) {
9003 ceph_assert(in->snaprealm);
9004 in->decode_snap_blob(mdr->peer_request->srci_snapbl);
9005 }
9006 } else if (auto& srci_srnode = mdr->more()->srci_srnode) {
9007 delete srci_srnode;
9008 srci_srnode = NULL;
9009 }
9010 } else {
9011 dout(10) << "merging remote onto primary link" << dendl;
9012 oldin->pop_and_dirty_projected_inode(mdr->ls, mdr);
9013 }
9014 } else { // primary
9015 if (linkmerge) {
9016 dout(10) << "merging primary onto remote link" << dendl;
9017 destdn->get_dir()->unlink_inode(destdn, false);
9018 }
9019 destdnl = destdn->pop_projected_linkage();
9020 if (mdr->is_peer() && !mdr->more()->peer_update_journaled)
9021 ceph_assert(!destdn->is_projected()); // no other projected
9022
9023 // srcdn inode import?
9024 if (!srcdn->is_auth() && destdn->is_auth()) {
9025 ceph_assert(mdr->more()->inode_import.length() > 0);
9026
9027 map<client_t,Capability::Import> imported_caps;
9028
9029 // finish cap imports
9030 finish_force_open_sessions(mdr->more()->imported_session_map);
9031 if (mdr->more()->cap_imports.count(destdnl->get_inode())) {
9032 mdcache->migrator->finish_import_inode_caps(destdnl->get_inode(),
9033 mdr->more()->srcdn_auth_mds, true,
9034 mdr->more()->imported_session_map,
9035 mdr->more()->cap_imports[destdnl->get_inode()],
9036 imported_caps);
9037 }
9038
9039 mdr->more()->inode_import.clear();
9040 encode(imported_caps, mdr->more()->inode_import);
9041
9042 /* hack: add an auth pin for each xlock we hold. These were
9043 * remote xlocks previously but now they're local and
9044 * we're going to try and unpin when we xlock_finish. */
9045
9046 for (auto i = mdr->locks.lower_bound(&destdnl->get_inode()->versionlock);
9047 i != mdr->locks.end();
9048 ++i) {
9049 SimpleLock *lock = i->lock;
9050 if (lock->get_parent() != destdnl->get_inode())
9051 break;
9052 if (i->is_xlock() && !lock->is_locallock())
9053 mds->locker->xlock_import(lock);
9054 }
9055
9056 // hack: fix auth bit
9057 in->state_set(CInode::STATE_AUTH);
9058
9059 mdr->clear_ambiguous_auth();
9060 }
9061
9062 if (destdn->is_auth())
9063 in->pop_and_dirty_projected_inode(mdr->ls, mdr);
9064 }
9065
9066 // src
9067 if (srcdn->is_auth())
9068 srcdn->mark_dirty(mdr->more()->pvmap[srcdn], mdr->ls);
9069 srcdn->pop_projected_linkage();
9070 if (mdr->is_peer() && !mdr->more()->peer_update_journaled)
9071 ceph_assert(!srcdn->is_projected()); // no other projected
9072
9073 // apply remaining projected inodes (nested)
9074 mdr->apply();
9075
9076 // update subtree map?
9077 if (destdnl->is_primary() && in->is_dir())
9078 mdcache->adjust_subtree_after_rename(in, srcdn->get_dir(), true);
9079
9080 if (straydn && oldin->is_dir())
9081 mdcache->adjust_subtree_after_rename(oldin, destdn->get_dir(), true);
9082
9083 if (new_oldin_snaprealm)
9084 mdcache->do_realm_invalidate_and_update_notify(oldin, CEPH_SNAP_OP_SPLIT, false);
9085 if (new_in_snaprealm)
9086 mdcache->do_realm_invalidate_and_update_notify(in, CEPH_SNAP_OP_SPLIT, true);
9087
9088 // removing a new dn?
9089 if (srcdn->is_auth())
9090 srcdn->get_dir()->try_remove_unlinked_dn(srcdn);
9091 }
9092
9093
9094
9095 // ------------
9096 // PEER
9097
9098 class C_MDS_PeerRenamePrep : public ServerLogContext {
9099 CDentry *srcdn, *destdn, *straydn;
9100 public:
9101 C_MDS_PeerRenamePrep(Server *s, MDRequestRef& m, CDentry *sr, CDentry *de, CDentry *st) :
9102 ServerLogContext(s, m), srcdn(sr), destdn(de), straydn(st) {}
9103 void finish(int r) override {
9104 server->_logged_peer_rename(mdr, srcdn, destdn, straydn);
9105 }
9106 };
9107
9108 class C_MDS_PeerRenameCommit : public ServerContext {
9109 MDRequestRef mdr;
9110 CDentry *srcdn, *destdn, *straydn;
9111 public:
9112 C_MDS_PeerRenameCommit(Server *s, MDRequestRef& m, CDentry *sr, CDentry *de, CDentry *st) :
9113 ServerContext(s), mdr(m), srcdn(sr), destdn(de), straydn(st) {}
9114 void finish(int r) override {
9115 server->_commit_peer_rename(mdr, r, srcdn, destdn, straydn);
9116 }
9117 };
9118
9119 class C_MDS_PeerRenameSessionsFlushed : public ServerContext {
9120 MDRequestRef mdr;
9121 public:
9122 C_MDS_PeerRenameSessionsFlushed(Server *s, MDRequestRef& r) :
9123 ServerContext(s), mdr(r) {}
9124 void finish(int r) override {
9125 server->_peer_rename_sessions_flushed(mdr);
9126 }
9127 };
9128
9129 void Server::handle_peer_rename_prep(MDRequestRef& mdr)
9130 {
9131 dout(10) << "handle_peer_rename_prep " << *mdr
9132 << " " << mdr->peer_request->srcdnpath
9133 << " to " << mdr->peer_request->destdnpath
9134 << dendl;
9135
9136 if (mdr->peer_request->is_interrupted()) {
9137 dout(10) << " peer request interrupted, sending noop reply" << dendl;
9138 auto reply = make_message<MMDSPeerRequest>(mdr->reqid, mdr->attempt, MMDSPeerRequest::OP_RENAMEPREPACK);
9139 reply->mark_interrupted();
9140 mds->send_message_mds(reply, mdr->peer_to_mds);
9141 mdr->reset_peer_request();
9142 return;
9143 }
9144
9145 // discover destdn
9146 filepath destpath(mdr->peer_request->destdnpath);
9147 dout(10) << " dest " << destpath << dendl;
9148 vector<CDentry*> trace;
9149 CF_MDS_RetryRequestFactory cf(mdcache, mdr, false);
9150 int r = mdcache->path_traverse(mdr, cf, destpath,
9151 MDS_TRAVERSE_DISCOVER | MDS_TRAVERSE_PATH_LOCKED | MDS_TRAVERSE_WANT_DENTRY,
9152 &trace);
9153 if (r > 0) return;
9154 if (r == -CEPHFS_ESTALE) {
9155 mdcache->find_ino_peers(destpath.get_ino(), new C_MDS_RetryRequest(mdcache, mdr),
9156 mdr->peer_to_mds, true);
9157 return;
9158 }
9159 ceph_assert(r == 0); // we shouldn't get an error here!
9160
9161 CDentry *destdn = trace.back();
9162 CDentry::linkage_t *destdnl = destdn->get_projected_linkage();
9163 dout(10) << " destdn " << *destdn << dendl;
9164 mdr->pin(destdn);
9165
9166 // discover srcdn
9167 filepath srcpath(mdr->peer_request->srcdnpath);
9168 dout(10) << " src " << srcpath << dendl;
9169 CInode *srci = nullptr;
9170 r = mdcache->path_traverse(mdr, cf, srcpath,
9171 MDS_TRAVERSE_DISCOVER | MDS_TRAVERSE_PATH_LOCKED,
9172 &trace, &srci);
9173 if (r > 0) return;
9174 ceph_assert(r == 0);
9175
9176 CDentry *srcdn = trace.back();
9177 CDentry::linkage_t *srcdnl = srcdn->get_projected_linkage();
9178 dout(10) << " srcdn " << *srcdn << dendl;
9179 mdr->pin(srcdn);
9180 mdr->pin(srci);
9181
9182 // stray?
9183 bool linkmerge = srcdnl->get_inode() == destdnl->get_inode();
9184 if (linkmerge)
9185 ceph_assert(srcdnl->is_primary() && destdnl->is_remote());
9186 CDentry *straydn = mdr->straydn;
9187 if (destdnl->is_primary() && !linkmerge)
9188 ceph_assert(straydn);
9189
9190 mdr->set_op_stamp(mdr->peer_request->op_stamp);
9191 mdr->more()->srcdn_auth_mds = srcdn->authority().first;
9192
9193 // set up commit waiter (early, to clean up any freezing etc we do)
9194 if (!mdr->more()->peer_commit)
9195 mdr->more()->peer_commit = new C_MDS_PeerRenameCommit(this, mdr, srcdn, destdn, straydn);
9196
9197 // am i srcdn auth?
9198 if (srcdn->is_auth()) {
9199 set<mds_rank_t> srcdnrep;
9200 srcdn->list_replicas(srcdnrep);
9201
9202 bool reply_witness = false;
9203 if (srcdnl->is_primary() && !srcdnl->get_inode()->state_test(CInode::STATE_AMBIGUOUSAUTH)) {
9204 // freeze?
9205 // we need this to
9206 // - avoid conflicting lock state changes
9207 // - avoid concurrent updates to the inode
9208 // (this could also be accomplished with the versionlock)
9209 int allowance = 3; // 1 for the mdr auth_pin, 1 for the link lock, 1 for the snap lock
9210 dout(10) << " freezing srci " << *srcdnl->get_inode() << " with allowance " << allowance << dendl;
9211 bool frozen_inode = srcdnl->get_inode()->freeze_inode(allowance);
9212
9213 // unfreeze auth pin after freezing the inode to avoid queueing waiters
9214 if (srcdnl->get_inode()->is_frozen_auth_pin())
9215 mdr->unfreeze_auth_pin();
9216
9217 if (!frozen_inode) {
9218 srcdnl->get_inode()->add_waiter(CInode::WAIT_FROZEN, new C_MDS_RetryRequest(mdcache, mdr));
9219 return;
9220 }
9221
9222 /*
9223 * set ambiguous auth for srci
9224 * NOTE: we don't worry about ambiguous cache expire as we do
9225 * with subtree migrations because all peers will pin
9226 * srcdn->get_inode() for duration of this rename.
9227 */
9228 mdr->set_ambiguous_auth(srcdnl->get_inode());
9229
9230 // just mark the source inode as ambiguous auth if more than two MDS are involved.
9231 // the leader will send another OP_RENAMEPREP peer request later.
9232 if (mdr->peer_request->witnesses.size() > 1) {
9233 dout(10) << " set srci ambiguous auth; providing srcdn replica list" << dendl;
9234 reply_witness = true;
9235 }
9236
9237 // make sure bystanders have received all lock related messages
9238 for (set<mds_rank_t>::iterator p = srcdnrep.begin(); p != srcdnrep.end(); ++p) {
9239 if (*p == mdr->peer_to_mds ||
9240 (mds->is_cluster_degraded() &&
9241 !mds->mdsmap->is_clientreplay_or_active_or_stopping(*p)))
9242 continue;
9243 auto notify = make_message<MMDSPeerRequest>(mdr->reqid, mdr->attempt, MMDSPeerRequest::OP_RENAMENOTIFY);
9244 mds->send_message_mds(notify, *p);
9245 mdr->more()->waiting_on_peer.insert(*p);
9246 }
9247
9248 // make sure clients have received all cap related messages
9249 set<client_t> export_client_set;
9250 mdcache->migrator->get_export_client_set(srcdnl->get_inode(), export_client_set);
9251
9252 MDSGatherBuilder gather(g_ceph_context);
9253 flush_client_sessions(export_client_set, gather);
9254 if (gather.has_subs()) {
9255 mdr->more()->waiting_on_peer.insert(MDS_RANK_NONE);
9256 gather.set_finisher(new C_MDS_PeerRenameSessionsFlushed(this, mdr));
9257 gather.activate();
9258 }
9259 }
9260
9261 // is witness list sufficient?
9262 for (set<mds_rank_t>::iterator p = srcdnrep.begin(); p != srcdnrep.end(); ++p) {
9263 if (*p == mdr->peer_to_mds ||
9264 mdr->peer_request->witnesses.count(*p)) continue;
9265 dout(10) << " witness list insufficient; providing srcdn replica list" << dendl;
9266 reply_witness = true;
9267 break;
9268 }
9269
9270 if (reply_witness) {
9271 ceph_assert(!srcdnrep.empty());
9272 auto reply = make_message<MMDSPeerRequest>(mdr->reqid, mdr->attempt, MMDSPeerRequest::OP_RENAMEPREPACK);
9273 reply->witnesses.swap(srcdnrep);
9274 mds->send_message_mds(reply, mdr->peer_to_mds);
9275 mdr->reset_peer_request();
9276 return;
9277 }
9278 dout(10) << " witness list sufficient: includes all srcdn replicas" << dendl;
9279 if (!mdr->more()->waiting_on_peer.empty()) {
9280 dout(10) << " still waiting for rename notify acks from "
9281 << mdr->more()->waiting_on_peer << dendl;
9282 return;
9283 }
9284 } else if (srcdnl->is_primary() && srcdn->authority() != destdn->authority()) {
9285 // set ambiguous auth for srci on witnesses
9286 mdr->set_ambiguous_auth(srcdnl->get_inode());
9287 }
9288
9289 // encode everything we'd need to roll this back... basically, just the original state.
9290 rename_rollback rollback;
9291
9292 rollback.reqid = mdr->reqid;
9293
9294 rollback.orig_src.dirfrag = srcdn->get_dir()->dirfrag();
9295 rollback.orig_src.dirfrag_old_mtime = srcdn->get_dir()->get_projected_fnode()->fragstat.mtime;
9296 rollback.orig_src.dirfrag_old_rctime = srcdn->get_dir()->get_projected_fnode()->rstat.rctime;
9297 rollback.orig_src.dname = srcdn->get_name();
9298 if (srcdnl->is_primary())
9299 rollback.orig_src.ino = srcdnl->get_inode()->ino();
9300 else {
9301 ceph_assert(srcdnl->is_remote());
9302 rollback.orig_src.remote_ino = srcdnl->get_remote_ino();
9303 rollback.orig_src.remote_d_type = srcdnl->get_remote_d_type();
9304 }
9305
9306 rollback.orig_dest.dirfrag = destdn->get_dir()->dirfrag();
9307 rollback.orig_dest.dirfrag_old_mtime = destdn->get_dir()->get_projected_fnode()->fragstat.mtime;
9308 rollback.orig_dest.dirfrag_old_rctime = destdn->get_dir()->get_projected_fnode()->rstat.rctime;
9309 rollback.orig_dest.dname = destdn->get_name();
9310 if (destdnl->is_primary())
9311 rollback.orig_dest.ino = destdnl->get_inode()->ino();
9312 else if (destdnl->is_remote()) {
9313 rollback.orig_dest.remote_ino = destdnl->get_remote_ino();
9314 rollback.orig_dest.remote_d_type = destdnl->get_remote_d_type();
9315 }
9316
9317 if (straydn) {
9318 rollback.stray.dirfrag = straydn->get_dir()->dirfrag();
9319 rollback.stray.dirfrag_old_mtime = straydn->get_dir()->get_projected_fnode()->fragstat.mtime;
9320 rollback.stray.dirfrag_old_rctime = straydn->get_dir()->get_projected_fnode()->rstat.rctime;
9321 rollback.stray.dname = straydn->get_name();
9322 }
9323 if (mdr->peer_request->desti_snapbl.length()) {
9324 CInode *oldin = destdnl->get_inode();
9325 if (oldin->snaprealm) {
9326 encode(true, rollback.desti_snapbl);
9327 oldin->encode_snap_blob(rollback.desti_snapbl);
9328 } else {
9329 encode(false, rollback.desti_snapbl);
9330 }
9331 }
9332 if (mdr->peer_request->srci_snapbl.length()) {
9333 if (srci->snaprealm) {
9334 encode(true, rollback.srci_snapbl);
9335 srci->encode_snap_blob(rollback.srci_snapbl);
9336 } else {
9337 encode(false, rollback.srci_snapbl);
9338 }
9339 }
9340 encode(rollback, mdr->more()->rollback_bl);
9341 // FIXME: rollback snaprealm
9342 dout(20) << " rollback is " << mdr->more()->rollback_bl.length() << " bytes" << dendl;
9343
9344 // journal.
9345 mdr->ls = mdlog->get_current_segment();
9346 EPeerUpdate *le = new EPeerUpdate(mdlog, "peer_rename_prep", mdr->reqid, mdr->peer_to_mds,
9347 EPeerUpdate::OP_PREPARE, EPeerUpdate::RENAME);
9348 mdlog->start_entry(le);
9349 le->rollback = mdr->more()->rollback_bl;
9350
9351 bufferlist blah; // inode import data... obviously not used if we're the peer
9352 _rename_prepare(mdr, &le->commit, &blah, srcdn, destdn, mdr->peer_request->alternate_name, straydn);
9353
9354 if (le->commit.empty()) {
9355 dout(10) << " empty metablob, skipping journal" << dendl;
9356 mdlog->cancel_entry(le);
9357 mdr->ls = NULL;
9358 _logged_peer_rename(mdr, srcdn, destdn, straydn);
9359 } else {
9360 mdcache->add_uncommitted_peer(mdr->reqid, mdr->ls, mdr->peer_to_mds);
9361 mdr->more()->peer_update_journaled = true;
9362 submit_mdlog_entry(le, new C_MDS_PeerRenamePrep(this, mdr, srcdn, destdn, straydn),
9363 mdr, __func__);
9364 mdlog->flush();
9365 }
9366 }
9367
9368 void Server::_logged_peer_rename(MDRequestRef& mdr,
9369 CDentry *srcdn, CDentry *destdn, CDentry *straydn)
9370 {
9371 dout(10) << "_logged_peer_rename " << *mdr << dendl;
9372
9373 // prepare ack
9374 ref_t<MMDSPeerRequest> reply;
9375 if (!mdr->aborted) {
9376 reply = make_message<MMDSPeerRequest>(mdr->reqid, mdr->attempt, MMDSPeerRequest::OP_RENAMEPREPACK);
9377 if (!mdr->more()->peer_update_journaled)
9378 reply->mark_not_journaled();
9379 }
9380
9381 CDentry::linkage_t *srcdnl = srcdn->get_linkage();
9382 //CDentry::linkage_t *straydnl = straydn ? straydn->get_linkage() : 0;
9383
9384 // export srci?
9385 if (srcdn->is_auth() && srcdnl->is_primary()) {
9386 // set export bounds for CInode::encode_export()
9387 if (reply) {
9388 std::vector<CDir*> bounds;
9389 if (srcdnl->get_inode()->is_dir()) {
9390 srcdnl->get_inode()->get_dirfrags(bounds);
9391 for (const auto& bound : bounds) {
9392 bound->state_set(CDir::STATE_EXPORTBOUND);
9393 }
9394 }
9395
9396 map<client_t,entity_inst_t> exported_client_map;
9397 map<client_t, client_metadata_t> exported_client_metadata_map;
9398 bufferlist inodebl;
9399 mdcache->migrator->encode_export_inode(srcdnl->get_inode(), inodebl,
9400 exported_client_map,
9401 exported_client_metadata_map);
9402
9403 for (const auto& bound : bounds) {
9404 bound->state_clear(CDir::STATE_EXPORTBOUND);
9405 }
9406
9407 encode(exported_client_map, reply->inode_export, mds->mdsmap->get_up_features());
9408 encode(exported_client_metadata_map, reply->inode_export);
9409 reply->inode_export.claim_append(inodebl);
9410 reply->inode_export_v = srcdnl->get_inode()->get_version();
9411 }
9412
9413 // remove mdr auth pin
9414 mdr->auth_unpin(srcdnl->get_inode());
9415 mdr->more()->is_inode_exporter = true;
9416
9417 if (srcdnl->get_inode()->is_dirty())
9418 srcdnl->get_inode()->mark_clean();
9419
9420 dout(10) << " exported srci " << *srcdnl->get_inode() << dendl;
9421 }
9422
9423 // apply
9424 _rename_apply(mdr, srcdn, destdn, straydn);
9425
9426 CDentry::linkage_t *destdnl = destdn->get_linkage();
9427
9428 // bump popularity
9429 mds->balancer->hit_dir(srcdn->get_dir(), META_POP_IWR);
9430 if (destdnl->get_inode() && destdnl->get_inode()->is_auth())
9431 mds->balancer->hit_inode(destdnl->get_inode(), META_POP_IWR);
9432
9433 // done.
9434 mdr->reset_peer_request();
9435 mdr->straydn = 0;
9436
9437 if (reply) {
9438 mds->send_message_mds(reply, mdr->peer_to_mds);
9439 } else {
9440 ceph_assert(mdr->aborted);
9441 dout(10) << " abort flag set, finishing" << dendl;
9442 mdcache->request_finish(mdr);
9443 }
9444 }
9445
9446 void Server::_commit_peer_rename(MDRequestRef& mdr, int r,
9447 CDentry *srcdn, CDentry *destdn, CDentry *straydn)
9448 {
9449 dout(10) << "_commit_peer_rename " << *mdr << " r=" << r << dendl;
9450
9451 CInode *in = destdn->get_linkage()->get_inode();
9452
9453 inodeno_t migrated_stray;
9454 if (srcdn->is_auth() && srcdn->get_dir()->inode->is_stray())
9455 migrated_stray = in->ino();
9456
9457 MDSContext::vec finished;
9458 if (r == 0) {
9459 // unfreeze+singleauth inode
9460 // hmm, do i really need to delay this?
9461 if (mdr->more()->is_inode_exporter) {
9462 // drop our pins
9463 // we exported, clear out any xlocks that we moved to another MDS
9464
9465 for (auto i = mdr->locks.lower_bound(&in->versionlock);
9466 i != mdr->locks.end(); ) {
9467 SimpleLock *lock = i->lock;
9468 if (lock->get_parent() != in)
9469 break;
9470 // we only care about xlocks on the exported inode
9471 if (i->is_xlock() && !lock->is_locallock())
9472 mds->locker->xlock_export(i++, mdr.get());
9473 else
9474 ++i;
9475 }
9476
9477 map<client_t,Capability::Import> peer_imported;
9478 auto bp = mdr->more()->inode_import.cbegin();
9479 decode(peer_imported, bp);
9480
9481 dout(10) << " finishing inode export on " << *in << dendl;
9482 mdcache->migrator->finish_export_inode(in, mdr->peer_to_mds, peer_imported, finished);
9483 mds->queue_waiters(finished); // this includes SINGLEAUTH waiters.
9484
9485 // unfreeze
9486 ceph_assert(in->is_frozen_inode());
9487 in->unfreeze_inode(finished);
9488 }
9489
9490 // singleauth
9491 if (mdr->more()->is_ambiguous_auth) {
9492 mdr->more()->rename_inode->clear_ambiguous_auth(finished);
9493 mdr->more()->is_ambiguous_auth = false;
9494 }
9495
9496 if (straydn && mdr->more()->peer_update_journaled) {
9497 CInode *strayin = straydn->get_projected_linkage()->get_inode();
9498 if (strayin && !strayin->snaprealm)
9499 mdcache->clear_dirty_bits_for_stray(strayin);
9500 }
9501
9502 mds->queue_waiters(finished);
9503 mdr->cleanup();
9504
9505 if (mdr->more()->peer_update_journaled) {
9506 // write a commit to the journal
9507 EPeerUpdate *le = new EPeerUpdate(mdlog, "peer_rename_commit", mdr->reqid,
9508 mdr->peer_to_mds, EPeerUpdate::OP_COMMIT,
9509 EPeerUpdate::RENAME);
9510 mdlog->start_entry(le);
9511 submit_mdlog_entry(le, new C_MDS_CommittedPeer(this, mdr), mdr, __func__);
9512 mdlog->flush();
9513 } else {
9514 _committed_peer(mdr);
9515 }
9516 } else {
9517
9518 // abort
9519 // rollback_bl may be empty if we froze the inode but had to provide an expanded
9520 // witness list from the leader, and they failed before we tried prep again.
9521 if (mdr->more()->rollback_bl.length()) {
9522 if (mdr->more()->is_inode_exporter) {
9523 dout(10) << " reversing inode export of " << *in << dendl;
9524 in->abort_export();
9525 }
9526 if (mdcache->is_ambiguous_peer_update(mdr->reqid, mdr->peer_to_mds)) {
9527 mdcache->remove_ambiguous_peer_update(mdr->reqid, mdr->peer_to_mds);
9528 // rollback but preserve the peer request
9529 do_rename_rollback(mdr->more()->rollback_bl, mdr->peer_to_mds, mdr, false);
9530 mdr->more()->rollback_bl.clear();
9531 } else
9532 do_rename_rollback(mdr->more()->rollback_bl, mdr->peer_to_mds, mdr, true);
9533 } else {
9534 dout(10) << " rollback_bl empty, not rollback back rename (leader failed after getting extra witnesses?)" << dendl;
9535 // singleauth
9536 if (mdr->more()->is_ambiguous_auth) {
9537 if (srcdn->is_auth())
9538 mdr->more()->rename_inode->unfreeze_inode(finished);
9539
9540 mdr->more()->rename_inode->clear_ambiguous_auth(finished);
9541 mdr->more()->is_ambiguous_auth = false;
9542 }
9543 mds->queue_waiters(finished);
9544 mdcache->request_finish(mdr);
9545 }
9546 }
9547
9548 if (migrated_stray && mds->is_stopping())
9549 mdcache->shutdown_export_stray_finish(migrated_stray);
9550 }
9551
9552 static void _rollback_repair_dir(MutationRef& mut, CDir *dir,
9553 rename_rollback::drec &r, utime_t ctime,
9554 bool isdir, const nest_info_t &rstat)
9555 {
9556 auto pf = dir->project_fnode(mut);
9557 pf->version = dir->pre_dirty();
9558
9559 if (isdir) {
9560 pf->fragstat.nsubdirs += 1;
9561 } else {
9562 pf->fragstat.nfiles += 1;
9563 }
9564 if (r.ino) {
9565 pf->rstat.rbytes += rstat.rbytes;
9566 pf->rstat.rfiles += rstat.rfiles;
9567 pf->rstat.rsubdirs += rstat.rsubdirs;
9568 pf->rstat.rsnaps += rstat.rsnaps;
9569 }
9570 if (pf->fragstat.mtime == ctime) {
9571 pf->fragstat.mtime = r.dirfrag_old_mtime;
9572 if (pf->rstat.rctime == ctime)
9573 pf->rstat.rctime = r.dirfrag_old_rctime;
9574 }
9575 mut->add_updated_lock(&dir->get_inode()->filelock);
9576 mut->add_updated_lock(&dir->get_inode()->nestlock);
9577 }
9578
9579 struct C_MDS_LoggedRenameRollback : public ServerLogContext {
9580 MutationRef mut;
9581 CDentry *srcdn;
9582 version_t srcdnpv;
9583 CDentry *destdn;
9584 CDentry *straydn;
9585 map<client_t,ref_t<MClientSnap>> splits[2];
9586 bool finish_mdr;
9587 C_MDS_LoggedRenameRollback(Server *s, MutationRef& m, MDRequestRef& r,
9588 CDentry *sd, version_t pv, CDentry *dd, CDentry *st,
9589 map<client_t,ref_t<MClientSnap>> _splits[2], bool f) :
9590 ServerLogContext(s, r), mut(m), srcdn(sd), srcdnpv(pv), destdn(dd),
9591 straydn(st), finish_mdr(f) {
9592 splits[0].swap(_splits[0]);
9593 splits[1].swap(_splits[1]);
9594 }
9595 void finish(int r) override {
9596 server->_rename_rollback_finish(mut, mdr, srcdn, srcdnpv,
9597 destdn, straydn, splits, finish_mdr);
9598 }
9599 };
9600
9601 void Server::do_rename_rollback(bufferlist &rbl, mds_rank_t leader, MDRequestRef& mdr,
9602 bool finish_mdr)
9603 {
9604 rename_rollback rollback;
9605 auto p = rbl.cbegin();
9606 decode(rollback, p);
9607
9608 dout(10) << "do_rename_rollback on " << rollback.reqid << dendl;
9609 // need to finish this update before sending resolve to claim the subtree
9610 mdcache->add_rollback(rollback.reqid, leader);
9611
9612 MutationRef mut(new MutationImpl(nullptr, utime_t(), rollback.reqid));
9613 mut->ls = mds->mdlog->get_current_segment();
9614
9615 CDentry *srcdn = NULL;
9616 CDir *srcdir = mdcache->get_dirfrag(rollback.orig_src.dirfrag);
9617 if (!srcdir)
9618 srcdir = mdcache->get_dirfrag(rollback.orig_src.dirfrag.ino, rollback.orig_src.dname);
9619 if (srcdir) {
9620 dout(10) << " srcdir " << *srcdir << dendl;
9621 srcdn = srcdir->lookup(rollback.orig_src.dname);
9622 if (srcdn) {
9623 dout(10) << " srcdn " << *srcdn << dendl;
9624 ceph_assert(srcdn->get_linkage()->is_null());
9625 } else
9626 dout(10) << " srcdn not found" << dendl;
9627 } else
9628 dout(10) << " srcdir not found" << dendl;
9629
9630 CDentry *destdn = NULL;
9631 CDir *destdir = mdcache->get_dirfrag(rollback.orig_dest.dirfrag);
9632 if (!destdir)
9633 destdir = mdcache->get_dirfrag(rollback.orig_dest.dirfrag.ino, rollback.orig_dest.dname);
9634 if (destdir) {
9635 dout(10) << " destdir " << *destdir << dendl;
9636 destdn = destdir->lookup(rollback.orig_dest.dname);
9637 if (destdn)
9638 dout(10) << " destdn " << *destdn << dendl;
9639 else
9640 dout(10) << " destdn not found" << dendl;
9641 } else
9642 dout(10) << " destdir not found" << dendl;
9643
9644 CInode *in = NULL;
9645 if (rollback.orig_src.ino) {
9646 in = mdcache->get_inode(rollback.orig_src.ino);
9647 if (in && in->is_dir())
9648 ceph_assert(srcdn && destdn);
9649 } else
9650 in = mdcache->get_inode(rollback.orig_src.remote_ino);
9651
9652 CDir *straydir = NULL;
9653 CDentry *straydn = NULL;
9654 if (rollback.stray.dirfrag.ino) {
9655 straydir = mdcache->get_dirfrag(rollback.stray.dirfrag);
9656 if (straydir) {
9657 dout(10) << "straydir " << *straydir << dendl;
9658 straydn = straydir->lookup(rollback.stray.dname);
9659 if (straydn) {
9660 dout(10) << " straydn " << *straydn << dendl;
9661 ceph_assert(straydn->get_linkage()->is_primary());
9662 } else
9663 dout(10) << " straydn not found" << dendl;
9664 } else
9665 dout(10) << "straydir not found" << dendl;
9666 }
9667
9668 CInode *target = NULL;
9669 if (rollback.orig_dest.ino) {
9670 target = mdcache->get_inode(rollback.orig_dest.ino);
9671 if (target)
9672 ceph_assert(destdn && straydn);
9673 } else if (rollback.orig_dest.remote_ino)
9674 target = mdcache->get_inode(rollback.orig_dest.remote_ino);
9675
9676 // can't use is_auth() in the resolve stage
9677 mds_rank_t whoami = mds->get_nodeid();
9678 // peer
9679 ceph_assert(!destdn || destdn->authority().first != whoami);
9680 ceph_assert(!straydn || straydn->authority().first != whoami);
9681
9682 bool force_journal_src = false;
9683 bool force_journal_dest = false;
9684 if (in && in->is_dir() && srcdn->authority().first != whoami)
9685 force_journal_src = _need_force_journal(in, false);
9686 if (in && target && target->is_dir())
9687 force_journal_dest = _need_force_journal(in, true);
9688
9689 version_t srcdnpv = 0;
9690 // repair src
9691 if (srcdn) {
9692 if (srcdn->authority().first == whoami)
9693 srcdnpv = srcdn->pre_dirty();
9694 if (rollback.orig_src.ino) {
9695 ceph_assert(in);
9696 srcdn->push_projected_linkage(in);
9697 } else
9698 srcdn->push_projected_linkage(rollback.orig_src.remote_ino,
9699 rollback.orig_src.remote_d_type);
9700 }
9701
9702 map<client_t,ref_t<MClientSnap>> splits[2];
9703
9704 const CInode::mempool_inode *pip = nullptr;
9705 if (in) {
9706 bool projected;
9707 CDir *pdir = in->get_projected_parent_dir();
9708 if (pdir->authority().first == whoami) {
9709 auto pi = in->project_inode(mut);
9710 pi.inode->version = in->pre_dirty();
9711 if (pdir != srcdir) {
9712 auto pf = pdir->project_fnode(mut);
9713 pf->version = pdir->pre_dirty();
9714 }
9715 if (pi.inode->ctime == rollback.ctime)
9716 pi.inode->ctime = rollback.orig_src.old_ctime;
9717 projected = true;
9718 } else {
9719 if (in->get_inode()->ctime == rollback.ctime) {
9720 auto _inode = CInode::allocate_inode(*in->get_inode());
9721 _inode->ctime = rollback.orig_src.old_ctime;
9722 in->reset_inode(_inode);
9723 }
9724 projected = false;
9725 }
9726 pip = in->get_projected_inode().get();
9727
9728 if (rollback.srci_snapbl.length() && in->snaprealm) {
9729 bool hadrealm;
9730 auto p = rollback.srci_snapbl.cbegin();
9731 decode(hadrealm, p);
9732 if (hadrealm) {
9733 if (projected && !mds->is_resolve()) {
9734 sr_t *new_srnode = new sr_t();
9735 decode(*new_srnode, p);
9736 in->project_snaprealm(new_srnode);
9737 } else
9738 decode(in->snaprealm->srnode, p);
9739 } else {
9740 SnapRealm *realm;
9741 if (rollback.orig_src.ino) {
9742 ceph_assert(srcdir);
9743 realm = srcdir->get_inode()->find_snaprealm();
9744 } else {
9745 realm = in->snaprealm->parent;
9746 }
9747 if (!mds->is_resolve())
9748 mdcache->prepare_realm_merge(in->snaprealm, realm, splits[0]);
9749 if (projected)
9750 in->project_snaprealm(NULL);
9751 else
9752 in->snaprealm->merge_to(realm);
9753 }
9754 }
9755 }
9756
9757 // repair dest
9758 if (destdn) {
9759 if (rollback.orig_dest.ino && target) {
9760 destdn->push_projected_linkage(target);
9761 } else if (rollback.orig_dest.remote_ino) {
9762 destdn->push_projected_linkage(rollback.orig_dest.remote_ino,
9763 rollback.orig_dest.remote_d_type);
9764 } else {
9765 // the dentry will be trimmed soon, it's ok to have wrong linkage
9766 if (rollback.orig_dest.ino)
9767 ceph_assert(mds->is_resolve());
9768 destdn->push_projected_linkage();
9769 }
9770 }
9771
9772 if (straydn)
9773 straydn->push_projected_linkage();
9774
9775 if (target) {
9776 bool projected;
9777 CInode::inode_ptr ti;
9778 CDir *pdir = target->get_projected_parent_dir();
9779 if (pdir->authority().first == whoami) {
9780 auto pi = target->project_inode(mut);
9781 pi.inode->version = target->pre_dirty();
9782 if (pdir != srcdir) {
9783 auto pf = pdir->project_fnode(mut);
9784 pf->version = pdir->pre_dirty();
9785 }
9786 ti = pi.inode;
9787 projected = true;
9788 } else {
9789 ti = CInode::allocate_inode(*target->get_inode());
9790 projected = false;
9791 }
9792
9793 if (ti->ctime == rollback.ctime)
9794 ti->ctime = rollback.orig_dest.old_ctime;
9795 if (MDS_INO_IS_STRAY(rollback.orig_src.dirfrag.ino)) {
9796 if (MDS_INO_IS_STRAY(rollback.orig_dest.dirfrag.ino))
9797 ceph_assert(!rollback.orig_dest.ino && !rollback.orig_dest.remote_ino);
9798 else
9799 ceph_assert(rollback.orig_dest.remote_ino &&
9800 rollback.orig_dest.remote_ino == rollback.orig_src.ino);
9801 } else
9802 ti->nlink++;
9803
9804 if (!projected)
9805 target->reset_inode(ti);
9806
9807 if (rollback.desti_snapbl.length() && target->snaprealm) {
9808 bool hadrealm;
9809 auto p = rollback.desti_snapbl.cbegin();
9810 decode(hadrealm, p);
9811 if (hadrealm) {
9812 if (projected && !mds->is_resolve()) {
9813 sr_t *new_srnode = new sr_t();
9814 decode(*new_srnode, p);
9815 target->project_snaprealm(new_srnode);
9816 } else
9817 decode(target->snaprealm->srnode, p);
9818 } else {
9819 SnapRealm *realm;
9820 if (rollback.orig_dest.ino) {
9821 ceph_assert(destdir);
9822 realm = destdir->get_inode()->find_snaprealm();
9823 } else {
9824 realm = target->snaprealm->parent;
9825 }
9826 if (!mds->is_resolve())
9827 mdcache->prepare_realm_merge(target->snaprealm, realm, splits[1]);
9828 if (projected)
9829 target->project_snaprealm(NULL);
9830 else
9831 target->snaprealm->merge_to(realm);
9832 }
9833 }
9834 }
9835
9836 if (srcdn && srcdn->authority().first == whoami) {
9837 nest_info_t blah;
9838 _rollback_repair_dir(mut, srcdir, rollback.orig_src, rollback.ctime,
9839 in && in->is_dir(), pip ? pip->accounted_rstat : blah);
9840 }
9841
9842 if (srcdn)
9843 dout(0) << " srcdn back to " << *srcdn << dendl;
9844 if (in)
9845 dout(0) << " srci back to " << *in << dendl;
9846 if (destdn)
9847 dout(0) << " destdn back to " << *destdn << dendl;
9848 if (target)
9849 dout(0) << " desti back to " << *target << dendl;
9850
9851 // journal it
9852 EPeerUpdate *le = new EPeerUpdate(mdlog, "peer_rename_rollback", rollback.reqid, leader,
9853 EPeerUpdate::OP_ROLLBACK, EPeerUpdate::RENAME);
9854 mdlog->start_entry(le);
9855
9856 if (srcdn && (srcdn->authority().first == whoami || force_journal_src)) {
9857 le->commit.add_dir_context(srcdir);
9858 if (rollback.orig_src.ino)
9859 le->commit.add_primary_dentry(srcdn, 0, true);
9860 else
9861 le->commit.add_remote_dentry(srcdn, true);
9862 }
9863
9864 if (!rollback.orig_src.ino && // remote linkage
9865 in && in->authority().first == whoami) {
9866 le->commit.add_dir_context(in->get_projected_parent_dir());
9867 le->commit.add_primary_dentry(in->get_projected_parent_dn(), in, true);
9868 }
9869
9870 if (force_journal_dest) {
9871 ceph_assert(rollback.orig_dest.ino);
9872 le->commit.add_dir_context(destdir);
9873 le->commit.add_primary_dentry(destdn, 0, true);
9874 }
9875
9876 // peer: no need to journal straydn
9877
9878 if (target && target != in && target->authority().first == whoami) {
9879 ceph_assert(rollback.orig_dest.remote_ino);
9880 le->commit.add_dir_context(target->get_projected_parent_dir());
9881 le->commit.add_primary_dentry(target->get_projected_parent_dn(), target, true);
9882 }
9883
9884 if (in && in->is_dir() && (srcdn->authority().first == whoami || force_journal_src)) {
9885 dout(10) << " noting renamed dir ino " << in->ino() << " in metablob" << dendl;
9886 le->commit.renamed_dirino = in->ino();
9887 if (srcdn->authority().first == whoami) {
9888 auto&& ls = in->get_dirfrags();
9889 for (const auto& dir : ls) {
9890 if (!dir->is_auth())
9891 le->commit.renamed_dir_frags.push_back(dir->get_frag());
9892 }
9893 dout(10) << " noting renamed dir open frags " << le->commit.renamed_dir_frags << dendl;
9894 }
9895 } else if (force_journal_dest) {
9896 dout(10) << " noting rename target ino " << target->ino() << " in metablob" << dendl;
9897 le->commit.renamed_dirino = target->ino();
9898 }
9899
9900 if (target && target->is_dir()) {
9901 ceph_assert(destdn);
9902 mdcache->project_subtree_rename(target, straydir, destdir);
9903 }
9904
9905 if (in && in->is_dir()) {
9906 ceph_assert(srcdn);
9907 mdcache->project_subtree_rename(in, destdir, srcdir);
9908 }
9909
9910 if (mdr && !mdr->more()->peer_update_journaled) {
9911 ceph_assert(le->commit.empty());
9912 mdlog->cancel_entry(le);
9913 mut->ls = NULL;
9914 _rename_rollback_finish(mut, mdr, srcdn, srcdnpv, destdn, straydn, splits, finish_mdr);
9915 } else {
9916 ceph_assert(!le->commit.empty());
9917 if (mdr)
9918 mdr->more()->peer_update_journaled = false;
9919 MDSLogContextBase *fin = new C_MDS_LoggedRenameRollback(this, mut, mdr,
9920 srcdn, srcdnpv, destdn, straydn,
9921 splits, finish_mdr);
9922 submit_mdlog_entry(le, fin, mdr, __func__);
9923 mdlog->flush();
9924 }
9925 }
9926
9927 void Server::_rename_rollback_finish(MutationRef& mut, MDRequestRef& mdr, CDentry *srcdn,
9928 version_t srcdnpv, CDentry *destdn, CDentry *straydn,
9929 map<client_t,ref_t<MClientSnap>> splits[2], bool finish_mdr)
9930 {
9931 dout(10) << "_rename_rollback_finish " << mut->reqid << dendl;
9932
9933 if (straydn) {
9934 straydn->get_dir()->unlink_inode(straydn);
9935 straydn->pop_projected_linkage();
9936 }
9937 if (destdn) {
9938 destdn->get_dir()->unlink_inode(destdn);
9939 destdn->pop_projected_linkage();
9940 }
9941 if (srcdn) {
9942 srcdn->pop_projected_linkage();
9943 if (srcdn->authority().first == mds->get_nodeid()) {
9944 srcdn->mark_dirty(srcdnpv, mut->ls);
9945 if (srcdn->get_linkage()->is_primary())
9946 srcdn->get_linkage()->get_inode()->state_set(CInode::STATE_AUTH);
9947 }
9948 }
9949
9950 mut->apply();
9951
9952 if (srcdn && srcdn->get_linkage()->is_primary()) {
9953 CInode *in = srcdn->get_linkage()->get_inode();
9954 if (in && in->is_dir()) {
9955 ceph_assert(destdn);
9956 mdcache->adjust_subtree_after_rename(in, destdn->get_dir(), true);
9957 }
9958 }
9959
9960 if (destdn) {
9961 CInode *oldin = destdn->get_linkage()->get_inode();
9962 // update subtree map?
9963 if (oldin && oldin->is_dir()) {
9964 ceph_assert(straydn);
9965 mdcache->adjust_subtree_after_rename(oldin, straydn->get_dir(), true);
9966 }
9967 }
9968
9969 if (mds->is_resolve()) {
9970 CDir *root = NULL;
9971 if (straydn)
9972 root = mdcache->get_subtree_root(straydn->get_dir());
9973 else if (destdn)
9974 root = mdcache->get_subtree_root(destdn->get_dir());
9975 if (root)
9976 mdcache->try_trim_non_auth_subtree(root);
9977 } else {
9978 mdcache->send_snaps(splits[1]);
9979 mdcache->send_snaps(splits[0]);
9980 }
9981
9982 if (mdr) {
9983 MDSContext::vec finished;
9984 if (mdr->more()->is_ambiguous_auth) {
9985 if (srcdn->is_auth())
9986 mdr->more()->rename_inode->unfreeze_inode(finished);
9987
9988 mdr->more()->rename_inode->clear_ambiguous_auth(finished);
9989 mdr->more()->is_ambiguous_auth = false;
9990 }
9991 mds->queue_waiters(finished);
9992 if (finish_mdr || mdr->aborted)
9993 mdcache->request_finish(mdr);
9994 else
9995 mdr->more()->peer_rolling_back = false;
9996 }
9997
9998 mdcache->finish_rollback(mut->reqid, mdr);
9999
10000 mut->cleanup();
10001 }
10002
10003 void Server::handle_peer_rename_prep_ack(MDRequestRef& mdr, const cref_t<MMDSPeerRequest> &ack)
10004 {
10005 dout(10) << "handle_peer_rename_prep_ack " << *mdr
10006 << " witnessed by " << ack->get_source()
10007 << " " << *ack << dendl;
10008 mds_rank_t from = mds_rank_t(ack->get_source().num());
10009
10010 // note peer
10011 mdr->more()->peers.insert(from);
10012 if (mdr->more()->srcdn_auth_mds == from &&
10013 mdr->more()->is_remote_frozen_authpin &&
10014 !mdr->more()->is_ambiguous_auth) {
10015 mdr->set_ambiguous_auth(mdr->more()->rename_inode);
10016 }
10017
10018 // witnessed? or add extra witnesses?
10019 ceph_assert(mdr->more()->witnessed.count(from) == 0);
10020 if (ack->is_interrupted()) {
10021 dout(10) << " peer request interrupted, noop" << dendl;
10022 } else if (ack->witnesses.empty()) {
10023 mdr->more()->witnessed.insert(from);
10024 if (!ack->is_not_journaled())
10025 mdr->more()->has_journaled_peers = true;
10026 } else {
10027 dout(10) << " extra witnesses (srcdn replicas) are " << ack->witnesses << dendl;
10028 mdr->more()->extra_witnesses = ack->witnesses;
10029 mdr->more()->extra_witnesses.erase(mds->get_nodeid()); // not me!
10030 }
10031
10032 // srci import?
10033 if (ack->inode_export.length()) {
10034 dout(10) << " got srci import" << dendl;
10035 mdr->more()->inode_import.share(ack->inode_export);
10036 mdr->more()->inode_import_v = ack->inode_export_v;
10037 }
10038
10039 // remove from waiting list
10040 ceph_assert(mdr->more()->waiting_on_peer.count(from));
10041 mdr->more()->waiting_on_peer.erase(from);
10042
10043 if (mdr->more()->waiting_on_peer.empty())
10044 dispatch_client_request(mdr); // go again!
10045 else
10046 dout(10) << "still waiting on peers " << mdr->more()->waiting_on_peer << dendl;
10047 }
10048
10049 void Server::handle_peer_rename_notify_ack(MDRequestRef& mdr, const cref_t<MMDSPeerRequest> &ack)
10050 {
10051 dout(10) << "handle_peer_rename_notify_ack " << *mdr << " from mds."
10052 << ack->get_source() << dendl;
10053 ceph_assert(mdr->is_peer());
10054 mds_rank_t from = mds_rank_t(ack->get_source().num());
10055
10056 if (mdr->more()->waiting_on_peer.count(from)) {
10057 mdr->more()->waiting_on_peer.erase(from);
10058
10059 if (mdr->more()->waiting_on_peer.empty()) {
10060 if (mdr->peer_request)
10061 dispatch_peer_request(mdr);
10062 } else
10063 dout(10) << " still waiting for rename notify acks from "
10064 << mdr->more()->waiting_on_peer << dendl;
10065 }
10066 }
10067
10068 void Server::_peer_rename_sessions_flushed(MDRequestRef& mdr)
10069 {
10070 dout(10) << "_peer_rename_sessions_flushed " << *mdr << dendl;
10071
10072 if (mdr->more()->waiting_on_peer.count(MDS_RANK_NONE)) {
10073 mdr->more()->waiting_on_peer.erase(MDS_RANK_NONE);
10074
10075 if (mdr->more()->waiting_on_peer.empty()) {
10076 if (mdr->peer_request)
10077 dispatch_peer_request(mdr);
10078 } else
10079 dout(10) << " still waiting for rename notify acks from "
10080 << mdr->more()->waiting_on_peer << dendl;
10081 }
10082 }
10083
10084 // snaps
10085 /* This function takes responsibility for the passed mdr*/
10086 void Server::handle_client_lssnap(MDRequestRef& mdr)
10087 {
10088 const cref_t<MClientRequest> &req = mdr->client_request;
10089
10090 // traverse to path
10091 CInode *diri = try_get_auth_inode(mdr, req->get_filepath().get_ino());
10092 if (!diri)
10093 return;
10094
10095 if (!diri->is_dir()) {
10096 respond_to_request(mdr, -CEPHFS_ENOTDIR);
10097 return;
10098 }
10099 dout(10) << "lssnap on " << *diri << dendl;
10100
10101 // lock snap
10102 if (!mds->locker->try_rdlock_snap_layout(diri, mdr))
10103 return;
10104
10105 if (!check_access(mdr, diri, MAY_READ))
10106 return;
10107
10108 SnapRealm *realm = diri->find_snaprealm();
10109 map<snapid_t,const SnapInfo*> infomap;
10110 realm->get_snap_info(infomap, diri->get_oldest_snap());
10111
10112 unsigned max_entries = req->head.args.readdir.max_entries;
10113 if (!max_entries)
10114 max_entries = infomap.size();
10115 int max_bytes = req->head.args.readdir.max_bytes;
10116 if (!max_bytes)
10117 // make sure at least one item can be encoded
10118 max_bytes = (512 << 10) + g_conf()->mds_max_xattr_pairs_size;
10119
10120 __u64 last_snapid = 0;
10121 string offset_str = req->get_path2();
10122 if (!offset_str.empty())
10123 last_snapid = realm->resolve_snapname(offset_str, diri->ino());
10124
10125 //Empty DirStat
10126 bufferlist dirbl;
10127 static DirStat empty;
10128 CDir::encode_dirstat(dirbl, mdr->session->info, empty);
10129
10130 max_bytes -= dirbl.length() - sizeof(__u32) + sizeof(__u8) * 2;
10131
10132 __u32 num = 0;
10133 bufferlist dnbl;
10134 auto p = infomap.upper_bound(last_snapid);
10135 for (; p != infomap.end() && num < max_entries; ++p) {
10136 dout(10) << p->first << " -> " << *p->second << dendl;
10137
10138 // actual
10139 string snap_name;
10140 if (p->second->ino == diri->ino())
10141 snap_name = p->second->name;
10142 else
10143 snap_name = p->second->get_long_name();
10144
10145 unsigned start_len = dnbl.length();
10146 if (int(start_len + snap_name.length() + sizeof(__u32) + sizeof(LeaseStat)) > max_bytes)
10147 break;
10148
10149 encode(snap_name, dnbl);
10150 //infinite lease
10151 LeaseStat e(CEPH_LEASE_VALID, -1, 0);
10152 mds->locker->encode_lease(dnbl, mdr->session->info, e);
10153 dout(20) << "encode_infinite_lease" << dendl;
10154
10155 int r = diri->encode_inodestat(dnbl, mdr->session, realm, p->first, max_bytes - (int)dnbl.length());
10156 if (r < 0) {
10157 bufferlist keep;
10158 keep.substr_of(dnbl, 0, start_len);
10159 dnbl.swap(keep);
10160 break;
10161 }
10162 ++num;
10163 }
10164
10165 encode(num, dirbl);
10166 __u16 flags = 0;
10167 if (p == infomap.end()) {
10168 flags = CEPH_READDIR_FRAG_END;
10169 if (last_snapid == 0)
10170 flags |= CEPH_READDIR_FRAG_COMPLETE;
10171 }
10172 encode(flags, dirbl);
10173 dirbl.claim_append(dnbl);
10174
10175 mdr->reply_extra_bl = dirbl;
10176 mdr->tracei = diri;
10177 respond_to_request(mdr, 0);
10178 }
10179
10180
10181 // MKSNAP
10182
10183 struct C_MDS_mksnap_finish : public ServerLogContext {
10184 CInode *diri;
10185 SnapInfo info;
10186 C_MDS_mksnap_finish(Server *s, MDRequestRef& r, CInode *di, SnapInfo &i) :
10187 ServerLogContext(s, r), diri(di), info(i) {}
10188 void finish(int r) override {
10189 server->_mksnap_finish(mdr, diri, info);
10190 }
10191 };
10192
10193 /* This function takes responsibility for the passed mdr*/
10194 void Server::handle_client_mksnap(MDRequestRef& mdr)
10195 {
10196 const cref_t<MClientRequest> &req = mdr->client_request;
10197 // make sure we have as new a map as the client
10198 if (req->get_mdsmap_epoch() > mds->mdsmap->get_epoch()) {
10199 mds->wait_for_mdsmap(req->get_mdsmap_epoch(), new C_MDS_RetryRequest(mdcache, mdr));
10200 return;
10201 }
10202 if (!mds->mdsmap->allows_snaps()) {
10203 // you can't make snapshots until you set an option right now
10204 respond_to_request(mdr, -CEPHFS_EPERM);
10205 return;
10206 }
10207
10208 CInode *diri = try_get_auth_inode(mdr, req->get_filepath().get_ino());
10209 if (!diri)
10210 return;
10211
10212 // dir only
10213 if (!diri->is_dir()) {
10214 respond_to_request(mdr, -CEPHFS_ENOTDIR);
10215 return;
10216 }
10217 if (diri->is_system() && !diri->is_root()) {
10218 // no snaps in system dirs (root is ok)
10219 respond_to_request(mdr, -CEPHFS_EPERM);
10220 return;
10221 }
10222
10223 std::string_view snapname = req->get_filepath().last_dentry();
10224
10225 if (mdr->client_request->get_caller_uid() < g_conf()->mds_snap_min_uid || mdr->client_request->get_caller_uid() > g_conf()->mds_snap_max_uid) {
10226 dout(20) << "mksnap " << snapname << " on " << *diri << " denied to uid " << mdr->client_request->get_caller_uid() << dendl;
10227 respond_to_request(mdr, -CEPHFS_EPERM);
10228 return;
10229 }
10230
10231 dout(10) << "mksnap " << snapname << " on " << *diri << dendl;
10232
10233 // lock snap
10234 if (!(mdr->locking_state & MutationImpl::ALL_LOCKED)) {
10235 MutationImpl::LockOpVec lov;
10236 lov.add_xlock(&diri->snaplock);
10237 if (!mds->locker->acquire_locks(mdr, lov))
10238 return;
10239
10240 if (CDentry *pdn = diri->get_projected_parent_dn(); pdn) {
10241 if (!mds->locker->try_rdlock_snap_layout(pdn->get_dir()->get_inode(), mdr))
10242 return;
10243 }
10244 mdr->locking_state |= MutationImpl::ALL_LOCKED;
10245 }
10246
10247 if (!check_access(mdr, diri, MAY_WRITE|MAY_SNAPSHOT))
10248 return;
10249
10250 if (inodeno_t subvol_ino = diri->find_snaprealm()->get_subvolume_ino();
10251 (subvol_ino && subvol_ino != diri->ino())) {
10252 respond_to_request(mdr, -CEPHFS_EPERM);
10253 return;
10254 }
10255
10256 // check if we can create any more snapshots
10257 // we don't allow any more if we are already at or beyond the limit
10258 if (diri->snaprealm &&
10259 diri->snaprealm->get_snaps().size() >= max_snaps_per_dir) {
10260 respond_to_request(mdr, -CEPHFS_EMLINK);
10261 return;
10262 }
10263
10264 // make sure name is unique
10265 if (diri->snaprealm &&
10266 diri->snaprealm->exists(snapname)) {
10267 respond_to_request(mdr, -CEPHFS_EEXIST);
10268 return;
10269 }
10270 if (snapname.length() == 0 ||
10271 snapname[0] == '_') {
10272 respond_to_request(mdr, -CEPHFS_EINVAL);
10273 return;
10274 }
10275
10276 // allocate a snapid
10277 if (!mdr->more()->stid) {
10278 // prepare an stid
10279 mds->snapclient->prepare_create(diri->ino(), snapname,
10280 mdr->get_mds_stamp(),
10281 &mdr->more()->stid, &mdr->more()->snapidbl,
10282 new C_MDS_RetryRequest(mdcache, mdr));
10283 return;
10284 }
10285
10286 version_t stid = mdr->more()->stid;
10287 snapid_t snapid;
10288 auto p = mdr->more()->snapidbl.cbegin();
10289 decode(snapid, p);
10290 dout(10) << " stid " << stid << " snapid " << snapid << dendl;
10291
10292 ceph_assert(mds->snapclient->get_cached_version() >= stid);
10293
10294 SnapPayload payload;
10295 if (req->get_data().length()) {
10296 try {
10297 auto iter = req->get_data().cbegin();
10298 decode(payload, iter);
10299 } catch (const ceph::buffer::error &e) {
10300 // backward compat -- client sends xattr bufferlist. however,
10301 // that is not used anywhere -- so (log and) ignore.
10302 dout(20) << ": no metadata in payload (old client?)" << dendl;
10303 }
10304 }
10305
10306 // journal
10307 SnapInfo info;
10308 info.ino = diri->ino();
10309 info.snapid = snapid;
10310 info.name = snapname;
10311 info.stamp = mdr->get_op_stamp();
10312 info.metadata = payload.metadata;
10313
10314 auto pi = diri->project_inode(mdr, false, true);
10315 pi.inode->ctime = info.stamp;
10316 if (info.stamp > pi.inode->rstat.rctime)
10317 pi.inode->rstat.rctime = info.stamp;
10318 pi.inode->rstat.rsnaps++;
10319 pi.inode->version = diri->pre_dirty();
10320
10321 // project the snaprealm
10322 auto &newsnap = *pi.snapnode;
10323 newsnap.created = snapid;
10324 auto em = newsnap.snaps.emplace(std::piecewise_construct, std::forward_as_tuple(snapid), std::forward_as_tuple(info));
10325 if (!em.second)
10326 em.first->second = info;
10327 newsnap.seq = snapid;
10328 newsnap.last_created = snapid;
10329
10330 // journal the inode changes
10331 mdr->ls = mdlog->get_current_segment();
10332 EUpdate *le = new EUpdate(mdlog, "mksnap");
10333 mdlog->start_entry(le);
10334
10335 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
10336 le->metablob.add_table_transaction(TABLE_SNAP, stid);
10337 mdcache->predirty_journal_parents(mdr, &le->metablob, diri, 0, PREDIRTY_PRIMARY, false);
10338 mdcache->journal_dirty_inode(mdr.get(), &le->metablob, diri);
10339
10340 // journal the snaprealm changes
10341 submit_mdlog_entry(le, new C_MDS_mksnap_finish(this, mdr, diri, info),
10342 mdr, __func__);
10343 mdlog->flush();
10344 }
10345
10346 void Server::_mksnap_finish(MDRequestRef& mdr, CInode *diri, SnapInfo &info)
10347 {
10348 dout(10) << "_mksnap_finish " << *mdr << " " << info << dendl;
10349
10350 int op = (diri->snaprealm? CEPH_SNAP_OP_CREATE : CEPH_SNAP_OP_SPLIT);
10351
10352 mdr->apply();
10353
10354 mds->snapclient->commit(mdr->more()->stid, mdr->ls);
10355
10356 // create snap
10357 dout(10) << "snaprealm now " << *diri->snaprealm << dendl;
10358
10359 // notify other mds
10360 mdcache->send_snap_update(diri, mdr->more()->stid, op);
10361
10362 mdcache->do_realm_invalidate_and_update_notify(diri, op);
10363
10364 // yay
10365 mdr->in[0] = diri;
10366 mdr->snapid = info.snapid;
10367 mdr->tracei = diri;
10368 respond_to_request(mdr, 0);
10369 }
10370
10371
10372 // RMSNAP
10373
10374 struct C_MDS_rmsnap_finish : public ServerLogContext {
10375 CInode *diri;
10376 snapid_t snapid;
10377 C_MDS_rmsnap_finish(Server *s, MDRequestRef& r, CInode *di, snapid_t sn) :
10378 ServerLogContext(s, r), diri(di), snapid(sn) {}
10379 void finish(int r) override {
10380 server->_rmsnap_finish(mdr, diri, snapid);
10381 }
10382 };
10383
10384 /* This function takes responsibility for the passed mdr*/
10385 void Server::handle_client_rmsnap(MDRequestRef& mdr)
10386 {
10387 const cref_t<MClientRequest> &req = mdr->client_request;
10388
10389 CInode *diri = try_get_auth_inode(mdr, req->get_filepath().get_ino());
10390 if (!diri)
10391 return;
10392
10393 if (!diri->is_dir()) {
10394 respond_to_request(mdr, -CEPHFS_ENOTDIR);
10395 return;
10396 }
10397
10398 std::string_view snapname = req->get_filepath().last_dentry();
10399
10400 if (mdr->client_request->get_caller_uid() < g_conf()->mds_snap_min_uid || mdr->client_request->get_caller_uid() > g_conf()->mds_snap_max_uid) {
10401 dout(20) << "rmsnap " << snapname << " on " << *diri << " denied to uid " << mdr->client_request->get_caller_uid() << dendl;
10402 respond_to_request(mdr, -CEPHFS_EPERM);
10403 return;
10404 }
10405
10406 dout(10) << "rmsnap " << snapname << " on " << *diri << dendl;
10407
10408 // does snap exist?
10409 if (snapname.length() == 0 || snapname[0] == '_') {
10410 respond_to_request(mdr, -CEPHFS_EINVAL); // can't prune a parent snap, currently.
10411 return;
10412 }
10413 if (!diri->snaprealm || !diri->snaprealm->exists(snapname)) {
10414 respond_to_request(mdr, -CEPHFS_ENOENT);
10415 return;
10416 }
10417 snapid_t snapid = diri->snaprealm->resolve_snapname(snapname, diri->ino());
10418 dout(10) << " snapname " << snapname << " is " << snapid << dendl;
10419
10420 if (!(mdr->locking_state & MutationImpl::ALL_LOCKED)) {
10421 MutationImpl::LockOpVec lov;
10422 lov.add_xlock(&diri->snaplock);
10423 if (!mds->locker->acquire_locks(mdr, lov))
10424 return;
10425 if (CDentry *pdn = diri->get_projected_parent_dn(); pdn) {
10426 if (!mds->locker->try_rdlock_snap_layout(pdn->get_dir()->get_inode(), mdr))
10427 return;
10428 }
10429 mdr->locking_state |= MutationImpl::ALL_LOCKED;
10430 }
10431
10432 if (!check_access(mdr, diri, MAY_WRITE|MAY_SNAPSHOT))
10433 return;
10434
10435 // prepare
10436 if (!mdr->more()->stid) {
10437 mds->snapclient->prepare_destroy(diri->ino(), snapid,
10438 &mdr->more()->stid, &mdr->more()->snapidbl,
10439 new C_MDS_RetryRequest(mdcache, mdr));
10440 return;
10441 }
10442 version_t stid = mdr->more()->stid;
10443 auto p = mdr->more()->snapidbl.cbegin();
10444 snapid_t seq;
10445 decode(seq, p);
10446 dout(10) << " stid is " << stid << ", seq is " << seq << dendl;
10447
10448 ceph_assert(mds->snapclient->get_cached_version() >= stid);
10449
10450 // journal
10451 auto pi = diri->project_inode(mdr, false, true);
10452 pi.inode->version = diri->pre_dirty();
10453 pi.inode->ctime = mdr->get_op_stamp();
10454 if (mdr->get_op_stamp() > pi.inode->rstat.rctime)
10455 pi.inode->rstat.rctime = mdr->get_op_stamp();
10456 pi.inode->rstat.rsnaps--;
10457
10458 mdr->ls = mdlog->get_current_segment();
10459 EUpdate *le = new EUpdate(mdlog, "rmsnap");
10460 mdlog->start_entry(le);
10461
10462 // project the snaprealm
10463 auto &newnode = *pi.snapnode;
10464 newnode.snaps.erase(snapid);
10465 newnode.seq = seq;
10466 newnode.last_destroyed = seq;
10467
10468 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
10469 le->metablob.add_table_transaction(TABLE_SNAP, stid);
10470 mdcache->predirty_journal_parents(mdr, &le->metablob, diri, 0, PREDIRTY_PRIMARY, false);
10471 mdcache->journal_dirty_inode(mdr.get(), &le->metablob, diri);
10472
10473 submit_mdlog_entry(le, new C_MDS_rmsnap_finish(this, mdr, diri, snapid),
10474 mdr, __func__);
10475 mdlog->flush();
10476 }
10477
10478 void Server::_rmsnap_finish(MDRequestRef& mdr, CInode *diri, snapid_t snapid)
10479 {
10480 dout(10) << "_rmsnap_finish " << *mdr << " " << snapid << dendl;
10481 snapid_t stid = mdr->more()->stid;
10482 auto p = mdr->more()->snapidbl.cbegin();
10483 snapid_t seq;
10484 decode(seq, p);
10485
10486 mdr->apply();
10487
10488 mds->snapclient->commit(stid, mdr->ls);
10489
10490 dout(10) << "snaprealm now " << *diri->snaprealm << dendl;
10491
10492 // notify other mds
10493 mdcache->send_snap_update(diri, mdr->more()->stid, CEPH_SNAP_OP_DESTROY);
10494
10495 mdcache->do_realm_invalidate_and_update_notify(diri, CEPH_SNAP_OP_DESTROY);
10496
10497 // yay
10498 mdr->in[0] = diri;
10499 respond_to_request(mdr, 0);
10500
10501 // purge snapshot data
10502 diri->purge_stale_snap_data(diri->snaprealm->get_snaps());
10503 }
10504
10505 struct C_MDS_renamesnap_finish : public ServerLogContext {
10506 CInode *diri;
10507 snapid_t snapid;
10508 C_MDS_renamesnap_finish(Server *s, MDRequestRef& r, CInode *di, snapid_t sn) :
10509 ServerLogContext(s, r), diri(di), snapid(sn) {}
10510 void finish(int r) override {
10511 server->_renamesnap_finish(mdr, diri, snapid);
10512 }
10513 };
10514
10515 /* This function takes responsibility for the passed mdr*/
10516 void Server::handle_client_renamesnap(MDRequestRef& mdr)
10517 {
10518 const cref_t<MClientRequest> &req = mdr->client_request;
10519 if (req->get_filepath().get_ino() != req->get_filepath2().get_ino()) {
10520 respond_to_request(mdr, -CEPHFS_EINVAL);
10521 return;
10522 }
10523
10524 CInode *diri = try_get_auth_inode(mdr, req->get_filepath().get_ino());
10525 if (!diri)
10526 return;
10527
10528 if (!diri->is_dir()) { // dir only
10529 respond_to_request(mdr, -CEPHFS_ENOTDIR);
10530 return;
10531 }
10532
10533 if (mdr->client_request->get_caller_uid() < g_conf()->mds_snap_min_uid ||
10534 mdr->client_request->get_caller_uid() > g_conf()->mds_snap_max_uid) {
10535 respond_to_request(mdr, -CEPHFS_EPERM);
10536 return;
10537 }
10538
10539 std::string_view dstname = req->get_filepath().last_dentry();
10540 std::string_view srcname = req->get_filepath2().last_dentry();
10541 dout(10) << "renamesnap " << srcname << "->" << dstname << " on " << *diri << dendl;
10542
10543 if (srcname.length() == 0 || srcname[0] == '_') {
10544 respond_to_request(mdr, -CEPHFS_EINVAL); // can't rename a parent snap.
10545 return;
10546 }
10547 if (!diri->snaprealm || !diri->snaprealm->exists(srcname)) {
10548 respond_to_request(mdr, -CEPHFS_ENOENT);
10549 return;
10550 }
10551 if (dstname.length() == 0 || dstname[0] == '_') {
10552 respond_to_request(mdr, -CEPHFS_EINVAL);
10553 return;
10554 }
10555 if (diri->snaprealm->exists(dstname)) {
10556 respond_to_request(mdr, -CEPHFS_EEXIST);
10557 return;
10558 }
10559
10560 snapid_t snapid = diri->snaprealm->resolve_snapname(srcname, diri->ino());
10561 dout(10) << " snapname " << srcname << " is " << snapid << dendl;
10562
10563 // lock snap
10564 if (!(mdr->locking_state & MutationImpl::ALL_LOCKED)) {
10565 MutationImpl::LockOpVec lov;
10566 lov.add_xlock(&diri->snaplock);
10567 if (!mds->locker->acquire_locks(mdr, lov))
10568 return;
10569 if (CDentry *pdn = diri->get_projected_parent_dn(); pdn) {
10570 if (!mds->locker->try_rdlock_snap_layout(pdn->get_dir()->get_inode(), mdr))
10571 return;
10572 }
10573 mdr->locking_state |= MutationImpl::ALL_LOCKED;
10574 }
10575
10576 if (!check_access(mdr, diri, MAY_WRITE|MAY_SNAPSHOT))
10577 return;
10578
10579 // prepare
10580 if (!mdr->more()->stid) {
10581 mds->snapclient->prepare_update(diri->ino(), snapid, dstname, utime_t(),
10582 &mdr->more()->stid,
10583 new C_MDS_RetryRequest(mdcache, mdr));
10584 return;
10585 }
10586
10587 version_t stid = mdr->more()->stid;
10588 dout(10) << " stid is " << stid << dendl;
10589
10590 ceph_assert(mds->snapclient->get_cached_version() >= stid);
10591
10592 // journal
10593 auto pi = diri->project_inode(mdr, false, true);
10594 pi.inode->ctime = mdr->get_op_stamp();
10595 if (mdr->get_op_stamp() > pi.inode->rstat.rctime)
10596 pi.inode->rstat.rctime = mdr->get_op_stamp();
10597 pi.inode->version = diri->pre_dirty();
10598
10599 // project the snaprealm
10600 auto &newsnap = *pi.snapnode;
10601 auto it = newsnap.snaps.find(snapid);
10602 ceph_assert(it != newsnap.snaps.end());
10603 it->second.name = dstname;
10604
10605 // journal the inode changes
10606 mdr->ls = mdlog->get_current_segment();
10607 EUpdate *le = new EUpdate(mdlog, "renamesnap");
10608 mdlog->start_entry(le);
10609
10610 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
10611 le->metablob.add_table_transaction(TABLE_SNAP, stid);
10612 mdcache->predirty_journal_parents(mdr, &le->metablob, diri, 0, PREDIRTY_PRIMARY, false);
10613 mdcache->journal_dirty_inode(mdr.get(), &le->metablob, diri);
10614
10615 // journal the snaprealm changes
10616 submit_mdlog_entry(le, new C_MDS_renamesnap_finish(this, mdr, diri, snapid),
10617 mdr, __func__);
10618 mdlog->flush();
10619 }
10620
10621 void Server::_renamesnap_finish(MDRequestRef& mdr, CInode *diri, snapid_t snapid)
10622 {
10623 dout(10) << "_renamesnap_finish " << *mdr << " " << snapid << dendl;
10624
10625 mdr->apply();
10626
10627 mds->snapclient->commit(mdr->more()->stid, mdr->ls);
10628
10629 dout(10) << "snaprealm now " << *diri->snaprealm << dendl;
10630
10631 // notify other mds
10632 mdcache->send_snap_update(diri, mdr->more()->stid, CEPH_SNAP_OP_UPDATE);
10633
10634 mdcache->do_realm_invalidate_and_update_notify(diri, CEPH_SNAP_OP_UPDATE);
10635
10636 // yay
10637 mdr->in[0] = diri;
10638 mdr->tracei = diri;
10639 mdr->snapid = snapid;
10640 respond_to_request(mdr, 0);
10641 }
10642
10643 /**
10644 * Return true if server is in state RECONNECT and this
10645 * client has not yet reconnected.
10646 */
10647 bool Server::waiting_for_reconnect(client_t c) const
10648 {
10649 return client_reconnect_gather.count(c) > 0;
10650 }
10651
10652 void Server::dump_reconnect_status(Formatter *f) const
10653 {
10654 f->open_object_section("reconnect_status");
10655 f->dump_stream("client_reconnect_gather") << client_reconnect_gather;
10656 f->close_section();
10657 }