]> git.proxmox.com Git - ceph.git/blob - ceph/src/mds/Server.cc
import 15.2.5
[ceph.git] / ceph / src / mds / Server.cc
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3 /*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 *
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
12 *
13 */
14
15 #include <boost/lexical_cast.hpp>
16 #include "include/ceph_assert.h" // lexical_cast includes system assert.h
17
18 #include <boost/config/warning_disable.hpp>
19 #include <boost/fusion/include/std_pair.hpp>
20 #include <boost/range/adaptor/reversed.hpp>
21
22 #include "MDSRank.h"
23 #include "Server.h"
24 #include "Locker.h"
25 #include "MDCache.h"
26 #include "MDLog.h"
27 #include "Migrator.h"
28 #include "MDBalancer.h"
29 #include "InoTable.h"
30 #include "SnapClient.h"
31 #include "Mutation.h"
32 #include "cephfs_features.h"
33
34 #include "msg/Messenger.h"
35
36 #include "osdc/Objecter.h"
37
38 #include "events/EUpdate.h"
39 #include "events/ESlaveUpdate.h"
40 #include "events/ESession.h"
41 #include "events/EOpen.h"
42 #include "events/ECommitted.h"
43 #include "events/EPurged.h"
44
45 #include "include/stringify.h"
46 #include "include/filepath.h"
47 #include "common/errno.h"
48 #include "common/Timer.h"
49 #include "common/perf_counters.h"
50 #include "include/compat.h"
51 #include "osd/OSDMap.h"
52
53 #include <errno.h>
54 #include <math.h>
55
56 #include <list>
57 #include <iostream>
58 #include <string_view>
59
60 #include "common/config.h"
61
62 #define dout_context g_ceph_context
63 #define dout_subsys ceph_subsys_mds
64 #undef dout_prefix
65 #define dout_prefix *_dout << "mds." << mds->get_nodeid() << ".server "
66
67 class ServerContext : public MDSContext {
68 protected:
69 Server *server;
70 MDSRank *get_mds() override
71 {
72 return server->mds;
73 }
74
75 public:
76 explicit ServerContext(Server *s) : server(s) {
77 ceph_assert(server != NULL);
78 }
79 };
80
81 class Batch_Getattr_Lookup : public BatchOp {
82 protected:
83 Server* server;
84 ceph::ref_t<MDRequestImpl> mdr;
85 MDCache* mdcache;
86 int res = 0;
87 public:
88 Batch_Getattr_Lookup(Server* s, ceph::ref_t<MDRequestImpl> r, MDCache* mdc) : server(s), mdr(std::move(r)), mdcache(mdc) {}
89 void add_request(const ceph::ref_t<MDRequestImpl>& m) override {
90 mdr->batch_reqs.push_back(m);
91 }
92 void set_request(const ceph::ref_t<MDRequestImpl>& m) override {
93 mdr = m;
94 }
95 void _forward(mds_rank_t t) override {
96 mdcache->mds->forward_message_mds(mdr->release_client_request(), t);
97 mdr->set_mds_stamp(ceph_clock_now());
98 for (auto& m : mdr->batch_reqs) {
99 if (!m->killed)
100 mdcache->request_forward(m, t);
101 }
102 mdr->batch_reqs.clear();
103 }
104 void _respond(int r) override {
105 mdr->set_mds_stamp(ceph_clock_now());
106 for (auto& m : mdr->batch_reqs) {
107 if (!m->killed) {
108 m->tracei = mdr->tracei;
109 m->tracedn = mdr->tracedn;
110 server->respond_to_request(m, r);
111 }
112 }
113 mdr->batch_reqs.clear();
114 server->reply_client_request(mdr, make_message<MClientReply>(*mdr->client_request, r));
115 }
116 void print(std::ostream& o) {
117 o << "[batch front=" << *mdr << "]";
118 }
119 };
120
121 class ServerLogContext : public MDSLogContextBase {
122 protected:
123 Server *server;
124 MDSRank *get_mds() override
125 {
126 return server->mds;
127 }
128
129 MDRequestRef mdr;
130 void pre_finish(int r) override {
131 if (mdr)
132 mdr->mark_event("journal_committed: ");
133 }
134 public:
135 explicit ServerLogContext(Server *s) : server(s) {
136 ceph_assert(server != NULL);
137 }
138 explicit ServerLogContext(Server *s, MDRequestRef& r) : server(s), mdr(r) {
139 ceph_assert(server != NULL);
140 }
141 };
142
143 void Server::create_logger()
144 {
145 PerfCountersBuilder plb(g_ceph_context, "mds_server", l_mdss_first, l_mdss_last);
146
147 plb.add_u64_counter(l_mdss_handle_client_request, "handle_client_request",
148 "Client requests", "hcr", PerfCountersBuilder::PRIO_INTERESTING);
149 plb.add_u64_counter(l_mdss_handle_slave_request, "handle_slave_request",
150 "Slave requests", "hsr", PerfCountersBuilder::PRIO_INTERESTING);
151 plb.add_u64_counter(l_mdss_handle_client_session,
152 "handle_client_session", "Client session messages", "hcs",
153 PerfCountersBuilder::PRIO_INTERESTING);
154 plb.add_u64_counter(l_mdss_cap_revoke_eviction, "cap_revoke_eviction",
155 "Cap Revoke Client Eviction", "cre", PerfCountersBuilder::PRIO_INTERESTING);
156
157 // fop latencies are useful
158 plb.set_prio_default(PerfCountersBuilder::PRIO_USEFUL);
159 plb.add_time_avg(l_mdss_req_lookuphash_latency, "req_lookuphash_latency",
160 "Request type lookup hash of inode latency");
161 plb.add_time_avg(l_mdss_req_lookupino_latency, "req_lookupino_latency",
162 "Request type lookup inode latency");
163 plb.add_time_avg(l_mdss_req_lookupparent_latency, "req_lookupparent_latency",
164 "Request type lookup parent latency");
165 plb.add_time_avg(l_mdss_req_lookupname_latency, "req_lookupname_latency",
166 "Request type lookup name latency");
167 plb.add_time_avg(l_mdss_req_lookup_latency, "req_lookup_latency",
168 "Request type lookup latency");
169 plb.add_time_avg(l_mdss_req_lookupsnap_latency, "req_lookupsnap_latency",
170 "Request type lookup snapshot latency");
171 plb.add_time_avg(l_mdss_req_getattr_latency, "req_getattr_latency",
172 "Request type get attribute latency");
173 plb.add_time_avg(l_mdss_req_setattr_latency, "req_setattr_latency",
174 "Request type set attribute latency");
175 plb.add_time_avg(l_mdss_req_setlayout_latency, "req_setlayout_latency",
176 "Request type set file layout latency");
177 plb.add_time_avg(l_mdss_req_setdirlayout_latency, "req_setdirlayout_latency",
178 "Request type set directory layout latency");
179 plb.add_time_avg(l_mdss_req_setxattr_latency, "req_setxattr_latency",
180 "Request type set extended attribute latency");
181 plb.add_time_avg(l_mdss_req_rmxattr_latency, "req_rmxattr_latency",
182 "Request type remove extended attribute latency");
183 plb.add_time_avg(l_mdss_req_readdir_latency, "req_readdir_latency",
184 "Request type read directory latency");
185 plb.add_time_avg(l_mdss_req_setfilelock_latency, "req_setfilelock_latency",
186 "Request type set file lock latency");
187 plb.add_time_avg(l_mdss_req_getfilelock_latency, "req_getfilelock_latency",
188 "Request type get file lock latency");
189 plb.add_time_avg(l_mdss_req_create_latency, "req_create_latency",
190 "Request type create latency");
191 plb.add_time_avg(l_mdss_req_open_latency, "req_open_latency",
192 "Request type open latency");
193 plb.add_time_avg(l_mdss_req_mknod_latency, "req_mknod_latency",
194 "Request type make node latency");
195 plb.add_time_avg(l_mdss_req_link_latency, "req_link_latency",
196 "Request type link latency");
197 plb.add_time_avg(l_mdss_req_unlink_latency, "req_unlink_latency",
198 "Request type unlink latency");
199 plb.add_time_avg(l_mdss_req_rmdir_latency, "req_rmdir_latency",
200 "Request type remove directory latency");
201 plb.add_time_avg(l_mdss_req_rename_latency, "req_rename_latency",
202 "Request type rename latency");
203 plb.add_time_avg(l_mdss_req_mkdir_latency, "req_mkdir_latency",
204 "Request type make directory latency");
205 plb.add_time_avg(l_mdss_req_symlink_latency, "req_symlink_latency",
206 "Request type symbolic link latency");
207 plb.add_time_avg(l_mdss_req_lssnap_latency, "req_lssnap_latency",
208 "Request type list snapshot latency");
209 plb.add_time_avg(l_mdss_req_mksnap_latency, "req_mksnap_latency",
210 "Request type make snapshot latency");
211 plb.add_time_avg(l_mdss_req_rmsnap_latency, "req_rmsnap_latency",
212 "Request type remove snapshot latency");
213 plb.add_time_avg(l_mdss_req_renamesnap_latency, "req_renamesnap_latency",
214 "Request type rename snapshot latency");
215
216 plb.set_prio_default(PerfCountersBuilder::PRIO_DEBUGONLY);
217 plb.add_u64_counter(l_mdss_dispatch_client_request, "dispatch_client_request",
218 "Client requests dispatched");
219 plb.add_u64_counter(l_mdss_dispatch_slave_request, "dispatch_server_request",
220 "Server requests dispatched");
221
222 logger = plb.create_perf_counters();
223 g_ceph_context->get_perfcounters_collection()->add(logger);
224 }
225
226 Server::Server(MDSRank *m) :
227 mds(m),
228 mdcache(mds->mdcache), mdlog(mds->mdlog),
229 recall_throttle(g_conf().get_val<double>("mds_recall_max_decay_rate"))
230 {
231 replay_unsafe_with_closed_session = g_conf().get_val<bool>("mds_replay_unsafe_with_closed_session");
232 cap_revoke_eviction_timeout = g_conf().get_val<double>("mds_cap_revoke_eviction_timeout");
233 max_snaps_per_dir = g_conf().get_val<uint64_t>("mds_max_snaps_per_dir");
234 delegate_inos_pct = g_conf().get_val<uint64_t>("mds_client_delegate_inos_pct");
235 supported_features = feature_bitset_t(CEPHFS_FEATURES_MDS_SUPPORTED);
236 }
237
238 void Server::dispatch(const cref_t<Message> &m)
239 {
240 switch (m->get_type()) {
241 case CEPH_MSG_CLIENT_RECONNECT:
242 handle_client_reconnect(ref_cast<MClientReconnect>(m));
243 return;
244 }
245
246 /*
247 *In reconnect phase, client sent unsafe requests to mds before reconnect msg. Seting sessionclosed_isok will handle scenario like this:
248
249 1. In reconnect phase, client sent unsafe requests to mds.
250 2. It reached reconnect timeout. All sessions without sending reconnect msg in time, some of which may had sent unsafe requests, are marked as closed.
251 (Another situation is #31668, which will deny all client reconnect msg to speed up reboot).
252 3.So these unsafe request from session without sending reconnect msg in time or being denied could be handled in clientreplay phase.
253
254 */
255 bool sessionclosed_isok = replay_unsafe_with_closed_session;
256 // active?
257 // handle_slave_request()/handle_client_session() will wait if necessary
258 if (m->get_type() == CEPH_MSG_CLIENT_REQUEST && !mds->is_active()) {
259 const auto &req = ref_cast<MClientRequest>(m);
260 if (mds->is_reconnect() || mds->get_want_state() == CEPH_MDS_STATE_RECONNECT) {
261 Session *session = mds->get_session(req);
262 if (!session || (!session->is_open() && !sessionclosed_isok)) {
263 dout(5) << "session is closed, dropping " << req->get_reqid() << dendl;
264 return;
265 }
266 bool queue_replay = false;
267 if (req->is_replay() || req->is_async()) {
268 dout(3) << "queuing replayed op" << dendl;
269 queue_replay = true;
270 if (req->head.ino &&
271 !session->have_completed_request(req->get_reqid().tid, nullptr)) {
272 mdcache->add_replay_ino_alloc(inodeno_t(req->head.ino));
273 }
274 } else if (req->get_retry_attempt()) {
275 // process completed request in clientreplay stage. The completed request
276 // might have created new file/directorie. This guarantees MDS sends a reply
277 // to client before other request modifies the new file/directorie.
278 if (session->have_completed_request(req->get_reqid().tid, NULL)) {
279 dout(3) << "queuing completed op" << dendl;
280 queue_replay = true;
281 }
282 // this request was created before the cap reconnect message, drop any embedded
283 // cap releases.
284 req->releases.clear();
285 }
286 if (queue_replay) {
287 req->mark_queued_for_replay();
288 mds->enqueue_replay(new C_MDS_RetryMessage(mds, m));
289 return;
290 }
291 }
292
293 bool wait_for_active = true;
294 if (mds->is_stopping()) {
295 wait_for_active = false;
296 } else if (mds->is_clientreplay()) {
297 if (req->is_queued_for_replay()) {
298 wait_for_active = false;
299 }
300 }
301 if (wait_for_active) {
302 dout(3) << "not active yet, waiting" << dendl;
303 mds->wait_for_active(new C_MDS_RetryMessage(mds, m));
304 return;
305 }
306 }
307
308 switch (m->get_type()) {
309 case CEPH_MSG_CLIENT_SESSION:
310 handle_client_session(ref_cast<MClientSession>(m));
311 return;
312 case CEPH_MSG_CLIENT_REQUEST:
313 handle_client_request(ref_cast<MClientRequest>(m));
314 return;
315 case CEPH_MSG_CLIENT_RECLAIM:
316 handle_client_reclaim(ref_cast<MClientReclaim>(m));
317 return;
318 case MSG_MDS_SLAVE_REQUEST:
319 handle_slave_request(ref_cast<MMDSSlaveRequest>(m));
320 return;
321 default:
322 derr << "server unknown message " << m->get_type() << dendl;
323 ceph_abort_msg("server unknown message");
324 }
325 }
326
327
328
329 // ----------------------------------------------------------
330 // SESSION management
331
332 class C_MDS_session_finish : public ServerLogContext {
333 Session *session;
334 uint64_t state_seq;
335 bool open;
336 version_t cmapv;
337 interval_set<inodeno_t> inos;
338 version_t inotablev;
339 interval_set<inodeno_t> purge_inos;
340 LogSegment *ls = nullptr;
341 Context *fin;
342 public:
343 C_MDS_session_finish(Server *srv, Session *se, uint64_t sseq, bool s, version_t mv, Context *fin_ = NULL) :
344 ServerLogContext(srv), session(se), state_seq(sseq), open(s), cmapv(mv), inotablev(0), fin(fin_) { }
345 C_MDS_session_finish(Server *srv, Session *se, uint64_t sseq, bool s, version_t mv, interval_set<inodeno_t> i, version_t iv, Context *fin_ = NULL) :
346 ServerLogContext(srv), session(se), state_seq(sseq), open(s), cmapv(mv), inos(std::move(i)), inotablev(iv), fin(fin_) { }
347 C_MDS_session_finish(Server *srv, Session *se, uint64_t sseq, bool s, version_t mv, interval_set<inodeno_t> i, version_t iv,
348 interval_set<inodeno_t> _purge_inos, LogSegment *_ls, Context *fin_ = NULL) :
349 ServerLogContext(srv), session(se), state_seq(sseq), open(s), cmapv(mv), inos(std::move(i)), inotablev(iv), purge_inos(std::move(_purge_inos)), ls(_ls), fin(fin_){}
350 void finish(int r) override {
351 ceph_assert(r == 0);
352 server->_session_logged(session, state_seq, open, cmapv, inos, inotablev, purge_inos, ls);
353 if (fin) {
354 fin->complete(r);
355 }
356 }
357 };
358
359 Session* Server::find_session_by_uuid(std::string_view uuid)
360 {
361 Session* session = nullptr;
362 for (auto& it : mds->sessionmap.get_sessions()) {
363 auto& metadata = it.second->info.client_metadata;
364
365 auto p = metadata.find("uuid");
366 if (p == metadata.end() || p->second != uuid)
367 continue;
368
369 if (!session) {
370 session = it.second;
371 } else if (!session->reclaiming_from) {
372 assert(it.second->reclaiming_from == session);
373 session = it.second;
374 } else {
375 assert(session->reclaiming_from == it.second);
376 }
377 }
378 return session;
379 }
380
381 void Server::reclaim_session(Session *session, const cref_t<MClientReclaim> &m)
382 {
383 if (!session->is_open() && !session->is_stale()) {
384 dout(10) << "session not open, dropping this req" << dendl;
385 return;
386 }
387
388 auto reply = make_message<MClientReclaimReply>(0);
389 if (m->get_uuid().empty()) {
390 dout(10) << __func__ << " invalid message (no uuid)" << dendl;
391 reply->set_result(-EINVAL);
392 mds->send_message_client(reply, session);
393 return;
394 }
395
396 unsigned flags = m->get_flags();
397 if (flags != CEPH_RECLAIM_RESET) { // currently only support reset
398 dout(10) << __func__ << " unsupported flags" << dendl;
399 reply->set_result(-EOPNOTSUPP);
400 mds->send_message_client(reply, session);
401 return;
402 }
403
404 Session* target = find_session_by_uuid(m->get_uuid());
405 if (target) {
406 if (session->info.auth_name != target->info.auth_name) {
407 dout(10) << __func__ << " session auth_name " << session->info.auth_name
408 << " != target auth_name " << target->info.auth_name << dendl;
409 reply->set_result(-EPERM);
410 mds->send_message_client(reply, session);
411 }
412
413 assert(!target->reclaiming_from);
414 assert(!session->reclaiming_from);
415 session->reclaiming_from = target;
416 reply->set_addrs(entity_addrvec_t(target->info.inst.addr));
417 }
418
419 if (flags & CEPH_RECLAIM_RESET) {
420 finish_reclaim_session(session, reply);
421 return;
422 }
423
424 ceph_abort();
425 }
426
427 void Server::finish_reclaim_session(Session *session, const ref_t<MClientReclaimReply> &reply)
428 {
429 Session *target = session->reclaiming_from;
430 if (target) {
431 session->reclaiming_from = nullptr;
432
433 Context *send_reply;
434 if (reply) {
435 int64_t session_id = session->get_client().v;
436 send_reply = new LambdaContext([this, session_id, reply](int r) {
437 assert(ceph_mutex_is_locked_by_me(mds->mds_lock));
438 Session *session = mds->sessionmap.get_session(entity_name_t::CLIENT(session_id));
439 if (!session) {
440 return;
441 }
442 auto epoch = mds->objecter->with_osdmap([](const OSDMap &map){ return map.get_epoch(); });
443 reply->set_epoch(epoch);
444 mds->send_message_client(reply, session);
445 });
446 } else {
447 send_reply = nullptr;
448 }
449
450 bool blacklisted = mds->objecter->with_osdmap([target](const OSDMap &map) {
451 return map.is_blacklisted(target->info.inst.addr);
452 });
453
454 if (blacklisted || !g_conf()->mds_session_blacklist_on_evict) {
455 kill_session(target, send_reply);
456 } else {
457 std::stringstream ss;
458 mds->evict_client(target->get_client().v, false, true, ss, send_reply);
459 }
460 } else if (reply) {
461 mds->send_message_client(reply, session);
462 }
463 }
464
465 void Server::handle_client_reclaim(const cref_t<MClientReclaim> &m)
466 {
467 Session *session = mds->get_session(m);
468 dout(3) << __func__ << " " << *m << " from " << m->get_source() << dendl;
469 assert(m->get_source().is_client()); // should _not_ come from an mds!
470
471 if (!session) {
472 dout(0) << " ignoring sessionless msg " << *m << dendl;
473 return;
474 }
475
476 if (mds->get_state() < MDSMap::STATE_CLIENTREPLAY) {
477 mds->wait_for_replay(new C_MDS_RetryMessage(mds, m));
478 return;
479 }
480
481 if (m->get_flags() & MClientReclaim::FLAG_FINISH) {
482 finish_reclaim_session(session);
483 } else {
484 reclaim_session(session, m);
485 }
486 }
487
488 void Server::handle_client_session(const cref_t<MClientSession> &m)
489 {
490 version_t pv;
491 Session *session = mds->get_session(m);
492
493 dout(3) << "handle_client_session " << *m << " from " << m->get_source() << dendl;
494 ceph_assert(m->get_source().is_client()); // should _not_ come from an mds!
495
496 if (!session) {
497 dout(0) << " ignoring sessionless msg " << *m << dendl;
498 auto reply = make_message<MClientSession>(CEPH_SESSION_REJECT);
499 reply->metadata["error_string"] = "sessionless";
500 mds->send_message(reply, m->get_connection());
501 return;
502 }
503
504 if (m->get_op() == CEPH_SESSION_REQUEST_RENEWCAPS) {
505 // always handle renewcaps (state >= MDSMap::STATE_RECONNECT)
506 } else if (m->get_op() == CEPH_SESSION_REQUEST_CLOSE) {
507 // close requests need to be handled when mds is active
508 if (mds->get_state() < MDSMap::STATE_ACTIVE) {
509 mds->wait_for_active(new C_MDS_RetryMessage(mds, m));
510 return;
511 }
512 } else {
513 if (mds->get_state() < MDSMap::STATE_CLIENTREPLAY) {
514 mds->wait_for_replay(new C_MDS_RetryMessage(mds, m));
515 return;
516 }
517 }
518
519 if (logger)
520 logger->inc(l_mdss_handle_client_session);
521
522 uint64_t sseq = 0;
523 switch (m->get_op()) {
524 case CEPH_SESSION_REQUEST_OPEN:
525 if (session->is_opening() ||
526 session->is_open() ||
527 session->is_stale() ||
528 session->is_killing() ||
529 terminating_sessions) {
530 dout(10) << "currently open|opening|stale|killing, dropping this req" << dendl;
531 return;
532 }
533 ceph_assert(session->is_closed() || session->is_closing());
534
535 if (mds->is_stopping()) {
536 dout(10) << "mds is stopping, dropping open req" << dendl;
537 return;
538 }
539
540 {
541 auto& addr = session->info.inst.addr;
542 session->set_client_metadata(client_metadata_t(m->metadata, m->supported_features, m->metric_spec));
543 auto& client_metadata = session->info.client_metadata;
544
545 auto log_session_status = [this, m, session](std::string_view status, std::string_view err) {
546 auto now = ceph_clock_now();
547 auto throttle_elapsed = m->get_recv_complete_stamp() - m->get_throttle_stamp();
548 auto elapsed = now - m->get_recv_stamp();
549 CachedStackStringStream css;
550 *css << "New client session:"
551 << " addr=\"" << session->info.inst.addr << "\""
552 << ",elapsed=" << elapsed
553 << ",throttled=" << throttle_elapsed
554 << ",status=\"" << status << "\"";
555 if (!err.empty()) {
556 *css << ",error=\"" << err << "\"";
557 }
558 const auto& metadata = session->info.client_metadata;
559 if (auto it = metadata.find("root"); it != metadata.end()) {
560 *css << ",root=\"" << it->second << "\"";
561 }
562 dout(2) << css->strv() << dendl;
563 };
564
565 auto send_reject_message = [this, &session, &log_session_status](std::string_view err_str) {
566 auto m = make_message<MClientSession>(CEPH_SESSION_REJECT);
567 if (session->info.has_feature(CEPHFS_FEATURE_MIMIC))
568 m->metadata["error_string"] = err_str;
569 mds->send_message_client(m, session);
570 log_session_status("REJECTED", err_str);
571 };
572
573 bool blacklisted = mds->objecter->with_osdmap(
574 [&addr](const OSDMap &osd_map) -> bool {
575 return osd_map.is_blacklisted(addr);
576 });
577
578 if (blacklisted) {
579 dout(10) << "rejecting blacklisted client " << addr << dendl;
580 send_reject_message("blacklisted");
581 session->clear();
582 break;
583 }
584
585 if (client_metadata.features.empty())
586 infer_supported_features(session, client_metadata);
587
588 dout(20) << __func__ << " CEPH_SESSION_REQUEST_OPEN metadata entries:" << dendl;
589 dout(20) << " features: '" << client_metadata.features << "'" << dendl;
590 dout(20) << " metric specification: [" << client_metadata.metric_spec << "]" << dendl;
591 for (const auto& p : client_metadata) {
592 dout(20) << " " << p.first << ": " << p.second << dendl;
593 }
594
595 feature_bitset_t missing_features = required_client_features;
596 missing_features -= client_metadata.features;
597 if (!missing_features.empty()) {
598 stringstream ss;
599 ss << "missing required features '" << missing_features << "'";
600 send_reject_message(ss.str());
601 mds->clog->warn() << "client session (" << session->info.inst
602 << ") lacks required features " << missing_features
603 << "; client supports " << client_metadata.features;
604 session->clear();
605 break;
606 }
607
608 // Special case for the 'root' metadata path; validate that the claimed
609 // root is actually within the caps of the session
610 if (auto it = client_metadata.find("root"); it != client_metadata.end()) {
611 auto claimed_root = it->second;
612 stringstream ss;
613 bool denied = false;
614 // claimed_root has a leading "/" which we strip before passing
615 // into caps check
616 if (claimed_root.empty() || claimed_root[0] != '/') {
617 denied = true;
618 ss << "invalue root '" << claimed_root << "'";
619 } else if (!session->auth_caps.path_capable(claimed_root.substr(1))) {
620 denied = true;
621 ss << "non-allowable root '" << claimed_root << "'";
622 }
623
624 if (denied) {
625 // Tell the client we're rejecting their open
626 send_reject_message(ss.str());
627 mds->clog->warn() << "client session with " << ss.str()
628 << " denied (" << session->info.inst << ")";
629 session->clear();
630 break;
631 }
632 }
633
634 if (auto it = client_metadata.find("uuid"); it != client_metadata.end()) {
635 if (find_session_by_uuid(it->second)) {
636 send_reject_message("duplicated session uuid");
637 mds->clog->warn() << "client session with duplicated session uuid '"
638 << it->second << "' denied (" << session->info.inst << ")";
639 session->clear();
640 break;
641 }
642 }
643
644 if (session->is_closed())
645 mds->sessionmap.add_session(session);
646
647 pv = mds->sessionmap.mark_projected(session);
648 sseq = mds->sessionmap.set_state(session, Session::STATE_OPENING);
649 mds->sessionmap.touch_session(session);
650 auto fin = new LambdaContext([log_session_status = std::move(log_session_status)](int r){
651 ceph_assert(r == 0);
652 log_session_status("ACCEPTED", "");
653 });
654 mdlog->start_submit_entry(new ESession(m->get_source_inst(), true, pv, client_metadata),
655 new C_MDS_session_finish(this, session, sseq, true, pv, fin));
656 mdlog->flush();
657 }
658 break;
659
660 case CEPH_SESSION_REQUEST_RENEWCAPS:
661 if (session->is_open() || session->is_stale()) {
662 mds->sessionmap.touch_session(session);
663 if (session->is_stale()) {
664 mds->sessionmap.set_state(session, Session::STATE_OPEN);
665 mds->locker->resume_stale_caps(session);
666 mds->sessionmap.touch_session(session);
667 }
668 auto reply = make_message<MClientSession>(CEPH_SESSION_RENEWCAPS, m->get_seq());
669 mds->send_message_client(reply, session);
670 } else {
671 dout(10) << "ignoring renewcaps on non open|stale session (" << session->get_state_name() << ")" << dendl;
672 }
673 break;
674
675 case CEPH_SESSION_REQUEST_CLOSE:
676 {
677 if (session->is_closed() ||
678 session->is_closing() ||
679 session->is_killing()) {
680 dout(10) << "already closed|closing|killing, dropping this req" << dendl;
681 return;
682 }
683 if (session->is_importing()) {
684 dout(10) << "ignoring close req on importing session" << dendl;
685 return;
686 }
687 ceph_assert(session->is_open() ||
688 session->is_stale() ||
689 session->is_opening());
690 if (m->get_seq() < session->get_push_seq()) {
691 dout(10) << "old push seq " << m->get_seq() << " < " << session->get_push_seq()
692 << ", dropping" << dendl;
693 return;
694 }
695 // We are getting a seq that is higher than expected.
696 // Handle the same as any other seqn error.
697 //
698 if (m->get_seq() != session->get_push_seq()) {
699 dout(0) << "old push seq " << m->get_seq() << " != " << session->get_push_seq()
700 << ", BUGGY!" << dendl;
701 mds->clog->warn() << "incorrect push seq " << m->get_seq() << " != "
702 << session->get_push_seq() << ", dropping" << " from client : " << session->get_human_name();
703 return;
704 }
705 journal_close_session(session, Session::STATE_CLOSING, NULL);
706 }
707 break;
708
709 case CEPH_SESSION_FLUSHMSG_ACK:
710 finish_flush_session(session, m->get_seq());
711 break;
712
713 case CEPH_SESSION_REQUEST_FLUSH_MDLOG:
714 if (mds->is_active())
715 mdlog->flush();
716 break;
717
718 default:
719 ceph_abort();
720 }
721 }
722
723
724 void Server::flush_session(Session *session, MDSGatherBuilder *gather) {
725 if (!session->is_open() ||
726 !session->get_connection() ||
727 !session->get_connection()->has_feature(CEPH_FEATURE_EXPORT_PEER)) {
728 return;
729 }
730
731 version_t seq = session->wait_for_flush(gather->new_sub());
732 mds->send_message_client(
733 make_message<MClientSession>(CEPH_SESSION_FLUSHMSG, seq), session);
734 }
735
736 void Server::flush_client_sessions(set<client_t>& client_set, MDSGatherBuilder& gather)
737 {
738 for (set<client_t>::iterator p = client_set.begin(); p != client_set.end(); ++p) {
739 Session *session = mds->sessionmap.get_session(entity_name_t::CLIENT(p->v));
740 ceph_assert(session);
741 flush_session(session, &gather);
742 }
743 }
744
745 void Server::finish_flush_session(Session *session, version_t seq)
746 {
747 MDSContext::vec finished;
748 session->finish_flush(seq, finished);
749 mds->queue_waiters(finished);
750 }
751
752 void Server::_session_logged(Session *session, uint64_t state_seq, bool open, version_t pv,
753 const interval_set<inodeno_t>& inos, version_t piv,
754 const interval_set<inodeno_t>& purge_inos, LogSegment *ls)
755 {
756 dout(10) << "_session_logged " << session->info.inst
757 << " state_seq " << state_seq
758 << " " << (open ? "open":"close")
759 << " " << pv
760 << " purge_inos : " << purge_inos << dendl;
761
762 if (NULL != ls) {
763 dout(10) << "_session_logged seq : " << ls->seq << dendl;
764 if (purge_inos.size()){
765 ls->purge_inodes.insert(purge_inos);
766 mdcache->purge_inodes(purge_inos, ls);
767 }
768 }
769
770 if (piv) {
771 ceph_assert(session->is_closing() || session->is_killing() ||
772 session->is_opening()); // re-open closing session
773 session->info.prealloc_inos.subtract(inos);
774 session->delegated_inos.clear();
775 mds->inotable->apply_release_ids(inos);
776 ceph_assert(mds->inotable->get_version() == piv);
777 }
778
779 mds->sessionmap.mark_dirty(session);
780
781 // apply
782 if (session->get_state_seq() != state_seq) {
783 dout(10) << " journaled state_seq " << state_seq << " != current " << session->get_state_seq()
784 << ", noop" << dendl;
785 // close must have been canceled (by an import?), or any number of other things..
786 } else if (open) {
787 ceph_assert(session->is_opening());
788 mds->sessionmap.set_state(session, Session::STATE_OPEN);
789 mds->sessionmap.touch_session(session);
790 ceph_assert(session->get_connection());
791 auto reply = make_message<MClientSession>(CEPH_SESSION_OPEN);
792 if (session->info.has_feature(CEPHFS_FEATURE_MIMIC))
793 reply->supported_features = supported_features;
794 mds->send_message_client(reply, session);
795 if (mdcache->is_readonly()) {
796 auto m = make_message<MClientSession>(CEPH_SESSION_FORCE_RO);
797 mds->send_message_client(m, session);
798 }
799 } else if (session->is_closing() ||
800 session->is_killing()) {
801 // kill any lingering capabilities, leases, requests
802 while (!session->caps.empty()) {
803 Capability *cap = session->caps.front();
804 CInode *in = cap->get_inode();
805 dout(20) << " killing capability " << ccap_string(cap->issued()) << " on " << *in << dendl;
806 mds->locker->remove_client_cap(in, cap, true);
807 }
808 while (!session->leases.empty()) {
809 ClientLease *r = session->leases.front();
810 CDentry *dn = static_cast<CDentry*>(r->parent);
811 dout(20) << " killing client lease of " << *dn << dendl;
812 dn->remove_client_lease(r, mds->locker);
813 }
814 if (client_reconnect_gather.erase(session->info.get_client())) {
815 dout(20) << " removing client from reconnect set" << dendl;
816 if (client_reconnect_gather.empty()) {
817 dout(7) << " client " << session->info.inst << " was last reconnect, finishing" << dendl;
818 reconnect_gather_finish();
819 }
820 }
821 if (client_reclaim_gather.erase(session->info.get_client())) {
822 dout(20) << " removing client from reclaim set" << dendl;
823 if (client_reclaim_gather.empty()) {
824 dout(7) << " client " << session->info.inst << " was last reclaimed, finishing" << dendl;
825 mds->maybe_clientreplay_done();
826 }
827 }
828
829 if (session->is_closing()) {
830 // mark con disposable. if there is a fault, we will get a
831 // reset and clean it up. if the client hasn't received the
832 // CLOSE message yet, they will reconnect and get an
833 // ms_handle_remote_reset() and realize they had in fact closed.
834 // do this *before* sending the message to avoid a possible
835 // race.
836 if (session->get_connection()) {
837 // Conditional because terminate_sessions will indiscrimately
838 // put sessions in CLOSING whether they ever had a conn or not.
839 session->get_connection()->mark_disposable();
840 }
841
842 // reset session
843 mds->send_message_client(make_message<MClientSession>(CEPH_SESSION_CLOSE), session);
844 mds->sessionmap.set_state(session, Session::STATE_CLOSED);
845 session->clear();
846 mds->sessionmap.remove_session(session);
847 } else if (session->is_killing()) {
848 // destroy session, close connection
849 if (session->get_connection()) {
850 session->get_connection()->mark_down();
851 mds->sessionmap.set_state(session, Session::STATE_CLOSED);
852 session->set_connection(nullptr);
853 }
854 mds->sessionmap.remove_session(session);
855 } else {
856 ceph_abort();
857 }
858 } else {
859 ceph_abort();
860 }
861 }
862
863 /**
864 * Inject sessions from some source other than actual connections.
865 *
866 * For example:
867 * - sessions inferred from journal replay
868 * - sessions learned from other MDSs during rejoin
869 * - sessions learned from other MDSs during dir/caps migration
870 * - sessions learned from other MDSs during a cross-MDS rename
871 */
872 version_t Server::prepare_force_open_sessions(map<client_t,entity_inst_t>& cm,
873 map<client_t,client_metadata_t>& cmm,
874 map<client_t, pair<Session*,uint64_t> >& smap)
875 {
876 version_t pv = mds->sessionmap.get_projected();
877
878 dout(10) << "prepare_force_open_sessions " << pv
879 << " on " << cm.size() << " clients"
880 << dendl;
881
882 mds->objecter->with_osdmap(
883 [this, &cm, &cmm](const OSDMap &osd_map) {
884 for (auto p = cm.begin(); p != cm.end(); ) {
885 if (osd_map.is_blacklisted(p->second.addr)) {
886 dout(10) << " ignoring blacklisted client." << p->first
887 << " (" << p->second.addr << ")" << dendl;
888 cmm.erase(p->first);
889 cm.erase(p++);
890 } else {
891 ++p;
892 }
893 }
894 });
895
896 for (map<client_t,entity_inst_t>::iterator p = cm.begin(); p != cm.end(); ++p) {
897 Session *session = mds->sessionmap.get_or_add_session(p->second);
898 pv = mds->sessionmap.mark_projected(session);
899 uint64_t sseq;
900 if (session->is_closed() ||
901 session->is_closing() ||
902 session->is_killing()) {
903 sseq = mds->sessionmap.set_state(session, Session::STATE_OPENING);
904 auto q = cmm.find(p->first);
905 if (q != cmm.end())
906 session->info.client_metadata.merge(q->second);
907 } else {
908 ceph_assert(session->is_open() ||
909 session->is_opening() ||
910 session->is_stale());
911 sseq = 0;
912 }
913 smap[p->first] = make_pair(session, sseq);
914 session->inc_importing();
915 }
916 return pv;
917 }
918
919 void Server::finish_force_open_sessions(const map<client_t,pair<Session*,uint64_t> >& smap,
920 bool dec_import)
921 {
922 /*
923 * FIXME: need to carefully consider the race conditions between a
924 * client trying to close a session and an MDS doing an import
925 * trying to force open a session...
926 */
927 dout(10) << "finish_force_open_sessions on " << smap.size() << " clients,"
928 << " initial v " << mds->sessionmap.get_version() << dendl;
929
930 for (auto &it : smap) {
931 Session *session = it.second.first;
932 uint64_t sseq = it.second.second;
933 if (sseq > 0) {
934 if (session->get_state_seq() != sseq) {
935 dout(10) << "force_open_sessions skipping changed " << session->info.inst << dendl;
936 } else {
937 dout(10) << "force_open_sessions opened " << session->info.inst << dendl;
938 mds->sessionmap.set_state(session, Session::STATE_OPEN);
939 mds->sessionmap.touch_session(session);
940
941 auto reply = make_message<MClientSession>(CEPH_SESSION_OPEN);
942 if (session->info.has_feature(CEPHFS_FEATURE_MIMIC))
943 reply->supported_features = supported_features;
944 mds->send_message_client(reply, session);
945
946 if (mdcache->is_readonly())
947 mds->send_message_client(make_message<MClientSession>(CEPH_SESSION_FORCE_RO), session);
948 }
949 } else {
950 dout(10) << "force_open_sessions skipping already-open " << session->info.inst << dendl;
951 ceph_assert(session->is_open() || session->is_stale());
952 }
953
954 if (dec_import) {
955 session->dec_importing();
956 }
957
958 mds->sessionmap.mark_dirty(session);
959 }
960
961 dout(10) << __func__ << ": final v " << mds->sessionmap.get_version() << dendl;
962 }
963
964 class C_MDS_TerminatedSessions : public ServerContext {
965 void finish(int r) override {
966 server->terminating_sessions = false;
967 }
968 public:
969 explicit C_MDS_TerminatedSessions(Server *s) : ServerContext(s) {}
970 };
971
972 void Server::terminate_sessions()
973 {
974 dout(5) << "terminating all sessions..." << dendl;
975
976 terminating_sessions = true;
977
978 // kill them off. clients will retry etc.
979 set<Session*> sessions;
980 mds->sessionmap.get_client_session_set(sessions);
981 for (set<Session*>::const_iterator p = sessions.begin();
982 p != sessions.end();
983 ++p) {
984 Session *session = *p;
985 if (session->is_closing() ||
986 session->is_killing() ||
987 session->is_closed())
988 continue;
989 journal_close_session(session, Session::STATE_CLOSING, NULL);
990 }
991
992 mdlog->wait_for_safe(new C_MDS_TerminatedSessions(this));
993 }
994
995
996 void Server::find_idle_sessions()
997 {
998 auto now = clock::now();
999 auto last_cleared_laggy = mds->last_cleared_laggy();
1000
1001 dout(10) << "find_idle_sessions. last cleared laggy state " << last_cleared_laggy << "s ago" << dendl;
1002
1003 // timeout/stale
1004 // (caps go stale, lease die)
1005 double queue_max_age = mds->get_dispatch_queue_max_age(ceph_clock_now());
1006 double cutoff = queue_max_age + mds->mdsmap->get_session_timeout();
1007
1008 // don't kick clients if we've been laggy
1009 if (last_cleared_laggy < cutoff) {
1010 dout(10) << " last cleared laggy " << last_cleared_laggy << "s ago (< cutoff " << cutoff
1011 << "), not marking any client stale" << dendl;
1012 return;
1013 }
1014
1015 std::vector<Session*> to_evict;
1016
1017 bool defer_session_stale = g_conf().get_val<bool>("mds_defer_session_stale");
1018 const auto sessions_p1 = mds->sessionmap.by_state.find(Session::STATE_OPEN);
1019 if (sessions_p1 != mds->sessionmap.by_state.end() && !sessions_p1->second->empty()) {
1020 std::vector<Session*> new_stale;
1021
1022 for (auto session : *(sessions_p1->second)) {
1023 auto last_cap_renew_span = std::chrono::duration<double>(now - session->last_cap_renew).count();
1024 if (last_cap_renew_span < cutoff) {
1025 dout(20) << "laggiest active session is " << session->info.inst
1026 << " and renewed caps recently (" << last_cap_renew_span << "s ago)" << dendl;
1027 break;
1028 }
1029
1030 if (session->last_seen > session->last_cap_renew) {
1031 last_cap_renew_span = std::chrono::duration<double>(now - session->last_seen).count();
1032 if (last_cap_renew_span < cutoff) {
1033 dout(20) << "laggiest active session is " << session->info.inst
1034 << " and renewed caps recently (" << last_cap_renew_span << "s ago)" << dendl;
1035 continue;
1036 }
1037 }
1038
1039 if (last_cap_renew_span >= mds->mdsmap->get_session_autoclose()) {
1040 dout(20) << "evicting session " << session->info.inst << " since autoclose "
1041 "has arrived" << dendl;
1042 // evict session without marking it stale
1043 to_evict.push_back(session);
1044 continue;
1045 }
1046
1047 if (defer_session_stale &&
1048 !session->is_any_flush_waiter() &&
1049 !mds->locker->is_revoking_any_caps_from(session->get_client())) {
1050 dout(20) << "deferring marking session " << session->info.inst << " stale "
1051 "since it holds no caps" << dendl;
1052 continue;
1053 }
1054
1055 auto it = session->info.client_metadata.find("timeout");
1056 if (it != session->info.client_metadata.end()) {
1057 unsigned timeout = strtoul(it->second.c_str(), nullptr, 0);
1058 if (timeout == 0) {
1059 dout(10) << "skipping session " << session->info.inst
1060 << ", infinite timeout specified" << dendl;
1061 continue;
1062 }
1063 double cutoff = queue_max_age + timeout;
1064 if (last_cap_renew_span < cutoff) {
1065 dout(10) << "skipping session " << session->info.inst
1066 << ", timeout (" << timeout << ") specified"
1067 << " and renewed caps recently (" << last_cap_renew_span << "s ago)" << dendl;
1068 continue;
1069 }
1070
1071 // do not go through stale, evict it directly.
1072 to_evict.push_back(session);
1073 } else {
1074 dout(10) << "new stale session " << session->info.inst
1075 << " last renewed caps " << last_cap_renew_span << "s ago" << dendl;
1076 new_stale.push_back(session);
1077 }
1078 }
1079
1080 for (auto session : new_stale) {
1081 mds->sessionmap.set_state(session, Session::STATE_STALE);
1082 if (mds->locker->revoke_stale_caps(session)) {
1083 mds->locker->remove_stale_leases(session);
1084 finish_flush_session(session, session->get_push_seq());
1085 auto m = make_message<MClientSession>(CEPH_SESSION_STALE, session->get_push_seq());
1086 mds->send_message_client(m, session);
1087 } else {
1088 to_evict.push_back(session);
1089 }
1090 }
1091 }
1092
1093 // autoclose
1094 cutoff = queue_max_age + mds->mdsmap->get_session_autoclose();
1095
1096 // Collect a list of sessions exceeding the autoclose threshold
1097 const auto sessions_p2 = mds->sessionmap.by_state.find(Session::STATE_STALE);
1098 if (sessions_p2 != mds->sessionmap.by_state.end() && !sessions_p2->second->empty()) {
1099 for (auto session : *(sessions_p2->second)) {
1100 assert(session->is_stale());
1101 auto last_cap_renew_span = std::chrono::duration<double>(now - session->last_cap_renew).count();
1102 if (last_cap_renew_span < cutoff) {
1103 dout(20) << "oldest stale session is " << session->info.inst
1104 << " and recently renewed caps " << last_cap_renew_span << "s ago" << dendl;
1105 break;
1106 }
1107 to_evict.push_back(session);
1108 }
1109 }
1110
1111 for (auto session: to_evict) {
1112 if (session->is_importing()) {
1113 dout(10) << "skipping session " << session->info.inst << ", it's being imported" << dendl;
1114 continue;
1115 }
1116
1117 auto last_cap_renew_span = std::chrono::duration<double>(now - session->last_cap_renew).count();
1118 mds->clog->warn() << "evicting unresponsive client " << *session
1119 << ", after " << last_cap_renew_span << " seconds";
1120 dout(10) << "autoclosing stale session " << session->info.inst
1121 << " last renewed caps " << last_cap_renew_span << "s ago" << dendl;
1122
1123 if (g_conf()->mds_session_blacklist_on_timeout) {
1124 std::stringstream ss;
1125 mds->evict_client(session->get_client().v, false, true, ss, nullptr);
1126 } else {
1127 kill_session(session, NULL);
1128 }
1129 }
1130 }
1131
1132 void Server::evict_cap_revoke_non_responders() {
1133 if (!cap_revoke_eviction_timeout) {
1134 return;
1135 }
1136
1137 auto&& to_evict = mds->locker->get_late_revoking_clients(cap_revoke_eviction_timeout);
1138
1139 for (auto const &client: to_evict) {
1140 mds->clog->warn() << "client id " << client << " has not responded to"
1141 << " cap revoke by MDS for over " << cap_revoke_eviction_timeout
1142 << " seconds, evicting";
1143 dout(1) << __func__ << ": evicting cap revoke non-responder client id "
1144 << client << dendl;
1145
1146 std::stringstream ss;
1147 bool evicted = mds->evict_client(client.v, false,
1148 g_conf()->mds_session_blacklist_on_evict,
1149 ss, nullptr);
1150 if (evicted && logger) {
1151 logger->inc(l_mdss_cap_revoke_eviction);
1152 }
1153 }
1154 }
1155
1156 void Server::handle_conf_change(const std::set<std::string>& changed) {
1157 if (changed.count("mds_replay_unsafe_with_closed_session")) {
1158 replay_unsafe_with_closed_session = g_conf().get_val<bool>("mds_replay_unsafe_with_closed_session");
1159 }
1160 if (changed.count("mds_cap_revoke_eviction_timeout")) {
1161 cap_revoke_eviction_timeout = g_conf().get_val<double>("mds_cap_revoke_eviction_timeout");
1162 dout(20) << __func__ << " cap revoke eviction timeout changed to "
1163 << cap_revoke_eviction_timeout << dendl;
1164 }
1165 if (changed.count("mds_recall_max_decay_rate")) {
1166 recall_throttle = DecayCounter(g_conf().get_val<double>("mds_recall_max_decay_rate"));
1167 }
1168 if (changed.count("mds_max_snaps_per_dir")) {
1169 max_snaps_per_dir = g_conf().get_val<uint64_t>("mds_max_snaps_per_dir");
1170 dout(20) << __func__ << " max snapshots per directory changed to "
1171 << max_snaps_per_dir << dendl;
1172 }
1173 if (changed.count("mds_client_delegate_inos_pct")) {
1174 delegate_inos_pct = g_conf().get_val<uint64_t>("mds_client_delegate_inos_pct");
1175 }
1176 }
1177
1178 /*
1179 * XXX bump in the interface here, not using an MDSContext here
1180 * because all the callers right now happen to use a SaferCond
1181 */
1182 void Server::kill_session(Session *session, Context *on_safe, bool need_purge_inos)
1183 {
1184 ceph_assert(ceph_mutex_is_locked_by_me(mds->mds_lock));
1185
1186 if ((session->is_opening() ||
1187 session->is_open() ||
1188 session->is_stale()) &&
1189 !session->is_importing()) {
1190 dout(10) << "kill_session " << session << dendl;
1191 journal_close_session(session, Session::STATE_KILLING, on_safe, need_purge_inos);
1192 } else {
1193 dout(10) << "kill_session importing or already closing/killing " << session << dendl;
1194 if (session->is_closing() ||
1195 session->is_killing()) {
1196 if (on_safe)
1197 mdlog->wait_for_safe(new MDSInternalContextWrapper(mds, on_safe));
1198 } else {
1199 ceph_assert(session->is_closed() ||
1200 session->is_importing());
1201 if (on_safe)
1202 on_safe->complete(0);
1203 }
1204 }
1205 }
1206
1207 size_t Server::apply_blacklist(const std::set<entity_addr_t> &blacklist)
1208 {
1209 bool prenautilus = mds->objecter->with_osdmap(
1210 [&](const OSDMap& o) {
1211 return o.require_osd_release < ceph_release_t::nautilus;
1212 });
1213
1214 std::vector<Session*> victims;
1215 const auto& sessions = mds->sessionmap.get_sessions();
1216 for (const auto& p : sessions) {
1217 if (!p.first.is_client()) {
1218 // Do not apply OSDMap blacklist to MDS daemons, we find out
1219 // about their death via MDSMap.
1220 continue;
1221 }
1222
1223 Session *s = p.second;
1224 auto inst_addr = s->info.inst.addr;
1225 // blacklist entries are always TYPE_ANY for nautilus+
1226 inst_addr.set_type(entity_addr_t::TYPE_ANY);
1227 if (blacklist.count(inst_addr)) {
1228 victims.push_back(s);
1229 continue;
1230 }
1231 if (prenautilus) {
1232 // ...except pre-nautilus, they were TYPE_LEGACY
1233 inst_addr.set_type(entity_addr_t::TYPE_LEGACY);
1234 if (blacklist.count(inst_addr)) {
1235 victims.push_back(s);
1236 }
1237 }
1238 }
1239
1240 for (const auto& s : victims) {
1241 kill_session(s, nullptr);
1242 }
1243
1244 dout(10) << "apply_blacklist: killed " << victims.size() << dendl;
1245
1246 return victims.size();
1247 }
1248
1249 void Server::journal_close_session(Session *session, int state, Context *on_safe, bool need_purge_inos)
1250 {
1251 dout(10) << __func__ << " : "
1252 << "("<< need_purge_inos << ")"
1253 << session->info.inst
1254 << "(" << session->info.prealloc_inos.size() << "|" << session->pending_prealloc_inos.size() << ")" << dendl;
1255
1256 uint64_t sseq = mds->sessionmap.set_state(session, state);
1257 version_t pv = mds->sessionmap.mark_projected(session);
1258 version_t piv = 0;
1259
1260 // release alloc and pending-alloc inos for this session
1261 // and wipe out session state, in case the session close aborts for some reason
1262 interval_set<inodeno_t> both;
1263 both.insert(session->pending_prealloc_inos);
1264 if (!need_purge_inos)
1265 both.insert(session->info.prealloc_inos);
1266 if (both.size()) {
1267 mds->inotable->project_release_ids(both);
1268 piv = mds->inotable->get_projected_version();
1269 } else
1270 piv = 0;
1271
1272 if(need_purge_inos && session->info.prealloc_inos.size()) {
1273 dout(10) << "start purge indoes " << session->info.prealloc_inos << dendl;
1274 LogSegment* ls = mdlog->get_current_segment();
1275 LogEvent* e = new ESession(session->info.inst, false, pv, both, piv, session->info.prealloc_inos);
1276 MDSLogContextBase* c = new C_MDS_session_finish(this, session, sseq, false, pv, both, piv,
1277 session->info.prealloc_inos, ls, on_safe);
1278 mdlog->start_submit_entry(e, c);
1279 } else {
1280 interval_set<inodeno_t> empty;
1281 LogEvent* e = new ESession(session->info.inst, false, pv, both, piv, empty);
1282 MDSLogContextBase* c = new C_MDS_session_finish(this, session, sseq, false, pv, both, piv, on_safe);
1283 mdlog->start_submit_entry(e, c);
1284 }
1285 mdlog->flush();
1286
1287 // clean up requests, too
1288 for (auto p = session->requests.begin(); !p.end(); ) {
1289 MDRequestRef mdr(*p);
1290 ++p;
1291 mdcache->request_kill(mdr);
1292 }
1293
1294 finish_flush_session(session, session->get_push_seq());
1295 }
1296
1297 void Server::reconnect_clients(MDSContext *reconnect_done_)
1298 {
1299 reconnect_done = reconnect_done_;
1300
1301 auto now = clock::now();
1302 set<Session*> sessions;
1303 mds->sessionmap.get_client_session_set(sessions);
1304 for (auto session : sessions) {
1305 if (session->is_open()) {
1306 client_reconnect_gather.insert(session->get_client());
1307 session->set_reconnecting(true);
1308 session->last_cap_renew = now;
1309 }
1310 }
1311
1312 if (client_reconnect_gather.empty()) {
1313 dout(7) << "reconnect_clients -- no sessions, doing nothing." << dendl;
1314 reconnect_gather_finish();
1315 return;
1316 }
1317
1318 // clients will get the mdsmap and discover we're reconnecting via the monitor.
1319
1320 reconnect_start = now;
1321 dout(1) << "reconnect_clients -- " << client_reconnect_gather.size() << " sessions" << dendl;
1322 mds->sessionmap.dump();
1323 }
1324
1325 void Server::handle_client_reconnect(const cref_t<MClientReconnect> &m)
1326 {
1327 dout(7) << "handle_client_reconnect " << m->get_source()
1328 << (m->has_more() ? " (more)" : "") << dendl;
1329 client_t from = m->get_source().num();
1330 Session *session = mds->get_session(m);
1331 if (!session) {
1332 dout(0) << " ignoring sessionless msg " << *m << dendl;
1333 auto reply = make_message<MClientSession>(CEPH_SESSION_REJECT);
1334 reply->metadata["error_string"] = "sessionless";
1335 mds->send_message(reply, m->get_connection());
1336 return;
1337 }
1338
1339 if (!session->is_open()) {
1340 dout(0) << " ignoring msg from not-open session" << *m << dendl;
1341 auto reply = make_message<MClientSession>(CEPH_SESSION_CLOSE);
1342 mds->send_message(reply, m->get_connection());
1343 return;
1344 }
1345
1346 if (!mds->is_reconnect() && mds->get_want_state() == CEPH_MDS_STATE_RECONNECT) {
1347 dout(10) << " we're almost in reconnect state (mdsmap delivery race?); waiting" << dendl;
1348 mds->wait_for_reconnect(new C_MDS_RetryMessage(mds, m));
1349 return;
1350 }
1351
1352 auto delay = std::chrono::duration<double>(clock::now() - reconnect_start).count();
1353 dout(10) << " reconnect_start " << reconnect_start << " delay " << delay << dendl;
1354
1355 bool deny = false;
1356 if (!mds->is_reconnect() || mds->get_want_state() != CEPH_MDS_STATE_RECONNECT || reconnect_evicting) {
1357 // XXX maybe in the future we can do better than this?
1358 dout(1) << " no longer in reconnect state, ignoring reconnect, sending close" << dendl;
1359 mds->clog->info() << "denied reconnect attempt (mds is "
1360 << ceph_mds_state_name(mds->get_state())
1361 << ") from " << m->get_source_inst()
1362 << " after " << delay << " (allowed interval " << g_conf()->mds_reconnect_timeout << ")";
1363 deny = true;
1364 } else {
1365 std::string error_str;
1366 if (!session->is_open()) {
1367 error_str = "session is closed";
1368 } else if (mdcache->is_readonly()) {
1369 error_str = "mds is readonly";
1370 } else {
1371 if (session->info.client_metadata.features.empty())
1372 infer_supported_features(session, session->info.client_metadata);
1373
1374 feature_bitset_t missing_features = required_client_features;
1375 missing_features -= session->info.client_metadata.features;
1376 if (!missing_features.empty()) {
1377 stringstream ss;
1378 ss << "missing required features '" << missing_features << "'";
1379 error_str = ss.str();
1380 }
1381 }
1382
1383 if (!error_str.empty()) {
1384 deny = true;
1385 dout(1) << " " << error_str << ", ignoring reconnect, sending close" << dendl;
1386 mds->clog->info() << "denied reconnect attempt from "
1387 << m->get_source_inst() << " (" << error_str << ")";
1388 }
1389 }
1390
1391 if (deny) {
1392 auto r = make_message<MClientSession>(CEPH_SESSION_CLOSE);
1393 mds->send_message_client(r, session);
1394 if (session->is_open())
1395 kill_session(session, nullptr);
1396 return;
1397 }
1398
1399 if (!m->has_more()) {
1400 // notify client of success with an OPEN
1401 auto reply = make_message<MClientSession>(CEPH_SESSION_OPEN);
1402 if (session->info.has_feature(CEPHFS_FEATURE_MIMIC))
1403 reply->supported_features = supported_features;
1404 mds->send_message_client(reply, session);
1405 mds->clog->debug() << "reconnect by " << session->info.inst << " after " << delay;
1406 }
1407
1408 session->last_cap_renew = clock::now();
1409
1410 // snaprealms
1411 for (const auto &r : m->realms) {
1412 CInode *in = mdcache->get_inode(inodeno_t(r.realm.ino));
1413 if (in && in->state_test(CInode::STATE_PURGING))
1414 continue;
1415 if (in) {
1416 if (in->snaprealm) {
1417 dout(15) << "open snaprealm (w inode) on " << *in << dendl;
1418 } else {
1419 // this can happen if we are non-auth or we rollback snaprealm
1420 dout(15) << "open snaprealm (null snaprealm) on " << *in << dendl;
1421 }
1422 mdcache->add_reconnected_snaprealm(from, inodeno_t(r.realm.ino), snapid_t(r.realm.seq));
1423 } else {
1424 dout(15) << "open snaprealm (w/o inode) on " << inodeno_t(r.realm.ino)
1425 << " seq " << r.realm.seq << dendl;
1426 mdcache->add_reconnected_snaprealm(from, inodeno_t(r.realm.ino), snapid_t(r.realm.seq));
1427 }
1428 }
1429
1430 // caps
1431 for (const auto &p : m->caps) {
1432 // make sure our last_cap_id is MAX over all issued caps
1433 if (p.second.capinfo.cap_id > mdcache->last_cap_id)
1434 mdcache->last_cap_id = p.second.capinfo.cap_id;
1435
1436 CInode *in = mdcache->get_inode(p.first);
1437 if (in && in->state_test(CInode::STATE_PURGING))
1438 continue;
1439 if (in && in->is_auth()) {
1440 // we recovered it, and it's ours. take note.
1441 dout(15) << "open cap realm " << inodeno_t(p.second.capinfo.snaprealm)
1442 << " on " << *in << dendl;
1443 in->reconnect_cap(from, p.second, session);
1444 mdcache->add_reconnected_cap(from, p.first, p.second);
1445 recover_filelocks(in, p.second.flockbl, m->get_orig_source().num());
1446 continue;
1447 }
1448
1449 if (in && !in->is_auth()) {
1450 // not mine.
1451 dout(10) << "non-auth " << *in << ", will pass off to authority" << dendl;
1452 // add to cap export list.
1453 mdcache->rejoin_export_caps(p.first, from, p.second,
1454 in->authority().first, true);
1455 } else {
1456 // don't know if the inode is mine
1457 dout(10) << "missing ino " << p.first << ", will load later" << dendl;
1458 mdcache->rejoin_recovered_caps(p.first, from, p.second, MDS_RANK_NONE);
1459 }
1460 }
1461
1462 reconnect_last_seen = clock::now();
1463
1464 if (!m->has_more()) {
1465 mdcache->rejoin_recovered_client(session->get_client(), session->info.inst);
1466
1467 // remove from gather set
1468 client_reconnect_gather.erase(from);
1469 session->set_reconnecting(false);
1470 if (client_reconnect_gather.empty())
1471 reconnect_gather_finish();
1472 }
1473 }
1474
1475 void Server::infer_supported_features(Session *session, client_metadata_t& client_metadata)
1476 {
1477 int supported = -1;
1478 auto it = client_metadata.find("ceph_version");
1479 if (it != client_metadata.end()) {
1480 // user space client
1481 if (it->second.compare(0, 16, "ceph version 12.") == 0)
1482 supported = CEPHFS_FEATURE_LUMINOUS;
1483 else if (session->get_connection()->has_feature(CEPH_FEATURE_FS_CHANGE_ATTR))
1484 supported = CEPHFS_FEATURE_KRAKEN;
1485 } else {
1486 it = client_metadata.find("kernel_version");
1487 if (it != client_metadata.end()) {
1488 // kernel client
1489 if (session->get_connection()->has_feature(CEPH_FEATURE_NEW_OSDOP_ENCODING))
1490 supported = CEPHFS_FEATURE_LUMINOUS;
1491 }
1492 }
1493 if (supported == -1 &&
1494 session->get_connection()->has_feature(CEPH_FEATURE_FS_FILE_LAYOUT_V2))
1495 supported = CEPHFS_FEATURE_JEWEL;
1496
1497 if (supported >= 0) {
1498 unsigned long value = (1UL << (supported + 1)) - 1;
1499 client_metadata.features = feature_bitset_t(value);
1500 dout(10) << __func__ << " got '" << client_metadata.features << "'" << dendl;
1501 }
1502 }
1503
1504 void Server::update_required_client_features()
1505 {
1506 vector<size_t> bits = CEPHFS_FEATURES_MDS_REQUIRED;
1507
1508 /* If this blows up on you, you added a release without adding a new release bit to cephfs_features.h */
1509 static_assert(CEPHFS_CURRENT_RELEASE == CEPH_RELEASE_MAX-1);
1510
1511 ceph_release_t min_compat = mds->mdsmap->get_min_compat_client();
1512 if (min_compat >= ceph_release_t::octopus)
1513 bits.push_back(CEPHFS_FEATURE_OCTOPUS);
1514 else if (min_compat >= ceph_release_t::nautilus)
1515 bits.push_back(CEPHFS_FEATURE_NAUTILUS);
1516 else if (min_compat >= ceph_release_t::mimic)
1517 bits.push_back(CEPHFS_FEATURE_MIMIC);
1518 else if (min_compat >= ceph_release_t::luminous)
1519 bits.push_back(CEPHFS_FEATURE_LUMINOUS);
1520 else if (min_compat >= ceph_release_t::kraken)
1521 bits.push_back(CEPHFS_FEATURE_KRAKEN);
1522 else if (min_compat >= ceph_release_t::jewel)
1523 bits.push_back(CEPHFS_FEATURE_JEWEL);
1524
1525 std::sort(bits.begin(), bits.end());
1526 required_client_features = feature_bitset_t(bits);
1527 dout(7) << "required_client_features: " << required_client_features << dendl;
1528
1529 if (mds->get_state() >= MDSMap::STATE_RECONNECT) {
1530 set<Session*> sessions;
1531 mds->sessionmap.get_client_session_set(sessions);
1532 for (auto session : sessions) {
1533 feature_bitset_t missing_features = required_client_features;
1534 missing_features -= session->info.client_metadata.features;
1535 if (!missing_features.empty()) {
1536 bool blacklisted = mds->objecter->with_osdmap(
1537 [session](const OSDMap &osd_map) -> bool {
1538 return osd_map.is_blacklisted(session->info.inst.addr);
1539 });
1540 if (blacklisted)
1541 continue;
1542
1543 mds->clog->warn() << "evicting session " << *session << ", missing required features '"
1544 << missing_features << "'";
1545 std::stringstream ss;
1546 mds->evict_client(session->get_client().v, false,
1547 g_conf()->mds_session_blacklist_on_evict, ss);
1548 }
1549 }
1550 }
1551 }
1552
1553 void Server::reconnect_gather_finish()
1554 {
1555 dout(7) << "reconnect_gather_finish. failed on " << failed_reconnects << " clients" << dendl;
1556 ceph_assert(reconnect_done);
1557
1558 if (!mds->snapclient->is_synced()) {
1559 // make sure snaptable cache is populated. snaprealms will be
1560 // extensively used in rejoin stage.
1561 dout(7) << " snaptable cache isn't synced, delaying state transition" << dendl;
1562 mds->snapclient->wait_for_sync(reconnect_done);
1563 } else {
1564 reconnect_done->complete(0);
1565 }
1566 reconnect_done = NULL;
1567 }
1568
1569 void Server::reconnect_tick()
1570 {
1571 if (reconnect_evicting) {
1572 dout(7) << "reconnect_tick: waiting for evictions" << dendl;
1573 return;
1574 }
1575
1576 if (client_reconnect_gather.empty())
1577 return;
1578
1579 auto now = clock::now();
1580 auto elapse1 = std::chrono::duration<double>(now - reconnect_start).count();
1581 if (elapse1 < g_conf()->mds_reconnect_timeout)
1582 return;
1583
1584 vector<Session*> remaining_sessions;
1585 remaining_sessions.reserve(client_reconnect_gather.size());
1586 for (auto c : client_reconnect_gather) {
1587 Session *session = mds->sessionmap.get_session(entity_name_t::CLIENT(c.v));
1588 ceph_assert(session);
1589 remaining_sessions.push_back(session);
1590 // client re-sends cap flush messages before the reconnect message
1591 if (session->last_seen > reconnect_last_seen)
1592 reconnect_last_seen = session->last_seen;
1593 }
1594
1595 auto elapse2 = std::chrono::duration<double>(now - reconnect_last_seen).count();
1596 if (elapse2 < g_conf()->mds_reconnect_timeout / 2) {
1597 dout(7) << "reconnect_tick: last seen " << elapse2
1598 << " seconds ago, extending reconnect interval" << dendl;
1599 return;
1600 }
1601
1602 dout(7) << "reconnect timed out, " << remaining_sessions.size()
1603 << " clients have not reconnected in time" << dendl;
1604
1605 // If we're doing blacklist evictions, use this to wait for them before
1606 // proceeding to reconnect_gather_finish
1607 MDSGatherBuilder gather(g_ceph_context);
1608
1609 for (auto session : remaining_sessions) {
1610 // Keep sessions that have specified timeout. These sessions will prevent
1611 // mds from going to active. MDS goes to active after they all have been
1612 // killed or reclaimed.
1613 if (session->info.client_metadata.find("timeout") !=
1614 session->info.client_metadata.end()) {
1615 dout(1) << "reconnect keeps " << session->info.inst
1616 << ", need to be reclaimed" << dendl;
1617 client_reclaim_gather.insert(session->get_client());
1618 continue;
1619 }
1620
1621 dout(1) << "reconnect gives up on " << session->info.inst << dendl;
1622
1623 mds->clog->warn() << "evicting unresponsive client " << *session
1624 << ", after waiting " << elapse1
1625 << " seconds during MDS startup";
1626
1627 if (g_conf()->mds_session_blacklist_on_timeout) {
1628 std::stringstream ss;
1629 mds->evict_client(session->get_client().v, false, true, ss,
1630 gather.new_sub());
1631 } else {
1632 kill_session(session, NULL, true);
1633 }
1634
1635 failed_reconnects++;
1636 }
1637 client_reconnect_gather.clear();
1638
1639 if (gather.has_subs()) {
1640 dout(1) << "reconnect will complete once clients are evicted" << dendl;
1641 gather.set_finisher(new MDSInternalContextWrapper(mds, new LambdaContext(
1642 [this](int r){reconnect_gather_finish();})));
1643 gather.activate();
1644 reconnect_evicting = true;
1645 } else {
1646 reconnect_gather_finish();
1647 }
1648 }
1649
1650 void Server::recover_filelocks(CInode *in, bufferlist locks, int64_t client)
1651 {
1652 if (!locks.length()) return;
1653 int numlocks;
1654 ceph_filelock lock;
1655 auto p = locks.cbegin();
1656 decode(numlocks, p);
1657 for (int i = 0; i < numlocks; ++i) {
1658 decode(lock, p);
1659 lock.client = client;
1660 in->get_fcntl_lock_state()->held_locks.insert(pair<uint64_t, ceph_filelock>(lock.start, lock));
1661 ++in->get_fcntl_lock_state()->client_held_lock_counts[client];
1662 }
1663 decode(numlocks, p);
1664 for (int i = 0; i < numlocks; ++i) {
1665 decode(lock, p);
1666 lock.client = client;
1667 in->get_flock_lock_state()->held_locks.insert(pair<uint64_t, ceph_filelock> (lock.start, lock));
1668 ++in->get_flock_lock_state()->client_held_lock_counts[client];
1669 }
1670 }
1671
1672 /**
1673 * Call this when the MDCache is oversized, to send requests to the clients
1674 * to trim some caps, and consequently unpin some inodes in the MDCache so
1675 * that it can trim too.
1676 */
1677 std::pair<bool, uint64_t> Server::recall_client_state(MDSGatherBuilder* gather, RecallFlags flags)
1678 {
1679 const auto now = clock::now();
1680 const bool steady = !!(flags&RecallFlags::STEADY);
1681 const bool enforce_max = !!(flags&RecallFlags::ENFORCE_MAX);
1682 const bool enforce_liveness = !!(flags&RecallFlags::ENFORCE_LIVENESS);
1683 const bool trim = !!(flags&RecallFlags::TRIM);
1684
1685 const auto max_caps_per_client = g_conf().get_val<uint64_t>("mds_max_caps_per_client");
1686 const auto min_caps_per_client = g_conf().get_val<uint64_t>("mds_min_caps_per_client");
1687 const auto recall_global_max_decay_threshold = g_conf().get_val<Option::size_t>("mds_recall_global_max_decay_threshold");
1688 const auto recall_max_caps = g_conf().get_val<Option::size_t>("mds_recall_max_caps");
1689 const auto recall_max_decay_threshold = g_conf().get_val<Option::size_t>("mds_recall_max_decay_threshold");
1690 const auto cache_liveness_magnitude = g_conf().get_val<Option::size_t>("mds_session_cache_liveness_magnitude");
1691
1692 dout(7) << __func__ << ":"
1693 << " min=" << min_caps_per_client
1694 << " max=" << max_caps_per_client
1695 << " total=" << Capability::count()
1696 << " flags=" << flags
1697 << dendl;
1698
1699 /* trim caps of sessions with the most caps first */
1700 std::multimap<uint64_t, Session*> caps_session;
1701 auto f = [&caps_session, enforce_max, enforce_liveness, trim, max_caps_per_client, cache_liveness_magnitude](auto& s) {
1702 auto num_caps = s->caps.size();
1703 auto cache_liveness = s->get_session_cache_liveness();
1704 if (trim || (enforce_max && num_caps > max_caps_per_client) || (enforce_liveness && cache_liveness < (num_caps>>cache_liveness_magnitude))) {
1705 caps_session.emplace(std::piecewise_construct, std::forward_as_tuple(num_caps), std::forward_as_tuple(s));
1706 }
1707 };
1708 mds->sessionmap.get_client_sessions(std::move(f));
1709
1710 std::pair<bool, uint64_t> result = {false, 0};
1711 auto& [throttled, caps_recalled] = result;
1712 last_recall_state = now;
1713 for (const auto& [num_caps, session] : boost::adaptors::reverse(caps_session)) {
1714 if (!session->is_open() ||
1715 !session->get_connection() ||
1716 !session->info.inst.name.is_client())
1717 continue;
1718
1719 dout(10) << __func__ << ":"
1720 << " session " << session->info.inst
1721 << " caps " << num_caps
1722 << ", leases " << session->leases.size()
1723 << dendl;
1724
1725 uint64_t newlim;
1726 if (num_caps < recall_max_caps || (num_caps-recall_max_caps) < min_caps_per_client) {
1727 newlim = min_caps_per_client;
1728 } else {
1729 newlim = num_caps-recall_max_caps;
1730 }
1731 if (num_caps > newlim) {
1732 /* now limit the number of caps we recall at a time to prevent overloading ourselves */
1733 uint64_t recall = std::min<uint64_t>(recall_max_caps, num_caps-newlim);
1734 newlim = num_caps-recall;
1735 const uint64_t session_recall_throttle = session->get_recall_caps_throttle();
1736 const uint64_t session_recall_throttle2o = session->get_recall_caps_throttle2o();
1737 const uint64_t global_recall_throttle = recall_throttle.get();
1738 if (session_recall_throttle+recall > recall_max_decay_threshold) {
1739 dout(15) << " session recall threshold (" << recall_max_decay_threshold << ") hit at " << session_recall_throttle << "; skipping!" << dendl;
1740 throttled = true;
1741 continue;
1742 } else if (session_recall_throttle2o+recall > recall_max_caps*2) {
1743 dout(15) << " session recall 2nd-order threshold (" << 2*recall_max_caps << ") hit at " << session_recall_throttle2o << "; skipping!" << dendl;
1744 throttled = true;
1745 continue;
1746 } else if (global_recall_throttle+recall > recall_global_max_decay_threshold) {
1747 dout(15) << " global recall threshold (" << recall_global_max_decay_threshold << ") hit at " << global_recall_throttle << "; skipping!" << dendl;
1748 throttled = true;
1749 break;
1750 }
1751
1752 // now check if we've recalled caps recently and the client is unlikely to satisfy a new recall
1753 if (steady) {
1754 const auto session_recall = session->get_recall_caps();
1755 const auto session_release = session->get_release_caps();
1756 if (2*session_release < session_recall && 2*session_recall > recall_max_decay_threshold) {
1757 /* The session has been unable to keep up with the number of caps
1758 * recalled (by half); additionally, to prevent marking sessions
1759 * we've just begun to recall from, the session_recall counter
1760 * (decayed count of caps recently recalled) is **greater** than the
1761 * session threshold for the session's cap recall throttle.
1762 */
1763 dout(15) << " 2*session_release < session_recall"
1764 " (2*" << session_release << " < " << session_recall << ") &&"
1765 " 2*session_recall < recall_max_decay_threshold"
1766 " (2*" << session_recall << " > " << recall_max_decay_threshold << ")"
1767 " Skipping because we are unlikely to get more released." << dendl;
1768 continue;
1769 } else if (recall < recall_max_caps && 2*recall < session_recall) {
1770 /* The number of caps recalled is less than the number we *could*
1771 * recall (so there isn't much left to recall?) and the number of
1772 * caps is less than the current recall_caps counter (decayed count
1773 * of caps recently recalled).
1774 */
1775 dout(15) << " 2*recall < session_recall "
1776 " (2*" << recall << " < " << session_recall << ") &&"
1777 " recall < recall_max_caps (" << recall << " < " << recall_max_caps << ");"
1778 " Skipping because we are unlikely to get more released." << dendl;
1779 continue;
1780 }
1781 }
1782
1783 dout(7) << " recalling " << recall << " caps; session_recall_throttle = " << session_recall_throttle << "; global_recall_throttle = " << global_recall_throttle << dendl;
1784
1785 auto m = make_message<MClientSession>(CEPH_SESSION_RECALL_STATE);
1786 m->head.max_caps = newlim;
1787 mds->send_message_client(m, session);
1788 if (gather) {
1789 flush_session(session, gather);
1790 }
1791 caps_recalled += session->notify_recall_sent(newlim);
1792 recall_throttle.hit(recall);
1793 }
1794 }
1795
1796 dout(7) << "recalled" << (throttled ? " (throttled)" : "") << " " << caps_recalled << " client caps." << dendl;
1797
1798 return result;
1799 }
1800
1801 void Server::force_clients_readonly()
1802 {
1803 dout(10) << "force_clients_readonly" << dendl;
1804 set<Session*> sessions;
1805 mds->sessionmap.get_client_session_set(sessions);
1806 for (set<Session*>::const_iterator p = sessions.begin();
1807 p != sessions.end();
1808 ++p) {
1809 Session *session = *p;
1810 if (!session->info.inst.name.is_client() ||
1811 !(session->is_open() || session->is_stale()))
1812 continue;
1813 mds->send_message_client(make_message<MClientSession>(CEPH_SESSION_FORCE_RO), session);
1814 }
1815 }
1816
1817 /*******
1818 * some generic stuff for finishing off requests
1819 */
1820 void Server::journal_and_reply(MDRequestRef& mdr, CInode *in, CDentry *dn, LogEvent *le, MDSLogContextBase *fin)
1821 {
1822 dout(10) << "journal_and_reply tracei " << in << " tracedn " << dn << dendl;
1823 ceph_assert(!mdr->has_completed);
1824
1825 // note trace items for eventual reply.
1826 mdr->tracei = in;
1827 if (in)
1828 mdr->pin(in);
1829
1830 mdr->tracedn = dn;
1831 if (dn)
1832 mdr->pin(dn);
1833
1834 early_reply(mdr, in, dn);
1835
1836 mdr->committing = true;
1837 submit_mdlog_entry(le, fin, mdr, __func__);
1838
1839 if (mdr->client_request && mdr->client_request->is_queued_for_replay()) {
1840 if (mds->queue_one_replay()) {
1841 dout(10) << " queued next replay op" << dendl;
1842 } else {
1843 dout(10) << " journaled last replay op" << dendl;
1844 }
1845 } else if (mdr->did_early_reply)
1846 mds->locker->drop_rdlocks_for_early_reply(mdr.get());
1847 else
1848 mdlog->flush();
1849 }
1850
1851 void Server::submit_mdlog_entry(LogEvent *le, MDSLogContextBase *fin, MDRequestRef& mdr,
1852 std::string_view event)
1853 {
1854 if (mdr) {
1855 string event_str("submit entry: ");
1856 event_str += event;
1857 mdr->mark_event(event_str);
1858 }
1859 mdlog->submit_entry(le, fin);
1860 }
1861
1862 /*
1863 * send response built from mdr contents and error code; clean up mdr
1864 */
1865 void Server::respond_to_request(MDRequestRef& mdr, int r)
1866 {
1867 if (mdr->client_request) {
1868 if (mdr->is_batch_op() && mdr->is_batch_head) {
1869 int mask = mdr->client_request->head.args.getattr.mask;
1870
1871 std::unique_ptr<BatchOp> bop;
1872 if (mdr->client_request->get_op() == CEPH_MDS_OP_GETATTR) {
1873 dout(20) << __func__ << ": respond other getattr ops. " << *mdr << dendl;
1874 auto it = mdr->in[0]->batch_ops.find(mask);
1875 bop = std::move(it->second);
1876 mdr->in[0]->batch_ops.erase(it);
1877 } else {
1878 dout(20) << __func__ << ": respond other lookup ops. " << *mdr << dendl;
1879 auto it = mdr->dn[0].back()->batch_ops.find(mask);
1880 bop = std::move(it->second);
1881 mdr->dn[0].back()->batch_ops.erase(it);
1882 }
1883
1884 bop->respond(r);
1885 } else {
1886 reply_client_request(mdr, make_message<MClientReply>(*mdr->client_request, r));
1887 }
1888 } else if (mdr->internal_op > -1) {
1889 dout(10) << "respond_to_request on internal request " << mdr << dendl;
1890 if (!mdr->internal_op_finish)
1891 ceph_abort_msg("trying to respond to internal op without finisher");
1892 mdr->internal_op_finish->complete(r);
1893 mdcache->request_finish(mdr);
1894 }
1895 }
1896
1897 // statistics mds req op number and latency
1898 void Server::perf_gather_op_latency(const cref_t<MClientRequest> &req, utime_t lat)
1899 {
1900 int code = l_mdss_first;
1901 switch(req->get_op()) {
1902 case CEPH_MDS_OP_LOOKUPHASH:
1903 code = l_mdss_req_lookuphash_latency;
1904 break;
1905 case CEPH_MDS_OP_LOOKUPINO:
1906 code = l_mdss_req_lookupino_latency;
1907 break;
1908 case CEPH_MDS_OP_LOOKUPPARENT:
1909 code = l_mdss_req_lookupparent_latency;
1910 break;
1911 case CEPH_MDS_OP_LOOKUPNAME:
1912 code = l_mdss_req_lookupname_latency;
1913 break;
1914 case CEPH_MDS_OP_LOOKUP:
1915 code = l_mdss_req_lookup_latency;
1916 break;
1917 case CEPH_MDS_OP_LOOKUPSNAP:
1918 code = l_mdss_req_lookupsnap_latency;
1919 break;
1920 case CEPH_MDS_OP_GETATTR:
1921 code = l_mdss_req_getattr_latency;
1922 break;
1923 case CEPH_MDS_OP_SETATTR:
1924 code = l_mdss_req_setattr_latency;
1925 break;
1926 case CEPH_MDS_OP_SETLAYOUT:
1927 code = l_mdss_req_setlayout_latency;
1928 break;
1929 case CEPH_MDS_OP_SETDIRLAYOUT:
1930 code = l_mdss_req_setdirlayout_latency;
1931 break;
1932 case CEPH_MDS_OP_SETXATTR:
1933 code = l_mdss_req_setxattr_latency;
1934 break;
1935 case CEPH_MDS_OP_RMXATTR:
1936 code = l_mdss_req_rmxattr_latency;
1937 break;
1938 case CEPH_MDS_OP_READDIR:
1939 code = l_mdss_req_readdir_latency;
1940 break;
1941 case CEPH_MDS_OP_SETFILELOCK:
1942 code = l_mdss_req_setfilelock_latency;
1943 break;
1944 case CEPH_MDS_OP_GETFILELOCK:
1945 code = l_mdss_req_getfilelock_latency;
1946 break;
1947 case CEPH_MDS_OP_CREATE:
1948 code = l_mdss_req_create_latency;
1949 break;
1950 case CEPH_MDS_OP_OPEN:
1951 code = l_mdss_req_open_latency;
1952 break;
1953 case CEPH_MDS_OP_MKNOD:
1954 code = l_mdss_req_mknod_latency;
1955 break;
1956 case CEPH_MDS_OP_LINK:
1957 code = l_mdss_req_link_latency;
1958 break;
1959 case CEPH_MDS_OP_UNLINK:
1960 code = l_mdss_req_unlink_latency;
1961 break;
1962 case CEPH_MDS_OP_RMDIR:
1963 code = l_mdss_req_rmdir_latency;
1964 break;
1965 case CEPH_MDS_OP_RENAME:
1966 code = l_mdss_req_rename_latency;
1967 break;
1968 case CEPH_MDS_OP_MKDIR:
1969 code = l_mdss_req_mkdir_latency;
1970 break;
1971 case CEPH_MDS_OP_SYMLINK:
1972 code = l_mdss_req_symlink_latency;
1973 break;
1974 case CEPH_MDS_OP_LSSNAP:
1975 code = l_mdss_req_lssnap_latency;
1976 break;
1977 case CEPH_MDS_OP_MKSNAP:
1978 code = l_mdss_req_mksnap_latency;
1979 break;
1980 case CEPH_MDS_OP_RMSNAP:
1981 code = l_mdss_req_rmsnap_latency;
1982 break;
1983 case CEPH_MDS_OP_RENAMESNAP:
1984 code = l_mdss_req_renamesnap_latency;
1985 break;
1986 default: ceph_abort();
1987 }
1988 logger->tinc(code, lat);
1989 }
1990
1991 void Server::early_reply(MDRequestRef& mdr, CInode *tracei, CDentry *tracedn)
1992 {
1993 if (!g_conf()->mds_early_reply)
1994 return;
1995
1996 if (mdr->no_early_reply) {
1997 dout(10) << "early_reply - flag no_early_reply is set, not allowed." << dendl;
1998 return;
1999 }
2000
2001 if (mdr->has_more() && mdr->more()->has_journaled_slaves) {
2002 dout(10) << "early_reply - there are journaled slaves, not allowed." << dendl;
2003 return;
2004 }
2005
2006 if (mdr->alloc_ino) {
2007 dout(10) << "early_reply - allocated ino, not allowed" << dendl;
2008 return;
2009 }
2010
2011 const cref_t<MClientRequest> &req = mdr->client_request;
2012 entity_inst_t client_inst = req->get_source_inst();
2013 if (client_inst.name.is_mds())
2014 return;
2015
2016 if (req->is_replay()) {
2017 dout(10) << " no early reply on replay op" << dendl;
2018 return;
2019 }
2020
2021
2022 auto reply = make_message<MClientReply>(*req, 0);
2023 reply->set_unsafe();
2024
2025 // mark xlocks "done", indicating that we are exposing uncommitted changes.
2026 //
2027 //_rename_finish() does not send dentry link/unlink message to replicas.
2028 // so do not set xlocks on dentries "done", the xlocks prevent dentries
2029 // that have projected linkages from getting new replica.
2030 mds->locker->set_xlocks_done(mdr.get(), req->get_op() == CEPH_MDS_OP_RENAME);
2031
2032 dout(10) << "early_reply " << reply->get_result()
2033 << " (" << cpp_strerror(reply->get_result())
2034 << ") " << *req << dendl;
2035
2036 if (tracei || tracedn) {
2037 if (tracei)
2038 mdr->cap_releases.erase(tracei->vino());
2039 if (tracedn)
2040 mdr->cap_releases.erase(tracedn->get_dir()->get_inode()->vino());
2041
2042 set_trace_dist(reply, tracei, tracedn, mdr);
2043 }
2044
2045 reply->set_extra_bl(mdr->reply_extra_bl);
2046 mds->send_message_client(reply, mdr->session);
2047
2048 mdr->did_early_reply = true;
2049
2050 mds->logger->inc(l_mds_reply);
2051 utime_t lat = ceph_clock_now() - req->get_recv_stamp();
2052 mds->logger->tinc(l_mds_reply_latency, lat);
2053 if (client_inst.name.is_client()) {
2054 mds->sessionmap.hit_session(mdr->session);
2055 }
2056 perf_gather_op_latency(req, lat);
2057 dout(20) << "lat " << lat << dendl;
2058
2059 mdr->mark_event("early_replied");
2060 }
2061
2062 /*
2063 * send given reply
2064 * include a trace to tracei
2065 * Clean up mdr
2066 */
2067 void Server::reply_client_request(MDRequestRef& mdr, const ref_t<MClientReply> &reply)
2068 {
2069 ceph_assert(mdr.get());
2070 const cref_t<MClientRequest> &req = mdr->client_request;
2071
2072 dout(7) << "reply_client_request " << reply->get_result()
2073 << " (" << cpp_strerror(reply->get_result())
2074 << ") " << *req << dendl;
2075
2076 mdr->mark_event("replying");
2077
2078 Session *session = mdr->session;
2079
2080 // note successful request in session map?
2081 //
2082 // setfilelock requests are special, they only modify states in MDS memory.
2083 // The states get lost when MDS fails. If Client re-send a completed
2084 // setfilelock request, it means that client did not receive corresponding
2085 // setfilelock reply. So MDS should re-execute the setfilelock request.
2086 if (req->may_write() && req->get_op() != CEPH_MDS_OP_SETFILELOCK &&
2087 reply->get_result() == 0 && session) {
2088 inodeno_t created = mdr->alloc_ino ? mdr->alloc_ino : mdr->used_prealloc_ino;
2089 session->add_completed_request(mdr->reqid.tid, created);
2090 if (mdr->ls) {
2091 mdr->ls->touched_sessions.insert(session->info.inst.name);
2092 }
2093 }
2094
2095 // give any preallocated inos to the session
2096 apply_allocated_inos(mdr, session);
2097
2098 // get tracei/tracedn from mdr?
2099 CInode *tracei = mdr->tracei;
2100 CDentry *tracedn = mdr->tracedn;
2101
2102 bool is_replay = mdr->client_request->is_replay();
2103 bool did_early_reply = mdr->did_early_reply;
2104 entity_inst_t client_inst = req->get_source_inst();
2105
2106 if (!did_early_reply && !is_replay) {
2107
2108 mds->logger->inc(l_mds_reply);
2109 utime_t lat = ceph_clock_now() - mdr->client_request->get_recv_stamp();
2110 mds->logger->tinc(l_mds_reply_latency, lat);
2111 if (session && client_inst.name.is_client()) {
2112 mds->sessionmap.hit_session(session);
2113 }
2114 perf_gather_op_latency(req, lat);
2115 dout(20) << "lat " << lat << dendl;
2116
2117 if (tracei)
2118 mdr->cap_releases.erase(tracei->vino());
2119 if (tracedn)
2120 mdr->cap_releases.erase(tracedn->get_dir()->get_inode()->vino());
2121 }
2122
2123 // drop non-rdlocks before replying, so that we can issue leases
2124 mdcache->request_drop_non_rdlocks(mdr);
2125
2126 // reply at all?
2127 if (session && !client_inst.name.is_mds()) {
2128 // send reply.
2129 if (!did_early_reply && // don't issue leases if we sent an earlier reply already
2130 (tracei || tracedn)) {
2131 if (is_replay) {
2132 if (tracei)
2133 mdcache->try_reconnect_cap(tracei, session);
2134 } else {
2135 // include metadata in reply
2136 set_trace_dist(reply, tracei, tracedn, mdr);
2137 }
2138 }
2139
2140 // We can set the extra bl unconditionally: if it's already been sent in the
2141 // early_reply, set_extra_bl will have claimed it and reply_extra_bl is empty
2142 reply->set_extra_bl(mdr->reply_extra_bl);
2143
2144 reply->set_mdsmap_epoch(mds->mdsmap->get_epoch());
2145 mds->send_message_client(reply, session);
2146 }
2147
2148 if (req->is_queued_for_replay() &&
2149 (mdr->has_completed || reply->get_result() < 0)) {
2150 if (reply->get_result() < 0) {
2151 int r = reply->get_result();
2152 derr << "reply_client_request: failed to replay " << *req
2153 << " error " << r << " (" << cpp_strerror(r) << ")" << dendl;
2154 mds->clog->warn() << "failed to replay " << req->get_reqid() << " error " << r;
2155 }
2156 mds->queue_one_replay();
2157 }
2158
2159 // clean up request
2160 mdcache->request_finish(mdr);
2161
2162 // take a closer look at tracei, if it happens to be a remote link
2163 if (tracei &&
2164 tracedn &&
2165 tracedn->get_projected_linkage()->is_remote()) {
2166 mdcache->eval_remote(tracedn);
2167 }
2168 }
2169
2170 /*
2171 * pass inode OR dentry (not both, or we may get confused)
2172 *
2173 * trace is in reverse order (i.e. root inode comes last)
2174 */
2175 void Server::set_trace_dist(const ref_t<MClientReply> &reply,
2176 CInode *in, CDentry *dn,
2177 MDRequestRef& mdr)
2178 {
2179 // skip doing this for debugging purposes?
2180 if (g_conf()->mds_inject_traceless_reply_probability &&
2181 mdr->ls && !mdr->o_trunc &&
2182 (rand() % 10000 < g_conf()->mds_inject_traceless_reply_probability * 10000.0)) {
2183 dout(5) << "deliberately skipping trace for " << *reply << dendl;
2184 return;
2185 }
2186
2187 // inode, dentry, dir, ..., inode
2188 bufferlist bl;
2189 mds_rank_t whoami = mds->get_nodeid();
2190 Session *session = mdr->session;
2191 snapid_t snapid = mdr->snapid;
2192 utime_t now = ceph_clock_now();
2193
2194 dout(20) << "set_trace_dist snapid " << snapid << dendl;
2195
2196 // realm
2197 if (snapid == CEPH_NOSNAP) {
2198 SnapRealm *realm;
2199 if (in)
2200 realm = in->find_snaprealm();
2201 else
2202 realm = dn->get_dir()->get_inode()->find_snaprealm();
2203 reply->snapbl = realm->get_snap_trace();
2204 dout(10) << "set_trace_dist snaprealm " << *realm << " len=" << reply->snapbl.length() << dendl;
2205 }
2206
2207 // dir + dentry?
2208 if (dn) {
2209 reply->head.is_dentry = 1;
2210 CDir *dir = dn->get_dir();
2211 CInode *diri = dir->get_inode();
2212
2213 diri->encode_inodestat(bl, session, NULL, snapid);
2214 dout(20) << "set_trace_dist added diri " << *diri << dendl;
2215
2216 #ifdef MDS_VERIFY_FRAGSTAT
2217 if (dir->is_complete())
2218 dir->verify_fragstat();
2219 #endif
2220 DirStat ds;
2221 ds.frag = dir->get_frag();
2222 ds.auth = dir->get_dir_auth().first;
2223 if (dir->is_auth() && !mdcache->forward_all_reqs_to_auth())
2224 dir->get_dist_spec(ds.dist, whoami);
2225
2226 dir->encode_dirstat(bl, session->info, ds);
2227 dout(20) << "set_trace_dist added dir " << *dir << dendl;
2228
2229 encode(dn->get_name(), bl);
2230
2231 int lease_mask = 0;
2232 CDentry::linkage_t *dnl = dn->get_linkage(mdr->get_client(), mdr);
2233 if (dnl->is_primary()) {
2234 ceph_assert(dnl->get_inode() == in);
2235 lease_mask = CEPH_LEASE_PRIMARY_LINK;
2236 } else {
2237 if (dnl->is_remote())
2238 ceph_assert(dnl->get_remote_ino() == in->ino());
2239 else
2240 ceph_assert(!in);
2241 }
2242 mds->locker->issue_client_lease(dn, mdr, lease_mask, now, bl);
2243 dout(20) << "set_trace_dist added dn " << snapid << " " << *dn << dendl;
2244 } else
2245 reply->head.is_dentry = 0;
2246
2247 // inode
2248 if (in) {
2249 in->encode_inodestat(bl, session, NULL, snapid, 0, mdr->getattr_caps);
2250 dout(20) << "set_trace_dist added in " << *in << dendl;
2251 reply->head.is_target = 1;
2252 } else
2253 reply->head.is_target = 0;
2254
2255 reply->set_trace(bl);
2256 }
2257
2258 void Server::handle_client_request(const cref_t<MClientRequest> &req)
2259 {
2260 dout(4) << "handle_client_request " << *req << dendl;
2261
2262 if (mds->logger)
2263 mds->logger->inc(l_mds_request);
2264 if (logger)
2265 logger->inc(l_mdss_handle_client_request);
2266
2267 if (!mdcache->is_open()) {
2268 dout(5) << "waiting for root" << dendl;
2269 mdcache->wait_for_open(new C_MDS_RetryMessage(mds, req));
2270 return;
2271 }
2272
2273 bool sessionclosed_isok = replay_unsafe_with_closed_session;
2274 // active session?
2275 Session *session = 0;
2276 if (req->get_source().is_client()) {
2277 session = mds->get_session(req);
2278 if (!session) {
2279 dout(5) << "no session for " << req->get_source() << ", dropping" << dendl;
2280 } else if ((session->is_closed() && (!mds->is_clientreplay() || !sessionclosed_isok)) ||
2281 session->is_closing() ||
2282 session->is_killing()) {
2283 dout(5) << "session closed|closing|killing, dropping" << dendl;
2284 session = NULL;
2285 }
2286 if (!session) {
2287 if (req->is_queued_for_replay())
2288 mds->queue_one_replay();
2289 return;
2290 }
2291 }
2292
2293 // old mdsmap?
2294 if (req->get_mdsmap_epoch() < mds->mdsmap->get_epoch()) {
2295 // send it? hrm, this isn't ideal; they may get a lot of copies if
2296 // they have a high request rate.
2297 }
2298
2299 // completed request?
2300 bool has_completed = false;
2301 if (req->is_replay() || req->get_retry_attempt()) {
2302 ceph_assert(session);
2303 inodeno_t created;
2304 if (session->have_completed_request(req->get_reqid().tid, &created)) {
2305 has_completed = true;
2306 if (!session->is_open())
2307 return;
2308 // Don't send traceless reply if the completed request has created
2309 // new inode. Treat the request as lookup request instead.
2310 if (req->is_replay() ||
2311 ((created == inodeno_t() || !mds->is_clientreplay()) &&
2312 req->get_op() != CEPH_MDS_OP_OPEN &&
2313 req->get_op() != CEPH_MDS_OP_CREATE)) {
2314 dout(5) << "already completed " << req->get_reqid() << dendl;
2315 auto reply = make_message<MClientReply>(*req, 0);
2316 if (created != inodeno_t()) {
2317 bufferlist extra;
2318 encode(created, extra);
2319 reply->set_extra_bl(extra);
2320 }
2321 mds->send_message_client(reply, session);
2322
2323 if (req->is_queued_for_replay())
2324 mds->queue_one_replay();
2325
2326 return;
2327 }
2328 if (req->get_op() != CEPH_MDS_OP_OPEN &&
2329 req->get_op() != CEPH_MDS_OP_CREATE) {
2330 dout(10) << " completed request which created new inode " << created
2331 << ", convert it to lookup request" << dendl;
2332 req->head.op = req->get_dentry_wanted() ? CEPH_MDS_OP_LOOKUP : CEPH_MDS_OP_GETATTR;
2333 req->head.args.getattr.mask = CEPH_STAT_CAP_INODE_ALL;
2334 }
2335 }
2336 }
2337
2338 // trim completed_request list
2339 if (req->get_oldest_client_tid() > 0) {
2340 dout(15) << " oldest_client_tid=" << req->get_oldest_client_tid() << dendl;
2341 ceph_assert(session);
2342 if (session->trim_completed_requests(req->get_oldest_client_tid())) {
2343 // Sessions 'completed_requests' was dirtied, mark it to be
2344 // potentially flushed at segment expiry.
2345 mdlog->get_current_segment()->touched_sessions.insert(session->info.inst.name);
2346
2347 if (session->get_num_trim_requests_warnings() > 0 &&
2348 session->get_num_completed_requests() * 2 < g_conf()->mds_max_completed_requests)
2349 session->reset_num_trim_requests_warnings();
2350 } else {
2351 if (session->get_num_completed_requests() >=
2352 (g_conf()->mds_max_completed_requests << session->get_num_trim_requests_warnings())) {
2353 session->inc_num_trim_requests_warnings();
2354 stringstream ss;
2355 ss << "client." << session->get_client() << " does not advance its oldest_client_tid ("
2356 << req->get_oldest_client_tid() << "), "
2357 << session->get_num_completed_requests()
2358 << " completed requests recorded in session\n";
2359 mds->clog->warn() << ss.str();
2360 dout(20) << __func__ << " " << ss.str() << dendl;
2361 }
2362 }
2363 }
2364
2365 // register + dispatch
2366 MDRequestRef mdr = mdcache->request_start(req);
2367 if (!mdr.get())
2368 return;
2369
2370 if (session) {
2371 mdr->session = session;
2372 session->requests.push_back(&mdr->item_session_request);
2373 }
2374
2375 if (has_completed)
2376 mdr->has_completed = true;
2377
2378 // process embedded cap releases?
2379 // (only if NOT replay!)
2380 if (!req->releases.empty() && req->get_source().is_client() && !req->is_replay()) {
2381 client_t client = req->get_source().num();
2382 for (const auto &r : req->releases) {
2383 mds->locker->process_request_cap_release(mdr, client, r.item, r.dname);
2384 }
2385 req->releases.clear();
2386 }
2387
2388 dispatch_client_request(mdr);
2389 return;
2390 }
2391
2392 void Server::handle_osd_map()
2393 {
2394 /* Note that we check the OSDMAP_FULL flag directly rather than
2395 * using osdmap_full_flag(), because we want to know "is the flag set"
2396 * rather than "does the flag apply to us?" */
2397 mds->objecter->with_osdmap([this](const OSDMap& o) {
2398 auto pi = o.get_pg_pool(mds->mdsmap->get_metadata_pool());
2399 is_full = pi && pi->has_flag(pg_pool_t::FLAG_FULL);
2400 dout(7) << __func__ << ": full = " << is_full << " epoch = "
2401 << o.get_epoch() << dendl;
2402 });
2403 }
2404
2405 void Server::clear_batch_ops(const MDRequestRef& mdr)
2406 {
2407 int mask = mdr->client_request->head.args.getattr.mask;
2408 if (mdr->client_request->get_op() == CEPH_MDS_OP_GETATTR && mdr->in[0]) {
2409 mdr->in[0]->batch_ops.erase(mask);
2410 } else if (mdr->client_request->get_op() == CEPH_MDS_OP_LOOKUP && mdr->dn[0].size()) {
2411 mdr->dn[0].back()->batch_ops.erase(mask);
2412 }
2413 }
2414
2415 void Server::dispatch_client_request(MDRequestRef& mdr)
2416 {
2417 // we shouldn't be waiting on anyone.
2418 ceph_assert(!mdr->has_more() || mdr->more()->waiting_on_slave.empty());
2419
2420 if (mdr->killed) {
2421 dout(10) << "request " << *mdr << " was killed" << dendl;
2422 //if the mdr is a "batch_op" and it has followers, pick a follower as
2423 //the new "head of the batch ops" and go on processing the new one.
2424 if (mdr->is_batch_op() && mdr->is_batch_head ) {
2425 if (!mdr->batch_reqs.empty()) {
2426 MDRequestRef new_batch_head;
2427 for (auto itr = mdr->batch_reqs.cbegin(); itr != mdr->batch_reqs.cend();) {
2428 auto req = *itr;
2429 itr = mdr->batch_reqs.erase(itr);
2430 if (!req->killed) {
2431 new_batch_head = req;
2432 break;
2433 }
2434 }
2435
2436 if (!new_batch_head) {
2437 clear_batch_ops(mdr);
2438 return;
2439 }
2440
2441 new_batch_head->batch_reqs = std::move(mdr->batch_reqs);
2442
2443 mdr = new_batch_head;
2444 mdr->is_batch_head = true;
2445 int mask = mdr->client_request->head.args.getattr.mask;
2446 if (mdr->client_request->get_op() == CEPH_MDS_OP_GETATTR) {
2447 auto& fin = mdr->in[0]->batch_ops[mask];
2448 fin->set_request(new_batch_head);
2449 } else if (mdr->client_request->get_op() == CEPH_MDS_OP_LOOKUP) {
2450 auto& fin = mdr->dn[0].back()->batch_ops[mask];
2451 fin->set_request(new_batch_head);
2452 }
2453 } else {
2454 clear_batch_ops(mdr);
2455 return;
2456 }
2457 } else {
2458 return;
2459 }
2460 } else if (mdr->aborted) {
2461 mdr->aborted = false;
2462 mdcache->request_kill(mdr);
2463 return;
2464 }
2465
2466 const cref_t<MClientRequest> &req = mdr->client_request;
2467
2468 if (logger) logger->inc(l_mdss_dispatch_client_request);
2469
2470 dout(7) << "dispatch_client_request " << *req << dendl;
2471
2472 if (req->may_write() && mdcache->is_readonly()) {
2473 dout(10) << " read-only FS" << dendl;
2474 respond_to_request(mdr, -EROFS);
2475 return;
2476 }
2477 if (mdr->has_more() && mdr->more()->slave_error) {
2478 dout(10) << " got error from slaves" << dendl;
2479 respond_to_request(mdr, mdr->more()->slave_error);
2480 return;
2481 }
2482
2483 if (is_full) {
2484 if (req->get_op() == CEPH_MDS_OP_SETLAYOUT ||
2485 req->get_op() == CEPH_MDS_OP_SETDIRLAYOUT ||
2486 req->get_op() == CEPH_MDS_OP_SETLAYOUT ||
2487 req->get_op() == CEPH_MDS_OP_RMXATTR ||
2488 req->get_op() == CEPH_MDS_OP_SETXATTR ||
2489 req->get_op() == CEPH_MDS_OP_CREATE ||
2490 req->get_op() == CEPH_MDS_OP_SYMLINK ||
2491 req->get_op() == CEPH_MDS_OP_MKSNAP ||
2492 ((req->get_op() == CEPH_MDS_OP_LINK ||
2493 req->get_op() == CEPH_MDS_OP_RENAME) &&
2494 (!mdr->has_more() || mdr->more()->witnessed.empty())) // haven't started slave request
2495 ) {
2496
2497 dout(20) << __func__ << ": full, responding ENOSPC to op " << ceph_mds_op_name(req->get_op()) << dendl;
2498 respond_to_request(mdr, -ENOSPC);
2499 return;
2500 } else {
2501 dout(20) << __func__ << ": full, permitting op " << ceph_mds_op_name(req->get_op()) << dendl;
2502 }
2503 }
2504
2505 switch (req->get_op()) {
2506 case CEPH_MDS_OP_LOOKUPHASH:
2507 case CEPH_MDS_OP_LOOKUPINO:
2508 handle_client_lookup_ino(mdr, false, false);
2509 break;
2510 case CEPH_MDS_OP_LOOKUPPARENT:
2511 handle_client_lookup_ino(mdr, true, false);
2512 break;
2513 case CEPH_MDS_OP_LOOKUPNAME:
2514 handle_client_lookup_ino(mdr, false, true);
2515 break;
2516
2517 // inodes ops.
2518 case CEPH_MDS_OP_LOOKUP:
2519 handle_client_getattr(mdr, true);
2520 break;
2521
2522 case CEPH_MDS_OP_LOOKUPSNAP:
2523 // lookupsnap does not reference a CDentry; treat it as a getattr
2524 case CEPH_MDS_OP_GETATTR:
2525 handle_client_getattr(mdr, false);
2526 break;
2527
2528 case CEPH_MDS_OP_SETATTR:
2529 handle_client_setattr(mdr);
2530 break;
2531 case CEPH_MDS_OP_SETLAYOUT:
2532 handle_client_setlayout(mdr);
2533 break;
2534 case CEPH_MDS_OP_SETDIRLAYOUT:
2535 handle_client_setdirlayout(mdr);
2536 break;
2537 case CEPH_MDS_OP_SETXATTR:
2538 handle_client_setxattr(mdr);
2539 break;
2540 case CEPH_MDS_OP_RMXATTR:
2541 handle_client_removexattr(mdr);
2542 break;
2543
2544 case CEPH_MDS_OP_READDIR:
2545 handle_client_readdir(mdr);
2546 break;
2547
2548 case CEPH_MDS_OP_SETFILELOCK:
2549 handle_client_file_setlock(mdr);
2550 break;
2551
2552 case CEPH_MDS_OP_GETFILELOCK:
2553 handle_client_file_readlock(mdr);
2554 break;
2555
2556 // funky.
2557 case CEPH_MDS_OP_CREATE:
2558 if (mdr->has_completed)
2559 handle_client_open(mdr); // already created.. just open
2560 else
2561 handle_client_openc(mdr);
2562 break;
2563
2564 case CEPH_MDS_OP_OPEN:
2565 handle_client_open(mdr);
2566 break;
2567
2568 // namespace.
2569 // no prior locks.
2570 case CEPH_MDS_OP_MKNOD:
2571 handle_client_mknod(mdr);
2572 break;
2573 case CEPH_MDS_OP_LINK:
2574 handle_client_link(mdr);
2575 break;
2576 case CEPH_MDS_OP_UNLINK:
2577 case CEPH_MDS_OP_RMDIR:
2578 handle_client_unlink(mdr);
2579 break;
2580 case CEPH_MDS_OP_RENAME:
2581 handle_client_rename(mdr);
2582 break;
2583 case CEPH_MDS_OP_MKDIR:
2584 handle_client_mkdir(mdr);
2585 break;
2586 case CEPH_MDS_OP_SYMLINK:
2587 handle_client_symlink(mdr);
2588 break;
2589
2590
2591 // snaps
2592 case CEPH_MDS_OP_LSSNAP:
2593 handle_client_lssnap(mdr);
2594 break;
2595 case CEPH_MDS_OP_MKSNAP:
2596 handle_client_mksnap(mdr);
2597 break;
2598 case CEPH_MDS_OP_RMSNAP:
2599 handle_client_rmsnap(mdr);
2600 break;
2601 case CEPH_MDS_OP_RENAMESNAP:
2602 handle_client_renamesnap(mdr);
2603 break;
2604
2605 default:
2606 dout(1) << " unknown client op " << req->get_op() << dendl;
2607 respond_to_request(mdr, -EOPNOTSUPP);
2608 }
2609 }
2610
2611
2612 // ---------------------------------------
2613 // SLAVE REQUESTS
2614
2615 void Server::handle_slave_request(const cref_t<MMDSSlaveRequest> &m)
2616 {
2617 dout(4) << "handle_slave_request " << m->get_reqid() << " from " << m->get_source() << dendl;
2618 mds_rank_t from = mds_rank_t(m->get_source().num());
2619
2620 if (logger) logger->inc(l_mdss_handle_slave_request);
2621
2622 // reply?
2623 if (m->is_reply())
2624 return handle_slave_request_reply(m);
2625
2626 // the purpose of rename notify is enforcing causal message ordering. making sure
2627 // bystanders have received all messages from rename srcdn's auth MDS.
2628 if (m->get_op() == MMDSSlaveRequest::OP_RENAMENOTIFY) {
2629 auto reply = make_message<MMDSSlaveRequest>(m->get_reqid(), m->get_attempt(), MMDSSlaveRequest::OP_RENAMENOTIFYACK);
2630 mds->send_message(reply, m->get_connection());
2631 return;
2632 }
2633
2634 CDentry *straydn = NULL;
2635 if (m->straybl.length() > 0) {
2636 mdcache->decode_replica_stray(straydn, m->straybl, from);
2637 ceph_assert(straydn);
2638 m->straybl.clear();
2639 }
2640
2641 if (!mds->is_clientreplay() && !mds->is_active() && !mds->is_stopping()) {
2642 dout(3) << "not clientreplay|active yet, waiting" << dendl;
2643 mds->wait_for_replay(new C_MDS_RetryMessage(mds, m));
2644 return;
2645 }
2646
2647 // am i a new slave?
2648 MDRequestRef mdr;
2649 if (mdcache->have_request(m->get_reqid())) {
2650 // existing?
2651 mdr = mdcache->request_get(m->get_reqid());
2652
2653 // is my request newer?
2654 if (mdr->attempt > m->get_attempt()) {
2655 dout(10) << "local request " << *mdr << " attempt " << mdr->attempt << " > " << m->get_attempt()
2656 << ", dropping " << *m << dendl;
2657 return;
2658 }
2659
2660 if (mdr->attempt < m->get_attempt()) {
2661 // mine is old, close it out
2662 dout(10) << "local request " << *mdr << " attempt " << mdr->attempt << " < " << m->get_attempt()
2663 << ", closing out" << dendl;
2664 mdcache->request_finish(mdr);
2665 mdr.reset();
2666 } else if (mdr->slave_to_mds != from) {
2667 dout(10) << "local request " << *mdr << " not slave to mds." << from << dendl;
2668 return;
2669 }
2670
2671 // may get these while mdr->slave_request is non-null
2672 if (m->get_op() == MMDSSlaveRequest::OP_DROPLOCKS) {
2673 mds->locker->drop_locks(mdr.get());
2674 return;
2675 }
2676 if (m->get_op() == MMDSSlaveRequest::OP_FINISH) {
2677 if (m->is_abort()) {
2678 mdr->aborted = true;
2679 if (mdr->slave_request) {
2680 // only abort on-going xlock, wrlock and auth pin
2681 ceph_assert(!mdr->slave_did_prepare());
2682 } else {
2683 mdcache->request_finish(mdr);
2684 }
2685 } else {
2686 if (m->inode_export.length() > 0)
2687 mdr->more()->inode_import = m->inode_export;
2688 // finish off request.
2689 mdcache->request_finish(mdr);
2690 }
2691 return;
2692 }
2693 }
2694 if (!mdr.get()) {
2695 // new?
2696 if (m->get_op() == MMDSSlaveRequest::OP_FINISH) {
2697 dout(10) << "missing slave request for " << m->get_reqid()
2698 << " OP_FINISH, must have lost race with a forward" << dendl;
2699 return;
2700 }
2701 mdr = mdcache->request_start_slave(m->get_reqid(), m->get_attempt(), m);
2702 mdr->set_op_stamp(m->op_stamp);
2703 }
2704 ceph_assert(mdr->slave_request == 0); // only one at a time, please!
2705
2706 if (straydn) {
2707 mdr->pin(straydn);
2708 mdr->straydn = straydn;
2709 }
2710
2711 if (mds->is_clientreplay() && !mds->mdsmap->is_clientreplay(from) &&
2712 mdr->locks.empty()) {
2713 dout(3) << "not active yet, waiting" << dendl;
2714 mds->wait_for_active(new C_MDS_RetryMessage(mds, m));
2715 return;
2716 }
2717
2718 mdr->reset_slave_request(m);
2719
2720 dispatch_slave_request(mdr);
2721 }
2722
2723 void Server::handle_slave_request_reply(const cref_t<MMDSSlaveRequest> &m)
2724 {
2725 mds_rank_t from = mds_rank_t(m->get_source().num());
2726
2727 if (!mds->is_clientreplay() && !mds->is_active() && !mds->is_stopping()) {
2728 metareqid_t r = m->get_reqid();
2729 if (!mdcache->have_uncommitted_master(r, from)) {
2730 dout(10) << "handle_slave_request_reply ignoring slave reply from mds."
2731 << from << " reqid " << r << dendl;
2732 return;
2733 }
2734 dout(3) << "not clientreplay|active yet, waiting" << dendl;
2735 mds->wait_for_replay(new C_MDS_RetryMessage(mds, m));
2736 return;
2737 }
2738
2739 if (m->get_op() == MMDSSlaveRequest::OP_COMMITTED) {
2740 metareqid_t r = m->get_reqid();
2741 mdcache->committed_master_slave(r, from);
2742 return;
2743 }
2744
2745 MDRequestRef mdr = mdcache->request_get(m->get_reqid());
2746 if (m->get_attempt() != mdr->attempt) {
2747 dout(10) << "handle_slave_request_reply " << *mdr << " ignoring reply from other attempt "
2748 << m->get_attempt() << dendl;
2749 return;
2750 }
2751
2752 switch (m->get_op()) {
2753 case MMDSSlaveRequest::OP_XLOCKACK:
2754 {
2755 // identify lock, master request
2756 SimpleLock *lock = mds->locker->get_lock(m->get_lock_type(),
2757 m->get_object_info());
2758 mdr->more()->slaves.insert(from);
2759 lock->decode_locked_state(m->get_lock_data());
2760 dout(10) << "got remote xlock on " << *lock << " on " << *lock->get_parent() << dendl;
2761 mdr->emplace_lock(lock, MutationImpl::LockOp::XLOCK);
2762 mdr->finish_locking(lock);
2763 lock->get_xlock(mdr, mdr->get_client());
2764
2765 ceph_assert(mdr->more()->waiting_on_slave.count(from));
2766 mdr->more()->waiting_on_slave.erase(from);
2767 ceph_assert(mdr->more()->waiting_on_slave.empty());
2768 mdcache->dispatch_request(mdr);
2769 }
2770 break;
2771
2772 case MMDSSlaveRequest::OP_WRLOCKACK:
2773 {
2774 // identify lock, master request
2775 SimpleLock *lock = mds->locker->get_lock(m->get_lock_type(),
2776 m->get_object_info());
2777 mdr->more()->slaves.insert(from);
2778 dout(10) << "got remote wrlock on " << *lock << " on " << *lock->get_parent() << dendl;
2779 auto it = mdr->emplace_lock(lock, MutationImpl::LockOp::REMOTE_WRLOCK, from);
2780 ceph_assert(it->is_remote_wrlock());
2781 ceph_assert(it->wrlock_target == from);
2782
2783 mdr->finish_locking(lock);
2784
2785 ceph_assert(mdr->more()->waiting_on_slave.count(from));
2786 mdr->more()->waiting_on_slave.erase(from);
2787 ceph_assert(mdr->more()->waiting_on_slave.empty());
2788 mdcache->dispatch_request(mdr);
2789 }
2790 break;
2791
2792 case MMDSSlaveRequest::OP_AUTHPINACK:
2793 handle_slave_auth_pin_ack(mdr, m);
2794 break;
2795
2796 case MMDSSlaveRequest::OP_LINKPREPACK:
2797 handle_slave_link_prep_ack(mdr, m);
2798 break;
2799
2800 case MMDSSlaveRequest::OP_RMDIRPREPACK:
2801 handle_slave_rmdir_prep_ack(mdr, m);
2802 break;
2803
2804 case MMDSSlaveRequest::OP_RENAMEPREPACK:
2805 handle_slave_rename_prep_ack(mdr, m);
2806 break;
2807
2808 case MMDSSlaveRequest::OP_RENAMENOTIFYACK:
2809 handle_slave_rename_notify_ack(mdr, m);
2810 break;
2811
2812 default:
2813 ceph_abort();
2814 }
2815 }
2816
2817 void Server::dispatch_slave_request(MDRequestRef& mdr)
2818 {
2819 dout(7) << "dispatch_slave_request " << *mdr << " " << *mdr->slave_request << dendl;
2820
2821 if (mdr->aborted) {
2822 dout(7) << " abort flag set, finishing" << dendl;
2823 mdcache->request_finish(mdr);
2824 return;
2825 }
2826
2827 if (logger) logger->inc(l_mdss_dispatch_slave_request);
2828
2829 int op = mdr->slave_request->get_op();
2830 switch (op) {
2831 case MMDSSlaveRequest::OP_XLOCK:
2832 case MMDSSlaveRequest::OP_WRLOCK:
2833 {
2834 // identify object
2835 SimpleLock *lock = mds->locker->get_lock(mdr->slave_request->get_lock_type(),
2836 mdr->slave_request->get_object_info());
2837
2838 if (!lock) {
2839 dout(10) << "don't have object, dropping" << dendl;
2840 ceph_abort(); // can this happen, if we auth pinned properly.
2841 }
2842 if (op == MMDSSlaveRequest::OP_XLOCK && !lock->get_parent()->is_auth()) {
2843 dout(10) << "not auth for remote xlock attempt, dropping on "
2844 << *lock << " on " << *lock->get_parent() << dendl;
2845 } else {
2846 // use acquire_locks so that we get auth_pinning.
2847 MutationImpl::LockOpVec lov;
2848 for (const auto& p : mdr->locks) {
2849 if (p.is_xlock())
2850 lov.add_xlock(p.lock);
2851 else if (p.is_wrlock())
2852 lov.add_wrlock(p.lock);
2853 }
2854
2855 int replycode = 0;
2856 switch (op) {
2857 case MMDSSlaveRequest::OP_XLOCK:
2858 lov.add_xlock(lock);
2859 replycode = MMDSSlaveRequest::OP_XLOCKACK;
2860 break;
2861 case MMDSSlaveRequest::OP_WRLOCK:
2862 lov.add_wrlock(lock);
2863 replycode = MMDSSlaveRequest::OP_WRLOCKACK;
2864 break;
2865 }
2866
2867 if (!mds->locker->acquire_locks(mdr, lov))
2868 return;
2869
2870 // ack
2871 auto r = make_message<MMDSSlaveRequest>(mdr->reqid, mdr->attempt, replycode);
2872 r->set_lock_type(lock->get_type());
2873 lock->get_parent()->set_object_info(r->get_object_info());
2874 if (replycode == MMDSSlaveRequest::OP_XLOCKACK)
2875 lock->encode_locked_state(r->get_lock_data());
2876 mds->send_message(r, mdr->slave_request->get_connection());
2877 }
2878
2879 // done.
2880 mdr->reset_slave_request();
2881 }
2882 break;
2883
2884 case MMDSSlaveRequest::OP_UNXLOCK:
2885 case MMDSSlaveRequest::OP_UNWRLOCK:
2886 {
2887 SimpleLock *lock = mds->locker->get_lock(mdr->slave_request->get_lock_type(),
2888 mdr->slave_request->get_object_info());
2889 ceph_assert(lock);
2890 auto it = mdr->locks.find(lock);
2891 ceph_assert(it != mdr->locks.end());
2892 bool need_issue = false;
2893 switch (op) {
2894 case MMDSSlaveRequest::OP_UNXLOCK:
2895 mds->locker->xlock_finish(it, mdr.get(), &need_issue);
2896 break;
2897 case MMDSSlaveRequest::OP_UNWRLOCK:
2898 mds->locker->wrlock_finish(it, mdr.get(), &need_issue);
2899 break;
2900 }
2901 if (need_issue)
2902 mds->locker->issue_caps(static_cast<CInode*>(lock->get_parent()));
2903
2904 // done. no ack necessary.
2905 mdr->reset_slave_request();
2906 }
2907 break;
2908
2909 case MMDSSlaveRequest::OP_AUTHPIN:
2910 handle_slave_auth_pin(mdr);
2911 break;
2912
2913 case MMDSSlaveRequest::OP_LINKPREP:
2914 case MMDSSlaveRequest::OP_UNLINKPREP:
2915 handle_slave_link_prep(mdr);
2916 break;
2917
2918 case MMDSSlaveRequest::OP_RMDIRPREP:
2919 handle_slave_rmdir_prep(mdr);
2920 break;
2921
2922 case MMDSSlaveRequest::OP_RENAMEPREP:
2923 handle_slave_rename_prep(mdr);
2924 break;
2925
2926 default:
2927 ceph_abort();
2928 }
2929 }
2930
2931 void Server::handle_slave_auth_pin(MDRequestRef& mdr)
2932 {
2933 dout(10) << "handle_slave_auth_pin " << *mdr << dendl;
2934
2935 // build list of objects
2936 list<MDSCacheObject*> objects;
2937 CInode *auth_pin_freeze = NULL;
2938 bool nonblocking = mdr->slave_request->is_nonblocking();
2939 bool fail = false, wouldblock = false, readonly = false;
2940 ref_t<MMDSSlaveRequest> reply;
2941
2942 if (mdcache->is_readonly()) {
2943 dout(10) << " read-only FS" << dendl;
2944 readonly = true;
2945 fail = true;
2946 }
2947
2948 if (!fail) {
2949 for (const auto &oi : mdr->slave_request->get_authpins()) {
2950 MDSCacheObject *object = mdcache->get_object(oi);
2951 if (!object) {
2952 dout(10) << " don't have " << oi << dendl;
2953 fail = true;
2954 break;
2955 }
2956
2957 objects.push_back(object);
2958 if (oi == mdr->slave_request->get_authpin_freeze())
2959 auth_pin_freeze = static_cast<CInode*>(object);
2960 }
2961 }
2962
2963 // can we auth pin them?
2964 if (!fail) {
2965 for (const auto& obj : objects) {
2966 if (!obj->is_auth()) {
2967 dout(10) << " not auth for " << *obj << dendl;
2968 fail = true;
2969 break;
2970 }
2971 if (mdr->is_auth_pinned(obj))
2972 continue;
2973 if (!mdr->can_auth_pin(obj)) {
2974 if (nonblocking) {
2975 dout(10) << " can't auth_pin (freezing?) " << *obj << " nonblocking" << dendl;
2976 fail = true;
2977 wouldblock = true;
2978 break;
2979 }
2980 // wait
2981 dout(10) << " waiting for authpinnable on " << *obj << dendl;
2982 obj->add_waiter(CDir::WAIT_UNFREEZE, new C_MDS_RetryRequest(mdcache, mdr));
2983 mdr->drop_local_auth_pins();
2984
2985 mds->locker->notify_freeze_waiter(obj);
2986 goto blocked;
2987 }
2988 }
2989 }
2990
2991 if (!fail) {
2992 /* freeze authpin wrong inode */
2993 if (mdr->has_more() && mdr->more()->is_freeze_authpin &&
2994 mdr->more()->rename_inode != auth_pin_freeze)
2995 mdr->unfreeze_auth_pin(true);
2996
2997 /* handle_slave_rename_prep() call freeze_inode() to wait for all other operations
2998 * on the source inode to complete. This happens after all locks for the rename
2999 * operation are acquired. But to acquire locks, we need auth pin locks' parent
3000 * objects first. So there is an ABBA deadlock if someone auth pins the source inode
3001 * after locks are acquired and before Server::handle_slave_rename_prep() is called.
3002 * The solution is freeze the inode and prevent other MDRequests from getting new
3003 * auth pins.
3004 */
3005 if (auth_pin_freeze) {
3006 dout(10) << " freezing auth pin on " << *auth_pin_freeze << dendl;
3007 if (!mdr->freeze_auth_pin(auth_pin_freeze)) {
3008 auth_pin_freeze->add_waiter(CInode::WAIT_FROZEN, new C_MDS_RetryRequest(mdcache, mdr));
3009 mds->mdlog->flush();
3010 goto blocked;
3011 }
3012 }
3013 }
3014
3015 reply = make_message<MMDSSlaveRequest>(mdr->reqid, mdr->attempt, MMDSSlaveRequest::OP_AUTHPINACK);
3016
3017 if (fail) {
3018 mdr->drop_local_auth_pins(); // just in case
3019 if (readonly)
3020 reply->mark_error_rofs();
3021 if (wouldblock)
3022 reply->mark_error_wouldblock();
3023 } else {
3024 // auth pin!
3025 for (const auto& obj : objects) {
3026 dout(10) << "auth_pinning " << *obj << dendl;
3027 mdr->auth_pin(obj);
3028 }
3029 // return list of my auth_pins (if any)
3030 for (const auto &p : mdr->object_states) {
3031 if (!p.second.auth_pinned)
3032 continue;
3033 MDSCacheObjectInfo info;
3034 p.first->set_object_info(info);
3035 reply->get_authpins().push_back(info);
3036 if (p.first == (MDSCacheObject*)auth_pin_freeze)
3037 auth_pin_freeze->set_object_info(reply->get_authpin_freeze());
3038 }
3039 }
3040
3041 mds->send_message_mds(reply, mdr->slave_to_mds);
3042
3043 // clean up this request
3044 mdr->reset_slave_request();
3045 return;
3046
3047 blocked:
3048 if (mdr->slave_request->should_notify_blocking()) {
3049 reply = make_message<MMDSSlaveRequest>(mdr->reqid, mdr->attempt, MMDSSlaveRequest::OP_AUTHPINACK);
3050 reply->mark_req_blocked();
3051 mds->send_message_mds(reply, mdr->slave_to_mds);
3052 mdr->slave_request->clear_notify_blocking();
3053 }
3054 return;
3055 }
3056
3057 void Server::handle_slave_auth_pin_ack(MDRequestRef& mdr, const cref_t<MMDSSlaveRequest> &ack)
3058 {
3059 dout(10) << "handle_slave_auth_pin_ack on " << *mdr << " " << *ack << dendl;
3060 mds_rank_t from = mds_rank_t(ack->get_source().num());
3061
3062 if (ack->is_req_blocked()) {
3063 mdr->disable_lock_cache();
3064 // slave auth pin is blocked, drop locks to avoid deadlock
3065 mds->locker->drop_locks(mdr.get(), nullptr);
3066 return;
3067 }
3068
3069 // added auth pins?
3070 set<MDSCacheObject*> pinned;
3071 for (const auto &oi : ack->get_authpins()) {
3072 MDSCacheObject *object = mdcache->get_object(oi);
3073 ceph_assert(object); // we pinned it
3074 dout(10) << " remote has pinned " << *object << dendl;
3075 mdr->set_remote_auth_pinned(object, from);
3076 if (oi == ack->get_authpin_freeze())
3077 mdr->set_remote_frozen_auth_pin(static_cast<CInode *>(object));
3078 pinned.insert(object);
3079 }
3080
3081 // removed frozen auth pin ?
3082 if (mdr->more()->is_remote_frozen_authpin &&
3083 ack->get_authpin_freeze() == MDSCacheObjectInfo()) {
3084 auto stat_p = mdr->find_object_state(mdr->more()->rename_inode);
3085 ceph_assert(stat_p);
3086 if (stat_p->remote_auth_pinned == from) {
3087 mdr->more()->is_remote_frozen_authpin = false;
3088 }
3089 }
3090
3091 // removed auth pins?
3092 for (auto& p : mdr->object_states) {
3093 if (p.second.remote_auth_pinned == MDS_RANK_NONE)
3094 continue;
3095 MDSCacheObject* object = p.first;
3096 if (p.second.remote_auth_pinned == from && pinned.count(object) == 0) {
3097 dout(10) << " remote has unpinned " << *object << dendl;
3098 mdr->_clear_remote_auth_pinned(p.second);
3099 }
3100 }
3101
3102 // note slave
3103 mdr->more()->slaves.insert(from);
3104
3105 // clear from waiting list
3106 auto ret = mdr->more()->waiting_on_slave.erase(from);
3107 ceph_assert(ret);
3108
3109 if (ack->is_error_rofs()) {
3110 mdr->more()->slave_error = -EROFS;
3111 } else if (ack->is_error_wouldblock()) {
3112 mdr->more()->slave_error = -EWOULDBLOCK;
3113 }
3114
3115 // go again?
3116 if (mdr->more()->waiting_on_slave.empty())
3117 mdcache->dispatch_request(mdr);
3118 else
3119 dout(10) << "still waiting on slaves " << mdr->more()->waiting_on_slave << dendl;
3120 }
3121
3122
3123 // ---------------------------------------
3124 // HELPERS
3125
3126
3127 /**
3128 * check whether we are permitted to complete a request
3129 *
3130 * Check whether we have permission to perform the operation specified
3131 * by mask on the given inode, based on the capability in the mdr's
3132 * session.
3133 */
3134 bool Server::check_access(MDRequestRef& mdr, CInode *in, unsigned mask)
3135 {
3136 if (mdr->session) {
3137 int r = mdr->session->check_access(
3138 in, mask,
3139 mdr->client_request->get_caller_uid(),
3140 mdr->client_request->get_caller_gid(),
3141 &mdr->client_request->get_caller_gid_list(),
3142 mdr->client_request->head.args.setattr.uid,
3143 mdr->client_request->head.args.setattr.gid);
3144 if (r < 0) {
3145 respond_to_request(mdr, r);
3146 return false;
3147 }
3148 }
3149 return true;
3150 }
3151
3152 /**
3153 * check whether fragment has reached maximum size
3154 *
3155 */
3156 bool Server::check_fragment_space(MDRequestRef &mdr, CDir *in)
3157 {
3158 const auto size = in->get_frag_size();
3159 if (size >= g_conf()->mds_bal_fragment_size_max) {
3160 dout(10) << "fragment " << *in << " size exceeds " << g_conf()->mds_bal_fragment_size_max << " (ENOSPC)" << dendl;
3161 respond_to_request(mdr, -ENOSPC);
3162 return false;
3163 }
3164
3165 return true;
3166 }
3167
3168 CDentry* Server::prepare_stray_dentry(MDRequestRef& mdr, CInode *in)
3169 {
3170 CDentry *straydn = mdr->straydn;
3171 if (straydn) {
3172 string straydname;
3173 in->name_stray_dentry(straydname);
3174 ceph_assert(straydn->get_name() == straydname);
3175 return straydn;
3176 }
3177
3178 CDir *straydir = mdcache->get_stray_dir(in);
3179
3180 if (!mdr->client_request->is_replay() &&
3181 !check_fragment_space(mdr, straydir))
3182 return NULL;
3183
3184 straydn = mdcache->get_or_create_stray_dentry(in);
3185 mdr->straydn = straydn;
3186 mdr->pin(straydn);
3187 return straydn;
3188 }
3189
3190 /** prepare_new_inode
3191 *
3192 * create a new inode. set c/m/atime. hit dir pop.
3193 */
3194 CInode* Server::prepare_new_inode(MDRequestRef& mdr, CDir *dir, inodeno_t useino, unsigned mode,
3195 file_layout_t *layout)
3196 {
3197 CInode *in = new CInode(mdcache);
3198
3199 // Server::prepare_force_open_sessions() can re-open session in closing
3200 // state. In that corner case, session's prealloc_inos are being freed.
3201 // To simplify the code, we disallow using/refilling session's prealloc_ino
3202 // while session is opening.
3203 bool allow_prealloc_inos = mdr->session->is_open();
3204
3205 // assign ino
3206 if (allow_prealloc_inos && (mdr->used_prealloc_ino = in->inode.ino = mdr->session->take_ino(useino))) {
3207 mds->sessionmap.mark_projected(mdr->session);
3208 dout(10) << "prepare_new_inode used_prealloc " << mdr->used_prealloc_ino
3209 << " (" << mdr->session->info.prealloc_inos
3210 << ", " << mdr->session->info.prealloc_inos.size() << " left)"
3211 << dendl;
3212 } else {
3213 mdr->alloc_ino =
3214 in->inode.ino = mds->inotable->project_alloc_id(useino);
3215 dout(10) << "prepare_new_inode alloc " << mdr->alloc_ino << dendl;
3216 }
3217
3218 if (useino && useino != in->inode.ino) {
3219 dout(0) << "WARNING: client specified " << useino << " and i allocated " << in->inode.ino << dendl;
3220 mds->clog->error() << mdr->client_request->get_source()
3221 << " specified ino " << useino
3222 << " but mds." << mds->get_nodeid() << " allocated " << in->inode.ino;
3223 //ceph_abort(); // just for now.
3224 }
3225
3226 if (allow_prealloc_inos &&
3227 mdr->session->get_num_projected_prealloc_inos() < g_conf()->mds_client_prealloc_inos / 2) {
3228 int need = g_conf()->mds_client_prealloc_inos - mdr->session->get_num_projected_prealloc_inos();
3229 mds->inotable->project_alloc_ids(mdr->prealloc_inos, need);
3230 ceph_assert(mdr->prealloc_inos.size()); // or else fix projected increment semantics
3231 mdr->session->pending_prealloc_inos.insert(mdr->prealloc_inos);
3232 mds->sessionmap.mark_projected(mdr->session);
3233 dout(10) << "prepare_new_inode prealloc " << mdr->prealloc_inos << dendl;
3234 }
3235
3236 in->inode.version = 1;
3237 in->inode.xattr_version = 1;
3238 in->inode.nlink = 1; // FIXME
3239
3240 in->inode.mode = mode;
3241
3242 // FIPS zeroization audit 20191117: this memset is not security related.
3243 memset(&in->inode.dir_layout, 0, sizeof(in->inode.dir_layout));
3244 if (in->inode.is_dir()) {
3245 in->inode.dir_layout.dl_dir_hash = g_conf()->mds_default_dir_hash;
3246 } else if (layout) {
3247 in->inode.layout = *layout;
3248 } else {
3249 in->inode.layout = mdcache->default_file_layout;
3250 }
3251
3252 in->inode.truncate_size = -1ull; // not truncated, yet!
3253 in->inode.truncate_seq = 1; /* starting with 1, 0 is kept for no-truncation logic */
3254
3255 CInode *diri = dir->get_inode();
3256
3257 dout(10) << oct << " dir mode 0" << diri->inode.mode << " new mode 0" << mode << dec << dendl;
3258
3259 if (diri->inode.mode & S_ISGID) {
3260 dout(10) << " dir is sticky" << dendl;
3261 in->inode.gid = diri->inode.gid;
3262 if (S_ISDIR(mode)) {
3263 dout(10) << " new dir also sticky" << dendl;
3264 in->inode.mode |= S_ISGID;
3265 }
3266 } else
3267 in->inode.gid = mdr->client_request->get_caller_gid();
3268
3269 in->inode.uid = mdr->client_request->get_caller_uid();
3270
3271 in->inode.btime = in->inode.ctime = in->inode.mtime = in->inode.atime =
3272 mdr->get_op_stamp();
3273
3274 in->inode.change_attr = 0;
3275
3276 const cref_t<MClientRequest> &req = mdr->client_request;
3277 if (req->get_data().length()) {
3278 auto p = req->get_data().cbegin();
3279
3280 // xattrs on new inode?
3281 CInode::mempool_xattr_map xattrs;
3282 decode_noshare(xattrs, p);
3283 for (const auto &p : xattrs) {
3284 dout(10) << "prepare_new_inode setting xattr " << p.first << dendl;
3285 auto em = in->xattrs.emplace(std::piecewise_construct, std::forward_as_tuple(p.first), std::forward_as_tuple(p.second));
3286 if (!em.second)
3287 em.first->second = p.second;
3288 }
3289 }
3290
3291 if (!mds->mdsmap->get_inline_data_enabled() ||
3292 !mdr->session->get_connection()->has_feature(CEPH_FEATURE_MDS_INLINE_DATA))
3293 in->inode.inline_data.version = CEPH_INLINE_NONE;
3294
3295 mdcache->add_inode(in); // add
3296 dout(10) << "prepare_new_inode " << *in << dendl;
3297 return in;
3298 }
3299
3300 void Server::journal_allocated_inos(MDRequestRef& mdr, EMetaBlob *blob)
3301 {
3302 dout(20) << "journal_allocated_inos sessionmapv " << mds->sessionmap.get_projected()
3303 << " inotablev " << mds->inotable->get_projected_version()
3304 << dendl;
3305 blob->set_ino_alloc(mdr->alloc_ino,
3306 mdr->used_prealloc_ino,
3307 mdr->prealloc_inos,
3308 mdr->client_request->get_source(),
3309 mds->sessionmap.get_projected(),
3310 mds->inotable->get_projected_version());
3311 }
3312
3313 void Server::apply_allocated_inos(MDRequestRef& mdr, Session *session)
3314 {
3315 dout(10) << "apply_allocated_inos " << mdr->alloc_ino
3316 << " / " << mdr->prealloc_inos
3317 << " / " << mdr->used_prealloc_ino << dendl;
3318
3319 if (mdr->alloc_ino) {
3320 mds->inotable->apply_alloc_id(mdr->alloc_ino);
3321 }
3322 if (mdr->prealloc_inos.size()) {
3323 ceph_assert(session);
3324 session->pending_prealloc_inos.subtract(mdr->prealloc_inos);
3325 session->info.prealloc_inos.insert(mdr->prealloc_inos);
3326 mds->sessionmap.mark_dirty(session, !mdr->used_prealloc_ino);
3327 mds->inotable->apply_alloc_ids(mdr->prealloc_inos);
3328 }
3329 if (mdr->used_prealloc_ino) {
3330 ceph_assert(session);
3331 session->info.used_inos.erase(mdr->used_prealloc_ino);
3332 mds->sessionmap.mark_dirty(session);
3333 }
3334 }
3335
3336 class C_MDS_TryFindInode : public ServerContext {
3337 MDRequestRef mdr;
3338 public:
3339 C_MDS_TryFindInode(Server *s, MDRequestRef& r) : ServerContext(s), mdr(r) {}
3340 void finish(int r) override {
3341 if (r == -ESTALE) // :( find_ino_peers failed
3342 server->respond_to_request(mdr, r);
3343 else
3344 server->dispatch_client_request(mdr);
3345 }
3346 };
3347
3348 class CF_MDS_MDRContextFactory : public MDSContextFactory {
3349 public:
3350 CF_MDS_MDRContextFactory(MDCache *cache, MDRequestRef &mdr, bool dl) :
3351 mdcache(cache), mdr(mdr), drop_locks(dl) {}
3352 MDSContext *build() {
3353 if (drop_locks) {
3354 mdcache->mds->locker->drop_locks(mdr.get(), nullptr);
3355 mdr->drop_local_auth_pins();
3356 }
3357 return new C_MDS_RetryRequest(mdcache, mdr);
3358 }
3359 private:
3360 MDCache *mdcache;
3361 MDRequestRef mdr;
3362 bool drop_locks;
3363 };
3364
3365 /* If this returns null, the request has been handled
3366 * as appropriate: forwarded on, or the client's been replied to */
3367 CInode* Server::rdlock_path_pin_ref(MDRequestRef& mdr,
3368 bool want_auth,
3369 bool no_want_auth)
3370 {
3371 const filepath& refpath = mdr->get_filepath();
3372 dout(10) << "rdlock_path_pin_ref " << *mdr << " " << refpath << dendl;
3373
3374 if (mdr->locking_state & MutationImpl::PATH_LOCKED)
3375 return mdr->in[0];
3376
3377 // traverse
3378 CF_MDS_MDRContextFactory cf(mdcache, mdr, true);
3379 int flags = 0;
3380 if (refpath.is_last_snap()) {
3381 if (!no_want_auth)
3382 want_auth = true;
3383 } else {
3384 flags |= MDS_TRAVERSE_RDLOCK_PATH | MDS_TRAVERSE_RDLOCK_SNAP;
3385 }
3386 if (want_auth)
3387 flags |= MDS_TRAVERSE_WANT_AUTH;
3388 int r = mdcache->path_traverse(mdr, cf, refpath, flags, &mdr->dn[0], &mdr->in[0]);
3389 if (r > 0)
3390 return nullptr; // delayed
3391 if (r < 0) { // error
3392 if (r == -ENOENT && !mdr->dn[0].empty()) {
3393 if (mdr->client_request &&
3394 mdr->client_request->get_dentry_wanted())
3395 mdr->tracedn = mdr->dn[0].back();
3396 respond_to_request(mdr, r);
3397 } else if (r == -ESTALE) {
3398 dout(10) << "FAIL on ESTALE but attempting recovery" << dendl;
3399 MDSContext *c = new C_MDS_TryFindInode(this, mdr);
3400 mdcache->find_ino_peers(refpath.get_ino(), c);
3401 } else {
3402 dout(10) << "FAIL on error " << r << dendl;
3403 respond_to_request(mdr, r);
3404 }
3405 return nullptr;
3406 }
3407 CInode *ref = mdr->in[0];
3408 dout(10) << "ref is " << *ref << dendl;
3409
3410 if (want_auth) {
3411 // auth_pin?
3412 // do NOT proceed if freezing, as cap release may defer in that case, and
3413 // we could deadlock when we try to lock @ref.
3414 // if we're already auth_pinned, continue; the release has already been processed.
3415 if (ref->is_frozen() || ref->is_frozen_auth_pin() ||
3416 (ref->is_freezing() && !mdr->is_auth_pinned(ref))) {
3417 dout(7) << "waiting for !frozen/authpinnable on " << *ref << dendl;
3418 ref->add_waiter(CInode::WAIT_UNFREEZE, cf.build());
3419 if (mdr->is_any_remote_auth_pin())
3420 mds->locker->notify_freeze_waiter(ref);
3421 return 0;
3422 }
3423 mdr->auth_pin(ref);
3424 }
3425
3426 // set and pin ref
3427 mdr->pin(ref);
3428 return ref;
3429 }
3430
3431
3432 /** rdlock_path_xlock_dentry
3433 * traverse path to the directory that could/would contain dentry.
3434 * make sure i am auth for that dentry, forward as necessary.
3435 * create null dentry in place (or use existing if okexist).
3436 * get rdlocks on traversed dentries, xlock on new dentry.
3437 */
3438 CDentry* Server::rdlock_path_xlock_dentry(MDRequestRef& mdr,
3439 bool create, bool okexist, bool want_layout)
3440 {
3441 const filepath& refpath = mdr->get_filepath();
3442 dout(10) << "rdlock_path_xlock_dentry " << *mdr << " " << refpath << dendl;
3443
3444 if (mdr->locking_state & MutationImpl::PATH_LOCKED)
3445 return mdr->dn[0].back();
3446
3447 // figure parent dir vs dname
3448 if (refpath.depth() == 0) {
3449 dout(7) << "invalid path (zero length)" << dendl;
3450 respond_to_request(mdr, -EINVAL);
3451 return nullptr;
3452 }
3453
3454 if (refpath.is_last_snap()) {
3455 respond_to_request(mdr, -EROFS);
3456 return nullptr;
3457 }
3458
3459 if (refpath.is_last_dot_or_dotdot()) {
3460 dout(7) << "invalid path (last dot or dot_dot)" << dendl;
3461 if (create)
3462 respond_to_request(mdr, -EEXIST);
3463 else
3464 respond_to_request(mdr, -ENOTEMPTY);
3465 return nullptr;
3466 }
3467
3468 // traverse to parent dir
3469 CF_MDS_MDRContextFactory cf(mdcache, mdr, true);
3470 int flags = MDS_TRAVERSE_RDLOCK_SNAP | MDS_TRAVERSE_RDLOCK_PATH |
3471 MDS_TRAVERSE_WANT_DENTRY | MDS_TRAVERSE_XLOCK_DENTRY |
3472 MDS_TRAVERSE_WANT_AUTH;
3473 if (refpath.depth() == 1 && !mdr->lock_cache_disabled)
3474 flags |= MDS_TRAVERSE_CHECK_LOCKCACHE;
3475 if (create)
3476 flags |= MDS_TRAVERSE_RDLOCK_AUTHLOCK;
3477 if (want_layout)
3478 flags |= MDS_TRAVERSE_WANT_DIRLAYOUT;
3479 int r = mdcache->path_traverse(mdr, cf, refpath, flags, &mdr->dn[0]);
3480 if (r > 0)
3481 return nullptr; // delayed
3482 if (r < 0) {
3483 if (r == -ESTALE) {
3484 dout(10) << "FAIL on ESTALE but attempting recovery" << dendl;
3485 mdcache->find_ino_peers(refpath.get_ino(), new C_MDS_TryFindInode(this, mdr));
3486 return nullptr;
3487 }
3488 respond_to_request(mdr, r);
3489 return nullptr;
3490 }
3491
3492 CDentry *dn = mdr->dn[0].back();
3493 CDir *dir = dn->get_dir();
3494 CInode *diri = dir->get_inode();
3495
3496 if (!mdr->reqid.name.is_mds()) {
3497 if (diri->is_system() && !diri->is_root()) {
3498 respond_to_request(mdr, -EROFS);
3499 return nullptr;
3500 }
3501 }
3502
3503 if (!diri->is_base() && diri->get_projected_parent_dir()->inode->is_stray()) {
3504 respond_to_request(mdr, -ENOENT);
3505 return nullptr;
3506 }
3507
3508 CDentry::linkage_t *dnl = dn->get_projected_linkage();
3509 if (dnl->is_null()) {
3510 if (!create && okexist) {
3511 respond_to_request(mdr, -ENOENT);
3512 return nullptr;
3513 }
3514
3515 snapid_t next_snap = mdcache->get_global_snaprealm()->get_newest_seq() + 1;
3516 dn->first = std::max(dn->first, next_snap);
3517 } else {
3518 if (!okexist) {
3519 respond_to_request(mdr, -EEXIST);
3520 return nullptr;
3521 }
3522 mdr->in[0] = dnl->get_inode();
3523 }
3524
3525 return dn;
3526 }
3527
3528 /** rdlock_two_paths_xlock_destdn
3529 * traverse two paths and lock the two paths in proper order.
3530 * The order of taking locks is:
3531 * 1. Lock directory inodes or dentries according to which trees they
3532 * are under. Lock objects under fs root before objects under mdsdir.
3533 * 2. Lock directory inodes or dentries according to their depth, in
3534 * ascending order.
3535 * 3. Lock directory inodes or dentries according to inode numbers or
3536 * dentries' parent inode numbers, in ascending order.
3537 * 4. Lock dentries in the same directory in order of their keys.
3538 * 5. Lock non-directory inodes according to inode numbers, in ascending
3539 * order.
3540 */
3541 std::pair<CDentry*, CDentry*>
3542 Server::rdlock_two_paths_xlock_destdn(MDRequestRef& mdr, bool xlock_srcdn)
3543 {
3544
3545 const filepath& refpath = mdr->get_filepath();
3546 const filepath& refpath2 = mdr->get_filepath2();
3547
3548 dout(10) << "rdlock_two_paths_xlock_destdn " << *mdr << " " << refpath << " " << refpath2 << dendl;
3549
3550 if (mdr->locking_state & MutationImpl::PATH_LOCKED)
3551 return std::make_pair(mdr->dn[0].back(), mdr->dn[1].back());
3552
3553 if (refpath.depth() != 1 || refpath2.depth() != 1) {
3554 respond_to_request(mdr, -EINVAL);
3555 return std::pair<CDentry*, CDentry*>(nullptr, nullptr);
3556 }
3557
3558 if (refpath.is_last_snap() || refpath2.is_last_snap()) {
3559 respond_to_request(mdr, -EROFS);
3560 return std::make_pair(nullptr, nullptr);
3561 }
3562
3563 // traverse to parent dir
3564 CF_MDS_MDRContextFactory cf(mdcache, mdr, true);
3565 int flags = MDS_TRAVERSE_RDLOCK_SNAP | MDS_TRAVERSE_WANT_DENTRY | MDS_TRAVERSE_WANT_AUTH;
3566 int r = mdcache->path_traverse(mdr, cf, refpath, flags, &mdr->dn[0]);
3567 if (r != 0) {
3568 if (r == -ESTALE) {
3569 dout(10) << "ESTALE on path, attempting recovery" << dendl;
3570 mdcache->find_ino_peers(refpath.get_ino(), new C_MDS_TryFindInode(this, mdr));
3571 } else if (r < 0) {
3572 respond_to_request(mdr, r);
3573 }
3574 return std::make_pair(nullptr, nullptr);
3575 }
3576
3577 flags = MDS_TRAVERSE_RDLOCK_SNAP2 | MDS_TRAVERSE_WANT_DENTRY | MDS_TRAVERSE_DISCOVER;
3578 r = mdcache->path_traverse(mdr, cf, refpath2, flags, &mdr->dn[1]);
3579 if (r != 0) {
3580 if (r == -ESTALE) {
3581 dout(10) << "ESTALE on path2, attempting recovery" << dendl;
3582 mdcache->find_ino_peers(refpath2.get_ino(), new C_MDS_TryFindInode(this, mdr));
3583 } else if (r < 0) {
3584 respond_to_request(mdr, r);
3585 }
3586 return std::make_pair(nullptr, nullptr);
3587 }
3588
3589 CDentry *srcdn = mdr->dn[1].back();
3590 CDir *srcdir = srcdn->get_dir();
3591 CDentry *destdn = mdr->dn[0].back();
3592 CDir *destdir = destdn->get_dir();
3593
3594 if (!mdr->reqid.name.is_mds()) {
3595 if ((srcdir->get_inode()->is_system() && !srcdir->get_inode()->is_root()) ||
3596 (destdir->get_inode()->is_system() && !destdir->get_inode()->is_root())) {
3597 respond_to_request(mdr, -EROFS);
3598 return std::make_pair(nullptr, nullptr);
3599 }
3600 }
3601
3602 if (!destdir->get_inode()->is_base() &&
3603 destdir->get_inode()->get_projected_parent_dir()->inode->is_stray()) {
3604 respond_to_request(mdr, -ENOENT);
3605 return std::make_pair(nullptr, nullptr);
3606 }
3607
3608 MutationImpl::LockOpVec lov;
3609 if (srcdir->get_inode() == destdir->get_inode()) {
3610 lov.add_wrlock(&destdir->inode->filelock);
3611 lov.add_wrlock(&destdir->inode->nestlock);
3612 if (xlock_srcdn && srcdir != destdir) {
3613 mds_rank_t srcdir_auth = srcdir->authority().first;
3614 if (srcdir_auth != mds->get_nodeid()) {
3615 lov.add_remote_wrlock(&srcdir->inode->filelock, srcdir_auth);
3616 lov.add_remote_wrlock(&srcdir->inode->nestlock, srcdir_auth);
3617 }
3618 }
3619
3620 if (srcdn->get_name() > destdn->get_name())
3621 lov.add_xlock(&destdn->lock);
3622
3623 if (xlock_srcdn)
3624 lov.add_xlock(&srcdn->lock);
3625 else
3626 lov.add_rdlock(&srcdn->lock);
3627
3628 if (srcdn->get_name() < destdn->get_name())
3629 lov.add_xlock(&destdn->lock);
3630 } else {
3631 int cmp = mdr->compare_paths();
3632 bool lock_destdir_first =
3633 (cmp < 0 || (cmp == 0 && destdir->ino() < srcdir->ino()));
3634
3635 if (lock_destdir_first) {
3636 lov.add_wrlock(&destdir->inode->filelock);
3637 lov.add_wrlock(&destdir->inode->nestlock);
3638 lov.add_xlock(&destdn->lock);
3639 }
3640
3641 if (xlock_srcdn) {
3642 mds_rank_t srcdir_auth = srcdir->authority().first;
3643 if (srcdir_auth == mds->get_nodeid()) {
3644 lov.add_wrlock(&srcdir->inode->filelock);
3645 lov.add_wrlock(&srcdir->inode->nestlock);
3646 } else {
3647 lov.add_remote_wrlock(&srcdir->inode->filelock, srcdir_auth);
3648 lov.add_remote_wrlock(&srcdir->inode->nestlock, srcdir_auth);
3649 }
3650 lov.add_xlock(&srcdn->lock);
3651 } else {
3652 lov.add_rdlock(&srcdn->lock);
3653 }
3654
3655 if (!lock_destdir_first) {
3656 lov.add_wrlock(&destdir->inode->filelock);
3657 lov.add_wrlock(&destdir->inode->nestlock);
3658 lov.add_xlock(&destdn->lock);
3659 }
3660 }
3661
3662 CInode *auth_pin_freeze = nullptr;
3663 // XXX any better way to do this?
3664 if (xlock_srcdn && !srcdn->is_auth()) {
3665 CDentry::linkage_t *srcdnl = srcdn->get_projected_linkage();
3666 auth_pin_freeze = srcdnl->is_primary() ? srcdnl->get_inode() : nullptr;
3667 }
3668 if (!mds->locker->acquire_locks(mdr, lov, auth_pin_freeze))
3669 return std::make_pair(nullptr, nullptr);
3670
3671 if (srcdn->get_projected_linkage()->is_null()) {
3672 respond_to_request(mdr, -ENOENT);
3673 return std::make_pair(nullptr, nullptr);
3674 }
3675
3676 if (destdn->get_projected_linkage()->is_null()) {
3677 snapid_t next_snap = mdcache->get_global_snaprealm()->get_newest_seq() + 1;
3678 destdn->first = std::max(destdn->first, next_snap);
3679 }
3680
3681 mdr->locking_state |= MutationImpl::PATH_LOCKED;
3682
3683 return std::make_pair(destdn, srcdn);
3684 }
3685
3686 /**
3687 * try_open_auth_dirfrag -- open dirfrag, or forward to dirfrag auth
3688 *
3689 * @param diri base inode
3690 * @param fg the exact frag we want
3691 * @param mdr request
3692 * @returns the pointer, or NULL if it had to be delayed (but mdr is taken care of)
3693 */
3694 CDir* Server::try_open_auth_dirfrag(CInode *diri, frag_t fg, MDRequestRef& mdr)
3695 {
3696 CDir *dir = diri->get_dirfrag(fg);
3697
3698 if (dir) {
3699 // am i auth for the dirfrag?
3700 if (!dir->is_auth()) {
3701 mds_rank_t auth = dir->authority().first;
3702 dout(7) << "try_open_auth_dirfrag: not auth for " << *dir
3703 << ", fw to mds." << auth << dendl;
3704 mdcache->request_forward(mdr, auth);
3705 return nullptr;
3706 }
3707 } else {
3708 // not open and inode not mine?
3709 if (!diri->is_auth()) {
3710 mds_rank_t inauth = diri->authority().first;
3711 dout(7) << "try_open_auth_dirfrag: not open, not inode auth, fw to mds." << inauth << dendl;
3712 mdcache->request_forward(mdr, inauth);
3713 return nullptr;
3714 }
3715
3716 // not open and inode frozen?
3717 if (diri->is_frozen()) {
3718 dout(10) << "try_open_auth_dirfrag: dir inode is frozen, waiting " << *diri << dendl;
3719 ceph_assert(diri->get_parent_dir());
3720 diri->add_waiter(CInode::WAIT_UNFREEZE, new C_MDS_RetryRequest(mdcache, mdr));
3721 return nullptr;
3722 }
3723
3724 // invent?
3725 dir = diri->get_or_open_dirfrag(mdcache, fg);
3726 }
3727
3728 return dir;
3729 }
3730
3731
3732 // ===============================================================================
3733 // STAT
3734
3735 void Server::handle_client_getattr(MDRequestRef& mdr, bool is_lookup)
3736 {
3737 const cref_t<MClientRequest> &req = mdr->client_request;
3738
3739 if (req->get_filepath().depth() == 0 && is_lookup) {
3740 // refpath can't be empty for lookup but it can for
3741 // getattr (we do getattr with empty refpath for mount of '/')
3742 respond_to_request(mdr, -EINVAL);
3743 return;
3744 }
3745
3746 bool want_auth = false;
3747 int mask = req->head.args.getattr.mask;
3748 if (mask & CEPH_STAT_RSTAT)
3749 want_auth = true; // set want_auth for CEPH_STAT_RSTAT mask
3750
3751 CInode *ref = rdlock_path_pin_ref(mdr, want_auth, false);
3752 if (!ref)
3753 return;
3754
3755 mdr->getattr_caps = mask;
3756
3757 if (mdr->snapid == CEPH_NOSNAP && !mdr->is_batch_head && mdr->is_batch_op()) {
3758 if (!is_lookup) {
3759 auto em = ref->batch_ops.emplace(std::piecewise_construct, std::forward_as_tuple(mask), std::forward_as_tuple());
3760 if (em.second) {
3761 em.first->second = std::make_unique<Batch_Getattr_Lookup>(this, mdr, mdcache);
3762 } else {
3763 dout(20) << __func__ << ": GETATTR op, wait for previous same getattr ops to respond. " << *mdr << dendl;
3764 em.first->second->add_request(mdr);
3765 return;
3766 }
3767 } else {
3768 CDentry* dn = mdr->dn[0].back();
3769 auto em = dn->batch_ops.emplace(std::piecewise_construct, std::forward_as_tuple(mask), std::forward_as_tuple());
3770 if (em.second) {
3771 em.first->second = std::make_unique<Batch_Getattr_Lookup>(this, mdr, mdcache);
3772 mdr->pin(dn);
3773 } else {
3774 dout(20) << __func__ << ": LOOKUP op, wait for previous same getattr ops to respond. " << *mdr << dendl;
3775 em.first->second->add_request(mdr);
3776 return;
3777 }
3778 }
3779 mdr->is_batch_head = true;
3780 }
3781
3782 /*
3783 * if client currently holds the EXCL cap on a field, do not rdlock
3784 * it; client's stat() will result in valid info if _either_ EXCL
3785 * cap is held or MDS rdlocks and reads the value here.
3786 *
3787 * handling this case here is easier than weakening rdlock
3788 * semantics... that would cause problems elsewhere.
3789 */
3790 client_t client = mdr->get_client();
3791 int issued = 0;
3792 Capability *cap = ref->get_client_cap(client);
3793 if (cap && (mdr->snapid == CEPH_NOSNAP ||
3794 mdr->snapid <= cap->client_follows))
3795 issued = cap->issued();
3796
3797 // FIXME
3798 MutationImpl::LockOpVec lov;
3799 if ((mask & CEPH_CAP_LINK_SHARED) && !(issued & CEPH_CAP_LINK_EXCL))
3800 lov.add_rdlock(&ref->linklock);
3801 if ((mask & CEPH_CAP_AUTH_SHARED) && !(issued & CEPH_CAP_AUTH_EXCL))
3802 lov.add_rdlock(&ref->authlock);
3803 if ((mask & CEPH_CAP_XATTR_SHARED) && !(issued & CEPH_CAP_XATTR_EXCL))
3804 lov.add_rdlock(&ref->xattrlock);
3805 if ((mask & CEPH_CAP_FILE_SHARED) && !(issued & CEPH_CAP_FILE_EXCL)) {
3806 // Don't wait on unstable filelock if client is allowed to read file size.
3807 // This can reduce the response time of getattr in the case that multiple
3808 // clients do stat(2) and there are writers.
3809 // The downside of this optimization is that mds may not issue Fs caps along
3810 // with getattr reply. Client may need to send more getattr requests.
3811 if (mdr->is_rdlocked(&ref->filelock)) {
3812 lov.add_rdlock(&ref->filelock);
3813 } else if (ref->filelock.is_stable() ||
3814 ref->filelock.get_num_wrlocks() > 0 ||
3815 !ref->filelock.can_read(mdr->get_client())) {
3816 lov.add_rdlock(&ref->filelock);
3817 mdr->locking_state &= ~MutationImpl::ALL_LOCKED;
3818 }
3819 }
3820
3821 if (!mds->locker->acquire_locks(mdr, lov))
3822 return;
3823
3824 if (!check_access(mdr, ref, MAY_READ))
3825 return;
3826
3827 utime_t now = ceph_clock_now();
3828 mdr->set_mds_stamp(now);
3829
3830 // note which caps are requested, so we return at least a snapshot
3831 // value for them. (currently this matters for xattrs and inline data)
3832 mdr->getattr_caps = mask;
3833
3834 mds->balancer->hit_inode(ref, META_POP_IRD, req->get_source().num());
3835
3836 // reply
3837 dout(10) << "reply to stat on " << *req << dendl;
3838 mdr->tracei = ref;
3839 if (is_lookup)
3840 mdr->tracedn = mdr->dn[0].back();
3841 respond_to_request(mdr, 0);
3842 }
3843
3844 struct C_MDS_LookupIno2 : public ServerContext {
3845 MDRequestRef mdr;
3846 C_MDS_LookupIno2(Server *s, MDRequestRef& r) : ServerContext(s), mdr(r) {}
3847 void finish(int r) override {
3848 server->_lookup_ino_2(mdr, r);
3849 }
3850 };
3851
3852 /*
3853 * filepath: ino
3854 */
3855 void Server::handle_client_lookup_ino(MDRequestRef& mdr,
3856 bool want_parent, bool want_dentry)
3857 {
3858 const cref_t<MClientRequest> &req = mdr->client_request;
3859
3860 if ((uint64_t)req->head.args.lookupino.snapid > 0)
3861 return _lookup_snap_ino(mdr);
3862
3863 inodeno_t ino = req->get_filepath().get_ino();
3864 CInode *in = mdcache->get_inode(ino);
3865 if (in && in->state_test(CInode::STATE_PURGING)) {
3866 respond_to_request(mdr, -ESTALE);
3867 return;
3868 }
3869 if (!in) {
3870 mdcache->open_ino(ino, (int64_t)-1, new C_MDS_LookupIno2(this, mdr), false);
3871 return;
3872 }
3873
3874 if (mdr && in->snaprealm && !in->snaprealm->have_past_parents_open() &&
3875 !in->snaprealm->open_parents(new C_MDS_RetryRequest(mdcache, mdr))) {
3876 return;
3877 }
3878
3879 // check for nothing (not read or write); this still applies the
3880 // path check.
3881 if (!check_access(mdr, in, 0))
3882 return;
3883
3884 CDentry *dn = in->get_projected_parent_dn();
3885 CInode *diri = dn ? dn->get_dir()->inode : NULL;
3886
3887 MutationImpl::LockOpVec lov;
3888 if (dn && (want_parent || want_dentry)) {
3889 mdr->pin(dn);
3890 lov.add_rdlock(&dn->lock);
3891 }
3892
3893 unsigned mask = req->head.args.lookupino.mask;
3894 if (mask) {
3895 Capability *cap = in->get_client_cap(mdr->get_client());
3896 int issued = 0;
3897 if (cap && (mdr->snapid == CEPH_NOSNAP || mdr->snapid <= cap->client_follows))
3898 issued = cap->issued();
3899 // FIXME
3900 // permission bits, ACL/security xattrs
3901 if ((mask & CEPH_CAP_AUTH_SHARED) && (issued & CEPH_CAP_AUTH_EXCL) == 0)
3902 lov.add_rdlock(&in->authlock);
3903 if ((mask & CEPH_CAP_XATTR_SHARED) && (issued & CEPH_CAP_XATTR_EXCL) == 0)
3904 lov.add_rdlock(&in->xattrlock);
3905
3906 mdr->getattr_caps = mask;
3907 }
3908
3909 if (!lov.empty()) {
3910 if (!mds->locker->acquire_locks(mdr, lov))
3911 return;
3912
3913 if (diri != NULL) {
3914 // need read access to directory inode
3915 if (!check_access(mdr, diri, MAY_READ))
3916 return;
3917 }
3918 }
3919
3920 if (want_parent) {
3921 if (in->is_base()) {
3922 respond_to_request(mdr, -EINVAL);
3923 return;
3924 }
3925 if (!diri || diri->is_stray()) {
3926 respond_to_request(mdr, -ESTALE);
3927 return;
3928 }
3929 dout(10) << "reply to lookup_parent " << *in << dendl;
3930 mdr->tracei = diri;
3931 respond_to_request(mdr, 0);
3932 } else {
3933 if (want_dentry) {
3934 inodeno_t dirino = req->get_filepath2().get_ino();
3935 if (!diri || (dirino != inodeno_t() && diri->ino() != dirino)) {
3936 respond_to_request(mdr, -ENOENT);
3937 return;
3938 }
3939 dout(10) << "reply to lookup_name " << *in << dendl;
3940 } else
3941 dout(10) << "reply to lookup_ino " << *in << dendl;
3942
3943 mdr->tracei = in;
3944 if (want_dentry)
3945 mdr->tracedn = dn;
3946 respond_to_request(mdr, 0);
3947 }
3948 }
3949
3950 void Server::_lookup_snap_ino(MDRequestRef& mdr)
3951 {
3952 const cref_t<MClientRequest> &req = mdr->client_request;
3953
3954 vinodeno_t vino;
3955 vino.ino = req->get_filepath().get_ino();
3956 vino.snapid = (__u64)req->head.args.lookupino.snapid;
3957 inodeno_t parent_ino = (__u64)req->head.args.lookupino.parent;
3958 __u32 hash = req->head.args.lookupino.hash;
3959
3960 dout(7) << "lookup_snap_ino " << vino << " parent " << parent_ino << " hash " << hash << dendl;
3961
3962 CInode *in = mdcache->lookup_snap_inode(vino);
3963 if (!in) {
3964 in = mdcache->get_inode(vino.ino);
3965 if (in) {
3966 if (in->state_test(CInode::STATE_PURGING) ||
3967 !in->has_snap_data(vino.snapid)) {
3968 if (in->is_dir() || !parent_ino) {
3969 respond_to_request(mdr, -ESTALE);
3970 return;
3971 }
3972 in = NULL;
3973 }
3974 }
3975 }
3976
3977 if (in) {
3978 dout(10) << "reply to lookup_snap_ino " << *in << dendl;
3979 mdr->snapid = vino.snapid;
3980 mdr->tracei = in;
3981 respond_to_request(mdr, 0);
3982 return;
3983 }
3984
3985 CInode *diri = NULL;
3986 if (parent_ino) {
3987 diri = mdcache->get_inode(parent_ino);
3988 if (!diri) {
3989 mdcache->open_ino(parent_ino, mds->mdsmap->get_metadata_pool(), new C_MDS_LookupIno2(this, mdr));
3990 return;
3991 }
3992
3993 if (!diri->is_dir()) {
3994 respond_to_request(mdr, -EINVAL);
3995 return;
3996 }
3997
3998 MutationImpl::LockOpVec lov;
3999 lov.add_rdlock(&diri->dirfragtreelock);
4000 if (!mds->locker->acquire_locks(mdr, lov))
4001 return;
4002
4003 frag_t frag = diri->dirfragtree[hash];
4004 CDir *dir = try_open_auth_dirfrag(diri, frag, mdr);
4005 if (!dir)
4006 return;
4007
4008 if (!dir->is_complete()) {
4009 if (dir->is_frozen()) {
4010 mds->locker->drop_locks(mdr.get());
4011 mdr->drop_local_auth_pins();
4012 dir->add_waiter(CDir::WAIT_UNFREEZE, new C_MDS_RetryRequest(mdcache, mdr));
4013 return;
4014 }
4015 dir->fetch(new C_MDS_RetryRequest(mdcache, mdr), true);
4016 return;
4017 }
4018
4019 respond_to_request(mdr, -ESTALE);
4020 } else {
4021 mdcache->open_ino(vino.ino, mds->mdsmap->get_metadata_pool(), new C_MDS_LookupIno2(this, mdr), false);
4022 }
4023 }
4024
4025 void Server::_lookup_ino_2(MDRequestRef& mdr, int r)
4026 {
4027 inodeno_t ino = mdr->client_request->get_filepath().get_ino();
4028 dout(10) << "_lookup_ino_2 " << mdr.get() << " ino " << ino << " r=" << r << dendl;
4029
4030 // `r` is a rank if >=0, else an error code
4031 if (r >= 0) {
4032 mds_rank_t dest_rank(r);
4033 if (dest_rank == mds->get_nodeid())
4034 dispatch_client_request(mdr);
4035 else
4036 mdcache->request_forward(mdr, dest_rank);
4037 return;
4038 }
4039
4040 // give up
4041 if (r == -ENOENT || r == -ENODATA)
4042 r = -ESTALE;
4043 respond_to_request(mdr, r);
4044 }
4045
4046
4047 /* This function takes responsibility for the passed mdr*/
4048 void Server::handle_client_open(MDRequestRef& mdr)
4049 {
4050 const cref_t<MClientRequest> &req = mdr->client_request;
4051 dout(7) << "open on " << req->get_filepath() << dendl;
4052
4053 int flags = req->head.args.open.flags;
4054 int cmode = ceph_flags_to_mode(flags);
4055 if (cmode < 0) {
4056 respond_to_request(mdr, -EINVAL);
4057 return;
4058 }
4059
4060 bool need_auth = !file_mode_is_readonly(cmode) ||
4061 (flags & (CEPH_O_TRUNC | CEPH_O_DIRECTORY));
4062
4063 if ((cmode & CEPH_FILE_MODE_WR) && mdcache->is_readonly()) {
4064 dout(7) << "read-only FS" << dendl;
4065 respond_to_request(mdr, -EROFS);
4066 return;
4067 }
4068
4069 CInode *cur = rdlock_path_pin_ref(mdr, need_auth);
4070 if (!cur)
4071 return;
4072
4073 if (cur->is_frozen() || cur->state_test(CInode::STATE_EXPORTINGCAPS)) {
4074 ceph_assert(!need_auth);
4075 mdr->locking_state &= ~(MutationImpl::PATH_LOCKED | MutationImpl::ALL_LOCKED);
4076 CInode *cur = rdlock_path_pin_ref(mdr, true);
4077 if (!cur)
4078 return;
4079 }
4080
4081 if (!cur->inode.is_file()) {
4082 // can only open non-regular inode with mode FILE_MODE_PIN, at least for now.
4083 cmode = CEPH_FILE_MODE_PIN;
4084 // the inode is symlink and client wants to follow it, ignore the O_TRUNC flag.
4085 if (cur->inode.is_symlink() && !(flags & CEPH_O_NOFOLLOW))
4086 flags &= ~CEPH_O_TRUNC;
4087 }
4088
4089 dout(10) << "open flags = " << flags
4090 << ", filemode = " << cmode
4091 << ", need_auth = " << need_auth
4092 << dendl;
4093
4094 // regular file?
4095 /*if (!cur->inode.is_file() && !cur->inode.is_dir()) {
4096 dout(7) << "not a file or dir " << *cur << dendl;
4097 respond_to_request(mdr, -ENXIO); // FIXME what error do we want?
4098 return;
4099 }*/
4100 if ((flags & CEPH_O_DIRECTORY) && !cur->inode.is_dir() && !cur->inode.is_symlink()) {
4101 dout(7) << "specified O_DIRECTORY on non-directory " << *cur << dendl;
4102 respond_to_request(mdr, -EINVAL);
4103 return;
4104 }
4105
4106 if ((flags & CEPH_O_TRUNC) && !cur->inode.is_file()) {
4107 dout(7) << "specified O_TRUNC on !(file|symlink) " << *cur << dendl;
4108 // we should return -EISDIR for directory, return -EINVAL for other non-regular
4109 respond_to_request(mdr, cur->inode.is_dir() ? -EISDIR : -EINVAL);
4110 return;
4111 }
4112
4113 if (cur->inode.inline_data.version != CEPH_INLINE_NONE &&
4114 !mdr->session->get_connection()->has_feature(CEPH_FEATURE_MDS_INLINE_DATA)) {
4115 dout(7) << "old client cannot open inline data file " << *cur << dendl;
4116 respond_to_request(mdr, -EPERM);
4117 return;
4118 }
4119
4120 // snapped data is read only
4121 if (mdr->snapid != CEPH_NOSNAP &&
4122 ((cmode & CEPH_FILE_MODE_WR) || req->may_write())) {
4123 dout(7) << "snap " << mdr->snapid << " is read-only " << *cur << dendl;
4124 respond_to_request(mdr, -EROFS);
4125 return;
4126 }
4127
4128 MutationImpl::LockOpVec lov;
4129
4130 unsigned mask = req->head.args.open.mask;
4131 if (mask) {
4132 Capability *cap = cur->get_client_cap(mdr->get_client());
4133 int issued = 0;
4134 if (cap && (mdr->snapid == CEPH_NOSNAP || mdr->snapid <= cap->client_follows))
4135 issued = cap->issued();
4136 // permission bits, ACL/security xattrs
4137 if ((mask & CEPH_CAP_AUTH_SHARED) && (issued & CEPH_CAP_AUTH_EXCL) == 0)
4138 lov.add_rdlock(&cur->authlock);
4139 if ((mask & CEPH_CAP_XATTR_SHARED) && (issued & CEPH_CAP_XATTR_EXCL) == 0)
4140 lov.add_rdlock(&cur->xattrlock);
4141
4142 mdr->getattr_caps = mask;
4143 }
4144
4145 // O_TRUNC
4146 if ((flags & CEPH_O_TRUNC) && !mdr->has_completed) {
4147 ceph_assert(cur->is_auth());
4148
4149 lov.add_xlock(&cur->filelock);
4150 if (!mds->locker->acquire_locks(mdr, lov))
4151 return;
4152
4153 if (!check_access(mdr, cur, MAY_WRITE))
4154 return;
4155
4156 // wait for pending truncate?
4157 const auto pi = cur->get_projected_inode();
4158 if (pi->is_truncating()) {
4159 dout(10) << " waiting for pending truncate from " << pi->truncate_from
4160 << " to " << pi->truncate_size << " to complete on " << *cur << dendl;
4161 mds->locker->drop_locks(mdr.get());
4162 mdr->drop_local_auth_pins();
4163 cur->add_waiter(CInode::WAIT_TRUNC, new C_MDS_RetryRequest(mdcache, mdr));
4164 return;
4165 }
4166
4167 do_open_truncate(mdr, cmode);
4168 return;
4169 }
4170
4171 // sync filelock if snapped.
4172 // this makes us wait for writers to flushsnaps, ensuring we get accurate metadata,
4173 // and that data itself is flushed so that we can read the snapped data off disk.
4174 if (mdr->snapid != CEPH_NOSNAP && !cur->is_dir()) {
4175 lov.add_rdlock(&cur->filelock);
4176 }
4177
4178 if (!mds->locker->acquire_locks(mdr, lov))
4179 return;
4180
4181 mask = MAY_READ;
4182 if (cmode & CEPH_FILE_MODE_WR)
4183 mask |= MAY_WRITE;
4184 if (!check_access(mdr, cur, mask))
4185 return;
4186
4187 utime_t now = ceph_clock_now();
4188 mdr->set_mds_stamp(now);
4189
4190 if (cur->is_file() || cur->is_dir()) {
4191 if (mdr->snapid == CEPH_NOSNAP) {
4192 // register new cap
4193 Capability *cap = mds->locker->issue_new_caps(cur, cmode, mdr, nullptr);
4194 if (cap)
4195 dout(12) << "open issued caps " << ccap_string(cap->pending())
4196 << " for " << req->get_source()
4197 << " on " << *cur << dendl;
4198 } else {
4199 int caps = ceph_caps_for_mode(cmode);
4200 dout(12) << "open issued IMMUTABLE SNAP caps " << ccap_string(caps)
4201 << " for " << req->get_source()
4202 << " snapid " << mdr->snapid
4203 << " on " << *cur << dendl;
4204 mdr->snap_caps = caps;
4205 }
4206 }
4207
4208 // increase max_size?
4209 if (cmode & CEPH_FILE_MODE_WR)
4210 mds->locker->check_inode_max_size(cur);
4211
4212 // make sure this inode gets into the journal
4213 if (cur->is_auth() && cur->last == CEPH_NOSNAP &&
4214 mdcache->open_file_table.should_log_open(cur)) {
4215 EOpen *le = new EOpen(mds->mdlog);
4216 mdlog->start_entry(le);
4217 le->add_clean_inode(cur);
4218 mdlog->submit_entry(le);
4219 }
4220
4221 // hit pop
4222 if (cmode & CEPH_FILE_MODE_WR)
4223 mds->balancer->hit_inode(cur, META_POP_IWR);
4224 else
4225 mds->balancer->hit_inode(cur, META_POP_IRD,
4226 mdr->client_request->get_source().num());
4227
4228 CDentry *dn = 0;
4229 if (req->get_dentry_wanted()) {
4230 ceph_assert(mdr->dn[0].size());
4231 dn = mdr->dn[0].back();
4232 }
4233
4234 mdr->tracei = cur;
4235 mdr->tracedn = dn;
4236 respond_to_request(mdr, 0);
4237 }
4238
4239 class C_MDS_openc_finish : public ServerLogContext {
4240 CDentry *dn;
4241 CInode *newi;
4242 public:
4243 C_MDS_openc_finish(Server *s, MDRequestRef& r, CDentry *d, CInode *ni) :
4244 ServerLogContext(s, r), dn(d), newi(ni) {}
4245 void finish(int r) override {
4246 ceph_assert(r == 0);
4247
4248 dn->pop_projected_linkage();
4249
4250 // dirty inode, dn, dir
4251 newi->inode.version--; // a bit hacky, see C_MDS_mknod_finish
4252 newi->mark_dirty(newi->inode.version+1, mdr->ls);
4253 newi->mark_dirty_parent(mdr->ls, true);
4254
4255 mdr->apply();
4256
4257 get_mds()->locker->share_inode_max_size(newi);
4258
4259 MDRequestRef null_ref;
4260 get_mds()->mdcache->send_dentry_link(dn, null_ref);
4261
4262 get_mds()->balancer->hit_inode(newi, META_POP_IWR);
4263
4264 server->respond_to_request(mdr, 0);
4265
4266 ceph_assert(g_conf()->mds_kill_openc_at != 1);
4267 }
4268 };
4269
4270 /* This function takes responsibility for the passed mdr*/
4271 void Server::handle_client_openc(MDRequestRef& mdr)
4272 {
4273 const cref_t<MClientRequest> &req = mdr->client_request;
4274 client_t client = mdr->get_client();
4275
4276 dout(7) << "open w/ O_CREAT on " << req->get_filepath() << dendl;
4277
4278 int cmode = ceph_flags_to_mode(req->head.args.open.flags);
4279 if (cmode < 0) {
4280 respond_to_request(mdr, -EINVAL);
4281 return;
4282 }
4283
4284 bool excl = req->head.args.open.flags & CEPH_O_EXCL;
4285 CDentry *dn = rdlock_path_xlock_dentry(mdr, true, !excl, true);
4286 if (!dn)
4287 return;
4288
4289 CDentry::linkage_t *dnl = dn->get_projected_linkage();
4290 if (!excl && !dnl->is_null()) {
4291 // it existed.
4292 mds->locker->xlock_downgrade(&dn->lock, mdr.get());
4293
4294 MutationImpl::LockOpVec lov;
4295 lov.add_rdlock(&dnl->get_inode()->snaplock);
4296 if (!mds->locker->acquire_locks(mdr, lov))
4297 return;
4298
4299 handle_client_open(mdr);
4300 return;
4301 }
4302
4303 ceph_assert(dnl->is_null());
4304
4305 // set layout
4306 file_layout_t layout;
4307 if (mdr->dir_layout != file_layout_t())
4308 layout = mdr->dir_layout;
4309 else
4310 layout = mdcache->default_file_layout;
4311
4312 // What kind of client caps are required to complete this operation
4313 uint64_t access = MAY_WRITE;
4314
4315 const auto default_layout = layout;
4316
4317 // fill in any special params from client
4318 if (req->head.args.open.stripe_unit)
4319 layout.stripe_unit = req->head.args.open.stripe_unit;
4320 if (req->head.args.open.stripe_count)
4321 layout.stripe_count = req->head.args.open.stripe_count;
4322 if (req->head.args.open.object_size)
4323 layout.object_size = req->head.args.open.object_size;
4324 if (req->get_connection()->has_feature(CEPH_FEATURE_CREATEPOOLID) &&
4325 (__s32)req->head.args.open.pool >= 0) {
4326 layout.pool_id = req->head.args.open.pool;
4327
4328 // make sure we have as new a map as the client
4329 if (req->get_mdsmap_epoch() > mds->mdsmap->get_epoch()) {
4330 mds->wait_for_mdsmap(req->get_mdsmap_epoch(), new C_MDS_RetryRequest(mdcache, mdr));
4331 return;
4332 }
4333 }
4334
4335 // If client doesn't have capability to modify layout pools, then
4336 // only permit this request if the requested pool matches what the
4337 // file would have inherited anyway from its parent.
4338 if (default_layout != layout) {
4339 access |= MAY_SET_VXATTR;
4340 }
4341
4342 if (!layout.is_valid()) {
4343 dout(10) << " invalid initial file layout" << dendl;
4344 respond_to_request(mdr, -EINVAL);
4345 return;
4346 }
4347 if (!mds->mdsmap->is_data_pool(layout.pool_id)) {
4348 dout(10) << " invalid data pool " << layout.pool_id << dendl;
4349 respond_to_request(mdr, -EINVAL);
4350 return;
4351 }
4352
4353 // created null dn.
4354 CDir *dir = dn->get_dir();
4355 CInode *diri = dir->get_inode();
4356 if (!check_access(mdr, diri, access))
4357 return;
4358 if (!check_fragment_space(mdr, dir))
4359 return;
4360
4361 if (mdr->dn[0].size() == 1)
4362 mds->locker->create_lock_cache(mdr, diri, &mdr->dir_layout);
4363
4364 // create inode.
4365 CInode *in = prepare_new_inode(mdr, dn->get_dir(), inodeno_t(req->head.ino),
4366 req->head.args.open.mode | S_IFREG, &layout);
4367 ceph_assert(in);
4368
4369 // it's a file.
4370 dn->push_projected_linkage(in);
4371
4372 in->inode.version = dn->pre_dirty();
4373 if (layout.pool_id != mdcache->default_file_layout.pool_id)
4374 in->inode.add_old_pool(mdcache->default_file_layout.pool_id);
4375 in->inode.update_backtrace();
4376 in->inode.rstat.rfiles = 1;
4377
4378 SnapRealm *realm = diri->find_snaprealm();
4379 snapid_t follows = mdcache->get_global_snaprealm()->get_newest_seq();
4380 ceph_assert(follows >= realm->get_newest_seq());
4381
4382 ceph_assert(dn->first == follows+1);
4383 in->first = dn->first;
4384
4385 // do the open
4386 Capability *cap = mds->locker->issue_new_caps(in, cmode, mdr, realm);
4387 in->authlock.set_state(LOCK_EXCL);
4388 in->xattrlock.set_state(LOCK_EXCL);
4389
4390 if (cap && (cmode & CEPH_FILE_MODE_WR)) {
4391 in->inode.client_ranges[client].range.first = 0;
4392 in->inode.client_ranges[client].range.last = in->inode.layout.stripe_unit;
4393 in->inode.client_ranges[client].follows = follows;
4394 cap->mark_clientwriteable();
4395 }
4396
4397 // prepare finisher
4398 mdr->ls = mdlog->get_current_segment();
4399 EUpdate *le = new EUpdate(mdlog, "openc");
4400 mdlog->start_entry(le);
4401 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
4402 journal_allocated_inos(mdr, &le->metablob);
4403 mdcache->predirty_journal_parents(mdr, &le->metablob, in, dn->get_dir(), PREDIRTY_PRIMARY|PREDIRTY_DIR, 1);
4404 le->metablob.add_primary_dentry(dn, in, true, true, true);
4405
4406 // make sure this inode gets into the journal
4407 le->metablob.add_opened_ino(in->ino());
4408
4409 C_MDS_openc_finish *fin = new C_MDS_openc_finish(this, mdr, dn, in);
4410
4411 if (mdr->session->info.has_feature(CEPHFS_FEATURE_DELEG_INO)) {
4412 openc_response_t ocresp;
4413
4414 dout(10) << "adding created_ino and delegated_inos" << dendl;
4415 ocresp.created_ino = in->inode.ino;
4416
4417 if (delegate_inos_pct && !req->is_queued_for_replay()) {
4418 // Try to delegate some prealloc_inos to the client, if it's down to half the max
4419 unsigned frac = 100 / delegate_inos_pct;
4420 if (mdr->session->delegated_inos.size() < (unsigned)g_conf()->mds_client_prealloc_inos / frac / 2)
4421 mdr->session->delegate_inos(g_conf()->mds_client_prealloc_inos / frac, ocresp.delegated_inos);
4422 }
4423
4424 encode(ocresp, mdr->reply_extra_bl);
4425 } else if (mdr->client_request->get_connection()->has_feature(CEPH_FEATURE_REPLY_CREATE_INODE)) {
4426 dout(10) << "adding ino to reply to indicate inode was created" << dendl;
4427 // add the file created flag onto the reply if create_flags features is supported
4428 encode(in->inode.ino, mdr->reply_extra_bl);
4429 }
4430
4431 journal_and_reply(mdr, in, dn, le, fin);
4432
4433 // We hit_dir (via hit_inode) in our finish callback, but by then we might
4434 // have overshot the split size (multiple opencs in flight), so here is
4435 // an early chance to split the dir if this openc makes it oversized.
4436 mds->balancer->maybe_fragment(dir, false);
4437 }
4438
4439
4440
4441 void Server::handle_client_readdir(MDRequestRef& mdr)
4442 {
4443 const cref_t<MClientRequest> &req = mdr->client_request;
4444 client_t client = req->get_source().num();
4445 MutationImpl::LockOpVec lov;
4446 CInode *diri = rdlock_path_pin_ref(mdr, false, true);
4447 if (!diri) return;
4448
4449 // it's a directory, right?
4450 if (!diri->is_dir()) {
4451 // not a dir
4452 dout(10) << "reply to " << *req << " readdir -ENOTDIR" << dendl;
4453 respond_to_request(mdr, -ENOTDIR);
4454 return;
4455 }
4456
4457 lov.add_rdlock(&diri->filelock);
4458 lov.add_rdlock(&diri->dirfragtreelock);
4459
4460 if (!mds->locker->acquire_locks(mdr, lov))
4461 return;
4462
4463 if (!check_access(mdr, diri, MAY_READ))
4464 return;
4465
4466 // which frag?
4467 frag_t fg = (__u32)req->head.args.readdir.frag;
4468 unsigned req_flags = (__u32)req->head.args.readdir.flags;
4469 string offset_str = req->get_path2();
4470
4471 __u32 offset_hash = 0;
4472 if (!offset_str.empty())
4473 offset_hash = ceph_frag_value(diri->hash_dentry_name(offset_str));
4474 else
4475 offset_hash = (__u32)req->head.args.readdir.offset_hash;
4476
4477 dout(10) << " frag " << fg << " offset '" << offset_str << "'"
4478 << " offset_hash " << offset_hash << " flags " << req_flags << dendl;
4479
4480 // does the frag exist?
4481 if (diri->dirfragtree[fg.value()] != fg) {
4482 frag_t newfg;
4483 if (req_flags & CEPH_READDIR_REPLY_BITFLAGS) {
4484 if (fg.contains((unsigned)offset_hash)) {
4485 newfg = diri->dirfragtree[offset_hash];
4486 } else {
4487 // client actually wants next frag
4488 newfg = diri->dirfragtree[fg.value()];
4489 }
4490 } else {
4491 offset_str.clear();
4492 newfg = diri->dirfragtree[fg.value()];
4493 }
4494 dout(10) << " adjust frag " << fg << " -> " << newfg << " " << diri->dirfragtree << dendl;
4495 fg = newfg;
4496 }
4497
4498 CDir *dir = try_open_auth_dirfrag(diri, fg, mdr);
4499 if (!dir) return;
4500
4501 // ok!
4502 dout(10) << "handle_client_readdir on " << *dir << dendl;
4503 ceph_assert(dir->is_auth());
4504
4505 if (!dir->is_complete()) {
4506 if (dir->is_frozen()) {
4507 dout(7) << "dir is frozen " << *dir << dendl;
4508 mds->locker->drop_locks(mdr.get());
4509 mdr->drop_local_auth_pins();
4510 dir->add_waiter(CDir::WAIT_UNFREEZE, new C_MDS_RetryRequest(mdcache, mdr));
4511 return;
4512 }
4513 // fetch
4514 dout(10) << " incomplete dir contents for readdir on " << *dir << ", fetching" << dendl;
4515 dir->fetch(new C_MDS_RetryRequest(mdcache, mdr), true);
4516 return;
4517 }
4518
4519 #ifdef MDS_VERIFY_FRAGSTAT
4520 dir->verify_fragstat();
4521 #endif
4522
4523 utime_t now = ceph_clock_now();
4524 mdr->set_mds_stamp(now);
4525
4526 snapid_t snapid = mdr->snapid;
4527 dout(10) << "snapid " << snapid << dendl;
4528
4529 SnapRealm *realm = diri->find_snaprealm();
4530
4531 unsigned max = req->head.args.readdir.max_entries;
4532 if (!max)
4533 max = dir->get_num_any(); // whatever, something big.
4534 unsigned max_bytes = req->head.args.readdir.max_bytes;
4535 if (!max_bytes)
4536 // make sure at least one item can be encoded
4537 max_bytes = (512 << 10) + g_conf()->mds_max_xattr_pairs_size;
4538
4539 // start final blob
4540 bufferlist dirbl;
4541 DirStat ds;
4542 ds.frag = dir->get_frag();
4543 ds.auth = dir->get_dir_auth().first;
4544 if (dir->is_auth() && !mdcache->forward_all_reqs_to_auth())
4545 dir->get_dist_spec(ds.dist, mds->get_nodeid());
4546
4547 dir->encode_dirstat(dirbl, mdr->session->info, ds);
4548
4549 // count bytes available.
4550 // this isn't perfect, but we should capture the main variable/unbounded size items!
4551 int front_bytes = dirbl.length() + sizeof(__u32) + sizeof(__u8)*2;
4552 int bytes_left = max_bytes - front_bytes;
4553 bytes_left -= realm->get_snap_trace().length();
4554
4555 // build dir contents
4556 bufferlist dnbl;
4557 __u32 numfiles = 0;
4558 bool start = !offset_hash && offset_str.empty();
4559 // skip all dns < dentry_key_t(snapid, offset_str, offset_hash)
4560 dentry_key_t skip_key(snapid, offset_str.c_str(), offset_hash);
4561 auto it = start ? dir->begin() : dir->lower_bound(skip_key);
4562 bool end = (it == dir->end());
4563 for (; !end && numfiles < max; end = (it == dir->end())) {
4564 CDentry *dn = it->second;
4565 ++it;
4566
4567 if (dn->state_test(CDentry::STATE_PURGING))
4568 continue;
4569
4570 bool dnp = dn->use_projected(client, mdr);
4571 CDentry::linkage_t *dnl = dnp ? dn->get_projected_linkage() : dn->get_linkage();
4572
4573 if (dnl->is_null())
4574 continue;
4575
4576 if (dn->last < snapid || dn->first > snapid) {
4577 dout(20) << "skipping non-overlapping snap " << *dn << dendl;
4578 continue;
4579 }
4580
4581 if (!start) {
4582 dentry_key_t offset_key(dn->last, offset_str.c_str(), offset_hash);
4583 if (!(offset_key < dn->key()))
4584 continue;
4585 }
4586
4587 CInode *in = dnl->get_inode();
4588
4589 if (in && in->ino() == CEPH_INO_CEPH)
4590 continue;
4591
4592 // remote link?
4593 // better for the MDS to do the work, if we think the client will stat any of these files.
4594 if (dnl->is_remote() && !in) {
4595 in = mdcache->get_inode(dnl->get_remote_ino());
4596 if (in) {
4597 dn->link_remote(dnl, in);
4598 } else if (dn->state_test(CDentry::STATE_BADREMOTEINO)) {
4599 dout(10) << "skipping bad remote ino on " << *dn << dendl;
4600 continue;
4601 } else {
4602 // touch everything i _do_ have
4603 for (auto &p : *dir) {
4604 if (!p.second->get_linkage()->is_null())
4605 mdcache->lru.lru_touch(p.second);
4606 }
4607
4608 // already issued caps and leases, reply immediately.
4609 if (dnbl.length() > 0) {
4610 mdcache->open_remote_dentry(dn, dnp, new C_MDSInternalNoop);
4611 dout(10) << " open remote dentry after caps were issued, stopping at "
4612 << dnbl.length() << " < " << bytes_left << dendl;
4613 break;
4614 }
4615
4616 mds->locker->drop_locks(mdr.get());
4617 mdr->drop_local_auth_pins();
4618 mdcache->open_remote_dentry(dn, dnp, new C_MDS_RetryRequest(mdcache, mdr));
4619 return;
4620 }
4621 }
4622 ceph_assert(in);
4623
4624 if ((int)(dnbl.length() + dn->get_name().length() + sizeof(__u32) + sizeof(LeaseStat)) > bytes_left) {
4625 dout(10) << " ran out of room, stopping at " << dnbl.length() << " < " << bytes_left << dendl;
4626 break;
4627 }
4628
4629 unsigned start_len = dnbl.length();
4630
4631 // dentry
4632 dout(12) << "including dn " << *dn << dendl;
4633 encode(dn->get_name(), dnbl);
4634 int lease_mask = dnl->is_primary() ? CEPH_LEASE_PRIMARY_LINK : 0;
4635 mds->locker->issue_client_lease(dn, mdr, lease_mask, now, dnbl);
4636
4637 // inode
4638 dout(12) << "including inode " << *in << dendl;
4639 int r = in->encode_inodestat(dnbl, mdr->session, realm, snapid, bytes_left - (int)dnbl.length());
4640 if (r < 0) {
4641 // chop off dn->name, lease
4642 dout(10) << " ran out of room, stopping at " << start_len << " < " << bytes_left << dendl;
4643 bufferlist keep;
4644 keep.substr_of(dnbl, 0, start_len);
4645 dnbl.swap(keep);
4646 break;
4647 }
4648 ceph_assert(r >= 0);
4649 numfiles++;
4650
4651 // touch dn
4652 mdcache->lru.lru_touch(dn);
4653 }
4654
4655 __u16 flags = 0;
4656 if (end) {
4657 flags = CEPH_READDIR_FRAG_END;
4658 if (start)
4659 flags |= CEPH_READDIR_FRAG_COMPLETE; // FIXME: what purpose does this serve
4660 }
4661 // client only understand END and COMPLETE flags ?
4662 if (req_flags & CEPH_READDIR_REPLY_BITFLAGS) {
4663 flags |= CEPH_READDIR_HASH_ORDER | CEPH_READDIR_OFFSET_HASH;
4664 }
4665
4666 // finish final blob
4667 encode(numfiles, dirbl);
4668 encode(flags, dirbl);
4669 dirbl.claim_append(dnbl);
4670
4671 // yay, reply
4672 dout(10) << "reply to " << *req << " readdir num=" << numfiles
4673 << " bytes=" << dirbl.length()
4674 << " start=" << (int)start
4675 << " end=" << (int)end
4676 << dendl;
4677 mdr->reply_extra_bl = dirbl;
4678
4679 // bump popularity. NOTE: this doesn't quite capture it.
4680 mds->balancer->hit_dir(dir, META_POP_IRD, -1, numfiles);
4681
4682 // reply
4683 mdr->tracei = diri;
4684 respond_to_request(mdr, 0);
4685 }
4686
4687
4688
4689 // ===============================================================================
4690 // INODE UPDATES
4691
4692
4693 /*
4694 * finisher for basic inode updates
4695 */
4696 class C_MDS_inode_update_finish : public ServerLogContext {
4697 CInode *in;
4698 bool truncating_smaller, changed_ranges, new_realm;
4699 public:
4700 C_MDS_inode_update_finish(Server *s, MDRequestRef& r, CInode *i,
4701 bool sm=false, bool cr=false, bool nr=false) :
4702 ServerLogContext(s, r), in(i),
4703 truncating_smaller(sm), changed_ranges(cr), new_realm(nr) { }
4704 void finish(int r) override {
4705 ceph_assert(r == 0);
4706
4707 // apply
4708 in->pop_and_dirty_projected_inode(mdr->ls);
4709 mdr->apply();
4710
4711 MDSRank *mds = get_mds();
4712
4713 // notify any clients
4714 if (truncating_smaller && in->inode.is_truncating()) {
4715 mds->locker->issue_truncate(in);
4716 mds->mdcache->truncate_inode(in, mdr->ls);
4717 }
4718
4719 if (new_realm) {
4720 int op = CEPH_SNAP_OP_SPLIT;
4721 mds->mdcache->send_snap_update(in, 0, op);
4722 mds->mdcache->do_realm_invalidate_and_update_notify(in, op);
4723 }
4724
4725 get_mds()->balancer->hit_inode(in, META_POP_IWR);
4726
4727 server->respond_to_request(mdr, 0);
4728
4729 if (changed_ranges)
4730 get_mds()->locker->share_inode_max_size(in);
4731 }
4732 };
4733
4734 void Server::handle_client_file_setlock(MDRequestRef& mdr)
4735 {
4736 const cref_t<MClientRequest> &req = mdr->client_request;
4737 MutationImpl::LockOpVec lov;
4738
4739 // get the inode to operate on, and set up any locks needed for that
4740 CInode *cur = rdlock_path_pin_ref(mdr, true);
4741 if (!cur)
4742 return;
4743
4744 lov.add_xlock(&cur->flocklock);
4745 /* acquire_locks will return true if it gets the locks. If it fails,
4746 it will redeliver this request at a later date, so drop the request.
4747 */
4748 if (!mds->locker->acquire_locks(mdr, lov)) {
4749 dout(10) << "handle_client_file_setlock could not get locks!" << dendl;
4750 return;
4751 }
4752
4753 // copy the lock change into a ceph_filelock so we can store/apply it
4754 ceph_filelock set_lock;
4755 set_lock.start = req->head.args.filelock_change.start;
4756 set_lock.length = req->head.args.filelock_change.length;
4757 set_lock.client = req->get_orig_source().num();
4758 set_lock.owner = req->head.args.filelock_change.owner;
4759 set_lock.pid = req->head.args.filelock_change.pid;
4760 set_lock.type = req->head.args.filelock_change.type;
4761 bool will_wait = req->head.args.filelock_change.wait;
4762
4763 dout(10) << "handle_client_file_setlock: " << set_lock << dendl;
4764
4765 ceph_lock_state_t *lock_state = NULL;
4766 bool interrupt = false;
4767
4768 // get the appropriate lock state
4769 switch (req->head.args.filelock_change.rule) {
4770 case CEPH_LOCK_FLOCK_INTR:
4771 interrupt = true;
4772 // fall-thru
4773 case CEPH_LOCK_FLOCK:
4774 lock_state = cur->get_flock_lock_state();
4775 break;
4776
4777 case CEPH_LOCK_FCNTL_INTR:
4778 interrupt = true;
4779 // fall-thru
4780 case CEPH_LOCK_FCNTL:
4781 lock_state = cur->get_fcntl_lock_state();
4782 break;
4783
4784 default:
4785 dout(10) << "got unknown lock type " << set_lock.type
4786 << ", dropping request!" << dendl;
4787 respond_to_request(mdr, -EOPNOTSUPP);
4788 return;
4789 }
4790
4791 dout(10) << " state prior to lock change: " << *lock_state << dendl;
4792 if (CEPH_LOCK_UNLOCK == set_lock.type) {
4793 list<ceph_filelock> activated_locks;
4794 MDSContext::vec waiters;
4795 if (lock_state->is_waiting(set_lock)) {
4796 dout(10) << " unlock removing waiting lock " << set_lock << dendl;
4797 lock_state->remove_waiting(set_lock);
4798 cur->take_waiting(CInode::WAIT_FLOCK, waiters);
4799 } else if (!interrupt) {
4800 dout(10) << " unlock attempt on " << set_lock << dendl;
4801 lock_state->remove_lock(set_lock, activated_locks);
4802 cur->take_waiting(CInode::WAIT_FLOCK, waiters);
4803 }
4804 mds->queue_waiters(waiters);
4805
4806 respond_to_request(mdr, 0);
4807 } else {
4808 dout(10) << " lock attempt on " << set_lock << dendl;
4809 bool deadlock = false;
4810 if (mdr->more()->flock_was_waiting &&
4811 !lock_state->is_waiting(set_lock)) {
4812 dout(10) << " was waiting for lock but not anymore, must have been canceled " << set_lock << dendl;
4813 respond_to_request(mdr, -EINTR);
4814 } else if (!lock_state->add_lock(set_lock, will_wait, mdr->more()->flock_was_waiting, &deadlock)) {
4815 dout(10) << " it failed on this attempt" << dendl;
4816 // couldn't set lock right now
4817 if (deadlock) {
4818 respond_to_request(mdr, -EDEADLK);
4819 } else if (!will_wait) {
4820 respond_to_request(mdr, -EWOULDBLOCK);
4821 } else {
4822 dout(10) << " added to waiting list" << dendl;
4823 ceph_assert(lock_state->is_waiting(set_lock));
4824 mdr->more()->flock_was_waiting = true;
4825 mds->locker->drop_locks(mdr.get());
4826 mdr->drop_local_auth_pins();
4827 mdr->mark_event("failed to add lock, waiting");
4828 mdr->mark_nowarn();
4829 cur->add_waiter(CInode::WAIT_FLOCK, new C_MDS_RetryRequest(mdcache, mdr));
4830 }
4831 } else
4832 respond_to_request(mdr, 0);
4833 }
4834 dout(10) << " state after lock change: " << *lock_state << dendl;
4835 }
4836
4837 void Server::handle_client_file_readlock(MDRequestRef& mdr)
4838 {
4839 const cref_t<MClientRequest> &req = mdr->client_request;
4840 MutationImpl::LockOpVec lov;
4841
4842 // get the inode to operate on, and set up any locks needed for that
4843 CInode *cur = rdlock_path_pin_ref(mdr, true);
4844 if (!cur)
4845 return;
4846
4847 /* acquire_locks will return true if it gets the locks. If it fails,
4848 it will redeliver this request at a later date, so drop the request.
4849 */
4850 lov.add_rdlock(&cur->flocklock);
4851 if (!mds->locker->acquire_locks(mdr, lov)) {
4852 dout(10) << "handle_client_file_readlock could not get locks!" << dendl;
4853 return;
4854 }
4855
4856 // copy the lock change into a ceph_filelock so we can store/apply it
4857 ceph_filelock checking_lock;
4858 checking_lock.start = req->head.args.filelock_change.start;
4859 checking_lock.length = req->head.args.filelock_change.length;
4860 checking_lock.client = req->get_orig_source().num();
4861 checking_lock.owner = req->head.args.filelock_change.owner;
4862 checking_lock.pid = req->head.args.filelock_change.pid;
4863 checking_lock.type = req->head.args.filelock_change.type;
4864
4865 // get the appropriate lock state
4866 ceph_lock_state_t *lock_state = NULL;
4867 switch (req->head.args.filelock_change.rule) {
4868 case CEPH_LOCK_FLOCK:
4869 lock_state = cur->get_flock_lock_state();
4870 break;
4871
4872 case CEPH_LOCK_FCNTL:
4873 lock_state = cur->get_fcntl_lock_state();
4874 break;
4875
4876 default:
4877 dout(10) << "got unknown lock type " << checking_lock.type << dendl;
4878 respond_to_request(mdr, -EINVAL);
4879 return;
4880 }
4881 lock_state->look_for_lock(checking_lock);
4882
4883 bufferlist lock_bl;
4884 encode(checking_lock, lock_bl);
4885
4886 mdr->reply_extra_bl = lock_bl;
4887 respond_to_request(mdr, 0);
4888 }
4889
4890 void Server::handle_client_setattr(MDRequestRef& mdr)
4891 {
4892 const cref_t<MClientRequest> &req = mdr->client_request;
4893 MutationImpl::LockOpVec lov;
4894 CInode *cur = rdlock_path_pin_ref(mdr, true);
4895 if (!cur) return;
4896
4897 if (mdr->snapid != CEPH_NOSNAP) {
4898 respond_to_request(mdr, -EROFS);
4899 return;
4900 }
4901 if (cur->ino() < MDS_INO_SYSTEM_BASE && !cur->is_base()) {
4902 respond_to_request(mdr, -EPERM);
4903 return;
4904 }
4905
4906 __u32 mask = req->head.args.setattr.mask;
4907 __u32 access_mask = MAY_WRITE;
4908
4909 // xlock inode
4910 if (mask & (CEPH_SETATTR_MODE|CEPH_SETATTR_UID|CEPH_SETATTR_GID|CEPH_SETATTR_BTIME|CEPH_SETATTR_KILL_SGUID))
4911 lov.add_xlock(&cur->authlock);
4912 if (mask & (CEPH_SETATTR_MTIME|CEPH_SETATTR_ATIME|CEPH_SETATTR_SIZE))
4913 lov.add_xlock(&cur->filelock);
4914 if (mask & CEPH_SETATTR_CTIME)
4915 lov.add_wrlock(&cur->versionlock);
4916
4917 if (!mds->locker->acquire_locks(mdr, lov))
4918 return;
4919
4920 if ((mask & CEPH_SETATTR_UID) && (cur->inode.uid != req->head.args.setattr.uid))
4921 access_mask |= MAY_CHOWN;
4922
4923 if ((mask & CEPH_SETATTR_GID) && (cur->inode.gid != req->head.args.setattr.gid))
4924 access_mask |= MAY_CHGRP;
4925
4926 if (!check_access(mdr, cur, access_mask))
4927 return;
4928
4929 // trunc from bigger -> smaller?
4930 auto pip = cur->get_projected_inode();
4931
4932 uint64_t old_size = std::max<uint64_t>(pip->size, req->head.args.setattr.old_size);
4933
4934 // ENOSPC on growing file while full, but allow shrinks
4935 if (is_full && req->head.args.setattr.size > old_size) {
4936 dout(20) << __func__ << ": full, responding ENOSPC to setattr with larger size" << dendl;
4937 respond_to_request(mdr, -ENOSPC);
4938 return;
4939 }
4940
4941 bool truncating_smaller = false;
4942 if (mask & CEPH_SETATTR_SIZE) {
4943 truncating_smaller = req->head.args.setattr.size < old_size;
4944 if (truncating_smaller && pip->is_truncating()) {
4945 dout(10) << " waiting for pending truncate from " << pip->truncate_from
4946 << " to " << pip->truncate_size << " to complete on " << *cur << dendl;
4947 mds->locker->drop_locks(mdr.get());
4948 mdr->drop_local_auth_pins();
4949 cur->add_waiter(CInode::WAIT_TRUNC, new C_MDS_RetryRequest(mdcache, mdr));
4950 return;
4951 }
4952 }
4953
4954 bool changed_ranges = false;
4955
4956 // project update
4957 mdr->ls = mdlog->get_current_segment();
4958 EUpdate *le = new EUpdate(mdlog, "setattr");
4959 mdlog->start_entry(le);
4960
4961 auto &pi = cur->project_inode();
4962
4963 if (mask & CEPH_SETATTR_UID)
4964 pi.inode.uid = req->head.args.setattr.uid;
4965 if (mask & CEPH_SETATTR_GID)
4966 pi.inode.gid = req->head.args.setattr.gid;
4967
4968 if (mask & CEPH_SETATTR_MODE)
4969 pi.inode.mode = (pi.inode.mode & ~07777) | (req->head.args.setattr.mode & 07777);
4970 else if ((mask & (CEPH_SETATTR_UID|CEPH_SETATTR_GID|CEPH_SETATTR_KILL_SGUID)) &&
4971 S_ISREG(pi.inode.mode) &&
4972 (pi.inode.mode & (S_IXUSR|S_IXGRP|S_IXOTH))) {
4973 pi.inode.mode &= ~(S_ISUID|S_ISGID);
4974 }
4975
4976 if (mask & CEPH_SETATTR_MTIME)
4977 pi.inode.mtime = req->head.args.setattr.mtime;
4978 if (mask & CEPH_SETATTR_ATIME)
4979 pi.inode.atime = req->head.args.setattr.atime;
4980 if (mask & CEPH_SETATTR_BTIME)
4981 pi.inode.btime = req->head.args.setattr.btime;
4982 if (mask & (CEPH_SETATTR_ATIME | CEPH_SETATTR_MTIME | CEPH_SETATTR_BTIME))
4983 pi.inode.time_warp_seq++; // maybe not a timewarp, but still a serialization point.
4984 if (mask & CEPH_SETATTR_SIZE) {
4985 if (truncating_smaller) {
4986 pi.inode.truncate(old_size, req->head.args.setattr.size);
4987 le->metablob.add_truncate_start(cur->ino());
4988 } else {
4989 pi.inode.size = req->head.args.setattr.size;
4990 pi.inode.rstat.rbytes = pi.inode.size;
4991 }
4992 pi.inode.mtime = mdr->get_op_stamp();
4993
4994 // adjust client's max_size?
4995 CInode::mempool_inode::client_range_map new_ranges;
4996 bool max_increased = false;
4997 mds->locker->calc_new_client_ranges(cur, pi.inode.size, true, &new_ranges, &max_increased);
4998 if (pi.inode.client_ranges != new_ranges) {
4999 dout(10) << " client_ranges " << pi.inode.client_ranges << " -> " << new_ranges << dendl;
5000 pi.inode.client_ranges = new_ranges;
5001 changed_ranges = true;
5002 }
5003 }
5004
5005 pi.inode.version = cur->pre_dirty();
5006 pi.inode.ctime = mdr->get_op_stamp();
5007 if (mdr->get_op_stamp() > pi.inode.rstat.rctime)
5008 pi.inode.rstat.rctime = mdr->get_op_stamp();
5009 pi.inode.change_attr++;
5010
5011 // log + wait
5012 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
5013 mdcache->predirty_journal_parents(mdr, &le->metablob, cur, 0, PREDIRTY_PRIMARY);
5014 mdcache->journal_dirty_inode(mdr.get(), &le->metablob, cur);
5015
5016 journal_and_reply(mdr, cur, 0, le, new C_MDS_inode_update_finish(this, mdr, cur,
5017 truncating_smaller, changed_ranges));
5018
5019 // flush immediately if there are readers/writers waiting
5020 if (mdr->is_xlocked(&cur->filelock) &&
5021 (cur->get_caps_wanted() & (CEPH_CAP_FILE_RD|CEPH_CAP_FILE_WR)))
5022 mds->mdlog->flush();
5023 }
5024
5025 /* Takes responsibility for mdr */
5026 void Server::do_open_truncate(MDRequestRef& mdr, int cmode)
5027 {
5028 CInode *in = mdr->in[0];
5029 client_t client = mdr->get_client();
5030 ceph_assert(in);
5031
5032 dout(10) << "do_open_truncate " << *in << dendl;
5033
5034 SnapRealm *realm = in->find_snaprealm();
5035 Capability *cap = mds->locker->issue_new_caps(in, cmode, mdr, realm);
5036
5037 mdr->ls = mdlog->get_current_segment();
5038 EUpdate *le = new EUpdate(mdlog, "open_truncate");
5039 mdlog->start_entry(le);
5040
5041 // prepare
5042 auto &pi = in->project_inode();
5043 pi.inode.version = in->pre_dirty();
5044 pi.inode.mtime = pi.inode.ctime = mdr->get_op_stamp();
5045 if (mdr->get_op_stamp() > pi.inode.rstat.rctime)
5046 pi.inode.rstat.rctime = mdr->get_op_stamp();
5047 pi.inode.change_attr++;
5048
5049 uint64_t old_size = std::max<uint64_t>(pi.inode.size, mdr->client_request->head.args.open.old_size);
5050 if (old_size > 0) {
5051 pi.inode.truncate(old_size, 0);
5052 le->metablob.add_truncate_start(in->ino());
5053 }
5054
5055 bool changed_ranges = false;
5056 if (cap && (cmode & CEPH_FILE_MODE_WR)) {
5057 pi.inode.client_ranges[client].range.first = 0;
5058 pi.inode.client_ranges[client].range.last = pi.inode.get_layout_size_increment();
5059 pi.inode.client_ranges[client].follows = realm->get_newest_seq();
5060 changed_ranges = true;
5061 cap->mark_clientwriteable();
5062 }
5063
5064 le->metablob.add_client_req(mdr->reqid, mdr->client_request->get_oldest_client_tid());
5065
5066 mdcache->predirty_journal_parents(mdr, &le->metablob, in, 0, PREDIRTY_PRIMARY);
5067 mdcache->journal_dirty_inode(mdr.get(), &le->metablob, in);
5068
5069 // make sure ino gets into the journal
5070 le->metablob.add_opened_ino(in->ino());
5071
5072 mdr->o_trunc = true;
5073
5074 CDentry *dn = 0;
5075 if (mdr->client_request->get_dentry_wanted()) {
5076 ceph_assert(mdr->dn[0].size());
5077 dn = mdr->dn[0].back();
5078 }
5079
5080 journal_and_reply(mdr, in, dn, le, new C_MDS_inode_update_finish(this, mdr, in, old_size > 0,
5081 changed_ranges));
5082 // Although the `open` part can give an early reply, the truncation won't
5083 // happen until our EUpdate is persistent, to give the client a prompt
5084 // response we must also flush that event.
5085 mdlog->flush();
5086 }
5087
5088
5089 /* This function cleans up the passed mdr */
5090 void Server::handle_client_setlayout(MDRequestRef& mdr)
5091 {
5092 const cref_t<MClientRequest> &req = mdr->client_request;
5093 CInode *cur = rdlock_path_pin_ref(mdr, true);
5094 if (!cur) return;
5095
5096 if (mdr->snapid != CEPH_NOSNAP) {
5097 respond_to_request(mdr, -EROFS);
5098 return;
5099 }
5100 if (!cur->is_file()) {
5101 respond_to_request(mdr, -EINVAL);
5102 return;
5103 }
5104 if (cur->get_projected_inode()->size ||
5105 cur->get_projected_inode()->truncate_seq > 1) {
5106 respond_to_request(mdr, -ENOTEMPTY);
5107 return;
5108 }
5109
5110 // validate layout
5111 file_layout_t layout = cur->get_projected_inode()->layout;
5112 // save existing layout for later
5113 const auto old_layout = layout;
5114
5115 int access = MAY_WRITE;
5116
5117 if (req->head.args.setlayout.layout.fl_object_size > 0)
5118 layout.object_size = req->head.args.setlayout.layout.fl_object_size;
5119 if (req->head.args.setlayout.layout.fl_stripe_unit > 0)
5120 layout.stripe_unit = req->head.args.setlayout.layout.fl_stripe_unit;
5121 if (req->head.args.setlayout.layout.fl_stripe_count > 0)
5122 layout.stripe_count=req->head.args.setlayout.layout.fl_stripe_count;
5123 if (req->head.args.setlayout.layout.fl_pg_pool > 0) {
5124 layout.pool_id = req->head.args.setlayout.layout.fl_pg_pool;
5125
5126 // make sure we have as new a map as the client
5127 if (req->get_mdsmap_epoch() > mds->mdsmap->get_epoch()) {
5128 mds->wait_for_mdsmap(req->get_mdsmap_epoch(), new C_MDS_RetryRequest(mdcache, mdr));
5129 return;
5130 }
5131 }
5132
5133 // Don't permit layout modifications without 'p' caps
5134 if (layout != old_layout) {
5135 access |= MAY_SET_VXATTR;
5136 }
5137
5138 if (!layout.is_valid()) {
5139 dout(10) << "bad layout" << dendl;
5140 respond_to_request(mdr, -EINVAL);
5141 return;
5142 }
5143 if (!mds->mdsmap->is_data_pool(layout.pool_id)) {
5144 dout(10) << " invalid data pool " << layout.pool_id << dendl;
5145 respond_to_request(mdr, -EINVAL);
5146 return;
5147 }
5148
5149 MutationImpl::LockOpVec lov;
5150 lov.add_xlock(&cur->filelock);
5151 if (!mds->locker->acquire_locks(mdr, lov))
5152 return;
5153
5154 if (!check_access(mdr, cur, access))
5155 return;
5156
5157 // project update
5158 auto &pi = cur->project_inode();
5159 pi.inode.layout = layout;
5160 // add the old pool to the inode
5161 pi.inode.add_old_pool(old_layout.pool_id);
5162 pi.inode.version = cur->pre_dirty();
5163 pi.inode.ctime = mdr->get_op_stamp();
5164 if (mdr->get_op_stamp() > pi.inode.rstat.rctime)
5165 pi.inode.rstat.rctime = mdr->get_op_stamp();
5166 pi.inode.change_attr++;
5167
5168 // log + wait
5169 mdr->ls = mdlog->get_current_segment();
5170 EUpdate *le = new EUpdate(mdlog, "setlayout");
5171 mdlog->start_entry(le);
5172 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
5173 mdcache->predirty_journal_parents(mdr, &le->metablob, cur, 0, PREDIRTY_PRIMARY);
5174 mdcache->journal_dirty_inode(mdr.get(), &le->metablob, cur);
5175
5176 journal_and_reply(mdr, cur, 0, le, new C_MDS_inode_update_finish(this, mdr, cur));
5177 }
5178
5179 bool Server::xlock_policylock(MDRequestRef& mdr, CInode *in, bool want_layout, bool xlock_snaplock)
5180 {
5181 if (mdr->locking_state & MutationImpl::ALL_LOCKED)
5182 return true;
5183
5184 MutationImpl::LockOpVec lov;
5185 lov.add_xlock(&in->policylock);
5186 if (xlock_snaplock)
5187 lov.add_xlock(&in->snaplock);
5188 else
5189 lov.add_rdlock(&in->snaplock);
5190 if (!mds->locker->acquire_locks(mdr, lov))
5191 return false;
5192
5193 if (want_layout && in->get_projected_inode()->has_layout()) {
5194 mdr->dir_layout = in->get_projected_inode()->layout;
5195 want_layout = false;
5196 }
5197 if (CDentry *pdn = in->get_projected_parent_dn(); pdn) {
5198 if (!mds->locker->try_rdlock_snap_layout(pdn->get_dir()->get_inode(), mdr, 0, want_layout))
5199 return false;
5200 }
5201
5202 mdr->locking_state |= MutationImpl::ALL_LOCKED;
5203 return true;
5204 }
5205
5206 CInode* Server::try_get_auth_inode(MDRequestRef& mdr, inodeno_t ino)
5207 {
5208 CInode *in = mdcache->get_inode(ino);
5209 if (!in || in->state_test(CInode::STATE_PURGING)) {
5210 respond_to_request(mdr, -ESTALE);
5211 return nullptr;
5212 }
5213 if (!in->is_auth()) {
5214 mdcache->request_forward(mdr, in->authority().first);
5215 return nullptr;
5216 }
5217
5218 return in;
5219 }
5220
5221 void Server::handle_client_setdirlayout(MDRequestRef& mdr)
5222 {
5223 const cref_t<MClientRequest> &req = mdr->client_request;
5224
5225 // can't use rdlock_path_pin_ref because we need to xlock snaplock/policylock
5226 CInode *cur = try_get_auth_inode(mdr, req->get_filepath().get_ino());
5227 if (!cur)
5228 return;
5229
5230 if (!cur->is_dir()) {
5231 respond_to_request(mdr, -ENOTDIR);
5232 return;
5233 }
5234
5235 if (!xlock_policylock(mdr, cur, true))
5236 return;
5237
5238 // validate layout
5239 const auto old_pi = cur->get_projected_inode();
5240 file_layout_t layout;
5241 if (old_pi->has_layout())
5242 layout = old_pi->layout;
5243 else if (mdr->dir_layout != file_layout_t())
5244 layout = mdr->dir_layout;
5245 else
5246 layout = mdcache->default_file_layout;
5247
5248 // Level of access required to complete
5249 int access = MAY_WRITE;
5250
5251 const auto old_layout = layout;
5252
5253 if (req->head.args.setlayout.layout.fl_object_size > 0)
5254 layout.object_size = req->head.args.setlayout.layout.fl_object_size;
5255 if (req->head.args.setlayout.layout.fl_stripe_unit > 0)
5256 layout.stripe_unit = req->head.args.setlayout.layout.fl_stripe_unit;
5257 if (req->head.args.setlayout.layout.fl_stripe_count > 0)
5258 layout.stripe_count=req->head.args.setlayout.layout.fl_stripe_count;
5259 if (req->head.args.setlayout.layout.fl_pg_pool > 0) {
5260 layout.pool_id = req->head.args.setlayout.layout.fl_pg_pool;
5261 // make sure we have as new a map as the client
5262 if (req->get_mdsmap_epoch() > mds->mdsmap->get_epoch()) {
5263 mds->wait_for_mdsmap(req->get_mdsmap_epoch(), new C_MDS_RetryRequest(mdcache, mdr));
5264 return;
5265 }
5266 }
5267
5268 if (layout != old_layout) {
5269 access |= MAY_SET_VXATTR;
5270 }
5271
5272 if (!layout.is_valid()) {
5273 dout(10) << "bad layout" << dendl;
5274 respond_to_request(mdr, -EINVAL);
5275 return;
5276 }
5277 if (!mds->mdsmap->is_data_pool(layout.pool_id)) {
5278 dout(10) << " invalid data pool " << layout.pool_id << dendl;
5279 respond_to_request(mdr, -EINVAL);
5280 return;
5281 }
5282
5283 if (!check_access(mdr, cur, access))
5284 return;
5285
5286 auto &pi = cur->project_inode();
5287 pi.inode.layout = layout;
5288 pi.inode.version = cur->pre_dirty();
5289
5290 // log + wait
5291 mdr->ls = mdlog->get_current_segment();
5292 EUpdate *le = new EUpdate(mdlog, "setlayout");
5293 mdlog->start_entry(le);
5294 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
5295 mdcache->predirty_journal_parents(mdr, &le->metablob, cur, 0, PREDIRTY_PRIMARY);
5296 mdcache->journal_dirty_inode(mdr.get(), &le->metablob, cur);
5297
5298 mdr->no_early_reply = true;
5299 journal_and_reply(mdr, cur, 0, le, new C_MDS_inode_update_finish(this, mdr, cur));
5300 }
5301
5302 // XATTRS
5303
5304 int Server::parse_layout_vxattr(string name, string value, const OSDMap& osdmap,
5305 file_layout_t *layout, bool validate)
5306 {
5307 dout(20) << "parse_layout_vxattr name " << name << " value '" << value << "'" << dendl;
5308 try {
5309 if (name == "layout") {
5310 string::iterator begin = value.begin();
5311 string::iterator end = value.end();
5312 keys_and_values<string::iterator> p; // create instance of parser
5313 std::map<string, string> m; // map to receive results
5314 if (!qi::parse(begin, end, p, m)) { // returns true if successful
5315 return -EINVAL;
5316 }
5317 string left(begin, end);
5318 dout(10) << " parsed " << m << " left '" << left << "'" << dendl;
5319 if (begin != end)
5320 return -EINVAL;
5321 for (map<string,string>::iterator q = m.begin(); q != m.end(); ++q) {
5322 // Skip validation on each attr, we do it once at the end (avoid
5323 // rejecting intermediate states if the overall result is ok)
5324 int r = parse_layout_vxattr(string("layout.") + q->first, q->second,
5325 osdmap, layout, false);
5326 if (r < 0)
5327 return r;
5328 }
5329 } else if (name == "layout.object_size") {
5330 layout->object_size = boost::lexical_cast<unsigned>(value);
5331 } else if (name == "layout.stripe_unit") {
5332 layout->stripe_unit = boost::lexical_cast<unsigned>(value);
5333 } else if (name == "layout.stripe_count") {
5334 layout->stripe_count = boost::lexical_cast<unsigned>(value);
5335 } else if (name == "layout.pool") {
5336 try {
5337 layout->pool_id = boost::lexical_cast<unsigned>(value);
5338 } catch (boost::bad_lexical_cast const&) {
5339 int64_t pool = osdmap.lookup_pg_pool_name(value);
5340 if (pool < 0) {
5341 dout(10) << " unknown pool " << value << dendl;
5342 return -ENOENT;
5343 }
5344 layout->pool_id = pool;
5345 }
5346 } else if (name == "layout.pool_namespace") {
5347 layout->pool_ns = value;
5348 } else {
5349 dout(10) << " unknown layout vxattr " << name << dendl;
5350 return -EINVAL;
5351 }
5352 } catch (boost::bad_lexical_cast const&) {
5353 dout(10) << "bad vxattr value, unable to parse int for " << name << dendl;
5354 return -EINVAL;
5355 }
5356
5357 if (validate && !layout->is_valid()) {
5358 dout(10) << "bad layout" << dendl;
5359 return -EINVAL;
5360 }
5361 if (!mds->mdsmap->is_data_pool(layout->pool_id)) {
5362 dout(10) << " invalid data pool " << layout->pool_id << dendl;
5363 return -EINVAL;
5364 }
5365 return 0;
5366 }
5367
5368 int Server::parse_quota_vxattr(string name, string value, quota_info_t *quota)
5369 {
5370 dout(20) << "parse_quota_vxattr name " << name << " value '" << value << "'" << dendl;
5371 try {
5372 if (name == "quota") {
5373 string::iterator begin = value.begin();
5374 string::iterator end = value.end();
5375 if (begin == end) {
5376 // keep quota unchanged. (for create_quota_realm())
5377 return 0;
5378 }
5379 keys_and_values<string::iterator> p; // create instance of parser
5380 std::map<string, string> m; // map to receive results
5381 if (!qi::parse(begin, end, p, m)) { // returns true if successful
5382 return -EINVAL;
5383 }
5384 string left(begin, end);
5385 dout(10) << " parsed " << m << " left '" << left << "'" << dendl;
5386 if (begin != end)
5387 return -EINVAL;
5388 for (map<string,string>::iterator q = m.begin(); q != m.end(); ++q) {
5389 int r = parse_quota_vxattr(string("quota.") + q->first, q->second, quota);
5390 if (r < 0)
5391 return r;
5392 }
5393 } else if (name == "quota.max_bytes") {
5394 int64_t q = boost::lexical_cast<int64_t>(value);
5395 if (q < 0)
5396 return -EINVAL;
5397 quota->max_bytes = q;
5398 } else if (name == "quota.max_files") {
5399 int64_t q = boost::lexical_cast<int64_t>(value);
5400 if (q < 0)
5401 return -EINVAL;
5402 quota->max_files = q;
5403 } else {
5404 dout(10) << " unknown quota vxattr " << name << dendl;
5405 return -EINVAL;
5406 }
5407 } catch (boost::bad_lexical_cast const&) {
5408 dout(10) << "bad vxattr value, unable to parse int for " << name << dendl;
5409 return -EINVAL;
5410 }
5411
5412 if (!quota->is_valid()) {
5413 dout(10) << "bad quota" << dendl;
5414 return -EINVAL;
5415 }
5416 return 0;
5417 }
5418
5419 void Server::create_quota_realm(CInode *in)
5420 {
5421 dout(10) << __func__ << " " << *in << dendl;
5422
5423 auto req = make_message<MClientRequest>(CEPH_MDS_OP_SETXATTR);
5424 req->set_filepath(filepath(in->ino()));
5425 req->set_string2("ceph.quota");
5426 // empty vxattr value
5427 req->set_tid(mds->issue_tid());
5428
5429 mds->send_message_mds(req, in->authority().first);
5430 }
5431
5432 /*
5433 * Verify that the file layout attribute carried by client
5434 * is well-formatted.
5435 * Return 0 on success, otherwise this function takes
5436 * responsibility for the passed mdr.
5437 */
5438 int Server::check_layout_vxattr(MDRequestRef& mdr,
5439 string name,
5440 string value,
5441 file_layout_t *layout)
5442 {
5443 const cref_t<MClientRequest> &req = mdr->client_request;
5444 epoch_t epoch;
5445 int r;
5446
5447 mds->objecter->with_osdmap([&](const OSDMap& osdmap) {
5448 r = parse_layout_vxattr(name, value, osdmap, layout);
5449 epoch = osdmap.get_epoch();
5450 });
5451
5452 if (r == -ENOENT) {
5453
5454 // we don't have the specified pool, make sure our map
5455 // is newer than or as new as the client.
5456 epoch_t req_epoch = req->get_osdmap_epoch();
5457
5458 if (req_epoch > epoch) {
5459
5460 // well, our map is older. consult mds.
5461 Context *fin = new C_IO_Wrapper(mds, new C_MDS_RetryRequest(mdcache, mdr));
5462
5463 if (!mds->objecter->wait_for_map(req_epoch, fin))
5464 return r; // wait, fin will retry this request later
5465
5466 delete fin;
5467
5468 // now we have at least as new a map as the client, try again.
5469 mds->objecter->with_osdmap([&](const OSDMap& osdmap) {
5470 r = parse_layout_vxattr(name, value, osdmap, layout);
5471 epoch = osdmap.get_epoch();
5472 });
5473
5474 ceph_assert(epoch >= req_epoch); // otherwise wait_for_map() told a lie
5475
5476 } else if (req_epoch == 0 && !mdr->waited_for_osdmap) {
5477
5478 // For compatibility with client w/ old code, we still need get the
5479 // latest map. One day if COMPACT_VERSION of MClientRequest >=3,
5480 // we can remove those code.
5481 mdr->waited_for_osdmap = true;
5482 mds->objecter->wait_for_latest_osdmap(new C_IO_Wrapper(
5483 mds, new C_MDS_RetryRequest(mdcache, mdr)));
5484 return r;
5485 }
5486 }
5487
5488 if (r < 0) {
5489
5490 if (r == -ENOENT)
5491 r = -EINVAL;
5492
5493 respond_to_request(mdr, r);
5494 return r;
5495 }
5496
5497 // all is well
5498 return 0;
5499 }
5500
5501 void Server::handle_set_vxattr(MDRequestRef& mdr, CInode *cur)
5502 {
5503 const cref_t<MClientRequest> &req = mdr->client_request;
5504 string name(req->get_path2());
5505 bufferlist bl = req->get_data();
5506 string value (bl.c_str(), bl.length());
5507 dout(10) << "handle_set_vxattr " << name
5508 << " val " << value.length()
5509 << " bytes on " << *cur
5510 << dendl;
5511
5512 CInode::mempool_inode *pip = nullptr;
5513 string rest;
5514
5515 if (!check_access(mdr, cur, MAY_SET_VXATTR)) {
5516 return;
5517 }
5518
5519 bool new_realm = false;
5520 if (name.compare(0, 15, "ceph.dir.layout") == 0) {
5521 if (!cur->is_dir()) {
5522 respond_to_request(mdr, -EINVAL);
5523 return;
5524 }
5525
5526 if (!xlock_policylock(mdr, cur, true))
5527 return;
5528
5529 file_layout_t layout;
5530 if (cur->get_projected_inode()->has_layout())
5531 layout = cur->get_projected_inode()->layout;
5532 else if (mdr->dir_layout != file_layout_t())
5533 layout = mdr->dir_layout;
5534 else
5535 layout = mdcache->default_file_layout;
5536
5537 rest = name.substr(name.find("layout"));
5538 if (check_layout_vxattr(mdr, rest, value, &layout) < 0)
5539 return;
5540
5541 auto &pi = cur->project_inode();
5542 pi.inode.layout = layout;
5543 mdr->no_early_reply = true;
5544 pip = &pi.inode;
5545 } else if (name.compare(0, 16, "ceph.file.layout") == 0) {
5546 if (!cur->is_file()) {
5547 respond_to_request(mdr, -EINVAL);
5548 return;
5549 }
5550 if (cur->get_projected_inode()->size ||
5551 cur->get_projected_inode()->truncate_seq > 1) {
5552 respond_to_request(mdr, -ENOTEMPTY);
5553 return;
5554 }
5555 file_layout_t layout = cur->get_projected_inode()->layout;
5556 rest = name.substr(name.find("layout"));
5557 if (check_layout_vxattr(mdr, rest, value, &layout) < 0)
5558 return;
5559
5560 MutationImpl::LockOpVec lov;
5561 lov.add_xlock(&cur->filelock);
5562 if (!mds->locker->acquire_locks(mdr, lov))
5563 return;
5564
5565 auto &pi = cur->project_inode();
5566 int64_t old_pool = pi.inode.layout.pool_id;
5567 pi.inode.add_old_pool(old_pool);
5568 pi.inode.layout = layout;
5569 pip = &pi.inode;
5570 } else if (name.compare(0, 10, "ceph.quota") == 0) {
5571 if (!cur->is_dir() || cur->is_root()) {
5572 respond_to_request(mdr, -EINVAL);
5573 return;
5574 }
5575
5576 quota_info_t quota = cur->get_projected_inode()->quota;
5577
5578 rest = name.substr(name.find("quota"));
5579 int r = parse_quota_vxattr(rest, value, &quota);
5580 if (r < 0) {
5581 respond_to_request(mdr, r);
5582 return;
5583 }
5584
5585 if (quota.is_enable() && !cur->get_projected_srnode())
5586 new_realm = true;
5587
5588 if (!xlock_policylock(mdr, cur, false, new_realm))
5589 return;
5590
5591 auto &pi = cur->project_inode(false, new_realm);
5592 pi.inode.quota = quota;
5593
5594 if (new_realm) {
5595 SnapRealm *realm = cur->find_snaprealm();
5596 auto seq = realm->get_newest_seq();
5597 auto &newsnap = *pi.snapnode;
5598 newsnap.created = seq;
5599 newsnap.seq = seq;
5600 }
5601 mdr->no_early_reply = true;
5602 pip = &pi.inode;
5603
5604 client_t exclude_ct = mdr->get_client();
5605 mdcache->broadcast_quota_to_client(cur, exclude_ct, true);
5606 } else if (name == "ceph.dir.pin"sv) {
5607 if (!cur->is_dir() || cur->is_root()) {
5608 respond_to_request(mdr, -EINVAL);
5609 return;
5610 }
5611
5612 mds_rank_t rank;
5613 try {
5614 rank = boost::lexical_cast<mds_rank_t>(value);
5615 if (rank < 0) rank = MDS_RANK_NONE;
5616 } catch (boost::bad_lexical_cast const&) {
5617 dout(10) << "bad vxattr value, unable to parse int for " << name << dendl;
5618 respond_to_request(mdr, -EINVAL);
5619 return;
5620 }
5621
5622 if (!xlock_policylock(mdr, cur))
5623 return;
5624
5625 auto &pi = cur->project_inode();
5626 cur->set_export_pin(rank);
5627 pip = &pi.inode;
5628 } else if (name == "ceph.dir.pin.random"sv) {
5629 if (!cur->is_dir() || cur->is_root()) {
5630 respond_to_request(mdr, -EINVAL);
5631 return;
5632 }
5633
5634 double val;
5635 try {
5636 val = boost::lexical_cast<double>(value);
5637 } catch (boost::bad_lexical_cast const&) {
5638 dout(10) << "bad vxattr value, unable to parse float for " << name << dendl;
5639 respond_to_request(mdr, -EINVAL);
5640 return;
5641 }
5642
5643 if (val < 0.0 || 1.0 < val) {
5644 respond_to_request(mdr, -EDOM);
5645 return;
5646 } else if (mdcache->export_ephemeral_random_max < val) {
5647 respond_to_request(mdr, -EINVAL);
5648 return;
5649 }
5650
5651 if (!xlock_policylock(mdr, cur))
5652 return;
5653
5654 auto &pi = cur->project_inode();
5655 cur->setxattr_ephemeral_rand(val);
5656 pip = &pi.inode;
5657 } else if (name == "ceph.dir.pin.distributed"sv) {
5658 if (!cur->is_dir() || cur->is_root()) {
5659 respond_to_request(mdr, -EINVAL);
5660 return;
5661 }
5662
5663 bool val;
5664 try {
5665 val = boost::lexical_cast<bool>(value);
5666 } catch (boost::bad_lexical_cast const&) {
5667 dout(10) << "bad vxattr value, unable to parse bool for " << name << dendl;
5668 respond_to_request(mdr, -EINVAL);
5669 return;
5670 }
5671
5672 if (!xlock_policylock(mdr, cur))
5673 return;
5674
5675 auto &pi = cur->project_inode();
5676 cur->setxattr_ephemeral_dist(val);
5677 pip = &pi.inode;
5678 } else {
5679 dout(10) << " unknown vxattr " << name << dendl;
5680 respond_to_request(mdr, -EINVAL);
5681 return;
5682 }
5683
5684 pip->change_attr++;
5685 pip->ctime = mdr->get_op_stamp();
5686 if (mdr->get_op_stamp() > pip->rstat.rctime)
5687 pip->rstat.rctime = mdr->get_op_stamp();
5688 pip->version = cur->pre_dirty();
5689 if (cur->is_file())
5690 pip->update_backtrace();
5691
5692 // log + wait
5693 mdr->ls = mdlog->get_current_segment();
5694 EUpdate *le = new EUpdate(mdlog, "set vxattr layout");
5695 mdlog->start_entry(le);
5696 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
5697 mdcache->predirty_journal_parents(mdr, &le->metablob, cur, 0, PREDIRTY_PRIMARY);
5698 mdcache->journal_dirty_inode(mdr.get(), &le->metablob, cur);
5699
5700 journal_and_reply(mdr, cur, 0, le, new C_MDS_inode_update_finish(this, mdr, cur,
5701 false, false, new_realm));
5702 return;
5703 }
5704
5705 void Server::handle_remove_vxattr(MDRequestRef& mdr, CInode *cur)
5706 {
5707 const cref_t<MClientRequest> &req = mdr->client_request;
5708 string name(req->get_path2());
5709
5710 dout(10) << __func__ << " " << name << " on " << *cur << dendl;
5711
5712 if (name == "ceph.dir.layout") {
5713 if (!cur->is_dir()) {
5714 respond_to_request(mdr, -ENODATA);
5715 return;
5716 }
5717 if (cur->is_root()) {
5718 dout(10) << "can't remove layout policy on the root directory" << dendl;
5719 respond_to_request(mdr, -EINVAL);
5720 return;
5721 }
5722
5723 if (!cur->get_projected_inode()->has_layout()) {
5724 respond_to_request(mdr, -ENODATA);
5725 return;
5726 }
5727
5728 MutationImpl::LockOpVec lov;
5729 lov.add_xlock(&cur->policylock);
5730 if (!mds->locker->acquire_locks(mdr, lov))
5731 return;
5732
5733 auto &pi = cur->project_inode();
5734 pi.inode.clear_layout();
5735 pi.inode.version = cur->pre_dirty();
5736
5737 // log + wait
5738 mdr->ls = mdlog->get_current_segment();
5739 EUpdate *le = new EUpdate(mdlog, "remove dir layout vxattr");
5740 mdlog->start_entry(le);
5741 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
5742 mdcache->predirty_journal_parents(mdr, &le->metablob, cur, 0, PREDIRTY_PRIMARY);
5743 mdcache->journal_dirty_inode(mdr.get(), &le->metablob, cur);
5744
5745 mdr->no_early_reply = true;
5746 journal_and_reply(mdr, cur, 0, le, new C_MDS_inode_update_finish(this, mdr, cur));
5747 return;
5748 } else if (name == "ceph.dir.layout.pool_namespace"
5749 || name == "ceph.file.layout.pool_namespace") {
5750 // Namespace is the only layout field that has a meaningful
5751 // null/none value (empty string, means default layout). Is equivalent
5752 // to a setxattr with empty string: pass through the empty payload of
5753 // the rmxattr request to do this.
5754 handle_set_vxattr(mdr, cur);
5755 return;
5756 }
5757
5758 respond_to_request(mdr, -ENODATA);
5759 }
5760
5761 class C_MDS_inode_xattr_update_finish : public ServerLogContext {
5762 CInode *in;
5763 public:
5764
5765 C_MDS_inode_xattr_update_finish(Server *s, MDRequestRef& r, CInode *i) :
5766 ServerLogContext(s, r), in(i) { }
5767 void finish(int r) override {
5768 ceph_assert(r == 0);
5769
5770 // apply
5771 in->pop_and_dirty_projected_inode(mdr->ls);
5772
5773 mdr->apply();
5774
5775 get_mds()->balancer->hit_inode(in, META_POP_IWR);
5776
5777 server->respond_to_request(mdr, 0);
5778 }
5779 };
5780
5781 void Server::handle_client_setxattr(MDRequestRef& mdr)
5782 {
5783 const cref_t<MClientRequest> &req = mdr->client_request;
5784 string name(req->get_path2());
5785
5786 // magic ceph.* namespace?
5787 if (name.compare(0, 5, "ceph.") == 0) {
5788 // can't use rdlock_path_pin_ref because we need to xlock snaplock/policylock
5789 CInode *cur = try_get_auth_inode(mdr, req->get_filepath().get_ino());
5790 if (!cur)
5791 return;
5792
5793 handle_set_vxattr(mdr, cur);
5794 return;
5795 }
5796
5797 CInode *cur = rdlock_path_pin_ref(mdr, true);
5798 if (!cur)
5799 return;
5800
5801 if (mdr->snapid != CEPH_NOSNAP) {
5802 respond_to_request(mdr, -EROFS);
5803 return;
5804 }
5805
5806 int flags = req->head.args.setxattr.flags;
5807
5808 MutationImpl::LockOpVec lov;
5809 lov.add_xlock(&cur->xattrlock);
5810 if (!mds->locker->acquire_locks(mdr, lov))
5811 return;
5812
5813 if (!check_access(mdr, cur, MAY_WRITE))
5814 return;
5815
5816 auto pxattrs = cur->get_projected_xattrs();
5817 size_t len = req->get_data().length();
5818 size_t inc = len + name.length();
5819
5820 // check xattrs kv pairs size
5821 size_t cur_xattrs_size = 0;
5822 for (const auto& p : *pxattrs) {
5823 if ((flags & CEPH_XATTR_REPLACE) && (name.compare(p.first) == 0)) {
5824 continue;
5825 }
5826 cur_xattrs_size += p.first.length() + p.second.length();
5827 }
5828
5829 if (((cur_xattrs_size + inc) > g_conf()->mds_max_xattr_pairs_size)) {
5830 dout(10) << "xattr kv pairs size too big. cur_xattrs_size "
5831 << cur_xattrs_size << ", inc " << inc << dendl;
5832 respond_to_request(mdr, -ENOSPC);
5833 return;
5834 }
5835
5836 if ((flags & CEPH_XATTR_CREATE) && pxattrs->count(mempool::mds_co::string(name))) {
5837 dout(10) << "setxattr '" << name << "' XATTR_CREATE and EEXIST on " << *cur << dendl;
5838 respond_to_request(mdr, -EEXIST);
5839 return;
5840 }
5841 if ((flags & CEPH_XATTR_REPLACE) && !pxattrs->count(mempool::mds_co::string(name))) {
5842 dout(10) << "setxattr '" << name << "' XATTR_REPLACE and ENODATA on " << *cur << dendl;
5843 respond_to_request(mdr, -ENODATA);
5844 return;
5845 }
5846
5847 dout(10) << "setxattr '" << name << "' len " << len << " on " << *cur << dendl;
5848
5849 // project update
5850 auto &pi = cur->project_inode(true);
5851 pi.inode.version = cur->pre_dirty();
5852 pi.inode.ctime = mdr->get_op_stamp();
5853 if (mdr->get_op_stamp() > pi.inode.rstat.rctime)
5854 pi.inode.rstat.rctime = mdr->get_op_stamp();
5855 pi.inode.change_attr++;
5856 pi.inode.xattr_version++;
5857 auto &px = *pi.xattrs;
5858 if ((flags & CEPH_XATTR_REMOVE)) {
5859 px.erase(mempool::mds_co::string(name));
5860 } else {
5861 bufferptr b = buffer::create(len);
5862 if (len)
5863 req->get_data().begin().copy(len, b.c_str());
5864 auto em = px.emplace(std::piecewise_construct, std::forward_as_tuple(mempool::mds_co::string(name)), std::forward_as_tuple(b));
5865 if (!em.second)
5866 em.first->second = b;
5867 }
5868
5869 // log + wait
5870 mdr->ls = mdlog->get_current_segment();
5871 EUpdate *le = new EUpdate(mdlog, "setxattr");
5872 mdlog->start_entry(le);
5873 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
5874 mdcache->predirty_journal_parents(mdr, &le->metablob, cur, 0, PREDIRTY_PRIMARY);
5875 mdcache->journal_dirty_inode(mdr.get(), &le->metablob, cur);
5876
5877 journal_and_reply(mdr, cur, 0, le, new C_MDS_inode_update_finish(this, mdr, cur));
5878 }
5879
5880 void Server::handle_client_removexattr(MDRequestRef& mdr)
5881 {
5882 const cref_t<MClientRequest> &req = mdr->client_request;
5883 std::string name(req->get_path2());
5884
5885 if (name.compare(0, 5, "ceph.") == 0) {
5886 // can't use rdlock_path_pin_ref because we need to xlock snaplock/policylock
5887 CInode *cur = try_get_auth_inode(mdr, req->get_filepath().get_ino());
5888 if (!cur)
5889 return;
5890
5891 handle_remove_vxattr(mdr, cur);
5892 return;
5893 }
5894
5895 CInode* cur = rdlock_path_pin_ref(mdr, true);
5896 if (!cur)
5897 return;
5898
5899 if (mdr->snapid != CEPH_NOSNAP) {
5900 respond_to_request(mdr, -EROFS);
5901 return;
5902 }
5903
5904 MutationImpl::LockOpVec lov;
5905 lov.add_xlock(&cur->xattrlock);
5906 if (!mds->locker->acquire_locks(mdr, lov))
5907 return;
5908
5909 auto pxattrs = cur->get_projected_xattrs();
5910 if (pxattrs->count(mempool::mds_co::string(name)) == 0) {
5911 dout(10) << "removexattr '" << name << "' and ENODATA on " << *cur << dendl;
5912 respond_to_request(mdr, -ENODATA);
5913 return;
5914 }
5915
5916 dout(10) << "removexattr '" << name << "' on " << *cur << dendl;
5917
5918 // project update
5919 auto &pi = cur->project_inode(true);
5920 auto &px = *pi.xattrs;
5921 pi.inode.version = cur->pre_dirty();
5922 pi.inode.ctime = mdr->get_op_stamp();
5923 if (mdr->get_op_stamp() > pi.inode.rstat.rctime)
5924 pi.inode.rstat.rctime = mdr->get_op_stamp();
5925 pi.inode.change_attr++;
5926 pi.inode.xattr_version++;
5927 px.erase(mempool::mds_co::string(name));
5928
5929 // log + wait
5930 mdr->ls = mdlog->get_current_segment();
5931 EUpdate *le = new EUpdate(mdlog, "removexattr");
5932 mdlog->start_entry(le);
5933 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
5934 mdcache->predirty_journal_parents(mdr, &le->metablob, cur, 0, PREDIRTY_PRIMARY);
5935 mdcache->journal_dirty_inode(mdr.get(), &le->metablob, cur);
5936
5937 journal_and_reply(mdr, cur, 0, le, new C_MDS_inode_update_finish(this, mdr, cur));
5938 }
5939
5940
5941 // =================================================================
5942 // DIRECTORY and NAMESPACE OPS
5943
5944
5945 // ------------------------------------------------
5946
5947 // MKNOD
5948
5949 class C_MDS_mknod_finish : public ServerLogContext {
5950 CDentry *dn;
5951 CInode *newi;
5952 public:
5953 C_MDS_mknod_finish(Server *s, MDRequestRef& r, CDentry *d, CInode *ni) :
5954 ServerLogContext(s, r), dn(d), newi(ni) {}
5955 void finish(int r) override {
5956 ceph_assert(r == 0);
5957
5958 // link the inode
5959 dn->pop_projected_linkage();
5960
5961 // be a bit hacky with the inode version, here.. we decrement it
5962 // just to keep mark_dirty() happen. (we didn't bother projecting
5963 // a new version of hte inode since it's just been created)
5964 newi->inode.version--;
5965 newi->mark_dirty(newi->inode.version + 1, mdr->ls);
5966 newi->mark_dirty_parent(mdr->ls, true);
5967
5968 // mkdir?
5969 if (newi->inode.is_dir()) {
5970 CDir *dir = newi->get_dirfrag(frag_t());
5971 ceph_assert(dir);
5972 dir->fnode.version--;
5973 dir->mark_dirty(dir->fnode.version + 1, mdr->ls);
5974 dir->mark_new(mdr->ls);
5975 }
5976
5977 mdr->apply();
5978
5979 MDRequestRef null_ref;
5980 get_mds()->mdcache->send_dentry_link(dn, null_ref);
5981
5982 if (newi->inode.is_file()) {
5983 get_mds()->locker->share_inode_max_size(newi);
5984 } else if (newi->inode.is_dir()) {
5985 // We do this now so that the linkages on the new directory are stable.
5986 newi->maybe_ephemeral_dist();
5987 newi->maybe_ephemeral_rand(true);
5988 }
5989
5990 // hit pop
5991 get_mds()->balancer->hit_inode(newi, META_POP_IWR);
5992
5993 // reply
5994 server->respond_to_request(mdr, 0);
5995 }
5996 };
5997
5998
5999 void Server::handle_client_mknod(MDRequestRef& mdr)
6000 {
6001 const cref_t<MClientRequest> &req = mdr->client_request;
6002 client_t client = mdr->get_client();
6003
6004 unsigned mode = req->head.args.mknod.mode;
6005 if ((mode & S_IFMT) == 0)
6006 mode |= S_IFREG;
6007
6008 mdr->disable_lock_cache();
6009 CDentry *dn = rdlock_path_xlock_dentry(mdr, true, false, S_ISREG(mode));
6010 if (!dn)
6011 return;
6012
6013 CDir *dir = dn->get_dir();
6014 CInode *diri = dir->get_inode();
6015 if (!check_access(mdr, diri, MAY_WRITE))
6016 return;
6017 if (!check_fragment_space(mdr, dn->get_dir()))
6018 return;
6019
6020 // set layout
6021 file_layout_t layout;
6022 if (mdr->dir_layout != file_layout_t())
6023 layout = mdr->dir_layout;
6024 else
6025 layout = mdcache->default_file_layout;
6026
6027 CInode *newi = prepare_new_inode(mdr, dn->get_dir(), inodeno_t(req->head.ino), mode, &layout);
6028 ceph_assert(newi);
6029
6030 dn->push_projected_linkage(newi);
6031
6032 newi->inode.rdev = req->head.args.mknod.rdev;
6033 newi->inode.version = dn->pre_dirty();
6034 newi->inode.rstat.rfiles = 1;
6035 if (layout.pool_id != mdcache->default_file_layout.pool_id)
6036 newi->inode.add_old_pool(mdcache->default_file_layout.pool_id);
6037 newi->inode.update_backtrace();
6038
6039 snapid_t follows = mdcache->get_global_snaprealm()->get_newest_seq();
6040 SnapRealm *realm = dn->get_dir()->inode->find_snaprealm();
6041 ceph_assert(follows >= realm->get_newest_seq());
6042
6043 // if the client created a _regular_ file via MKNOD, it's highly likely they'll
6044 // want to write to it (e.g., if they are reexporting NFS)
6045 if (S_ISREG(newi->inode.mode)) {
6046 // issue a cap on the file
6047 int cmode = CEPH_FILE_MODE_RDWR;
6048 Capability *cap = mds->locker->issue_new_caps(newi, cmode, mdr, realm);
6049 if (cap) {
6050 cap->set_wanted(0);
6051
6052 // put locks in excl mode
6053 newi->filelock.set_state(LOCK_EXCL);
6054 newi->authlock.set_state(LOCK_EXCL);
6055 newi->xattrlock.set_state(LOCK_EXCL);
6056
6057 dout(15) << " setting a client_range too, since this is a regular file" << dendl;
6058 newi->inode.client_ranges[client].range.first = 0;
6059 newi->inode.client_ranges[client].range.last = newi->inode.layout.stripe_unit;
6060 newi->inode.client_ranges[client].follows = follows;
6061 cap->mark_clientwriteable();
6062 }
6063 }
6064
6065 ceph_assert(dn->first == follows + 1);
6066 newi->first = dn->first;
6067
6068 dout(10) << "mknod mode " << newi->inode.mode << " rdev " << newi->inode.rdev << dendl;
6069
6070 // prepare finisher
6071 mdr->ls = mdlog->get_current_segment();
6072 EUpdate *le = new EUpdate(mdlog, "mknod");
6073 mdlog->start_entry(le);
6074 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
6075 journal_allocated_inos(mdr, &le->metablob);
6076
6077 mdcache->predirty_journal_parents(mdr, &le->metablob, newi, dn->get_dir(),
6078 PREDIRTY_PRIMARY|PREDIRTY_DIR, 1);
6079 le->metablob.add_primary_dentry(dn, newi, true, true, true);
6080
6081 journal_and_reply(mdr, newi, dn, le, new C_MDS_mknod_finish(this, mdr, dn, newi));
6082 mds->balancer->maybe_fragment(dn->get_dir(), false);
6083 }
6084
6085
6086
6087 // MKDIR
6088 /* This function takes responsibility for the passed mdr*/
6089 void Server::handle_client_mkdir(MDRequestRef& mdr)
6090 {
6091 const cref_t<MClientRequest> &req = mdr->client_request;
6092
6093 mdr->disable_lock_cache();
6094 CDentry *dn = rdlock_path_xlock_dentry(mdr, true);
6095 if (!dn)
6096 return;
6097
6098 CDir *dir = dn->get_dir();
6099 CInode *diri = dir->get_inode();
6100
6101 // mkdir check access
6102 if (!check_access(mdr, diri, MAY_WRITE))
6103 return;
6104
6105 if (!check_fragment_space(mdr, dir))
6106 return;
6107
6108 // new inode
6109 unsigned mode = req->head.args.mkdir.mode;
6110 mode &= ~S_IFMT;
6111 mode |= S_IFDIR;
6112 CInode *newi = prepare_new_inode(mdr, dir, inodeno_t(req->head.ino), mode);
6113 ceph_assert(newi);
6114
6115 // it's a directory.
6116 dn->push_projected_linkage(newi);
6117
6118 newi->inode.version = dn->pre_dirty();
6119 newi->inode.rstat.rsubdirs = 1;
6120 newi->inode.update_backtrace();
6121
6122 snapid_t follows = mdcache->get_global_snaprealm()->get_newest_seq();
6123 SnapRealm *realm = dn->get_dir()->inode->find_snaprealm();
6124 ceph_assert(follows >= realm->get_newest_seq());
6125
6126 dout(12) << " follows " << follows << dendl;
6127 ceph_assert(dn->first == follows + 1);
6128 newi->first = dn->first;
6129
6130 // ...and that new dir is empty.
6131 CDir *newdir = newi->get_or_open_dirfrag(mdcache, frag_t());
6132 newdir->state_set(CDir::STATE_CREATING);
6133 newdir->mark_complete();
6134 newdir->fnode.version = newdir->pre_dirty();
6135
6136 // prepare finisher
6137 mdr->ls = mdlog->get_current_segment();
6138 EUpdate *le = new EUpdate(mdlog, "mkdir");
6139 mdlog->start_entry(le);
6140 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
6141 journal_allocated_inos(mdr, &le->metablob);
6142 mdcache->predirty_journal_parents(mdr, &le->metablob, newi, dn->get_dir(), PREDIRTY_PRIMARY|PREDIRTY_DIR, 1);
6143 le->metablob.add_primary_dentry(dn, newi, true, true);
6144 le->metablob.add_new_dir(newdir); // dirty AND complete AND new
6145
6146 // issue a cap on the directory
6147 int cmode = CEPH_FILE_MODE_RDWR;
6148 Capability *cap = mds->locker->issue_new_caps(newi, cmode, mdr, realm);
6149 if (cap) {
6150 cap->set_wanted(0);
6151
6152 // put locks in excl mode
6153 newi->filelock.set_state(LOCK_EXCL);
6154 newi->authlock.set_state(LOCK_EXCL);
6155 newi->xattrlock.set_state(LOCK_EXCL);
6156 }
6157
6158 // make sure this inode gets into the journal
6159 le->metablob.add_opened_ino(newi->ino());
6160
6161 journal_and_reply(mdr, newi, dn, le, new C_MDS_mknod_finish(this, mdr, dn, newi));
6162
6163 // We hit_dir (via hit_inode) in our finish callback, but by then we might
6164 // have overshot the split size (multiple mkdir in flight), so here is
6165 // an early chance to split the dir if this mkdir makes it oversized.
6166 mds->balancer->maybe_fragment(dir, false);
6167 }
6168
6169
6170 // SYMLINK
6171
6172 void Server::handle_client_symlink(MDRequestRef& mdr)
6173 {
6174 mdr->disable_lock_cache();
6175 CDentry *dn = rdlock_path_xlock_dentry(mdr, true);
6176 if (!dn)
6177 return;
6178
6179 CDir *dir = dn->get_dir();
6180 CInode *diri = dir->get_inode();
6181
6182 if (!check_access(mdr, diri, MAY_WRITE))
6183 return;
6184 if (!check_fragment_space(mdr, dir))
6185 return;
6186
6187 const cref_t<MClientRequest> &req = mdr->client_request;
6188
6189 unsigned mode = S_IFLNK | 0777;
6190 CInode *newi = prepare_new_inode(mdr, dir, inodeno_t(req->head.ino), mode);
6191 ceph_assert(newi);
6192
6193 // it's a symlink
6194 dn->push_projected_linkage(newi);
6195
6196 newi->symlink = req->get_path2();
6197 newi->inode.size = newi->symlink.length();
6198 newi->inode.rstat.rbytes = newi->inode.size;
6199 newi->inode.rstat.rfiles = 1;
6200 newi->inode.version = dn->pre_dirty();
6201 newi->inode.update_backtrace();
6202
6203 newi->first = dn->first;
6204
6205 // prepare finisher
6206 mdr->ls = mdlog->get_current_segment();
6207 EUpdate *le = new EUpdate(mdlog, "symlink");
6208 mdlog->start_entry(le);
6209 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
6210 journal_allocated_inos(mdr, &le->metablob);
6211 mdcache->predirty_journal_parents(mdr, &le->metablob, newi, dn->get_dir(), PREDIRTY_PRIMARY|PREDIRTY_DIR, 1);
6212 le->metablob.add_primary_dentry(dn, newi, true, true);
6213
6214 journal_and_reply(mdr, newi, dn, le, new C_MDS_mknod_finish(this, mdr, dn, newi));
6215 mds->balancer->maybe_fragment(dir, false);
6216 }
6217
6218
6219
6220
6221
6222 // LINK
6223
6224 void Server::handle_client_link(MDRequestRef& mdr)
6225 {
6226 const cref_t<MClientRequest> &req = mdr->client_request;
6227
6228 dout(7) << "handle_client_link " << req->get_filepath()
6229 << " to " << req->get_filepath2()
6230 << dendl;
6231
6232 mdr->disable_lock_cache();
6233
6234 CDentry *destdn;
6235 CInode *targeti;
6236
6237 if (req->get_filepath2().depth() == 0) {
6238 targeti = mdcache->get_inode(req->get_filepath2().get_ino());
6239 if (!targeti) {
6240 dout(10) << "ESTALE on path2, attempting recovery" << dendl;
6241 mdcache->find_ino_peers(req->get_filepath2().get_ino(), new C_MDS_TryFindInode(this, mdr));
6242 return;
6243 }
6244 mdr->pin(targeti);
6245
6246 if (!(mdr->locking_state & MutationImpl::SNAP2_LOCKED)) {
6247 CDentry *pdn = targeti->get_projected_parent_dn();
6248 if (!pdn) {
6249 dout(7) << "target has no parent dn, failing..." << dendl;
6250 respond_to_request(mdr, -EINVAL);
6251 return;
6252 }
6253 if (!mds->locker->try_rdlock_snap_layout(pdn->get_dir()->get_inode(), mdr, 1))
6254 return;
6255 mdr->locking_state |= MutationImpl::SNAP2_LOCKED;
6256 }
6257
6258 destdn = rdlock_path_xlock_dentry(mdr, false);
6259 if (!destdn)
6260 return;
6261
6262 } else {
6263 auto ret = rdlock_two_paths_xlock_destdn(mdr, false);
6264 destdn = ret.first;
6265 if (!destdn)
6266 return;
6267
6268 if (!destdn->get_projected_linkage()->is_null()) {
6269 respond_to_request(mdr, -EEXIST);
6270 return;
6271 }
6272
6273 targeti = ret.second->get_projected_linkage()->get_inode();
6274 }
6275
6276 if (targeti->is_dir()) {
6277 dout(7) << "target is a dir, failing..." << dendl;
6278 respond_to_request(mdr, -EINVAL);
6279 return;
6280 }
6281
6282 CDir *dir = destdn->get_dir();
6283 dout(7) << "handle_client_link link " << destdn->get_name() << " in " << *dir << dendl;
6284 dout(7) << "target is " << *targeti << dendl;
6285
6286 if (!(mdr->locking_state & MutationImpl::ALL_LOCKED)) {
6287 MutationImpl::LockOpVec lov;
6288 lov.add_xlock(&targeti->snaplock);
6289 lov.add_xlock(&targeti->linklock);
6290
6291 if (!mds->locker->acquire_locks(mdr, lov))
6292 return;
6293
6294 mdr->locking_state |= MutationImpl::ALL_LOCKED;
6295 }
6296
6297 if (targeti->get_projected_inode()->nlink == 0) {
6298 dout(7) << "target has no link, failing..." << dendl;
6299 respond_to_request(mdr, -ENOENT);
6300 }
6301
6302 if ((!mdr->has_more() || mdr->more()->witnessed.empty())) {
6303 if (!check_access(mdr, targeti, MAY_WRITE))
6304 return;
6305
6306 if (!check_access(mdr, dir->get_inode(), MAY_WRITE))
6307 return;
6308
6309 if (!check_fragment_space(mdr, dir))
6310 return;
6311 }
6312
6313 // go!
6314 ceph_assert(g_conf()->mds_kill_link_at != 1);
6315
6316 // local or remote?
6317 if (targeti->is_auth())
6318 _link_local(mdr, destdn, targeti);
6319 else
6320 _link_remote(mdr, true, destdn, targeti);
6321 mds->balancer->maybe_fragment(dir, false);
6322 }
6323
6324
6325 class C_MDS_link_local_finish : public ServerLogContext {
6326 CDentry *dn;
6327 CInode *targeti;
6328 version_t dnpv;
6329 version_t tipv;
6330 bool adjust_realm;
6331 public:
6332 C_MDS_link_local_finish(Server *s, MDRequestRef& r, CDentry *d, CInode *ti,
6333 version_t dnpv_, version_t tipv_, bool ar) :
6334 ServerLogContext(s, r), dn(d), targeti(ti),
6335 dnpv(dnpv_), tipv(tipv_), adjust_realm(ar) { }
6336 void finish(int r) override {
6337 ceph_assert(r == 0);
6338 server->_link_local_finish(mdr, dn, targeti, dnpv, tipv, adjust_realm);
6339 }
6340 };
6341
6342
6343 void Server::_link_local(MDRequestRef& mdr, CDentry *dn, CInode *targeti)
6344 {
6345 dout(10) << "_link_local " << *dn << " to " << *targeti << dendl;
6346
6347 mdr->ls = mdlog->get_current_segment();
6348
6349 // predirty NEW dentry
6350 version_t dnpv = dn->pre_dirty();
6351 version_t tipv = targeti->pre_dirty();
6352
6353 // project inode update
6354 auto &pi = targeti->project_inode();
6355 pi.inode.nlink++;
6356 pi.inode.ctime = mdr->get_op_stamp();
6357 if (mdr->get_op_stamp() > pi.inode.rstat.rctime)
6358 pi.inode.rstat.rctime = mdr->get_op_stamp();
6359 pi.inode.change_attr++;
6360 pi.inode.version = tipv;
6361
6362 bool adjust_realm = false;
6363 if (!targeti->is_projected_snaprealm_global()) {
6364 sr_t *newsnap = targeti->project_snaprealm();
6365 targeti->mark_snaprealm_global(newsnap);
6366 targeti->record_snaprealm_parent_dentry(newsnap, NULL, targeti->get_projected_parent_dn(), true);
6367 adjust_realm = true;
6368 }
6369
6370 // log + wait
6371 EUpdate *le = new EUpdate(mdlog, "link_local");
6372 mdlog->start_entry(le);
6373 le->metablob.add_client_req(mdr->reqid, mdr->client_request->get_oldest_client_tid());
6374 mdcache->predirty_journal_parents(mdr, &le->metablob, targeti, dn->get_dir(), PREDIRTY_DIR, 1); // new dn
6375 mdcache->predirty_journal_parents(mdr, &le->metablob, targeti, 0, PREDIRTY_PRIMARY); // targeti
6376 le->metablob.add_remote_dentry(dn, true, targeti->ino(), targeti->d_type()); // new remote
6377 mdcache->journal_dirty_inode(mdr.get(), &le->metablob, targeti);
6378
6379 // do this after predirty_*, to avoid funky extra dnl arg
6380 dn->push_projected_linkage(targeti->ino(), targeti->d_type());
6381
6382 journal_and_reply(mdr, targeti, dn, le,
6383 new C_MDS_link_local_finish(this, mdr, dn, targeti, dnpv, tipv, adjust_realm));
6384 }
6385
6386 void Server::_link_local_finish(MDRequestRef& mdr, CDentry *dn, CInode *targeti,
6387 version_t dnpv, version_t tipv, bool adjust_realm)
6388 {
6389 dout(10) << "_link_local_finish " << *dn << " to " << *targeti << dendl;
6390
6391 // link and unlock the NEW dentry
6392 CDentry::linkage_t *dnl = dn->pop_projected_linkage();
6393 if (!dnl->get_inode())
6394 dn->link_remote(dnl, targeti);
6395 dn->mark_dirty(dnpv, mdr->ls);
6396
6397 // target inode
6398 targeti->pop_and_dirty_projected_inode(mdr->ls);
6399
6400 mdr->apply();
6401
6402 MDRequestRef null_ref;
6403 mdcache->send_dentry_link(dn, null_ref);
6404
6405 if (adjust_realm) {
6406 int op = CEPH_SNAP_OP_SPLIT;
6407 mds->mdcache->send_snap_update(targeti, 0, op);
6408 mds->mdcache->do_realm_invalidate_and_update_notify(targeti, op);
6409 }
6410
6411 // bump target popularity
6412 mds->balancer->hit_inode(targeti, META_POP_IWR);
6413 mds->balancer->hit_dir(dn->get_dir(), META_POP_IWR);
6414
6415 // reply
6416 respond_to_request(mdr, 0);
6417 }
6418
6419
6420 // link / unlink remote
6421
6422 class C_MDS_link_remote_finish : public ServerLogContext {
6423 bool inc;
6424 CDentry *dn;
6425 CInode *targeti;
6426 version_t dpv;
6427 public:
6428 C_MDS_link_remote_finish(Server *s, MDRequestRef& r, bool i, CDentry *d, CInode *ti) :
6429 ServerLogContext(s, r), inc(i), dn(d), targeti(ti),
6430 dpv(d->get_projected_version()) {}
6431 void finish(int r) override {
6432 ceph_assert(r == 0);
6433 server->_link_remote_finish(mdr, inc, dn, targeti, dpv);
6434 }
6435 };
6436
6437 void Server::_link_remote(MDRequestRef& mdr, bool inc, CDentry *dn, CInode *targeti)
6438 {
6439 dout(10) << "_link_remote "
6440 << (inc ? "link ":"unlink ")
6441 << *dn << " to " << *targeti << dendl;
6442
6443 // 1. send LinkPrepare to dest (journal nlink++ prepare)
6444 mds_rank_t linkauth = targeti->authority().first;
6445 if (mdr->more()->witnessed.count(linkauth) == 0) {
6446 if (mds->is_cluster_degraded() &&
6447 !mds->mdsmap->is_clientreplay_or_active_or_stopping(linkauth)) {
6448 dout(10) << " targeti auth mds." << linkauth << " is not active" << dendl;
6449 if (mdr->more()->waiting_on_slave.empty())
6450 mds->wait_for_active_peer(linkauth, new C_MDS_RetryRequest(mdcache, mdr));
6451 return;
6452 }
6453
6454 dout(10) << " targeti auth must prepare nlink++/--" << dendl;
6455 int op;
6456 if (inc)
6457 op = MMDSSlaveRequest::OP_LINKPREP;
6458 else
6459 op = MMDSSlaveRequest::OP_UNLINKPREP;
6460 auto req = make_message<MMDSSlaveRequest>(mdr->reqid, mdr->attempt, op);
6461 targeti->set_object_info(req->get_object_info());
6462 req->op_stamp = mdr->get_op_stamp();
6463 if (auto& desti_srnode = mdr->more()->desti_srnode)
6464 encode(*desti_srnode, req->desti_snapbl);
6465 mds->send_message_mds(req, linkauth);
6466
6467 ceph_assert(mdr->more()->waiting_on_slave.count(linkauth) == 0);
6468 mdr->more()->waiting_on_slave.insert(linkauth);
6469 return;
6470 }
6471 dout(10) << " targeti auth has prepared nlink++/--" << dendl;
6472
6473 ceph_assert(g_conf()->mds_kill_link_at != 2);
6474
6475 if (auto& desti_srnode = mdr->more()->desti_srnode) {
6476 delete desti_srnode;
6477 desti_srnode = NULL;
6478 }
6479
6480 mdr->set_mds_stamp(ceph_clock_now());
6481
6482 // add to event
6483 mdr->ls = mdlog->get_current_segment();
6484 EUpdate *le = new EUpdate(mdlog, inc ? "link_remote":"unlink_remote");
6485 mdlog->start_entry(le);
6486 le->metablob.add_client_req(mdr->reqid, mdr->client_request->get_oldest_client_tid());
6487 if (!mdr->more()->witnessed.empty()) {
6488 dout(20) << " noting uncommitted_slaves " << mdr->more()->witnessed << dendl;
6489 le->reqid = mdr->reqid;
6490 le->had_slaves = true;
6491 mdcache->add_uncommitted_master(mdr->reqid, mdr->ls, mdr->more()->witnessed);
6492 }
6493
6494 if (inc) {
6495 dn->pre_dirty();
6496 mdcache->predirty_journal_parents(mdr, &le->metablob, targeti, dn->get_dir(), PREDIRTY_DIR, 1);
6497 le->metablob.add_remote_dentry(dn, true, targeti->ino(), targeti->d_type()); // new remote
6498 dn->push_projected_linkage(targeti->ino(), targeti->d_type());
6499 } else {
6500 dn->pre_dirty();
6501 mdcache->predirty_journal_parents(mdr, &le->metablob, targeti, dn->get_dir(), PREDIRTY_DIR, -1);
6502 mdcache->journal_cow_dentry(mdr.get(), &le->metablob, dn);
6503 le->metablob.add_null_dentry(dn, true);
6504 dn->push_projected_linkage();
6505 }
6506
6507 journal_and_reply(mdr, (inc ? targeti : nullptr), dn, le,
6508 new C_MDS_link_remote_finish(this, mdr, inc, dn, targeti));
6509 }
6510
6511 void Server::_link_remote_finish(MDRequestRef& mdr, bool inc,
6512 CDentry *dn, CInode *targeti,
6513 version_t dpv)
6514 {
6515 dout(10) << "_link_remote_finish "
6516 << (inc ? "link ":"unlink ")
6517 << *dn << " to " << *targeti << dendl;
6518
6519 ceph_assert(g_conf()->mds_kill_link_at != 3);
6520
6521 if (!mdr->more()->witnessed.empty())
6522 mdcache->logged_master_update(mdr->reqid);
6523
6524 if (inc) {
6525 // link the new dentry
6526 CDentry::linkage_t *dnl = dn->pop_projected_linkage();
6527 if (!dnl->get_inode())
6528 dn->link_remote(dnl, targeti);
6529 dn->mark_dirty(dpv, mdr->ls);
6530 } else {
6531 // unlink main dentry
6532 dn->get_dir()->unlink_inode(dn);
6533 dn->pop_projected_linkage();
6534 dn->mark_dirty(dn->get_projected_version(), mdr->ls); // dirty old dentry
6535 }
6536
6537 mdr->apply();
6538
6539 MDRequestRef null_ref;
6540 if (inc)
6541 mdcache->send_dentry_link(dn, null_ref);
6542 else
6543 mdcache->send_dentry_unlink(dn, NULL, null_ref);
6544
6545 // bump target popularity
6546 mds->balancer->hit_inode(targeti, META_POP_IWR);
6547 mds->balancer->hit_dir(dn->get_dir(), META_POP_IWR);
6548
6549 // reply
6550 respond_to_request(mdr, 0);
6551
6552 if (!inc)
6553 // removing a new dn?
6554 dn->get_dir()->try_remove_unlinked_dn(dn);
6555 }
6556
6557
6558 // remote linking/unlinking
6559
6560 class C_MDS_SlaveLinkPrep : public ServerLogContext {
6561 CInode *targeti;
6562 bool adjust_realm;
6563 public:
6564 C_MDS_SlaveLinkPrep(Server *s, MDRequestRef& r, CInode *t, bool ar) :
6565 ServerLogContext(s, r), targeti(t), adjust_realm(ar) { }
6566 void finish(int r) override {
6567 ceph_assert(r == 0);
6568 server->_logged_slave_link(mdr, targeti, adjust_realm);
6569 }
6570 };
6571
6572 class C_MDS_SlaveLinkCommit : public ServerContext {
6573 MDRequestRef mdr;
6574 CInode *targeti;
6575 public:
6576 C_MDS_SlaveLinkCommit(Server *s, MDRequestRef& r, CInode *t) :
6577 ServerContext(s), mdr(r), targeti(t) { }
6578 void finish(int r) override {
6579 server->_commit_slave_link(mdr, r, targeti);
6580 }
6581 };
6582
6583 void Server::handle_slave_link_prep(MDRequestRef& mdr)
6584 {
6585 dout(10) << "handle_slave_link_prep " << *mdr
6586 << " on " << mdr->slave_request->get_object_info()
6587 << dendl;
6588
6589 ceph_assert(g_conf()->mds_kill_link_at != 4);
6590
6591 CInode *targeti = mdcache->get_inode(mdr->slave_request->get_object_info().ino);
6592 ceph_assert(targeti);
6593 dout(10) << "targeti " << *targeti << dendl;
6594 CDentry *dn = targeti->get_parent_dn();
6595 CDentry::linkage_t *dnl = dn->get_linkage();
6596 ceph_assert(dnl->is_primary());
6597
6598 mdr->set_op_stamp(mdr->slave_request->op_stamp);
6599
6600 mdr->auth_pin(targeti);
6601
6602 //ceph_abort(); // test hack: make sure master can handle a slave that fails to prepare...
6603 ceph_assert(g_conf()->mds_kill_link_at != 5);
6604
6605 // journal it
6606 mdr->ls = mdlog->get_current_segment();
6607 ESlaveUpdate *le = new ESlaveUpdate(mdlog, "slave_link_prep", mdr->reqid, mdr->slave_to_mds,
6608 ESlaveUpdate::OP_PREPARE, ESlaveUpdate::LINK);
6609 mdlog->start_entry(le);
6610
6611 auto &pi = dnl->get_inode()->project_inode();
6612
6613 // update journaled target inode
6614 bool inc;
6615 bool adjust_realm = false;
6616 bool realm_projected = false;
6617 if (mdr->slave_request->get_op() == MMDSSlaveRequest::OP_LINKPREP) {
6618 inc = true;
6619 pi.inode.nlink++;
6620 if (!targeti->is_projected_snaprealm_global()) {
6621 sr_t *newsnap = targeti->project_snaprealm();
6622 targeti->mark_snaprealm_global(newsnap);
6623 targeti->record_snaprealm_parent_dentry(newsnap, NULL, targeti->get_projected_parent_dn(), true);
6624 adjust_realm = true;
6625 realm_projected = true;
6626 }
6627 } else {
6628 inc = false;
6629 pi.inode.nlink--;
6630 if (targeti->is_projected_snaprealm_global()) {
6631 ceph_assert(mdr->slave_request->desti_snapbl.length());
6632 auto p = mdr->slave_request->desti_snapbl.cbegin();
6633
6634 sr_t *newsnap = targeti->project_snaprealm();
6635 decode(*newsnap, p);
6636
6637 if (pi.inode.nlink == 0)
6638 ceph_assert(!newsnap->is_parent_global());
6639
6640 realm_projected = true;
6641 } else {
6642 ceph_assert(mdr->slave_request->desti_snapbl.length() == 0);
6643 }
6644 }
6645
6646 link_rollback rollback;
6647 rollback.reqid = mdr->reqid;
6648 rollback.ino = targeti->ino();
6649 rollback.old_ctime = targeti->inode.ctime; // we hold versionlock xlock; no concorrent projections
6650 const fnode_t *pf = targeti->get_parent_dn()->get_dir()->get_projected_fnode();
6651 rollback.old_dir_mtime = pf->fragstat.mtime;
6652 rollback.old_dir_rctime = pf->rstat.rctime;
6653 rollback.was_inc = inc;
6654 if (realm_projected) {
6655 if (targeti->snaprealm) {
6656 encode(true, rollback.snapbl);
6657 targeti->encode_snap_blob(rollback.snapbl);
6658 } else {
6659 encode(false, rollback.snapbl);
6660 }
6661 }
6662 encode(rollback, le->rollback);
6663 mdr->more()->rollback_bl = le->rollback;
6664
6665 pi.inode.ctime = mdr->get_op_stamp();
6666 pi.inode.version = targeti->pre_dirty();
6667
6668 dout(10) << " projected inode " << pi.inode.ino << " v " << pi.inode.version << dendl;
6669
6670 // commit case
6671 mdcache->predirty_journal_parents(mdr, &le->commit, dnl->get_inode(), 0, PREDIRTY_SHALLOW|PREDIRTY_PRIMARY);
6672 mdcache->journal_dirty_inode(mdr.get(), &le->commit, targeti);
6673 mdcache->add_uncommitted_slave(mdr->reqid, mdr->ls, mdr->slave_to_mds);
6674
6675 // set up commit waiter
6676 mdr->more()->slave_commit = new C_MDS_SlaveLinkCommit(this, mdr, targeti);
6677
6678 mdr->more()->slave_update_journaled = true;
6679 submit_mdlog_entry(le, new C_MDS_SlaveLinkPrep(this, mdr, targeti, adjust_realm),
6680 mdr, __func__);
6681 mdlog->flush();
6682 }
6683
6684 void Server::_logged_slave_link(MDRequestRef& mdr, CInode *targeti, bool adjust_realm)
6685 {
6686 dout(10) << "_logged_slave_link " << *mdr
6687 << " " << *targeti << dendl;
6688
6689 ceph_assert(g_conf()->mds_kill_link_at != 6);
6690
6691 // update the target
6692 targeti->pop_and_dirty_projected_inode(mdr->ls);
6693 mdr->apply();
6694
6695 // hit pop
6696 mds->balancer->hit_inode(targeti, META_POP_IWR);
6697
6698 // done.
6699 mdr->reset_slave_request();
6700
6701 if (adjust_realm) {
6702 int op = CEPH_SNAP_OP_SPLIT;
6703 mds->mdcache->send_snap_update(targeti, 0, op);
6704 mds->mdcache->do_realm_invalidate_and_update_notify(targeti, op);
6705 }
6706
6707 // ack
6708 if (!mdr->aborted) {
6709 auto reply = make_message<MMDSSlaveRequest>(mdr->reqid, mdr->attempt, MMDSSlaveRequest::OP_LINKPREPACK);
6710 mds->send_message_mds(reply, mdr->slave_to_mds);
6711 } else {
6712 dout(10) << " abort flag set, finishing" << dendl;
6713 mdcache->request_finish(mdr);
6714 }
6715 }
6716
6717
6718 struct C_MDS_CommittedSlave : public ServerLogContext {
6719 C_MDS_CommittedSlave(Server *s, MDRequestRef& m) : ServerLogContext(s, m) {}
6720 void finish(int r) override {
6721 server->_committed_slave(mdr);
6722 }
6723 };
6724
6725 void Server::_commit_slave_link(MDRequestRef& mdr, int r, CInode *targeti)
6726 {
6727 dout(10) << "_commit_slave_link " << *mdr
6728 << " r=" << r
6729 << " " << *targeti << dendl;
6730
6731 ceph_assert(g_conf()->mds_kill_link_at != 7);
6732
6733 if (r == 0) {
6734 // drop our pins, etc.
6735 mdr->cleanup();
6736
6737 // write a commit to the journal
6738 ESlaveUpdate *le = new ESlaveUpdate(mdlog, "slave_link_commit", mdr->reqid, mdr->slave_to_mds,
6739 ESlaveUpdate::OP_COMMIT, ESlaveUpdate::LINK);
6740 mdlog->start_entry(le);
6741 submit_mdlog_entry(le, new C_MDS_CommittedSlave(this, mdr), mdr, __func__);
6742 mdlog->flush();
6743 } else {
6744 do_link_rollback(mdr->more()->rollback_bl, mdr->slave_to_mds, mdr);
6745 }
6746 }
6747
6748 void Server::_committed_slave(MDRequestRef& mdr)
6749 {
6750 dout(10) << "_committed_slave " << *mdr << dendl;
6751
6752 ceph_assert(g_conf()->mds_kill_link_at != 8);
6753
6754 bool assert_exist = mdr->more()->slave_update_journaled;
6755 mdcache->finish_uncommitted_slave(mdr->reqid, assert_exist);
6756 auto req = make_message<MMDSSlaveRequest>(mdr->reqid, mdr->attempt, MMDSSlaveRequest::OP_COMMITTED);
6757 mds->send_message_mds(req, mdr->slave_to_mds);
6758 mdcache->request_finish(mdr);
6759 }
6760
6761 struct C_MDS_LoggedLinkRollback : public ServerLogContext {
6762 MutationRef mut;
6763 map<client_t,ref_t<MClientSnap>> splits;
6764 C_MDS_LoggedLinkRollback(Server *s, MutationRef& m, MDRequestRef& r,
6765 map<client_t,ref_t<MClientSnap>>&& _splits) :
6766 ServerLogContext(s, r), mut(m), splits(std::move(_splits)) {
6767 }
6768 void finish(int r) override {
6769 server->_link_rollback_finish(mut, mdr, splits);
6770 }
6771 };
6772
6773 void Server::do_link_rollback(bufferlist &rbl, mds_rank_t master, MDRequestRef& mdr)
6774 {
6775 link_rollback rollback;
6776 auto p = rbl.cbegin();
6777 decode(rollback, p);
6778
6779 dout(10) << "do_link_rollback on " << rollback.reqid
6780 << (rollback.was_inc ? " inc":" dec")
6781 << " ino " << rollback.ino
6782 << dendl;
6783
6784 ceph_assert(g_conf()->mds_kill_link_at != 9);
6785
6786 mdcache->add_rollback(rollback.reqid, master); // need to finish this update before resolve finishes
6787 ceph_assert(mdr || mds->is_resolve());
6788
6789 MutationRef mut(new MutationImpl(nullptr, utime_t(), rollback.reqid));
6790 mut->ls = mds->mdlog->get_current_segment();
6791
6792 CInode *in = mdcache->get_inode(rollback.ino);
6793 ceph_assert(in);
6794 dout(10) << " target is " << *in << dendl;
6795 ceph_assert(!in->is_projected()); // live slave request hold versionlock xlock.
6796
6797 auto &pi = in->project_inode();
6798 pi.inode.version = in->pre_dirty();
6799 mut->add_projected_inode(in);
6800
6801 // parent dir rctime
6802 CDir *parent = in->get_projected_parent_dn()->get_dir();
6803 fnode_t *pf = parent->project_fnode();
6804 mut->add_projected_fnode(parent);
6805 pf->version = parent->pre_dirty();
6806 if (pf->fragstat.mtime == pi.inode.ctime) {
6807 pf->fragstat.mtime = rollback.old_dir_mtime;
6808 if (pf->rstat.rctime == pi.inode.ctime)
6809 pf->rstat.rctime = rollback.old_dir_rctime;
6810 mut->add_updated_lock(&parent->get_inode()->filelock);
6811 mut->add_updated_lock(&parent->get_inode()->nestlock);
6812 }
6813
6814 // inode
6815 pi.inode.ctime = rollback.old_ctime;
6816 if (rollback.was_inc)
6817 pi.inode.nlink--;
6818 else
6819 pi.inode.nlink++;
6820
6821 map<client_t,ref_t<MClientSnap>> splits;
6822 if (rollback.snapbl.length() && in->snaprealm) {
6823 bool hadrealm;
6824 auto p = rollback.snapbl.cbegin();
6825 decode(hadrealm, p);
6826 if (hadrealm) {
6827 if (!mds->is_resolve()) {
6828 sr_t *new_srnode = new sr_t();
6829 decode(*new_srnode, p);
6830 in->project_snaprealm(new_srnode);
6831 } else {
6832 decode(in->snaprealm->srnode, p);
6833 }
6834 } else {
6835 SnapRealm *realm = parent->get_inode()->find_snaprealm();
6836 if (!mds->is_resolve())
6837 mdcache->prepare_realm_merge(in->snaprealm, realm, splits);
6838 in->project_snaprealm(NULL);
6839 }
6840 }
6841
6842 // journal it
6843 ESlaveUpdate *le = new ESlaveUpdate(mdlog, "slave_link_rollback", rollback.reqid, master,
6844 ESlaveUpdate::OP_ROLLBACK, ESlaveUpdate::LINK);
6845 mdlog->start_entry(le);
6846 le->commit.add_dir_context(parent);
6847 le->commit.add_dir(parent, true);
6848 le->commit.add_primary_dentry(in->get_projected_parent_dn(), 0, true);
6849
6850 submit_mdlog_entry(le, new C_MDS_LoggedLinkRollback(this, mut, mdr, std::move(splits)),
6851 mdr, __func__);
6852 mdlog->flush();
6853 }
6854
6855 void Server::_link_rollback_finish(MutationRef& mut, MDRequestRef& mdr,
6856 map<client_t,ref_t<MClientSnap>>& splits)
6857 {
6858 dout(10) << "_link_rollback_finish" << dendl;
6859
6860 ceph_assert(g_conf()->mds_kill_link_at != 10);
6861
6862 mut->apply();
6863
6864 if (!mds->is_resolve())
6865 mdcache->send_snaps(splits);
6866
6867 if (mdr)
6868 mdcache->request_finish(mdr);
6869
6870 mdcache->finish_rollback(mut->reqid, mdr);
6871
6872 mut->cleanup();
6873 }
6874
6875
6876 void Server::handle_slave_link_prep_ack(MDRequestRef& mdr, const cref_t<MMDSSlaveRequest> &m)
6877 {
6878 dout(10) << "handle_slave_link_prep_ack " << *mdr
6879 << " " << *m << dendl;
6880 mds_rank_t from = mds_rank_t(m->get_source().num());
6881
6882 ceph_assert(g_conf()->mds_kill_link_at != 11);
6883
6884 // note slave
6885 mdr->more()->slaves.insert(from);
6886
6887 // witnessed!
6888 ceph_assert(mdr->more()->witnessed.count(from) == 0);
6889 mdr->more()->witnessed.insert(from);
6890 ceph_assert(!m->is_not_journaled());
6891 mdr->more()->has_journaled_slaves = true;
6892
6893 // remove from waiting list
6894 ceph_assert(mdr->more()->waiting_on_slave.count(from));
6895 mdr->more()->waiting_on_slave.erase(from);
6896
6897 ceph_assert(mdr->more()->waiting_on_slave.empty());
6898
6899 dispatch_client_request(mdr); // go again!
6900 }
6901
6902
6903
6904
6905
6906 // UNLINK
6907
6908 void Server::handle_client_unlink(MDRequestRef& mdr)
6909 {
6910 const cref_t<MClientRequest> &req = mdr->client_request;
6911 client_t client = mdr->get_client();
6912
6913 // rmdir or unlink?
6914 bool rmdir = (req->get_op() == CEPH_MDS_OP_RMDIR);
6915
6916 if (rmdir)
6917 mdr->disable_lock_cache();
6918 CDentry *dn = rdlock_path_xlock_dentry(mdr, false, true);
6919 if (!dn)
6920 return;
6921
6922 CDentry::linkage_t *dnl = dn->get_linkage(client, mdr);
6923 ceph_assert(!dnl->is_null());
6924 CInode *in = dnl->get_inode();
6925
6926 if (rmdir) {
6927 dout(7) << "handle_client_rmdir on " << *dn << dendl;
6928 } else {
6929 dout(7) << "handle_client_unlink on " << *dn << dendl;
6930 }
6931 dout(7) << "dn links to " << *in << dendl;
6932
6933 // rmdir vs is_dir
6934 if (in->is_dir()) {
6935 if (rmdir) {
6936 // do empty directory checks
6937 if (_dir_is_nonempty_unlocked(mdr, in)) {
6938 respond_to_request(mdr, -ENOTEMPTY);
6939 return;
6940 }
6941 } else {
6942 dout(7) << "handle_client_unlink on dir " << *in << ", returning error" << dendl;
6943 respond_to_request(mdr, -EISDIR);
6944 return;
6945 }
6946 } else {
6947 if (rmdir) {
6948 // unlink
6949 dout(7) << "handle_client_rmdir on non-dir " << *in << ", returning error" << dendl;
6950 respond_to_request(mdr, -ENOTDIR);
6951 return;
6952 }
6953 }
6954
6955 CInode *diri = dn->get_dir()->get_inode();
6956 if ((!mdr->has_more() || mdr->more()->witnessed.empty())) {
6957 if (!check_access(mdr, diri, MAY_WRITE))
6958 return;
6959 }
6960
6961 // -- create stray dentry? --
6962 CDentry *straydn = NULL;
6963 if (dnl->is_primary()) {
6964 straydn = prepare_stray_dentry(mdr, dnl->get_inode());
6965 if (!straydn)
6966 return;
6967 dout(10) << " straydn is " << *straydn << dendl;
6968 } else if (mdr->straydn) {
6969 mdr->unpin(mdr->straydn);
6970 mdr->straydn = NULL;
6971 }
6972
6973 // lock
6974 if (!(mdr->locking_state & MutationImpl::ALL_LOCKED)) {
6975 MutationImpl::LockOpVec lov;
6976
6977 lov.add_xlock(&in->linklock);
6978 lov.add_xlock(&in->snaplock);
6979 if (in->is_dir())
6980 lov.add_rdlock(&in->filelock); // to verify it's empty
6981
6982 if (straydn) {
6983 lov.add_wrlock(&straydn->get_dir()->inode->filelock);
6984 lov.add_wrlock(&straydn->get_dir()->inode->nestlock);
6985 lov.add_xlock(&straydn->lock);
6986 }
6987
6988 if (!mds->locker->acquire_locks(mdr, lov))
6989 return;
6990
6991 mdr->locking_state |= MutationImpl::ALL_LOCKED;
6992 }
6993
6994 if (in->is_dir() &&
6995 _dir_is_nonempty(mdr, in)) {
6996 respond_to_request(mdr, -ENOTEMPTY);
6997 return;
6998 }
6999
7000 if (straydn)
7001 straydn->first = mdcache->get_global_snaprealm()->get_newest_seq() + 1;
7002
7003 if (!mdr->more()->desti_srnode) {
7004 if (in->is_projected_snaprealm_global()) {
7005 sr_t *new_srnode = in->prepare_new_srnode(0);
7006 in->record_snaprealm_parent_dentry(new_srnode, NULL, dn, dnl->is_primary());
7007 // dropping the last linkage or dropping the last remote linkage,
7008 // detch the inode from global snaprealm
7009 auto nlink = in->get_projected_inode()->nlink;
7010 if (nlink == 1 ||
7011 (nlink == 2 && !dnl->is_primary() &&
7012 !in->get_projected_parent_dir()->inode->is_stray()))
7013 in->clear_snaprealm_global(new_srnode);
7014 mdr->more()->desti_srnode = new_srnode;
7015 } else if (dnl->is_primary()) {
7016 // prepare snaprealm blob for slave request
7017 SnapRealm *realm = in->find_snaprealm();
7018 snapid_t follows = realm->get_newest_seq();
7019 if (in->snaprealm || follows + 1 > in->get_oldest_snap()) {
7020 sr_t *new_srnode = in->prepare_new_srnode(follows);
7021 in->record_snaprealm_past_parent(new_srnode, straydn->get_dir()->inode->find_snaprealm());
7022 mdr->more()->desti_srnode = new_srnode;
7023 }
7024 }
7025 }
7026
7027 // yay!
7028 if (in->is_dir() && in->has_subtree_root_dirfrag()) {
7029 // subtree root auths need to be witnesses
7030 set<mds_rank_t> witnesses;
7031 in->list_replicas(witnesses);
7032 dout(10) << " witnesses " << witnesses << ", have " << mdr->more()->witnessed << dendl;
7033
7034 for (set<mds_rank_t>::iterator p = witnesses.begin();
7035 p != witnesses.end();
7036 ++p) {
7037 if (mdr->more()->witnessed.count(*p)) {
7038 dout(10) << " already witnessed by mds." << *p << dendl;
7039 } else if (mdr->more()->waiting_on_slave.count(*p)) {
7040 dout(10) << " already waiting on witness mds." << *p << dendl;
7041 } else {
7042 if (!_rmdir_prepare_witness(mdr, *p, mdr->dn[0], straydn))
7043 return;
7044 }
7045 }
7046 if (!mdr->more()->waiting_on_slave.empty())
7047 return; // we're waiting for a witness.
7048 }
7049
7050 if (!rmdir && dnl->is_primary() && mdr->dn[0].size() == 1)
7051 mds->locker->create_lock_cache(mdr, diri);
7052
7053 // ok!
7054 if (dnl->is_remote() && !dnl->get_inode()->is_auth())
7055 _link_remote(mdr, false, dn, dnl->get_inode());
7056 else
7057 _unlink_local(mdr, dn, straydn);
7058 }
7059
7060 class C_MDS_unlink_local_finish : public ServerLogContext {
7061 CDentry *dn;
7062 CDentry *straydn;
7063 version_t dnpv; // deleted dentry
7064 public:
7065 C_MDS_unlink_local_finish(Server *s, MDRequestRef& r, CDentry *d, CDentry *sd) :
7066 ServerLogContext(s, r), dn(d), straydn(sd),
7067 dnpv(d->get_projected_version()) {}
7068 void finish(int r) override {
7069 ceph_assert(r == 0);
7070 server->_unlink_local_finish(mdr, dn, straydn, dnpv);
7071 }
7072 };
7073
7074 void Server::_unlink_local(MDRequestRef& mdr, CDentry *dn, CDentry *straydn)
7075 {
7076 dout(10) << "_unlink_local " << *dn << dendl;
7077
7078 CDentry::linkage_t *dnl = dn->get_projected_linkage();
7079 CInode *in = dnl->get_inode();
7080
7081
7082 // ok, let's do it.
7083 mdr->ls = mdlog->get_current_segment();
7084
7085 // prepare log entry
7086 EUpdate *le = new EUpdate(mdlog, "unlink_local");
7087 mdlog->start_entry(le);
7088 le->metablob.add_client_req(mdr->reqid, mdr->client_request->get_oldest_client_tid());
7089 if (!mdr->more()->witnessed.empty()) {
7090 dout(20) << " noting uncommitted_slaves " << mdr->more()->witnessed << dendl;
7091 le->reqid = mdr->reqid;
7092 le->had_slaves = true;
7093 mdcache->add_uncommitted_master(mdr->reqid, mdr->ls, mdr->more()->witnessed);
7094 }
7095
7096 if (straydn) {
7097 ceph_assert(dnl->is_primary());
7098 straydn->push_projected_linkage(in);
7099 }
7100
7101 // the unlinked dentry
7102 dn->pre_dirty();
7103
7104 auto &pi = in->project_inode();
7105 {
7106 std::string t;
7107 dn->make_path_string(t, true);
7108 pi.inode.stray_prior_path = std::move(t);
7109 }
7110 pi.inode.version = in->pre_dirty();
7111 pi.inode.ctime = mdr->get_op_stamp();
7112 if (mdr->get_op_stamp() > pi.inode.rstat.rctime)
7113 pi.inode.rstat.rctime = mdr->get_op_stamp();
7114 pi.inode.change_attr++;
7115 pi.inode.nlink--;
7116 if (pi.inode.nlink == 0)
7117 in->state_set(CInode::STATE_ORPHAN);
7118
7119 if (mdr->more()->desti_srnode) {
7120 auto& desti_srnode = mdr->more()->desti_srnode;
7121 in->project_snaprealm(desti_srnode);
7122 desti_srnode = NULL;
7123 }
7124
7125 if (straydn) {
7126 // will manually pop projected inode
7127
7128 // primary link. add stray dentry.
7129 mdcache->predirty_journal_parents(mdr, &le->metablob, in, dn->get_dir(), PREDIRTY_PRIMARY|PREDIRTY_DIR, -1);
7130 mdcache->predirty_journal_parents(mdr, &le->metablob, in, straydn->get_dir(), PREDIRTY_PRIMARY|PREDIRTY_DIR, 1);
7131
7132 pi.inode.update_backtrace();
7133 le->metablob.add_primary_dentry(straydn, in, true, true);
7134 } else {
7135 mdr->add_projected_inode(in);
7136 // remote link. update remote inode.
7137 mdcache->predirty_journal_parents(mdr, &le->metablob, in, dn->get_dir(), PREDIRTY_DIR, -1);
7138 mdcache->predirty_journal_parents(mdr, &le->metablob, in, 0, PREDIRTY_PRIMARY);
7139 mdcache->journal_dirty_inode(mdr.get(), &le->metablob, in);
7140 }
7141
7142 mdcache->journal_cow_dentry(mdr.get(), &le->metablob, dn);
7143 le->metablob.add_null_dentry(dn, true);
7144
7145 if (in->is_dir()) {
7146 dout(10) << " noting renamed (unlinked) dir ino " << in->ino() << " in metablob" << dendl;
7147 le->metablob.renamed_dirino = in->ino();
7148 }
7149
7150 dn->push_projected_linkage();
7151
7152 if (straydn) {
7153 ceph_assert(in->first <= straydn->first);
7154 in->first = straydn->first;
7155 }
7156
7157 if (in->is_dir()) {
7158 ceph_assert(straydn);
7159 mdcache->project_subtree_rename(in, dn->get_dir(), straydn->get_dir());
7160 }
7161
7162 journal_and_reply(mdr, 0, dn, le, new C_MDS_unlink_local_finish(this, mdr, dn, straydn));
7163 }
7164
7165 void Server::_unlink_local_finish(MDRequestRef& mdr,
7166 CDentry *dn, CDentry *straydn,
7167 version_t dnpv)
7168 {
7169 dout(10) << "_unlink_local_finish " << *dn << dendl;
7170
7171 if (!mdr->more()->witnessed.empty())
7172 mdcache->logged_master_update(mdr->reqid);
7173
7174 CInode *strayin = NULL;
7175 bool hadrealm = false;
7176 if (straydn) {
7177 // if there is newly created snaprealm, need to split old snaprealm's
7178 // inodes_with_caps. So pop snaprealm before linkage changes.
7179 strayin = dn->get_linkage()->get_inode();
7180 hadrealm = strayin->snaprealm ? true : false;
7181 strayin->early_pop_projected_snaprealm();
7182 }
7183
7184 // unlink main dentry
7185 dn->get_dir()->unlink_inode(dn);
7186 dn->pop_projected_linkage();
7187
7188 // relink as stray? (i.e. was primary link?)
7189 if (straydn) {
7190 dout(20) << " straydn is " << *straydn << dendl;
7191 straydn->pop_projected_linkage();
7192
7193 strayin->pop_and_dirty_projected_inode(mdr->ls);
7194
7195 mdcache->touch_dentry_bottom(straydn);
7196 }
7197
7198 dn->mark_dirty(dnpv, mdr->ls);
7199 mdr->apply();
7200
7201 mdcache->send_dentry_unlink(dn, straydn, mdr);
7202
7203 if (straydn) {
7204 // update subtree map?
7205 if (strayin->is_dir())
7206 mdcache->adjust_subtree_after_rename(strayin, dn->get_dir(), true);
7207
7208 if (strayin->snaprealm && !hadrealm)
7209 mdcache->do_realm_invalidate_and_update_notify(strayin, CEPH_SNAP_OP_SPLIT, false);
7210 }
7211
7212 // bump pop
7213 mds->balancer->hit_dir(dn->get_dir(), META_POP_IWR);
7214
7215 // reply
7216 respond_to_request(mdr, 0);
7217
7218 // removing a new dn?
7219 dn->get_dir()->try_remove_unlinked_dn(dn);
7220
7221 // clean up ?
7222 // respond_to_request() drops locks. So stray reintegration can race with us.
7223 if (straydn && !straydn->get_projected_linkage()->is_null()) {
7224 // Tip off the MDCache that this dentry is a stray that
7225 // might be elegible for purge.
7226 mdcache->notify_stray(straydn);
7227 }
7228 }
7229
7230 bool Server::_rmdir_prepare_witness(MDRequestRef& mdr, mds_rank_t who, vector<CDentry*>& trace, CDentry *straydn)
7231 {
7232 if (mds->is_cluster_degraded() &&
7233 !mds->mdsmap->is_clientreplay_or_active_or_stopping(who)) {
7234 dout(10) << "_rmdir_prepare_witness mds." << who << " is not active" << dendl;
7235 if (mdr->more()->waiting_on_slave.empty())
7236 mds->wait_for_active_peer(who, new C_MDS_RetryRequest(mdcache, mdr));
7237 return false;
7238 }
7239
7240 dout(10) << "_rmdir_prepare_witness mds." << who << dendl;
7241 auto req = make_message<MMDSSlaveRequest>(mdr->reqid, mdr->attempt, MMDSSlaveRequest::OP_RMDIRPREP);
7242 req->srcdnpath = filepath(trace.front()->get_dir()->ino());
7243 for (auto dn : trace)
7244 req->srcdnpath.push_dentry(dn->get_name());
7245 mdcache->encode_replica_stray(straydn, who, req->straybl);
7246 if (mdr->more()->desti_srnode)
7247 encode(*mdr->more()->desti_srnode, req->desti_snapbl);
7248
7249 req->op_stamp = mdr->get_op_stamp();
7250 mds->send_message_mds(req, who);
7251
7252 ceph_assert(mdr->more()->waiting_on_slave.count(who) == 0);
7253 mdr->more()->waiting_on_slave.insert(who);
7254 return true;
7255 }
7256
7257 struct C_MDS_SlaveRmdirPrep : public ServerLogContext {
7258 CDentry *dn, *straydn;
7259 C_MDS_SlaveRmdirPrep(Server *s, MDRequestRef& r, CDentry *d, CDentry *st)
7260 : ServerLogContext(s, r), dn(d), straydn(st) {}
7261 void finish(int r) override {
7262 server->_logged_slave_rmdir(mdr, dn, straydn);
7263 }
7264 };
7265
7266 struct C_MDS_SlaveRmdirCommit : public ServerContext {
7267 MDRequestRef mdr;
7268 CDentry *straydn;
7269 C_MDS_SlaveRmdirCommit(Server *s, MDRequestRef& r, CDentry *sd)
7270 : ServerContext(s), mdr(r), straydn(sd) { }
7271 void finish(int r) override {
7272 server->_commit_slave_rmdir(mdr, r, straydn);
7273 }
7274 };
7275
7276 void Server::handle_slave_rmdir_prep(MDRequestRef& mdr)
7277 {
7278 dout(10) << "handle_slave_rmdir_prep " << *mdr
7279 << " " << mdr->slave_request->srcdnpath
7280 << " to " << mdr->slave_request->destdnpath
7281 << dendl;
7282
7283 vector<CDentry*> trace;
7284 filepath srcpath(mdr->slave_request->srcdnpath);
7285 dout(10) << " src " << srcpath << dendl;
7286 CInode *in;
7287 CF_MDS_MDRContextFactory cf(mdcache, mdr, false);
7288 int r = mdcache->path_traverse(mdr, cf, srcpath,
7289 MDS_TRAVERSE_DISCOVER | MDS_TRAVERSE_PATH_LOCKED,
7290 &trace, &in);
7291 if (r > 0) return;
7292 if (r == -ESTALE) {
7293 mdcache->find_ino_peers(srcpath.get_ino(), new C_MDS_RetryRequest(mdcache, mdr),
7294 mdr->slave_to_mds, true);
7295 return;
7296 }
7297 ceph_assert(r == 0);
7298 CDentry *dn = trace.back();
7299 dout(10) << " dn " << *dn << dendl;
7300 mdr->pin(dn);
7301
7302 ceph_assert(mdr->straydn);
7303 CDentry *straydn = mdr->straydn;
7304 dout(10) << " straydn " << *straydn << dendl;
7305
7306 mdr->set_op_stamp(mdr->slave_request->op_stamp);
7307
7308 rmdir_rollback rollback;
7309 rollback.reqid = mdr->reqid;
7310 rollback.src_dir = dn->get_dir()->dirfrag();
7311 rollback.src_dname = dn->get_name();
7312 rollback.dest_dir = straydn->get_dir()->dirfrag();
7313 rollback.dest_dname = straydn->get_name();
7314 if (mdr->slave_request->desti_snapbl.length()) {
7315 if (in->snaprealm) {
7316 encode(true, rollback.snapbl);
7317 in->encode_snap_blob(rollback.snapbl);
7318 } else {
7319 encode(false, rollback.snapbl);
7320 }
7321 }
7322 encode(rollback, mdr->more()->rollback_bl);
7323 // FIXME: rollback snaprealm
7324 dout(20) << " rollback is " << mdr->more()->rollback_bl.length() << " bytes" << dendl;
7325
7326 // set up commit waiter
7327 mdr->more()->slave_commit = new C_MDS_SlaveRmdirCommit(this, mdr, straydn);
7328
7329 straydn->push_projected_linkage(in);
7330 dn->push_projected_linkage();
7331
7332 ceph_assert(straydn->first >= in->first);
7333 in->first = straydn->first;
7334
7335 if (!in->has_subtree_root_dirfrag(mds->get_nodeid())) {
7336 dout(10) << " no auth subtree in " << *in << ", skipping journal" << dendl;
7337 _logged_slave_rmdir(mdr, dn, straydn);
7338 return;
7339 }
7340
7341 mdr->ls = mdlog->get_current_segment();
7342 ESlaveUpdate *le = new ESlaveUpdate(mdlog, "slave_rmdir", mdr->reqid, mdr->slave_to_mds,
7343 ESlaveUpdate::OP_PREPARE, ESlaveUpdate::RMDIR);
7344 mdlog->start_entry(le);
7345 le->rollback = mdr->more()->rollback_bl;
7346
7347 le->commit.add_dir_context(straydn->get_dir());
7348 le->commit.add_primary_dentry(straydn, in, true);
7349 // slave: no need to journal original dentry
7350
7351 dout(10) << " noting renamed (unlinked) dir ino " << in->ino() << " in metablob" << dendl;
7352 le->commit.renamed_dirino = in->ino();
7353
7354 mdcache->project_subtree_rename(in, dn->get_dir(), straydn->get_dir());
7355 mdcache->add_uncommitted_slave(mdr->reqid, mdr->ls, mdr->slave_to_mds);
7356
7357 mdr->more()->slave_update_journaled = true;
7358 submit_mdlog_entry(le, new C_MDS_SlaveRmdirPrep(this, mdr, dn, straydn),
7359 mdr, __func__);
7360 mdlog->flush();
7361 }
7362
7363 void Server::_logged_slave_rmdir(MDRequestRef& mdr, CDentry *dn, CDentry *straydn)
7364 {
7365 dout(10) << "_logged_slave_rmdir " << *mdr << " on " << *dn << dendl;
7366 CInode *in = dn->get_linkage()->get_inode();
7367
7368 bool new_realm;
7369 if (mdr->slave_request->desti_snapbl.length()) {
7370 new_realm = !in->snaprealm;
7371 in->decode_snap_blob(mdr->slave_request->desti_snapbl);
7372 ceph_assert(in->snaprealm);
7373 ceph_assert(in->snaprealm->have_past_parents_open());
7374 } else {
7375 new_realm = false;
7376 }
7377
7378 // update our cache now, so we are consistent with what is in the journal
7379 // when we journal a subtree map
7380 dn->get_dir()->unlink_inode(dn);
7381 straydn->pop_projected_linkage();
7382 dn->pop_projected_linkage();
7383
7384 mdcache->adjust_subtree_after_rename(in, dn->get_dir(), mdr->more()->slave_update_journaled);
7385
7386 if (new_realm)
7387 mdcache->do_realm_invalidate_and_update_notify(in, CEPH_SNAP_OP_SPLIT, false);
7388
7389 // done.
7390 mdr->reset_slave_request();
7391 mdr->straydn = 0;
7392
7393 if (!mdr->aborted) {
7394 auto reply = make_message<MMDSSlaveRequest>(mdr->reqid, mdr->attempt, MMDSSlaveRequest::OP_RMDIRPREPACK);
7395 if (!mdr->more()->slave_update_journaled)
7396 reply->mark_not_journaled();
7397 mds->send_message_mds(reply, mdr->slave_to_mds);
7398 } else {
7399 dout(10) << " abort flag set, finishing" << dendl;
7400 mdcache->request_finish(mdr);
7401 }
7402 }
7403
7404 void Server::handle_slave_rmdir_prep_ack(MDRequestRef& mdr, const cref_t<MMDSSlaveRequest> &ack)
7405 {
7406 dout(10) << "handle_slave_rmdir_prep_ack " << *mdr
7407 << " " << *ack << dendl;
7408
7409 mds_rank_t from = mds_rank_t(ack->get_source().num());
7410
7411 mdr->more()->slaves.insert(from);
7412 mdr->more()->witnessed.insert(from);
7413 if (!ack->is_not_journaled())
7414 mdr->more()->has_journaled_slaves = true;
7415
7416 // remove from waiting list
7417 ceph_assert(mdr->more()->waiting_on_slave.count(from));
7418 mdr->more()->waiting_on_slave.erase(from);
7419
7420 if (mdr->more()->waiting_on_slave.empty())
7421 dispatch_client_request(mdr); // go again!
7422 else
7423 dout(10) << "still waiting on slaves " << mdr->more()->waiting_on_slave << dendl;
7424 }
7425
7426 void Server::_commit_slave_rmdir(MDRequestRef& mdr, int r, CDentry *straydn)
7427 {
7428 dout(10) << "_commit_slave_rmdir " << *mdr << " r=" << r << dendl;
7429
7430 if (r == 0) {
7431 if (mdr->more()->slave_update_journaled) {
7432 CInode *strayin = straydn->get_projected_linkage()->get_inode();
7433 if (strayin && !strayin->snaprealm)
7434 mdcache->clear_dirty_bits_for_stray(strayin);
7435 }
7436
7437 mdr->cleanup();
7438
7439 if (mdr->more()->slave_update_journaled) {
7440 // write a commit to the journal
7441 ESlaveUpdate *le = new ESlaveUpdate(mdlog, "slave_rmdir_commit", mdr->reqid,
7442 mdr->slave_to_mds, ESlaveUpdate::OP_COMMIT,
7443 ESlaveUpdate::RMDIR);
7444 mdlog->start_entry(le);
7445 submit_mdlog_entry(le, new C_MDS_CommittedSlave(this, mdr), mdr, __func__);
7446 mdlog->flush();
7447 } else {
7448 _committed_slave(mdr);
7449 }
7450 } else {
7451 // abort
7452 do_rmdir_rollback(mdr->more()->rollback_bl, mdr->slave_to_mds, mdr);
7453 }
7454 }
7455
7456 struct C_MDS_LoggedRmdirRollback : public ServerLogContext {
7457 metareqid_t reqid;
7458 CDentry *dn;
7459 CDentry *straydn;
7460 C_MDS_LoggedRmdirRollback(Server *s, MDRequestRef& m, metareqid_t mr, CDentry *d, CDentry *st)
7461 : ServerLogContext(s, m), reqid(mr), dn(d), straydn(st) {}
7462 void finish(int r) override {
7463 server->_rmdir_rollback_finish(mdr, reqid, dn, straydn);
7464 }
7465 };
7466
7467 void Server::do_rmdir_rollback(bufferlist &rbl, mds_rank_t master, MDRequestRef& mdr)
7468 {
7469 // unlink the other rollback methods, the rmdir rollback is only
7470 // needed to record the subtree changes in the journal for inode
7471 // replicas who are auth for empty dirfrags. no actual changes to
7472 // the file system are taking place here, so there is no Mutation.
7473
7474 rmdir_rollback rollback;
7475 auto p = rbl.cbegin();
7476 decode(rollback, p);
7477
7478 dout(10) << "do_rmdir_rollback on " << rollback.reqid << dendl;
7479 mdcache->add_rollback(rollback.reqid, master); // need to finish this update before resolve finishes
7480 ceph_assert(mdr || mds->is_resolve());
7481
7482 CDir *dir = mdcache->get_dirfrag(rollback.src_dir);
7483 if (!dir)
7484 dir = mdcache->get_dirfrag(rollback.src_dir.ino, rollback.src_dname);
7485 ceph_assert(dir);
7486 CDentry *dn = dir->lookup(rollback.src_dname);
7487 ceph_assert(dn);
7488 dout(10) << " dn " << *dn << dendl;
7489 CDir *straydir = mdcache->get_dirfrag(rollback.dest_dir);
7490 ceph_assert(straydir);
7491 CDentry *straydn = straydir->lookup(rollback.dest_dname);
7492 ceph_assert(straydn);
7493 dout(10) << " straydn " << *straydn << dendl;
7494 CInode *in = straydn->get_linkage()->get_inode();
7495
7496 dn->push_projected_linkage(in);
7497 straydn->push_projected_linkage();
7498
7499 if (rollback.snapbl.length() && in->snaprealm) {
7500 bool hadrealm;
7501 auto p = rollback.snapbl.cbegin();
7502 decode(hadrealm, p);
7503 if (hadrealm) {
7504 decode(in->snaprealm->srnode, p);
7505 } else {
7506 in->snaprealm->merge_to(dir->get_inode()->find_snaprealm());
7507 }
7508 }
7509
7510 if (mdr && !mdr->more()->slave_update_journaled) {
7511 ceph_assert(!in->has_subtree_root_dirfrag(mds->get_nodeid()));
7512
7513 _rmdir_rollback_finish(mdr, rollback.reqid, dn, straydn);
7514 return;
7515 }
7516
7517
7518 ESlaveUpdate *le = new ESlaveUpdate(mdlog, "slave_rmdir_rollback", rollback.reqid, master,
7519 ESlaveUpdate::OP_ROLLBACK, ESlaveUpdate::RMDIR);
7520 mdlog->start_entry(le);
7521
7522 le->commit.add_dir_context(dn->get_dir());
7523 le->commit.add_primary_dentry(dn, in, true);
7524 // slave: no need to journal straydn
7525
7526 dout(10) << " noting renamed (unlinked) dir ino " << in->ino() << " in metablob" << dendl;
7527 le->commit.renamed_dirino = in->ino();
7528
7529 mdcache->project_subtree_rename(in, straydn->get_dir(), dn->get_dir());
7530
7531 submit_mdlog_entry(le,
7532 new C_MDS_LoggedRmdirRollback(this, mdr,rollback.reqid,
7533 dn, straydn),
7534 mdr, __func__);
7535 mdlog->flush();
7536 }
7537
7538 void Server::_rmdir_rollback_finish(MDRequestRef& mdr, metareqid_t reqid, CDentry *dn, CDentry *straydn)
7539 {
7540 dout(10) << "_rmdir_rollback_finish " << reqid << dendl;
7541
7542 straydn->get_dir()->unlink_inode(straydn);
7543 dn->pop_projected_linkage();
7544 straydn->pop_projected_linkage();
7545
7546 CInode *in = dn->get_linkage()->get_inode();
7547 mdcache->adjust_subtree_after_rename(in, straydn->get_dir(),
7548 !mdr || mdr->more()->slave_update_journaled);
7549
7550 if (mds->is_resolve()) {
7551 CDir *root = mdcache->get_subtree_root(straydn->get_dir());
7552 mdcache->try_trim_non_auth_subtree(root);
7553 }
7554
7555 if (mdr)
7556 mdcache->request_finish(mdr);
7557
7558 mdcache->finish_rollback(reqid, mdr);
7559 }
7560
7561
7562 /** _dir_is_nonempty[_unlocked]
7563 *
7564 * check if a directory is non-empty (i.e. we can rmdir it).
7565 *
7566 * the unlocked varient this is a fastpath check. we can't really be
7567 * sure until we rdlock the filelock.
7568 */
7569 bool Server::_dir_is_nonempty_unlocked(MDRequestRef& mdr, CInode *in)
7570 {
7571 dout(10) << "dir_is_nonempty_unlocked " << *in << dendl;
7572 ceph_assert(in->is_auth());
7573
7574 if (in->filelock.is_cached())
7575 return false; // there can be pending async create/unlink. don't know.
7576 if (in->snaprealm && in->snaprealm->srnode.snaps.size())
7577 return true; // in a snapshot!
7578
7579 auto&& ls = in->get_dirfrags();
7580 for (const auto& dir : ls) {
7581 // is the frag obviously non-empty?
7582 if (dir->is_auth()) {
7583 if (dir->get_projected_fnode()->fragstat.size()) {
7584 dout(10) << "dir_is_nonempty_unlocked dirstat has "
7585 << dir->get_projected_fnode()->fragstat.size() << " items " << *dir << dendl;
7586 return true;
7587 }
7588 }
7589 }
7590
7591 return false;
7592 }
7593
7594 bool Server::_dir_is_nonempty(MDRequestRef& mdr, CInode *in)
7595 {
7596 dout(10) << "dir_is_nonempty " << *in << dendl;
7597 ceph_assert(in->is_auth());
7598 ceph_assert(in->filelock.can_read(mdr->get_client()));
7599
7600 frag_info_t dirstat;
7601 version_t dirstat_version = in->get_projected_inode()->dirstat.version;
7602
7603 auto&& ls = in->get_dirfrags();
7604 for (const auto& dir : ls) {
7605 const fnode_t *pf = dir->get_projected_fnode();
7606 if (pf->fragstat.size()) {
7607 dout(10) << "dir_is_nonempty dirstat has "
7608 << pf->fragstat.size() << " items " << *dir << dendl;
7609 return true;
7610 }
7611
7612 if (pf->accounted_fragstat.version == dirstat_version)
7613 dirstat.add(pf->accounted_fragstat);
7614 else
7615 dirstat.add(pf->fragstat);
7616 }
7617
7618 return dirstat.size() != in->get_projected_inode()->dirstat.size();
7619 }
7620
7621
7622 // ======================================================
7623
7624
7625 class C_MDS_rename_finish : public ServerLogContext {
7626 CDentry *srcdn;
7627 CDentry *destdn;
7628 CDentry *straydn;
7629 public:
7630 C_MDS_rename_finish(Server *s, MDRequestRef& r,
7631 CDentry *sdn, CDentry *ddn, CDentry *stdn) :
7632 ServerLogContext(s, r),
7633 srcdn(sdn), destdn(ddn), straydn(stdn) { }
7634 void finish(int r) override {
7635 ceph_assert(r == 0);
7636 server->_rename_finish(mdr, srcdn, destdn, straydn);
7637 }
7638 };
7639
7640
7641 /** handle_client_rename
7642 *
7643 * rename master is the destdn auth. this is because cached inodes
7644 * must remain connected. thus, any replica of srci, must also
7645 * replicate destdn, and possibly straydn, so that srci (and
7646 * destdn->inode) remain connected during the rename.
7647 *
7648 * to do this, we freeze srci, then master (destdn auth) verifies that
7649 * all other nodes have also replciated destdn and straydn. note that
7650 * destdn replicas need not also replicate srci. this only works when
7651 * destdn is master.
7652 *
7653 * This function takes responsibility for the passed mdr.
7654 */
7655 void Server::handle_client_rename(MDRequestRef& mdr)
7656 {
7657 const cref_t<MClientRequest> &req = mdr->client_request;
7658 dout(7) << "handle_client_rename " << *req << dendl;
7659
7660 filepath destpath = req->get_filepath();
7661 filepath srcpath = req->get_filepath2();
7662 if (srcpath.is_last_dot_or_dotdot() || destpath.is_last_dot_or_dotdot()) {
7663 respond_to_request(mdr, -EBUSY);
7664 return;
7665 }
7666
7667 auto [destdn, srcdn] = rdlock_two_paths_xlock_destdn(mdr, true);
7668 if (!destdn)
7669 return;
7670
7671 dout(10) << " destdn " << *destdn << dendl;
7672 CDir *destdir = destdn->get_dir();
7673 ceph_assert(destdir->is_auth());
7674 CDentry::linkage_t *destdnl = destdn->get_projected_linkage();
7675
7676 dout(10) << " srcdn " << *srcdn << dendl;
7677 CDir *srcdir = srcdn->get_dir();
7678 CDentry::linkage_t *srcdnl = srcdn->get_projected_linkage();
7679 CInode *srci = srcdnl->get_inode();
7680 dout(10) << " srci " << *srci << dendl;
7681
7682 // -- some sanity checks --
7683 if (destdn == srcdn) {
7684 dout(7) << "rename src=dest, noop" << dendl;
7685 respond_to_request(mdr, 0);
7686 return;
7687 }
7688
7689 // dest a child of src?
7690 // e.g. mv /usr /usr/foo
7691 if (srci->is_dir() && srci->is_projected_ancestor_of(destdir->get_inode())) {
7692 dout(7) << "cannot rename item to be a child of itself" << dendl;
7693 respond_to_request(mdr, -EINVAL);
7694 return;
7695 }
7696
7697 // is this a stray migration, reintegration or merge? (sanity checks!)
7698 if (mdr->reqid.name.is_mds() &&
7699 !(MDS_INO_IS_STRAY(srcpath.get_ino()) &&
7700 MDS_INO_IS_STRAY(destpath.get_ino())) &&
7701 !(destdnl->is_remote() &&
7702 destdnl->get_remote_ino() == srci->ino())) {
7703 respond_to_request(mdr, -EINVAL); // actually, this won't reply, but whatev.
7704 return;
7705 }
7706
7707 CInode *oldin = 0;
7708 if (!destdnl->is_null()) {
7709 //dout(10) << "dest dn exists " << *destdn << dendl;
7710 oldin = mdcache->get_dentry_inode(destdn, mdr, true);
7711 if (!oldin) return;
7712 dout(10) << " oldin " << *oldin << dendl;
7713
7714 // non-empty dir? do trivial fast unlocked check, do another check later with read locks
7715 if (oldin->is_dir() && _dir_is_nonempty_unlocked(mdr, oldin)) {
7716 respond_to_request(mdr, -ENOTEMPTY);
7717 return;
7718 }
7719
7720 // mv /some/thing /to/some/existing_other_thing
7721 if (oldin->is_dir() && !srci->is_dir()) {
7722 respond_to_request(mdr, -EISDIR);
7723 return;
7724 }
7725 if (!oldin->is_dir() && srci->is_dir()) {
7726 respond_to_request(mdr, -ENOTDIR);
7727 return;
7728 }
7729 if (srci == oldin && !srcdir->inode->is_stray()) {
7730 respond_to_request(mdr, 0); // no-op. POSIX makes no sense.
7731 return;
7732 }
7733 }
7734
7735 vector<CDentry*>& srctrace = mdr->dn[1];
7736 vector<CDentry*>& desttrace = mdr->dn[0];
7737
7738 // src+dest traces _must_ share a common ancestor for locking to prevent orphans
7739 if (destpath.get_ino() != srcpath.get_ino() &&
7740 !(req->get_source().is_mds() &&
7741 MDS_INO_IS_STRAY(srcpath.get_ino()))) { // <-- mds 'rename' out of stray dir is ok!
7742 CInode *srcbase = srctrace[0]->get_dir()->get_inode();
7743 CInode *destbase = desttrace[0]->get_dir()->get_inode();
7744 // ok, extend srctrace toward root until it is an ancestor of desttrace.
7745 while (srcbase != destbase &&
7746 !srcbase->is_projected_ancestor_of(destbase)) {
7747 CDentry *pdn = srcbase->get_projected_parent_dn();
7748 srctrace.insert(srctrace.begin(), pdn);
7749 dout(10) << "rename prepending srctrace with " << *pdn << dendl;
7750 srcbase = pdn->get_dir()->get_inode();
7751 }
7752
7753 // then, extend destpath until it shares the same parent inode as srcpath.
7754 while (destbase != srcbase) {
7755 CDentry *pdn = destbase->get_projected_parent_dn();
7756 desttrace.insert(desttrace.begin(), pdn);
7757 dout(10) << "rename prepending desttrace with " << *pdn << dendl;
7758 destbase = pdn->get_dir()->get_inode();
7759 }
7760 dout(10) << "rename src and dest traces now share common ancestor " << *destbase << dendl;
7761 }
7762
7763
7764 bool linkmerge = srcdnl->get_inode() == destdnl->get_inode();
7765 if (linkmerge)
7766 dout(10) << " this is a link merge" << dendl;
7767
7768 // -- create stray dentry? --
7769 CDentry *straydn = NULL;
7770 if (destdnl->is_primary() && !linkmerge) {
7771 straydn = prepare_stray_dentry(mdr, destdnl->get_inode());
7772 if (!straydn)
7773 return;
7774 dout(10) << " straydn is " << *straydn << dendl;
7775 } else if (mdr->straydn) {
7776 mdr->unpin(mdr->straydn);
7777 mdr->straydn = NULL;
7778 }
7779
7780
7781 // -- locks --
7782 if (!(mdr->locking_state & MutationImpl::ALL_LOCKED)) {
7783 MutationImpl::LockOpVec lov;
7784
7785 // we need to update srci's ctime. xlock its least contended lock to do that...
7786 lov.add_xlock(&srci->linklock);
7787 lov.add_xlock(&srci->snaplock);
7788
7789 if (oldin) {
7790 // xlock oldin (for nlink--)
7791 lov.add_xlock(&oldin->linklock);
7792 lov.add_xlock(&oldin->snaplock);
7793 if (oldin->is_dir()) {
7794 ceph_assert(srci->is_dir());
7795 lov.add_rdlock(&oldin->filelock); // to verify it's empty
7796
7797 // adjust locking order?
7798 int cmp = mdr->compare_paths();
7799 if (cmp < 0 || (cmp == 0 && oldin->ino() < srci->ino()))
7800 std::reverse(lov.begin(), lov.end());
7801 } else {
7802 ceph_assert(!srci->is_dir());
7803 // adjust locking order;
7804 if (srci->ino() > oldin->ino())
7805 std::reverse(lov.begin(), lov.end());
7806 }
7807 }
7808
7809 // straydn?
7810 if (straydn) {
7811 lov.add_wrlock(&straydn->get_dir()->inode->filelock);
7812 lov.add_wrlock(&straydn->get_dir()->inode->nestlock);
7813 lov.add_xlock(&straydn->lock);
7814 }
7815
7816 CInode *auth_pin_freeze = !srcdn->is_auth() && srcdnl->is_primary() ? srci : nullptr;
7817 if (!mds->locker->acquire_locks(mdr, lov, auth_pin_freeze))
7818 return;
7819
7820 mdr->locking_state |= MutationImpl::ALL_LOCKED;
7821 }
7822
7823 if (linkmerge)
7824 ceph_assert(srcdir->inode->is_stray() && srcdnl->is_primary() && destdnl->is_remote());
7825
7826 if ((!mdr->has_more() || mdr->more()->witnessed.empty())) {
7827 if (!check_access(mdr, srcdir->get_inode(), MAY_WRITE))
7828 return;
7829
7830 if (!check_access(mdr, destdn->get_dir()->get_inode(), MAY_WRITE))
7831 return;
7832
7833 if (!check_fragment_space(mdr, destdn->get_dir()))
7834 return;
7835
7836 if (!check_access(mdr, srci, MAY_WRITE))
7837 return;
7838 }
7839
7840 // with read lock, really verify oldin is empty
7841 if (oldin &&
7842 oldin->is_dir() &&
7843 _dir_is_nonempty(mdr, oldin)) {
7844 respond_to_request(mdr, -ENOTEMPTY);
7845 return;
7846 }
7847
7848 /* project_snaprealm_past_parent() will do this job
7849 *
7850 // moving between snaprealms?
7851 if (srcdnl->is_primary() && srci->is_multiversion() && !srci->snaprealm) {
7852 SnapRealm *srcrealm = srci->find_snaprealm();
7853 SnapRealm *destrealm = destdn->get_dir()->inode->find_snaprealm();
7854 if (srcrealm != destrealm &&
7855 (srcrealm->get_newest_seq() + 1 > srcdn->first ||
7856 destrealm->get_newest_seq() + 1 > srcdn->first)) {
7857 dout(10) << " renaming between snaprealms, creating snaprealm for " << *srci << dendl;
7858 mdcache->snaprealm_create(mdr, srci);
7859 return;
7860 }
7861 }
7862 */
7863
7864 ceph_assert(g_conf()->mds_kill_rename_at != 1);
7865
7866 // -- open all srcdn inode frags, if any --
7867 // we need these open so that auth can properly delegate from inode to dirfrags
7868 // after the inode is _ours_.
7869 if (srcdnl->is_primary() &&
7870 !srcdn->is_auth() &&
7871 srci->is_dir()) {
7872 dout(10) << "srci is remote dir, setting stickydirs and opening all frags" << dendl;
7873 mdr->set_stickydirs(srci);
7874
7875 frag_vec_t leaves;
7876 srci->dirfragtree.get_leaves(leaves);
7877 for (const auto& leaf : leaves) {
7878 CDir *dir = srci->get_dirfrag(leaf);
7879 if (!dir) {
7880 dout(10) << " opening " << leaf << " under " << *srci << dendl;
7881 mdcache->open_remote_dirfrag(srci, leaf, new C_MDS_RetryRequest(mdcache, mdr));
7882 return;
7883 }
7884 }
7885 }
7886
7887 // -- prepare snaprealm ---
7888
7889 if (linkmerge) {
7890 if (!mdr->more()->srci_srnode &&
7891 srci->get_projected_inode()->nlink == 1 &&
7892 srci->is_projected_snaprealm_global()) {
7893 sr_t *new_srnode = srci->prepare_new_srnode(0);
7894 srci->record_snaprealm_parent_dentry(new_srnode, NULL, destdn, false);
7895
7896 srci->clear_snaprealm_global(new_srnode);
7897 mdr->more()->srci_srnode = new_srnode;
7898 }
7899 } else {
7900 if (oldin && !mdr->more()->desti_srnode) {
7901 if (oldin->is_projected_snaprealm_global()) {
7902 sr_t *new_srnode = oldin->prepare_new_srnode(0);
7903 oldin->record_snaprealm_parent_dentry(new_srnode, NULL, destdn, destdnl->is_primary());
7904 // dropping the last linkage or dropping the last remote linkage,
7905 // detch the inode from global snaprealm
7906 auto nlink = oldin->get_projected_inode()->nlink;
7907 if (nlink == 1 ||
7908 (nlink == 2 && !destdnl->is_primary() &&
7909 !oldin->get_projected_parent_dir()->inode->is_stray()))
7910 oldin->clear_snaprealm_global(new_srnode);
7911 mdr->more()->desti_srnode = new_srnode;
7912 } else if (destdnl->is_primary()) {
7913 SnapRealm *dest_realm = destdir->inode->find_snaprealm();
7914 snapid_t follows = dest_realm->get_newest_seq();
7915 if (oldin->snaprealm || follows + 1 > oldin->get_oldest_snap()) {
7916 sr_t *new_srnode = oldin->prepare_new_srnode(follows);
7917 oldin->record_snaprealm_past_parent(new_srnode, straydn->get_dir()->inode->find_snaprealm());
7918 mdr->more()->desti_srnode = new_srnode;
7919 }
7920 }
7921 }
7922 if (!mdr->more()->srci_srnode) {
7923 SnapRealm *dest_realm = destdir->inode->find_snaprealm();
7924 if (srci->is_projected_snaprealm_global()) {
7925 sr_t *new_srnode = srci->prepare_new_srnode(0);
7926 srci->record_snaprealm_parent_dentry(new_srnode, dest_realm, srcdn, srcdnl->is_primary());
7927 mdr->more()->srci_srnode = new_srnode;
7928 } else if (srcdnl->is_primary()) {
7929 SnapRealm *src_realm = srcdir->inode->find_snaprealm();
7930 snapid_t follows = src_realm->get_newest_seq();
7931 if (src_realm != dest_realm &&
7932 (srci->snaprealm || follows + 1 > srci->get_oldest_snap())) {
7933 sr_t *new_srnode = srci->prepare_new_srnode(follows);
7934 srci->record_snaprealm_past_parent(new_srnode, dest_realm);
7935 mdr->more()->srci_srnode = new_srnode;
7936 }
7937 }
7938 }
7939 }
7940
7941 // -- prepare witnesses --
7942
7943 /*
7944 * NOTE: we use _all_ replicas as witnesses.
7945 * this probably isn't totally necessary (esp for file renames),
7946 * but if/when we change that, we have to make sure rejoin is
7947 * sufficiently robust to handle strong rejoins from survivors
7948 * with totally wrong dentry->inode linkage.
7949 * (currently, it can ignore rename effects, because the resolve
7950 * stage will sort them out.)
7951 */
7952 set<mds_rank_t> witnesses = mdr->more()->extra_witnesses;
7953 if (srcdn->is_auth())
7954 srcdn->list_replicas(witnesses);
7955 else
7956 witnesses.insert(srcdn->authority().first);
7957 if (srcdnl->is_remote() && !srci->is_auth())
7958 witnesses.insert(srci->authority().first);
7959 destdn->list_replicas(witnesses);
7960 if (destdnl->is_remote() && !oldin->is_auth())
7961 witnesses.insert(oldin->authority().first);
7962 dout(10) << " witnesses " << witnesses << ", have " << mdr->more()->witnessed << dendl;
7963
7964 if (!witnesses.empty()) {
7965 // Replicas can't see projected dentry linkages and will get confused.
7966 // We have taken snaplocks on ancestor inodes. Later rename/rmdir requests
7967 // can't project these inodes' linkages.
7968 bool need_flush = false;
7969 for (auto& dn : srctrace) {
7970 if (dn->is_projected()) {
7971 need_flush = true;
7972 break;
7973 }
7974 }
7975 if (!need_flush) {
7976 CDentry *dn = destdn;
7977 do {
7978 if (dn->is_projected()) {
7979 need_flush = true;
7980 break;
7981 }
7982 CInode *diri = dn->get_dir()->get_inode();
7983 dn = diri->get_projected_parent_dn();
7984 } while (dn);
7985 }
7986 if (need_flush) {
7987 mdlog->wait_for_safe(
7988 new MDSInternalContextWrapper(mds,
7989 new C_MDS_RetryRequest(mdcache, mdr)));
7990 mdlog->flush();
7991 return;
7992 }
7993 }
7994
7995 // do srcdn auth last
7996 mds_rank_t last = MDS_RANK_NONE;
7997 if (!srcdn->is_auth()) {
7998 last = srcdn->authority().first;
7999 mdr->more()->srcdn_auth_mds = last;
8000 // ask auth of srci to mark srci as ambiguous auth if more than two MDS
8001 // are involved in the rename operation.
8002 if (srcdnl->is_primary() && !mdr->more()->is_ambiguous_auth) {
8003 dout(10) << " preparing ambiguous auth for srci" << dendl;
8004 ceph_assert(mdr->more()->is_remote_frozen_authpin);
8005 ceph_assert(mdr->more()->rename_inode == srci);
8006 _rename_prepare_witness(mdr, last, witnesses, srctrace, desttrace, straydn);
8007 return;
8008 }
8009 }
8010
8011 for (set<mds_rank_t>::iterator p = witnesses.begin();
8012 p != witnesses.end();
8013 ++p) {
8014 if (*p == last) continue; // do it last!
8015 if (mdr->more()->witnessed.count(*p)) {
8016 dout(10) << " already witnessed by mds." << *p << dendl;
8017 } else if (mdr->more()->waiting_on_slave.count(*p)) {
8018 dout(10) << " already waiting on witness mds." << *p << dendl;
8019 } else {
8020 if (!_rename_prepare_witness(mdr, *p, witnesses, srctrace, desttrace, straydn))
8021 return;
8022 }
8023 }
8024 if (!mdr->more()->waiting_on_slave.empty())
8025 return; // we're waiting for a witness.
8026
8027 if (last != MDS_RANK_NONE && mdr->more()->witnessed.count(last) == 0) {
8028 dout(10) << " preparing last witness (srcdn auth)" << dendl;
8029 ceph_assert(mdr->more()->waiting_on_slave.count(last) == 0);
8030 _rename_prepare_witness(mdr, last, witnesses, srctrace, desttrace, straydn);
8031 return;
8032 }
8033
8034 // test hack: bail after slave does prepare, so we can verify it's _live_ rollback.
8035 if (!mdr->more()->slaves.empty() && !srci->is_dir())
8036 ceph_assert(g_conf()->mds_kill_rename_at != 3);
8037 if (!mdr->more()->slaves.empty() && srci->is_dir())
8038 ceph_assert(g_conf()->mds_kill_rename_at != 4);
8039
8040 // -- declare now --
8041 mdr->set_mds_stamp(ceph_clock_now());
8042
8043 // -- prepare journal entry --
8044 mdr->ls = mdlog->get_current_segment();
8045 EUpdate *le = new EUpdate(mdlog, "rename");
8046 mdlog->start_entry(le);
8047 le->metablob.add_client_req(mdr->reqid, mdr->client_request->get_oldest_client_tid());
8048 if (!mdr->more()->witnessed.empty()) {
8049 dout(20) << " noting uncommitted_slaves " << mdr->more()->witnessed << dendl;
8050
8051 le->reqid = mdr->reqid;
8052 le->had_slaves = true;
8053
8054 mdcache->add_uncommitted_master(mdr->reqid, mdr->ls, mdr->more()->witnessed);
8055 // no need to send frozen auth pin to recovring auth MDS of srci
8056 mdr->more()->is_remote_frozen_authpin = false;
8057 }
8058
8059 _rename_prepare(mdr, &le->metablob, &le->client_map, srcdn, destdn, straydn);
8060 if (le->client_map.length())
8061 le->cmapv = mds->sessionmap.get_projected();
8062
8063 // -- commit locally --
8064 C_MDS_rename_finish *fin = new C_MDS_rename_finish(this, mdr, srcdn, destdn, straydn);
8065
8066 journal_and_reply(mdr, srci, destdn, le, fin);
8067 mds->balancer->maybe_fragment(destdn->get_dir(), false);
8068 }
8069
8070
8071 void Server::_rename_finish(MDRequestRef& mdr, CDentry *srcdn, CDentry *destdn, CDentry *straydn)
8072 {
8073 dout(10) << "_rename_finish " << *mdr << dendl;
8074
8075 if (!mdr->more()->witnessed.empty())
8076 mdcache->logged_master_update(mdr->reqid);
8077
8078 // apply
8079 _rename_apply(mdr, srcdn, destdn, straydn);
8080
8081 mdcache->send_dentry_link(destdn, mdr);
8082
8083 CDentry::linkage_t *destdnl = destdn->get_linkage();
8084 CInode *in = destdnl->get_inode();
8085 bool need_eval = mdr->more()->cap_imports.count(in);
8086
8087 // test hack: test slave commit
8088 if (!mdr->more()->slaves.empty() && !in->is_dir())
8089 ceph_assert(g_conf()->mds_kill_rename_at != 5);
8090 if (!mdr->more()->slaves.empty() && in->is_dir())
8091 ceph_assert(g_conf()->mds_kill_rename_at != 6);
8092
8093 // bump popularity
8094 mds->balancer->hit_dir(srcdn->get_dir(), META_POP_IWR);
8095 if (destdnl->is_remote() && in->is_auth())
8096 mds->balancer->hit_inode(in, META_POP_IWR);
8097
8098 // did we import srci? if so, explicitly ack that import that, before we unlock and reply.
8099
8100 ceph_assert(g_conf()->mds_kill_rename_at != 7);
8101
8102 // reply
8103 respond_to_request(mdr, 0);
8104
8105 if (need_eval)
8106 mds->locker->eval(in, CEPH_CAP_LOCKS, true);
8107
8108 // clean up?
8109 // respond_to_request() drops locks. So stray reintegration can race with us.
8110 if (straydn && !straydn->get_projected_linkage()->is_null()) {
8111 mdcache->notify_stray(straydn);
8112 }
8113 }
8114
8115
8116
8117 // helpers
8118
8119 bool Server::_rename_prepare_witness(MDRequestRef& mdr, mds_rank_t who, set<mds_rank_t> &witnesse,
8120 vector<CDentry*>& srctrace, vector<CDentry*>& dsttrace, CDentry *straydn)
8121 {
8122 if (mds->is_cluster_degraded() &&
8123 !mds->mdsmap->is_clientreplay_or_active_or_stopping(who)) {
8124 dout(10) << "_rename_prepare_witness mds." << who << " is not active" << dendl;
8125 if (mdr->more()->waiting_on_slave.empty())
8126 mds->wait_for_active_peer(who, new C_MDS_RetryRequest(mdcache, mdr));
8127 return false;
8128 }
8129
8130 dout(10) << "_rename_prepare_witness mds." << who << dendl;
8131 auto req = make_message<MMDSSlaveRequest>(mdr->reqid, mdr->attempt, MMDSSlaveRequest::OP_RENAMEPREP);
8132
8133 req->srcdnpath = filepath(srctrace.front()->get_dir()->ino());
8134 for (auto dn : srctrace)
8135 req->srcdnpath.push_dentry(dn->get_name());
8136 req->destdnpath = filepath(dsttrace.front()->get_dir()->ino());
8137 for (auto dn : dsttrace)
8138 req->destdnpath.push_dentry(dn->get_name());
8139 if (straydn)
8140 mdcache->encode_replica_stray(straydn, who, req->straybl);
8141
8142 if (mdr->more()->srci_srnode)
8143 encode(*mdr->more()->srci_srnode, req->srci_snapbl);
8144 if (mdr->more()->desti_srnode)
8145 encode(*mdr->more()->desti_srnode, req->desti_snapbl);
8146
8147 req->srcdn_auth = mdr->more()->srcdn_auth_mds;
8148
8149 // srcdn auth will verify our current witness list is sufficient
8150 req->witnesses = witnesse;
8151
8152 req->op_stamp = mdr->get_op_stamp();
8153 mds->send_message_mds(req, who);
8154
8155 ceph_assert(mdr->more()->waiting_on_slave.count(who) == 0);
8156 mdr->more()->waiting_on_slave.insert(who);
8157 return true;
8158 }
8159
8160 version_t Server::_rename_prepare_import(MDRequestRef& mdr, CDentry *srcdn, bufferlist *client_map_bl)
8161 {
8162 version_t oldpv = mdr->more()->inode_import_v;
8163
8164 CDentry::linkage_t *srcdnl = srcdn->get_linkage();
8165
8166 /* import node */
8167 auto blp = mdr->more()->inode_import.cbegin();
8168
8169 // imported caps
8170 map<client_t,entity_inst_t> client_map;
8171 map<client_t, client_metadata_t> client_metadata_map;
8172 decode(client_map, blp);
8173 decode(client_metadata_map, blp);
8174 prepare_force_open_sessions(client_map, client_metadata_map,
8175 mdr->more()->imported_session_map);
8176 encode(client_map, *client_map_bl, mds->mdsmap->get_up_features());
8177 encode(client_metadata_map, *client_map_bl);
8178
8179 list<ScatterLock*> updated_scatterlocks;
8180 mdcache->migrator->decode_import_inode(srcdn, blp, srcdn->authority().first, mdr->ls,
8181 mdr->more()->cap_imports, updated_scatterlocks);
8182
8183 // hack: force back to !auth and clean, temporarily
8184 srcdnl->get_inode()->state_clear(CInode::STATE_AUTH);
8185 srcdnl->get_inode()->mark_clean();
8186
8187 return oldpv;
8188 }
8189
8190 bool Server::_need_force_journal(CInode *diri, bool empty)
8191 {
8192 auto&& dirs = diri->get_dirfrags();
8193
8194 bool force_journal = false;
8195 if (empty) {
8196 for (const auto& dir : dirs) {
8197 if (dir->is_subtree_root() && dir->get_dir_auth().first == mds->get_nodeid()) {
8198 dout(10) << " frag " << dir->get_frag() << " is auth subtree dirfrag, will force journal" << dendl;
8199 force_journal = true;
8200 break;
8201 } else
8202 dout(20) << " frag " << dir->get_frag() << " is not auth subtree dirfrag" << dendl;
8203 }
8204 } else {
8205 // see if any children of our frags are auth subtrees.
8206 std::vector<CDir*> subtrees;
8207 mdcache->get_subtrees(subtrees);
8208 dout(10) << " subtrees " << subtrees << " frags " << dirs << dendl;
8209 for (const auto& dir : dirs) {
8210 for (const auto& subtree : subtrees) {
8211 if (dir->contains(subtree)) {
8212 if (subtree->get_dir_auth().first == mds->get_nodeid()) {
8213 dout(10) << " frag " << dir->get_frag() << " contains (maybe) auth subtree, will force journal "
8214 << *subtree << dendl;
8215 force_journal = true;
8216 break;
8217 } else
8218 dout(20) << " frag " << dir->get_frag() << " contains but isn't auth for " << *subtree << dendl;
8219 } else
8220 dout(20) << " frag " << dir->get_frag() << " does not contain " << *subtree << dendl;
8221 }
8222 if (force_journal)
8223 break;
8224 }
8225 }
8226 return force_journal;
8227 }
8228
8229 void Server::_rename_prepare(MDRequestRef& mdr,
8230 EMetaBlob *metablob, bufferlist *client_map_bl,
8231 CDentry *srcdn, CDentry *destdn, CDentry *straydn)
8232 {
8233 dout(10) << "_rename_prepare " << *mdr << " " << *srcdn << " " << *destdn << dendl;
8234 if (straydn)
8235 dout(10) << " straydn " << *straydn << dendl;
8236
8237 CDentry::linkage_t *srcdnl = srcdn->get_projected_linkage();
8238 CDentry::linkage_t *destdnl = destdn->get_projected_linkage();
8239 CInode *srci = srcdnl->get_inode();
8240 CInode *oldin = destdnl->get_inode();
8241
8242 // primary+remote link merge?
8243 bool linkmerge = (srci == oldin);
8244 if (linkmerge)
8245 ceph_assert(srcdnl->is_primary() && destdnl->is_remote());
8246 bool silent = srcdn->get_dir()->inode->is_stray();
8247
8248 bool force_journal_dest = false;
8249 if (srci->is_dir() && !destdn->is_auth()) {
8250 if (srci->is_auth()) {
8251 // if we are auth for srci and exporting it, force journal because journal replay needs
8252 // the source inode to create auth subtrees.
8253 dout(10) << " we are exporting srci, will force journal destdn" << dendl;
8254 force_journal_dest = true;
8255 } else
8256 force_journal_dest = _need_force_journal(srci, false);
8257 }
8258
8259 bool force_journal_stray = false;
8260 if (oldin && oldin->is_dir() && straydn && !straydn->is_auth())
8261 force_journal_stray = _need_force_journal(oldin, true);
8262
8263 if (linkmerge)
8264 dout(10) << " merging remote and primary links to the same inode" << dendl;
8265 if (silent)
8266 dout(10) << " reintegrating stray; will avoid changing nlink or dir mtime" << dendl;
8267 if (force_journal_dest)
8268 dout(10) << " forcing journal destdn because we (will) have auth subtrees nested beneath it" << dendl;
8269 if (force_journal_stray)
8270 dout(10) << " forcing journal straydn because we (will) have auth subtrees nested beneath it" << dendl;
8271
8272 if (srci->is_dir() && (destdn->is_auth() || force_journal_dest)) {
8273 dout(10) << " noting renamed dir ino " << srci->ino() << " in metablob" << dendl;
8274 metablob->renamed_dirino = srci->ino();
8275 } else if (oldin && oldin->is_dir() && force_journal_stray) {
8276 dout(10) << " noting rename target dir " << oldin->ino() << " in metablob" << dendl;
8277 metablob->renamed_dirino = oldin->ino();
8278 }
8279
8280 // prepare
8281 CInode::mempool_inode *spi = 0; // renamed inode
8282 CInode::mempool_inode *tpi = 0; // target/overwritten inode
8283
8284 // target inode
8285 if (!linkmerge) {
8286 if (destdnl->is_primary()) {
8287 ceph_assert(straydn); // moving to straydn.
8288 // link--, and move.
8289 if (destdn->is_auth()) {
8290 auto &pi= oldin->project_inode(); //project_snaprealm
8291 pi.inode.version = straydn->pre_dirty(pi.inode.version);
8292 pi.inode.update_backtrace();
8293 tpi = &pi.inode;
8294 }
8295 straydn->push_projected_linkage(oldin);
8296 } else if (destdnl->is_remote()) {
8297 // nlink-- targeti
8298 if (oldin->is_auth()) {
8299 auto &pi = oldin->project_inode();
8300 pi.inode.version = oldin->pre_dirty();
8301 tpi = &pi.inode;
8302 }
8303 }
8304 }
8305
8306 // dest
8307 if (srcdnl->is_remote()) {
8308 if (!linkmerge) {
8309 // destdn
8310 if (destdn->is_auth())
8311 mdr->more()->pvmap[destdn] = destdn->pre_dirty();
8312 destdn->push_projected_linkage(srcdnl->get_remote_ino(), srcdnl->get_remote_d_type());
8313 // srci
8314 if (srci->is_auth()) {
8315 auto &pi = srci->project_inode();
8316 pi.inode.version = srci->pre_dirty();
8317 spi = &pi.inode;
8318 }
8319 } else {
8320 dout(10) << " will merge remote onto primary link" << dendl;
8321 if (destdn->is_auth()) {
8322 auto &pi = oldin->project_inode();
8323 pi.inode.version = mdr->more()->pvmap[destdn] = destdn->pre_dirty(oldin->inode.version);
8324 spi = &pi.inode;
8325 }
8326 }
8327 } else { // primary
8328 if (destdn->is_auth()) {
8329 version_t oldpv;
8330 if (srcdn->is_auth())
8331 oldpv = srci->get_projected_version();
8332 else {
8333 oldpv = _rename_prepare_import(mdr, srcdn, client_map_bl);
8334
8335 // note which dirfrags have child subtrees in the journal
8336 // event, so that we can open those (as bounds) during replay.
8337 if (srci->is_dir()) {
8338 auto&& ls = srci->get_dirfrags();
8339 for (const auto& dir : ls) {
8340 if (!dir->is_auth())
8341 metablob->renamed_dir_frags.push_back(dir->get_frag());
8342 }
8343 dout(10) << " noting renamed dir open frags " << metablob->renamed_dir_frags << dendl;
8344 }
8345 }
8346 auto &pi = srci->project_inode(); // project snaprealm if srcdnl->is_primary
8347 // & srcdnl->snaprealm
8348 pi.inode.version = mdr->more()->pvmap[destdn] = destdn->pre_dirty(oldpv);
8349 pi.inode.update_backtrace();
8350 spi = &pi.inode;
8351 }
8352 destdn->push_projected_linkage(srci);
8353 }
8354
8355 // src
8356 if (srcdn->is_auth())
8357 mdr->more()->pvmap[srcdn] = srcdn->pre_dirty();
8358 srcdn->push_projected_linkage(); // push null linkage
8359
8360 if (!silent) {
8361 if (spi) {
8362 spi->ctime = mdr->get_op_stamp();
8363 if (mdr->get_op_stamp() > spi->rstat.rctime)
8364 spi->rstat.rctime = mdr->get_op_stamp();
8365 spi->change_attr++;
8366 if (linkmerge)
8367 spi->nlink--;
8368 }
8369 if (tpi) {
8370 tpi->ctime = mdr->get_op_stamp();
8371 if (mdr->get_op_stamp() > tpi->rstat.rctime)
8372 tpi->rstat.rctime = mdr->get_op_stamp();
8373 tpi->change_attr++;
8374 {
8375 std::string t;
8376 destdn->make_path_string(t, true);
8377 tpi->stray_prior_path = std::move(t);
8378 }
8379 tpi->nlink--;
8380 if (tpi->nlink == 0)
8381 oldin->state_set(CInode::STATE_ORPHAN);
8382 }
8383 }
8384
8385 // prepare nesting, mtime updates
8386 int predirty_dir = silent ? 0:PREDIRTY_DIR;
8387
8388 // guarantee stray dir is processed first during journal replay. unlink the old inode,
8389 // then link the source inode to destdn
8390 if (destdnl->is_primary()) {
8391 ceph_assert(straydn);
8392 if (straydn->is_auth()) {
8393 metablob->add_dir_context(straydn->get_dir());
8394 metablob->add_dir(straydn->get_dir(), true);
8395 }
8396 }
8397
8398 // sub off target
8399 if (destdn->is_auth() && !destdnl->is_null()) {
8400 mdcache->predirty_journal_parents(mdr, metablob, oldin, destdn->get_dir(),
8401 (destdnl->is_primary() ? PREDIRTY_PRIMARY:0)|predirty_dir, -1);
8402 if (destdnl->is_primary()) {
8403 ceph_assert(straydn);
8404 mdcache->predirty_journal_parents(mdr, metablob, oldin, straydn->get_dir(),
8405 PREDIRTY_PRIMARY|PREDIRTY_DIR, 1);
8406 }
8407 }
8408
8409 // move srcdn
8410 int predirty_primary = (srcdnl->is_primary() && srcdn->get_dir() != destdn->get_dir()) ? PREDIRTY_PRIMARY:0;
8411 int flags = predirty_dir | predirty_primary;
8412 if (srcdn->is_auth())
8413 mdcache->predirty_journal_parents(mdr, metablob, srci, srcdn->get_dir(), PREDIRTY_SHALLOW|flags, -1);
8414 if (destdn->is_auth())
8415 mdcache->predirty_journal_parents(mdr, metablob, srci, destdn->get_dir(), flags, 1);
8416
8417 // add it all to the metablob
8418 // target inode
8419 if (!linkmerge) {
8420 if (destdnl->is_primary()) {
8421 ceph_assert(straydn);
8422 if (destdn->is_auth()) {
8423 // project snaprealm, too
8424 if (auto& desti_srnode = mdr->more()->desti_srnode) {
8425 oldin->project_snaprealm(desti_srnode);
8426 if (tpi->nlink == 0)
8427 ceph_assert(!desti_srnode->is_parent_global());
8428 desti_srnode = NULL;
8429 }
8430 straydn->first = mdcache->get_global_snaprealm()->get_newest_seq() + 1;
8431 metablob->add_primary_dentry(straydn, oldin, true, true);
8432 } else if (force_journal_stray) {
8433 dout(10) << " forced journaling straydn " << *straydn << dendl;
8434 metablob->add_dir_context(straydn->get_dir());
8435 metablob->add_primary_dentry(straydn, oldin, true);
8436 }
8437 } else if (destdnl->is_remote()) {
8438 if (oldin->is_auth()) {
8439 sr_t *new_srnode = NULL;
8440 if (mdr->slave_request) {
8441 if (mdr->slave_request->desti_snapbl.length() > 0) {
8442 new_srnode = new sr_t();
8443 auto p = mdr->slave_request->desti_snapbl.cbegin();
8444 decode(*new_srnode, p);
8445 }
8446 } else if (auto& desti_srnode = mdr->more()->desti_srnode) {
8447 new_srnode = desti_srnode;
8448 desti_srnode = NULL;
8449 }
8450 if (new_srnode) {
8451 oldin->project_snaprealm(new_srnode);
8452 if (tpi->nlink == 0)
8453 ceph_assert(!new_srnode->is_parent_global());
8454 }
8455 // auth for targeti
8456 metablob->add_dir_context(oldin->get_projected_parent_dir());
8457 mdcache->journal_cow_dentry(mdr.get(), metablob, oldin->get_projected_parent_dn(),
8458 CEPH_NOSNAP, 0, destdnl);
8459 metablob->add_primary_dentry(oldin->get_projected_parent_dn(), oldin, true);
8460 }
8461 }
8462 }
8463
8464 // dest
8465 if (srcdnl->is_remote()) {
8466 ceph_assert(!linkmerge);
8467 if (destdn->is_auth() && !destdnl->is_null())
8468 mdcache->journal_cow_dentry(mdr.get(), metablob, destdn, CEPH_NOSNAP, 0, destdnl);
8469 else
8470 destdn->first = mdcache->get_global_snaprealm()->get_newest_seq() + 1;
8471
8472 if (destdn->is_auth())
8473 metablob->add_remote_dentry(destdn, true, srcdnl->get_remote_ino(), srcdnl->get_remote_d_type());
8474
8475 if (srci->is_auth() ) { // it's remote
8476 if (mdr->slave_request) {
8477 if (mdr->slave_request->srci_snapbl.length() > 0) {
8478 sr_t *new_srnode = new sr_t();
8479 auto p = mdr->slave_request->srci_snapbl.cbegin();
8480 decode(*new_srnode, p);
8481 srci->project_snaprealm(new_srnode);
8482 }
8483 } else if (auto& srci_srnode = mdr->more()->srci_srnode) {
8484 srci->project_snaprealm(srci_srnode);
8485 srci_srnode = NULL;
8486 }
8487
8488 CDentry *srci_pdn = srci->get_projected_parent_dn();
8489 metablob->add_dir_context(srci_pdn->get_dir());
8490 mdcache->journal_cow_dentry(mdr.get(), metablob, srci_pdn, CEPH_NOSNAP, 0, srcdnl);
8491 metablob->add_primary_dentry(srci_pdn, srci, true);
8492 }
8493 } else if (srcdnl->is_primary()) {
8494 // project snap parent update?
8495 if (destdn->is_auth()) {
8496 if (auto& srci_srnode = mdr->more()->srci_srnode) {
8497 srci->project_snaprealm(srci_srnode);
8498 srci_srnode = NULL;
8499 }
8500 }
8501
8502 if (destdn->is_auth() && !destdnl->is_null())
8503 mdcache->journal_cow_dentry(mdr.get(), metablob, destdn, CEPH_NOSNAP, 0, destdnl);
8504
8505 destdn->first = mdcache->get_global_snaprealm()->get_newest_seq() + 1;
8506
8507 if (destdn->is_auth())
8508 metablob->add_primary_dentry(destdn, srci, true, true);
8509 else if (force_journal_dest) {
8510 dout(10) << " forced journaling destdn " << *destdn << dendl;
8511 metablob->add_dir_context(destdn->get_dir());
8512 metablob->add_primary_dentry(destdn, srci, true);
8513 if (srcdn->is_auth() && srci->is_dir()) {
8514 // journal new subtrees root dirfrags
8515 auto&& ls = srci->get_dirfrags();
8516 for (const auto& dir : ls) {
8517 if (dir->is_auth())
8518 metablob->add_dir(dir, true);
8519 }
8520 }
8521 }
8522 }
8523
8524 // src
8525 if (srcdn->is_auth()) {
8526 dout(10) << " journaling srcdn " << *srcdn << dendl;
8527 mdcache->journal_cow_dentry(mdr.get(), metablob, srcdn, CEPH_NOSNAP, 0, srcdnl);
8528 // also journal the inode in case we need do slave rename rollback. It is Ok to add
8529 // both primary and NULL dentries. Because during journal replay, null dentry is
8530 // processed after primary dentry.
8531 if (srcdnl->is_primary() && !srci->is_dir() && !destdn->is_auth())
8532 metablob->add_primary_dentry(srcdn, srci, true);
8533 metablob->add_null_dentry(srcdn, true);
8534 } else
8535 dout(10) << " NOT journaling srcdn " << *srcdn << dendl;
8536
8537 // make renamed inode first track the dn
8538 if (srcdnl->is_primary() && destdn->is_auth()) {
8539 ceph_assert(srci->first <= destdn->first);
8540 srci->first = destdn->first;
8541 }
8542 // make stray inode first track the straydn
8543 if (straydn && straydn->is_auth()) {
8544 ceph_assert(oldin->first <= straydn->first);
8545 oldin->first = straydn->first;
8546 }
8547
8548 if (oldin && oldin->is_dir()) {
8549 ceph_assert(straydn);
8550 mdcache->project_subtree_rename(oldin, destdn->get_dir(), straydn->get_dir());
8551 }
8552 if (srci->is_dir())
8553 mdcache->project_subtree_rename(srci, srcdn->get_dir(), destdn->get_dir());
8554
8555 }
8556
8557
8558 void Server::_rename_apply(MDRequestRef& mdr, CDentry *srcdn, CDentry *destdn, CDentry *straydn)
8559 {
8560 dout(10) << "_rename_apply " << *mdr << " " << *srcdn << " " << *destdn << dendl;
8561 dout(10) << " pvs " << mdr->more()->pvmap << dendl;
8562
8563 CDentry::linkage_t *srcdnl = srcdn->get_linkage();
8564 CDentry::linkage_t *destdnl = destdn->get_linkage();
8565
8566 CInode *oldin = destdnl->get_inode();
8567
8568 // primary+remote link merge?
8569 bool linkmerge = (srcdnl->get_inode() == oldin);
8570 if (linkmerge)
8571 ceph_assert(srcdnl->is_primary() || destdnl->is_remote());
8572
8573 bool new_in_snaprealm = false;
8574 bool new_oldin_snaprealm = false;
8575
8576 // target inode
8577 if (!linkmerge) {
8578 if (destdnl->is_primary()) {
8579 ceph_assert(straydn);
8580 dout(10) << "straydn is " << *straydn << dendl;
8581
8582 // if there is newly created snaprealm, need to split old snaprealm's
8583 // inodes_with_caps. So pop snaprealm before linkage changes.
8584 if (destdn->is_auth()) {
8585 bool hadrealm = (oldin->snaprealm ? true : false);
8586 oldin->early_pop_projected_snaprealm();
8587 new_oldin_snaprealm = (oldin->snaprealm && !hadrealm);
8588 } else {
8589 ceph_assert(mdr->slave_request);
8590 if (mdr->slave_request->desti_snapbl.length()) {
8591 new_oldin_snaprealm = !oldin->snaprealm;
8592 oldin->decode_snap_blob(mdr->slave_request->desti_snapbl);
8593 ceph_assert(oldin->snaprealm);
8594 ceph_assert(oldin->snaprealm->have_past_parents_open());
8595 }
8596 }
8597
8598 destdn->get_dir()->unlink_inode(destdn, false);
8599
8600 straydn->pop_projected_linkage();
8601 if (mdr->is_slave() && !mdr->more()->slave_update_journaled)
8602 ceph_assert(!straydn->is_projected()); // no other projected
8603
8604 // nlink-- targeti
8605 if (destdn->is_auth())
8606 oldin->pop_and_dirty_projected_inode(mdr->ls);
8607
8608 mdcache->touch_dentry_bottom(straydn); // drop dn as quickly as possible.
8609 } else if (destdnl->is_remote()) {
8610 destdn->get_dir()->unlink_inode(destdn, false);
8611 if (oldin->is_auth()) {
8612 oldin->pop_and_dirty_projected_inode(mdr->ls);
8613 } else if (mdr->slave_request) {
8614 if (mdr->slave_request->desti_snapbl.length() > 0) {
8615 ceph_assert(oldin->snaprealm);
8616 oldin->decode_snap_blob(mdr->slave_request->desti_snapbl);
8617 }
8618 } else if (auto& desti_srnode = mdr->more()->desti_srnode) {
8619 delete desti_srnode;
8620 desti_srnode = NULL;
8621 }
8622 }
8623 }
8624
8625 // unlink src before we relink it at dest
8626 CInode *in = srcdnl->get_inode();
8627 ceph_assert(in);
8628
8629 bool srcdn_was_remote = srcdnl->is_remote();
8630 if (!srcdn_was_remote) {
8631 // if there is newly created snaprealm, need to split old snaprealm's
8632 // inodes_with_caps. So pop snaprealm before linkage changes.
8633 if (destdn->is_auth()) {
8634 bool hadrealm = (in->snaprealm ? true : false);
8635 in->early_pop_projected_snaprealm();
8636 new_in_snaprealm = (in->snaprealm && !hadrealm);
8637 } else {
8638 ceph_assert(mdr->slave_request);
8639 if (mdr->slave_request->srci_snapbl.length()) {
8640 new_in_snaprealm = !in->snaprealm;
8641 in->decode_snap_blob(mdr->slave_request->srci_snapbl);
8642 ceph_assert(in->snaprealm);
8643 ceph_assert(in->snaprealm->have_past_parents_open());
8644 }
8645 }
8646 }
8647
8648 srcdn->get_dir()->unlink_inode(srcdn);
8649
8650 // dest
8651 if (srcdn_was_remote) {
8652 if (!linkmerge) {
8653 // destdn
8654 destdnl = destdn->pop_projected_linkage();
8655 if (mdr->is_slave() && !mdr->more()->slave_update_journaled)
8656 ceph_assert(!destdn->is_projected()); // no other projected
8657
8658 destdn->link_remote(destdnl, in);
8659 if (destdn->is_auth())
8660 destdn->mark_dirty(mdr->more()->pvmap[destdn], mdr->ls);
8661 // in
8662 if (in->is_auth()) {
8663 in->pop_and_dirty_projected_inode(mdr->ls);
8664 } else if (mdr->slave_request) {
8665 if (mdr->slave_request->srci_snapbl.length() > 0) {
8666 ceph_assert(in->snaprealm);
8667 in->decode_snap_blob(mdr->slave_request->srci_snapbl);
8668 }
8669 } else if (auto& srci_srnode = mdr->more()->srci_srnode) {
8670 delete srci_srnode;
8671 srci_srnode = NULL;
8672 }
8673 } else {
8674 dout(10) << "merging remote onto primary link" << dendl;
8675 oldin->pop_and_dirty_projected_inode(mdr->ls);
8676 }
8677 } else { // primary
8678 if (linkmerge) {
8679 dout(10) << "merging primary onto remote link" << dendl;
8680 destdn->get_dir()->unlink_inode(destdn, false);
8681 }
8682 destdnl = destdn->pop_projected_linkage();
8683 if (mdr->is_slave() && !mdr->more()->slave_update_journaled)
8684 ceph_assert(!destdn->is_projected()); // no other projected
8685
8686 // srcdn inode import?
8687 if (!srcdn->is_auth() && destdn->is_auth()) {
8688 ceph_assert(mdr->more()->inode_import.length() > 0);
8689
8690 map<client_t,Capability::Import> imported_caps;
8691
8692 // finish cap imports
8693 finish_force_open_sessions(mdr->more()->imported_session_map);
8694 if (mdr->more()->cap_imports.count(destdnl->get_inode())) {
8695 mdcache->migrator->finish_import_inode_caps(destdnl->get_inode(),
8696 mdr->more()->srcdn_auth_mds, true,
8697 mdr->more()->imported_session_map,
8698 mdr->more()->cap_imports[destdnl->get_inode()],
8699 imported_caps);
8700 }
8701
8702 mdr->more()->inode_import.clear();
8703 encode(imported_caps, mdr->more()->inode_import);
8704
8705 /* hack: add an auth pin for each xlock we hold. These were
8706 * remote xlocks previously but now they're local and
8707 * we're going to try and unpin when we xlock_finish. */
8708
8709 for (auto i = mdr->locks.lower_bound(&destdnl->get_inode()->versionlock);
8710 i != mdr->locks.end();
8711 ++i) {
8712 SimpleLock *lock = i->lock;
8713 if (lock->get_parent() != destdnl->get_inode())
8714 break;
8715 if (i->is_xlock() && !lock->is_locallock())
8716 mds->locker->xlock_import(lock);
8717 }
8718
8719 // hack: fix auth bit
8720 in->state_set(CInode::STATE_AUTH);
8721
8722 mdr->clear_ambiguous_auth();
8723 }
8724
8725 if (destdn->is_auth())
8726 in->pop_and_dirty_projected_inode(mdr->ls);
8727 }
8728
8729 // src
8730 if (srcdn->is_auth())
8731 srcdn->mark_dirty(mdr->more()->pvmap[srcdn], mdr->ls);
8732 srcdn->pop_projected_linkage();
8733 if (mdr->is_slave() && !mdr->more()->slave_update_journaled)
8734 ceph_assert(!srcdn->is_projected()); // no other projected
8735
8736 // apply remaining projected inodes (nested)
8737 mdr->apply();
8738
8739 // update subtree map?
8740 if (destdnl->is_primary() && in->is_dir())
8741 mdcache->adjust_subtree_after_rename(in, srcdn->get_dir(), true);
8742
8743 if (straydn && oldin->is_dir())
8744 mdcache->adjust_subtree_after_rename(oldin, destdn->get_dir(), true);
8745
8746 if (new_oldin_snaprealm)
8747 mdcache->do_realm_invalidate_and_update_notify(oldin, CEPH_SNAP_OP_SPLIT, false);
8748 if (new_in_snaprealm)
8749 mdcache->do_realm_invalidate_and_update_notify(in, CEPH_SNAP_OP_SPLIT, true);
8750
8751 // removing a new dn?
8752 if (srcdn->is_auth())
8753 srcdn->get_dir()->try_remove_unlinked_dn(srcdn);
8754 }
8755
8756
8757
8758 // ------------
8759 // SLAVE
8760
8761 class C_MDS_SlaveRenamePrep : public ServerLogContext {
8762 CDentry *srcdn, *destdn, *straydn;
8763 public:
8764 C_MDS_SlaveRenamePrep(Server *s, MDRequestRef& m, CDentry *sr, CDentry *de, CDentry *st) :
8765 ServerLogContext(s, m), srcdn(sr), destdn(de), straydn(st) {}
8766 void finish(int r) override {
8767 server->_logged_slave_rename(mdr, srcdn, destdn, straydn);
8768 }
8769 };
8770
8771 class C_MDS_SlaveRenameCommit : public ServerContext {
8772 MDRequestRef mdr;
8773 CDentry *srcdn, *destdn, *straydn;
8774 public:
8775 C_MDS_SlaveRenameCommit(Server *s, MDRequestRef& m, CDentry *sr, CDentry *de, CDentry *st) :
8776 ServerContext(s), mdr(m), srcdn(sr), destdn(de), straydn(st) {}
8777 void finish(int r) override {
8778 server->_commit_slave_rename(mdr, r, srcdn, destdn, straydn);
8779 }
8780 };
8781
8782 class C_MDS_SlaveRenameSessionsFlushed : public ServerContext {
8783 MDRequestRef mdr;
8784 public:
8785 C_MDS_SlaveRenameSessionsFlushed(Server *s, MDRequestRef& r) :
8786 ServerContext(s), mdr(r) {}
8787 void finish(int r) override {
8788 server->_slave_rename_sessions_flushed(mdr);
8789 }
8790 };
8791
8792 void Server::handle_slave_rename_prep(MDRequestRef& mdr)
8793 {
8794 dout(10) << "handle_slave_rename_prep " << *mdr
8795 << " " << mdr->slave_request->srcdnpath
8796 << " to " << mdr->slave_request->destdnpath
8797 << dendl;
8798
8799 if (mdr->slave_request->is_interrupted()) {
8800 dout(10) << " slave request interrupted, sending noop reply" << dendl;
8801 auto reply = make_message<MMDSSlaveRequest>(mdr->reqid, mdr->attempt, MMDSSlaveRequest::OP_RENAMEPREPACK);
8802 reply->mark_interrupted();
8803 mds->send_message_mds(reply, mdr->slave_to_mds);
8804 mdr->reset_slave_request();
8805 return;
8806 }
8807
8808 // discover destdn
8809 filepath destpath(mdr->slave_request->destdnpath);
8810 dout(10) << " dest " << destpath << dendl;
8811 vector<CDentry*> trace;
8812 CF_MDS_MDRContextFactory cf(mdcache, mdr, false);
8813 int r = mdcache->path_traverse(mdr, cf, destpath,
8814 MDS_TRAVERSE_DISCOVER | MDS_TRAVERSE_PATH_LOCKED | MDS_TRAVERSE_WANT_DENTRY,
8815 &trace);
8816 if (r > 0) return;
8817 if (r == -ESTALE) {
8818 mdcache->find_ino_peers(destpath.get_ino(), new C_MDS_RetryRequest(mdcache, mdr),
8819 mdr->slave_to_mds, true);
8820 return;
8821 }
8822 ceph_assert(r == 0); // we shouldn't get an error here!
8823
8824 CDentry *destdn = trace.back();
8825 CDentry::linkage_t *destdnl = destdn->get_projected_linkage();
8826 dout(10) << " destdn " << *destdn << dendl;
8827 mdr->pin(destdn);
8828
8829 // discover srcdn
8830 filepath srcpath(mdr->slave_request->srcdnpath);
8831 dout(10) << " src " << srcpath << dendl;
8832 CInode *srci = nullptr;
8833 r = mdcache->path_traverse(mdr, cf, srcpath,
8834 MDS_TRAVERSE_DISCOVER | MDS_TRAVERSE_PATH_LOCKED,
8835 &trace, &srci);
8836 if (r > 0) return;
8837 ceph_assert(r == 0);
8838
8839 CDentry *srcdn = trace.back();
8840 CDentry::linkage_t *srcdnl = srcdn->get_projected_linkage();
8841 dout(10) << " srcdn " << *srcdn << dendl;
8842 mdr->pin(srcdn);
8843 mdr->pin(srci);
8844
8845 // stray?
8846 bool linkmerge = srcdnl->get_inode() == destdnl->get_inode();
8847 if (linkmerge)
8848 ceph_assert(srcdnl->is_primary() && destdnl->is_remote());
8849 CDentry *straydn = mdr->straydn;
8850 if (destdnl->is_primary() && !linkmerge)
8851 ceph_assert(straydn);
8852
8853 mdr->set_op_stamp(mdr->slave_request->op_stamp);
8854 mdr->more()->srcdn_auth_mds = srcdn->authority().first;
8855
8856 // set up commit waiter (early, to clean up any freezing etc we do)
8857 if (!mdr->more()->slave_commit)
8858 mdr->more()->slave_commit = new C_MDS_SlaveRenameCommit(this, mdr, srcdn, destdn, straydn);
8859
8860 // am i srcdn auth?
8861 if (srcdn->is_auth()) {
8862 set<mds_rank_t> srcdnrep;
8863 srcdn->list_replicas(srcdnrep);
8864
8865 bool reply_witness = false;
8866 if (srcdnl->is_primary() && !srcdnl->get_inode()->state_test(CInode::STATE_AMBIGUOUSAUTH)) {
8867 // freeze?
8868 // we need this to
8869 // - avoid conflicting lock state changes
8870 // - avoid concurrent updates to the inode
8871 // (this could also be accomplished with the versionlock)
8872 int allowance = 3; // 1 for the mdr auth_pin, 1 for the link lock, 1 for the snap lock
8873 dout(10) << " freezing srci " << *srcdnl->get_inode() << " with allowance " << allowance << dendl;
8874 bool frozen_inode = srcdnl->get_inode()->freeze_inode(allowance);
8875
8876 // unfreeze auth pin after freezing the inode to avoid queueing waiters
8877 if (srcdnl->get_inode()->is_frozen_auth_pin())
8878 mdr->unfreeze_auth_pin();
8879
8880 if (!frozen_inode) {
8881 srcdnl->get_inode()->add_waiter(CInode::WAIT_FROZEN, new C_MDS_RetryRequest(mdcache, mdr));
8882 return;
8883 }
8884
8885 /*
8886 * set ambiguous auth for srci
8887 * NOTE: we don't worry about ambiguous cache expire as we do
8888 * with subtree migrations because all slaves will pin
8889 * srcdn->get_inode() for duration of this rename.
8890 */
8891 mdr->set_ambiguous_auth(srcdnl->get_inode());
8892
8893 // just mark the source inode as ambiguous auth if more than two MDS are involved.
8894 // the master will send another OP_RENAMEPREP slave request later.
8895 if (mdr->slave_request->witnesses.size() > 1) {
8896 dout(10) << " set srci ambiguous auth; providing srcdn replica list" << dendl;
8897 reply_witness = true;
8898 }
8899
8900 // make sure bystanders have received all lock related messages
8901 for (set<mds_rank_t>::iterator p = srcdnrep.begin(); p != srcdnrep.end(); ++p) {
8902 if (*p == mdr->slave_to_mds ||
8903 (mds->is_cluster_degraded() &&
8904 !mds->mdsmap->is_clientreplay_or_active_or_stopping(*p)))
8905 continue;
8906 auto notify = make_message<MMDSSlaveRequest>(mdr->reqid, mdr->attempt, MMDSSlaveRequest::OP_RENAMENOTIFY);
8907 mds->send_message_mds(notify, *p);
8908 mdr->more()->waiting_on_slave.insert(*p);
8909 }
8910
8911 // make sure clients have received all cap related messages
8912 set<client_t> export_client_set;
8913 mdcache->migrator->get_export_client_set(srcdnl->get_inode(), export_client_set);
8914
8915 MDSGatherBuilder gather(g_ceph_context);
8916 flush_client_sessions(export_client_set, gather);
8917 if (gather.has_subs()) {
8918 mdr->more()->waiting_on_slave.insert(MDS_RANK_NONE);
8919 gather.set_finisher(new C_MDS_SlaveRenameSessionsFlushed(this, mdr));
8920 gather.activate();
8921 }
8922 }
8923
8924 // is witness list sufficient?
8925 for (set<mds_rank_t>::iterator p = srcdnrep.begin(); p != srcdnrep.end(); ++p) {
8926 if (*p == mdr->slave_to_mds ||
8927 mdr->slave_request->witnesses.count(*p)) continue;
8928 dout(10) << " witness list insufficient; providing srcdn replica list" << dendl;
8929 reply_witness = true;
8930 break;
8931 }
8932
8933 if (reply_witness) {
8934 ceph_assert(!srcdnrep.empty());
8935 auto reply = make_message<MMDSSlaveRequest>(mdr->reqid, mdr->attempt, MMDSSlaveRequest::OP_RENAMEPREPACK);
8936 reply->witnesses.swap(srcdnrep);
8937 mds->send_message_mds(reply, mdr->slave_to_mds);
8938 mdr->reset_slave_request();
8939 return;
8940 }
8941 dout(10) << " witness list sufficient: includes all srcdn replicas" << dendl;
8942 if (!mdr->more()->waiting_on_slave.empty()) {
8943 dout(10) << " still waiting for rename notify acks from "
8944 << mdr->more()->waiting_on_slave << dendl;
8945 return;
8946 }
8947 } else if (srcdnl->is_primary() && srcdn->authority() != destdn->authority()) {
8948 // set ambiguous auth for srci on witnesses
8949 mdr->set_ambiguous_auth(srcdnl->get_inode());
8950 }
8951
8952 // encode everything we'd need to roll this back... basically, just the original state.
8953 rename_rollback rollback;
8954
8955 rollback.reqid = mdr->reqid;
8956
8957 rollback.orig_src.dirfrag = srcdn->get_dir()->dirfrag();
8958 rollback.orig_src.dirfrag_old_mtime = srcdn->get_dir()->get_projected_fnode()->fragstat.mtime;
8959 rollback.orig_src.dirfrag_old_rctime = srcdn->get_dir()->get_projected_fnode()->rstat.rctime;
8960 rollback.orig_src.dname = srcdn->get_name();
8961 if (srcdnl->is_primary())
8962 rollback.orig_src.ino = srcdnl->get_inode()->ino();
8963 else {
8964 ceph_assert(srcdnl->is_remote());
8965 rollback.orig_src.remote_ino = srcdnl->get_remote_ino();
8966 rollback.orig_src.remote_d_type = srcdnl->get_remote_d_type();
8967 }
8968
8969 rollback.orig_dest.dirfrag = destdn->get_dir()->dirfrag();
8970 rollback.orig_dest.dirfrag_old_mtime = destdn->get_dir()->get_projected_fnode()->fragstat.mtime;
8971 rollback.orig_dest.dirfrag_old_rctime = destdn->get_dir()->get_projected_fnode()->rstat.rctime;
8972 rollback.orig_dest.dname = destdn->get_name();
8973 if (destdnl->is_primary())
8974 rollback.orig_dest.ino = destdnl->get_inode()->ino();
8975 else if (destdnl->is_remote()) {
8976 rollback.orig_dest.remote_ino = destdnl->get_remote_ino();
8977 rollback.orig_dest.remote_d_type = destdnl->get_remote_d_type();
8978 }
8979
8980 if (straydn) {
8981 rollback.stray.dirfrag = straydn->get_dir()->dirfrag();
8982 rollback.stray.dirfrag_old_mtime = straydn->get_dir()->get_projected_fnode()->fragstat.mtime;
8983 rollback.stray.dirfrag_old_rctime = straydn->get_dir()->get_projected_fnode()->rstat.rctime;
8984 rollback.stray.dname = straydn->get_name();
8985 }
8986 if (mdr->slave_request->desti_snapbl.length()) {
8987 CInode *oldin = destdnl->get_inode();
8988 if (oldin->snaprealm) {
8989 encode(true, rollback.desti_snapbl);
8990 oldin->encode_snap_blob(rollback.desti_snapbl);
8991 } else {
8992 encode(false, rollback.desti_snapbl);
8993 }
8994 }
8995 if (mdr->slave_request->srci_snapbl.length()) {
8996 if (srci->snaprealm) {
8997 encode(true, rollback.srci_snapbl);
8998 srci->encode_snap_blob(rollback.srci_snapbl);
8999 } else {
9000 encode(false, rollback.srci_snapbl);
9001 }
9002 }
9003 encode(rollback, mdr->more()->rollback_bl);
9004 // FIXME: rollback snaprealm
9005 dout(20) << " rollback is " << mdr->more()->rollback_bl.length() << " bytes" << dendl;
9006
9007 // journal.
9008 mdr->ls = mdlog->get_current_segment();
9009 ESlaveUpdate *le = new ESlaveUpdate(mdlog, "slave_rename_prep", mdr->reqid, mdr->slave_to_mds,
9010 ESlaveUpdate::OP_PREPARE, ESlaveUpdate::RENAME);
9011 mdlog->start_entry(le);
9012 le->rollback = mdr->more()->rollback_bl;
9013
9014 bufferlist blah; // inode import data... obviously not used if we're the slave
9015 _rename_prepare(mdr, &le->commit, &blah, srcdn, destdn, straydn);
9016
9017 if (le->commit.empty()) {
9018 dout(10) << " empty metablob, skipping journal" << dendl;
9019 mdlog->cancel_entry(le);
9020 mdr->ls = NULL;
9021 _logged_slave_rename(mdr, srcdn, destdn, straydn);
9022 } else {
9023 mdcache->add_uncommitted_slave(mdr->reqid, mdr->ls, mdr->slave_to_mds);
9024 mdr->more()->slave_update_journaled = true;
9025 submit_mdlog_entry(le, new C_MDS_SlaveRenamePrep(this, mdr, srcdn, destdn, straydn),
9026 mdr, __func__);
9027 mdlog->flush();
9028 }
9029 }
9030
9031 void Server::_logged_slave_rename(MDRequestRef& mdr,
9032 CDentry *srcdn, CDentry *destdn, CDentry *straydn)
9033 {
9034 dout(10) << "_logged_slave_rename " << *mdr << dendl;
9035
9036 // prepare ack
9037 ref_t<MMDSSlaveRequest> reply;
9038 if (!mdr->aborted) {
9039 reply = make_message<MMDSSlaveRequest>(mdr->reqid, mdr->attempt, MMDSSlaveRequest::OP_RENAMEPREPACK);
9040 if (!mdr->more()->slave_update_journaled)
9041 reply->mark_not_journaled();
9042 }
9043
9044 CDentry::linkage_t *srcdnl = srcdn->get_linkage();
9045 //CDentry::linkage_t *straydnl = straydn ? straydn->get_linkage() : 0;
9046
9047 // export srci?
9048 if (srcdn->is_auth() && srcdnl->is_primary()) {
9049 // set export bounds for CInode::encode_export()
9050 if (reply) {
9051 std::vector<CDir*> bounds;
9052 if (srcdnl->get_inode()->is_dir()) {
9053 srcdnl->get_inode()->get_dirfrags(bounds);
9054 for (const auto& bound : bounds) {
9055 bound->state_set(CDir::STATE_EXPORTBOUND);
9056 }
9057 }
9058
9059 map<client_t,entity_inst_t> exported_client_map;
9060 map<client_t, client_metadata_t> exported_client_metadata_map;
9061 bufferlist inodebl;
9062 mdcache->migrator->encode_export_inode(srcdnl->get_inode(), inodebl,
9063 exported_client_map,
9064 exported_client_metadata_map);
9065
9066 for (const auto& bound : bounds) {
9067 bound->state_clear(CDir::STATE_EXPORTBOUND);
9068 }
9069
9070 encode(exported_client_map, reply->inode_export, mds->mdsmap->get_up_features());
9071 encode(exported_client_metadata_map, reply->inode_export);
9072 reply->inode_export.claim_append(inodebl);
9073 reply->inode_export_v = srcdnl->get_inode()->inode.version;
9074 }
9075
9076 // remove mdr auth pin
9077 mdr->auth_unpin(srcdnl->get_inode());
9078 mdr->more()->is_inode_exporter = true;
9079
9080 if (srcdnl->get_inode()->is_dirty())
9081 srcdnl->get_inode()->mark_clean();
9082
9083 dout(10) << " exported srci " << *srcdnl->get_inode() << dendl;
9084 }
9085
9086 // apply
9087 _rename_apply(mdr, srcdn, destdn, straydn);
9088
9089 CDentry::linkage_t *destdnl = destdn->get_linkage();
9090
9091 // bump popularity
9092 mds->balancer->hit_dir(srcdn->get_dir(), META_POP_IWR);
9093 if (destdnl->get_inode() && destdnl->get_inode()->is_auth())
9094 mds->balancer->hit_inode(destdnl->get_inode(), META_POP_IWR);
9095
9096 // done.
9097 mdr->reset_slave_request();
9098 mdr->straydn = 0;
9099
9100 if (reply) {
9101 mds->send_message_mds(reply, mdr->slave_to_mds);
9102 } else {
9103 ceph_assert(mdr->aborted);
9104 dout(10) << " abort flag set, finishing" << dendl;
9105 mdcache->request_finish(mdr);
9106 }
9107 }
9108
9109 void Server::_commit_slave_rename(MDRequestRef& mdr, int r,
9110 CDentry *srcdn, CDentry *destdn, CDentry *straydn)
9111 {
9112 dout(10) << "_commit_slave_rename " << *mdr << " r=" << r << dendl;
9113
9114 CInode *in = destdn->get_linkage()->get_inode();
9115
9116 inodeno_t migrated_stray;
9117 if (srcdn->is_auth() && srcdn->get_dir()->inode->is_stray())
9118 migrated_stray = in->ino();
9119
9120 MDSContext::vec finished;
9121 if (r == 0) {
9122 // unfreeze+singleauth inode
9123 // hmm, do i really need to delay this?
9124 if (mdr->more()->is_inode_exporter) {
9125 // drop our pins
9126 // we exported, clear out any xlocks that we moved to another MDS
9127
9128 for (auto i = mdr->locks.lower_bound(&in->versionlock);
9129 i != mdr->locks.end(); ) {
9130 SimpleLock *lock = i->lock;
9131 if (lock->get_parent() != in)
9132 break;
9133 // we only care about xlocks on the exported inode
9134 if (i->is_xlock() && !lock->is_locallock())
9135 mds->locker->xlock_export(i++, mdr.get());
9136 else
9137 ++i;
9138 }
9139
9140 map<client_t,Capability::Import> peer_imported;
9141 auto bp = mdr->more()->inode_import.cbegin();
9142 decode(peer_imported, bp);
9143
9144 dout(10) << " finishing inode export on " << *in << dendl;
9145 mdcache->migrator->finish_export_inode(in, mdr->slave_to_mds, peer_imported, finished);
9146 mds->queue_waiters(finished); // this includes SINGLEAUTH waiters.
9147
9148 // unfreeze
9149 ceph_assert(in->is_frozen_inode());
9150 in->unfreeze_inode(finished);
9151 }
9152
9153 // singleauth
9154 if (mdr->more()->is_ambiguous_auth) {
9155 mdr->more()->rename_inode->clear_ambiguous_auth(finished);
9156 mdr->more()->is_ambiguous_auth = false;
9157 }
9158
9159 if (straydn && mdr->more()->slave_update_journaled) {
9160 CInode *strayin = straydn->get_projected_linkage()->get_inode();
9161 if (strayin && !strayin->snaprealm)
9162 mdcache->clear_dirty_bits_for_stray(strayin);
9163 }
9164
9165 mds->queue_waiters(finished);
9166 mdr->cleanup();
9167
9168 if (mdr->more()->slave_update_journaled) {
9169 // write a commit to the journal
9170 ESlaveUpdate *le = new ESlaveUpdate(mdlog, "slave_rename_commit", mdr->reqid,
9171 mdr->slave_to_mds, ESlaveUpdate::OP_COMMIT,
9172 ESlaveUpdate::RENAME);
9173 mdlog->start_entry(le);
9174 submit_mdlog_entry(le, new C_MDS_CommittedSlave(this, mdr), mdr, __func__);
9175 mdlog->flush();
9176 } else {
9177 _committed_slave(mdr);
9178 }
9179 } else {
9180
9181 // abort
9182 // rollback_bl may be empty if we froze the inode but had to provide an expanded
9183 // witness list from the master, and they failed before we tried prep again.
9184 if (mdr->more()->rollback_bl.length()) {
9185 if (mdr->more()->is_inode_exporter) {
9186 dout(10) << " reversing inode export of " << *in << dendl;
9187 in->abort_export();
9188 }
9189 if (mdcache->is_ambiguous_slave_update(mdr->reqid, mdr->slave_to_mds)) {
9190 mdcache->remove_ambiguous_slave_update(mdr->reqid, mdr->slave_to_mds);
9191 // rollback but preserve the slave request
9192 do_rename_rollback(mdr->more()->rollback_bl, mdr->slave_to_mds, mdr, false);
9193 mdr->more()->rollback_bl.clear();
9194 } else
9195 do_rename_rollback(mdr->more()->rollback_bl, mdr->slave_to_mds, mdr, true);
9196 } else {
9197 dout(10) << " rollback_bl empty, not rollback back rename (master failed after getting extra witnesses?)" << dendl;
9198 // singleauth
9199 if (mdr->more()->is_ambiguous_auth) {
9200 if (srcdn->is_auth())
9201 mdr->more()->rename_inode->unfreeze_inode(finished);
9202
9203 mdr->more()->rename_inode->clear_ambiguous_auth(finished);
9204 mdr->more()->is_ambiguous_auth = false;
9205 }
9206 mds->queue_waiters(finished);
9207 mdcache->request_finish(mdr);
9208 }
9209 }
9210
9211 if (migrated_stray && mds->is_stopping())
9212 mdcache->shutdown_export_stray_finish(migrated_stray);
9213 }
9214
9215 void _rollback_repair_dir(MutationRef& mut, CDir *dir, rename_rollback::drec &r, utime_t ctime,
9216 bool isdir, int linkunlink, nest_info_t &rstat)
9217 {
9218 fnode_t *pf;
9219 pf = dir->project_fnode();
9220 mut->add_projected_fnode(dir);
9221 pf->version = dir->pre_dirty();
9222
9223 if (isdir) {
9224 pf->fragstat.nsubdirs += linkunlink;
9225 } else {
9226 pf->fragstat.nfiles += linkunlink;
9227 }
9228 if (r.ino) {
9229 pf->rstat.rbytes += linkunlink * rstat.rbytes;
9230 pf->rstat.rfiles += linkunlink * rstat.rfiles;
9231 pf->rstat.rsubdirs += linkunlink * rstat.rsubdirs;
9232 pf->rstat.rsnaps += linkunlink * rstat.rsnaps;
9233 }
9234 if (pf->fragstat.mtime == ctime) {
9235 pf->fragstat.mtime = r.dirfrag_old_mtime;
9236 if (pf->rstat.rctime == ctime)
9237 pf->rstat.rctime = r.dirfrag_old_rctime;
9238 }
9239 mut->add_updated_lock(&dir->get_inode()->filelock);
9240 mut->add_updated_lock(&dir->get_inode()->nestlock);
9241 }
9242
9243 struct C_MDS_LoggedRenameRollback : public ServerLogContext {
9244 MutationRef mut;
9245 CDentry *srcdn;
9246 version_t srcdnpv;
9247 CDentry *destdn;
9248 CDentry *straydn;
9249 map<client_t,ref_t<MClientSnap>> splits[2];
9250 bool finish_mdr;
9251 C_MDS_LoggedRenameRollback(Server *s, MutationRef& m, MDRequestRef& r,
9252 CDentry *sd, version_t pv, CDentry *dd, CDentry *st,
9253 map<client_t,ref_t<MClientSnap>> _splits[2], bool f) :
9254 ServerLogContext(s, r), mut(m), srcdn(sd), srcdnpv(pv), destdn(dd),
9255 straydn(st), finish_mdr(f) {
9256 splits[0].swap(_splits[0]);
9257 splits[1].swap(_splits[1]);
9258 }
9259 void finish(int r) override {
9260 server->_rename_rollback_finish(mut, mdr, srcdn, srcdnpv,
9261 destdn, straydn, splits, finish_mdr);
9262 }
9263 };
9264
9265 void Server::do_rename_rollback(bufferlist &rbl, mds_rank_t master, MDRequestRef& mdr,
9266 bool finish_mdr)
9267 {
9268 rename_rollback rollback;
9269 auto p = rbl.cbegin();
9270 decode(rollback, p);
9271
9272 dout(10) << "do_rename_rollback on " << rollback.reqid << dendl;
9273 // need to finish this update before sending resolve to claim the subtree
9274 mdcache->add_rollback(rollback.reqid, master);
9275
9276 MutationRef mut(new MutationImpl(nullptr, utime_t(), rollback.reqid));
9277 mut->ls = mds->mdlog->get_current_segment();
9278
9279 CDentry *srcdn = NULL;
9280 CDir *srcdir = mdcache->get_dirfrag(rollback.orig_src.dirfrag);
9281 if (!srcdir)
9282 srcdir = mdcache->get_dirfrag(rollback.orig_src.dirfrag.ino, rollback.orig_src.dname);
9283 if (srcdir) {
9284 dout(10) << " srcdir " << *srcdir << dendl;
9285 srcdn = srcdir->lookup(rollback.orig_src.dname);
9286 if (srcdn) {
9287 dout(10) << " srcdn " << *srcdn << dendl;
9288 ceph_assert(srcdn->get_linkage()->is_null());
9289 } else
9290 dout(10) << " srcdn not found" << dendl;
9291 } else
9292 dout(10) << " srcdir not found" << dendl;
9293
9294 CDentry *destdn = NULL;
9295 CDir *destdir = mdcache->get_dirfrag(rollback.orig_dest.dirfrag);
9296 if (!destdir)
9297 destdir = mdcache->get_dirfrag(rollback.orig_dest.dirfrag.ino, rollback.orig_dest.dname);
9298 if (destdir) {
9299 dout(10) << " destdir " << *destdir << dendl;
9300 destdn = destdir->lookup(rollback.orig_dest.dname);
9301 if (destdn)
9302 dout(10) << " destdn " << *destdn << dendl;
9303 else
9304 dout(10) << " destdn not found" << dendl;
9305 } else
9306 dout(10) << " destdir not found" << dendl;
9307
9308 CInode *in = NULL;
9309 if (rollback.orig_src.ino) {
9310 in = mdcache->get_inode(rollback.orig_src.ino);
9311 if (in && in->is_dir())
9312 ceph_assert(srcdn && destdn);
9313 } else
9314 in = mdcache->get_inode(rollback.orig_src.remote_ino);
9315
9316 CDir *straydir = NULL;
9317 CDentry *straydn = NULL;
9318 if (rollback.stray.dirfrag.ino) {
9319 straydir = mdcache->get_dirfrag(rollback.stray.dirfrag);
9320 if (straydir) {
9321 dout(10) << "straydir " << *straydir << dendl;
9322 straydn = straydir->lookup(rollback.stray.dname);
9323 if (straydn) {
9324 dout(10) << " straydn " << *straydn << dendl;
9325 ceph_assert(straydn->get_linkage()->is_primary());
9326 } else
9327 dout(10) << " straydn not found" << dendl;
9328 } else
9329 dout(10) << "straydir not found" << dendl;
9330 }
9331
9332 CInode *target = NULL;
9333 if (rollback.orig_dest.ino) {
9334 target = mdcache->get_inode(rollback.orig_dest.ino);
9335 if (target)
9336 ceph_assert(destdn && straydn);
9337 } else if (rollback.orig_dest.remote_ino)
9338 target = mdcache->get_inode(rollback.orig_dest.remote_ino);
9339
9340 // can't use is_auth() in the resolve stage
9341 mds_rank_t whoami = mds->get_nodeid();
9342 // slave
9343 ceph_assert(!destdn || destdn->authority().first != whoami);
9344 ceph_assert(!straydn || straydn->authority().first != whoami);
9345
9346 bool force_journal_src = false;
9347 bool force_journal_dest = false;
9348 if (in && in->is_dir() && srcdn->authority().first != whoami)
9349 force_journal_src = _need_force_journal(in, false);
9350 if (in && target && target->is_dir())
9351 force_journal_dest = _need_force_journal(in, true);
9352
9353 version_t srcdnpv = 0;
9354 // repair src
9355 if (srcdn) {
9356 if (srcdn->authority().first == whoami)
9357 srcdnpv = srcdn->pre_dirty();
9358 if (rollback.orig_src.ino) {
9359 ceph_assert(in);
9360 srcdn->push_projected_linkage(in);
9361 } else
9362 srcdn->push_projected_linkage(rollback.orig_src.remote_ino,
9363 rollback.orig_src.remote_d_type);
9364 }
9365
9366 map<client_t,ref_t<MClientSnap>> splits[2];
9367
9368 CInode::mempool_inode *pip = nullptr;
9369 if (in) {
9370 bool projected;
9371 if (in->get_projected_parent_dn()->authority().first == whoami) {
9372 auto &pi = in->project_inode();
9373 pip = &pi.inode;
9374 mut->add_projected_inode(in);
9375 pip->version = in->pre_dirty();
9376 projected = true;
9377 } else {
9378 pip = in->get_projected_inode();
9379 projected = false;
9380 }
9381 if (pip->ctime == rollback.ctime)
9382 pip->ctime = rollback.orig_src.old_ctime;
9383
9384 if (rollback.srci_snapbl.length() && in->snaprealm) {
9385 bool hadrealm;
9386 auto p = rollback.srci_snapbl.cbegin();
9387 decode(hadrealm, p);
9388 if (hadrealm) {
9389 if (projected && !mds->is_resolve()) {
9390 sr_t *new_srnode = new sr_t();
9391 decode(*new_srnode, p);
9392 in->project_snaprealm(new_srnode);
9393 } else
9394 decode(in->snaprealm->srnode, p);
9395 } else {
9396 SnapRealm *realm;
9397 if (rollback.orig_src.ino) {
9398 ceph_assert(srcdir);
9399 realm = srcdir->get_inode()->find_snaprealm();
9400 } else {
9401 realm = in->snaprealm->parent;
9402 }
9403 if (!mds->is_resolve())
9404 mdcache->prepare_realm_merge(in->snaprealm, realm, splits[0]);
9405 if (projected)
9406 in->project_snaprealm(NULL);
9407 else
9408 in->snaprealm->merge_to(realm);
9409 }
9410 }
9411 }
9412
9413 if (srcdn && srcdn->authority().first == whoami) {
9414 nest_info_t blah;
9415 _rollback_repair_dir(mut, srcdir, rollback.orig_src, rollback.ctime,
9416 in ? in->is_dir() : false, 1, pip ? pip->accounted_rstat : blah);
9417 }
9418
9419 // repair dest
9420 if (destdn) {
9421 if (rollback.orig_dest.ino && target) {
9422 destdn->push_projected_linkage(target);
9423 } else if (rollback.orig_dest.remote_ino) {
9424 destdn->push_projected_linkage(rollback.orig_dest.remote_ino,
9425 rollback.orig_dest.remote_d_type);
9426 } else {
9427 // the dentry will be trimmed soon, it's ok to have wrong linkage
9428 if (rollback.orig_dest.ino)
9429 ceph_assert(mds->is_resolve());
9430 destdn->push_projected_linkage();
9431 }
9432 }
9433
9434 if (straydn)
9435 straydn->push_projected_linkage();
9436
9437 if (target) {
9438 bool projected;
9439 CInode::mempool_inode *ti = nullptr;
9440 if (target->get_projected_parent_dn()->authority().first == whoami) {
9441 auto &pi = target->project_inode();
9442 ti = &pi.inode;
9443 mut->add_projected_inode(target);
9444 ti->version = target->pre_dirty();
9445 projected = true;
9446 } else {
9447 ti = target->get_projected_inode();
9448 projected = false;
9449 }
9450 if (ti->ctime == rollback.ctime)
9451 ti->ctime = rollback.orig_dest.old_ctime;
9452 if (MDS_INO_IS_STRAY(rollback.orig_src.dirfrag.ino)) {
9453 if (MDS_INO_IS_STRAY(rollback.orig_dest.dirfrag.ino))
9454 ceph_assert(!rollback.orig_dest.ino && !rollback.orig_dest.remote_ino);
9455 else
9456 ceph_assert(rollback.orig_dest.remote_ino &&
9457 rollback.orig_dest.remote_ino == rollback.orig_src.ino);
9458 } else
9459 ti->nlink++;
9460
9461 if (rollback.desti_snapbl.length() && target->snaprealm) {
9462 bool hadrealm;
9463 auto p = rollback.desti_snapbl.cbegin();
9464 decode(hadrealm, p);
9465 if (hadrealm) {
9466 if (projected && !mds->is_resolve()) {
9467 sr_t *new_srnode = new sr_t();
9468 decode(*new_srnode, p);
9469 target->project_snaprealm(new_srnode);
9470 } else
9471 decode(target->snaprealm->srnode, p);
9472 } else {
9473 SnapRealm *realm;
9474 if (rollback.orig_dest.ino) {
9475 ceph_assert(destdir);
9476 realm = destdir->get_inode()->find_snaprealm();
9477 } else {
9478 realm = target->snaprealm->parent;
9479 }
9480 if (!mds->is_resolve())
9481 mdcache->prepare_realm_merge(target->snaprealm, realm, splits[1]);
9482 if (projected)
9483 target->project_snaprealm(NULL);
9484 else
9485 target->snaprealm->merge_to(realm);
9486 }
9487 }
9488 }
9489
9490 if (srcdn)
9491 dout(0) << " srcdn back to " << *srcdn << dendl;
9492 if (in)
9493 dout(0) << " srci back to " << *in << dendl;
9494 if (destdn)
9495 dout(0) << " destdn back to " << *destdn << dendl;
9496 if (target)
9497 dout(0) << " desti back to " << *target << dendl;
9498
9499 // journal it
9500 ESlaveUpdate *le = new ESlaveUpdate(mdlog, "slave_rename_rollback", rollback.reqid, master,
9501 ESlaveUpdate::OP_ROLLBACK, ESlaveUpdate::RENAME);
9502 mdlog->start_entry(le);
9503
9504 if (srcdn && (srcdn->authority().first == whoami || force_journal_src)) {
9505 le->commit.add_dir_context(srcdir);
9506 if (rollback.orig_src.ino)
9507 le->commit.add_primary_dentry(srcdn, 0, true);
9508 else
9509 le->commit.add_remote_dentry(srcdn, true);
9510 }
9511
9512 if (!rollback.orig_src.ino && // remote linkage
9513 in && in->authority().first == whoami) {
9514 le->commit.add_dir_context(in->get_projected_parent_dir());
9515 le->commit.add_primary_dentry(in->get_projected_parent_dn(), in, true);
9516 }
9517
9518 if (force_journal_dest) {
9519 ceph_assert(rollback.orig_dest.ino);
9520 le->commit.add_dir_context(destdir);
9521 le->commit.add_primary_dentry(destdn, 0, true);
9522 }
9523
9524 // slave: no need to journal straydn
9525
9526 if (target && target != in && target->authority().first == whoami) {
9527 ceph_assert(rollback.orig_dest.remote_ino);
9528 le->commit.add_dir_context(target->get_projected_parent_dir());
9529 le->commit.add_primary_dentry(target->get_projected_parent_dn(), target, true);
9530 }
9531
9532 if (in && in->is_dir() && (srcdn->authority().first == whoami || force_journal_src)) {
9533 dout(10) << " noting renamed dir ino " << in->ino() << " in metablob" << dendl;
9534 le->commit.renamed_dirino = in->ino();
9535 if (srcdn->authority().first == whoami) {
9536 auto&& ls = in->get_dirfrags();
9537 for (const auto& dir : ls) {
9538 if (!dir->is_auth())
9539 le->commit.renamed_dir_frags.push_back(dir->get_frag());
9540 }
9541 dout(10) << " noting renamed dir open frags " << le->commit.renamed_dir_frags << dendl;
9542 }
9543 } else if (force_journal_dest) {
9544 dout(10) << " noting rename target ino " << target->ino() << " in metablob" << dendl;
9545 le->commit.renamed_dirino = target->ino();
9546 }
9547
9548 if (target && target->is_dir()) {
9549 ceph_assert(destdn);
9550 mdcache->project_subtree_rename(target, straydir, destdir);
9551 }
9552
9553 if (in && in->is_dir()) {
9554 ceph_assert(srcdn);
9555 mdcache->project_subtree_rename(in, destdir, srcdir);
9556 }
9557
9558 if (mdr && !mdr->more()->slave_update_journaled) {
9559 ceph_assert(le->commit.empty());
9560 mdlog->cancel_entry(le);
9561 mut->ls = NULL;
9562 _rename_rollback_finish(mut, mdr, srcdn, srcdnpv, destdn, straydn, splits, finish_mdr);
9563 } else {
9564 ceph_assert(!le->commit.empty());
9565 if (mdr)
9566 mdr->more()->slave_update_journaled = false;
9567 MDSLogContextBase *fin = new C_MDS_LoggedRenameRollback(this, mut, mdr,
9568 srcdn, srcdnpv, destdn, straydn,
9569 splits, finish_mdr);
9570 submit_mdlog_entry(le, fin, mdr, __func__);
9571 mdlog->flush();
9572 }
9573 }
9574
9575 void Server::_rename_rollback_finish(MutationRef& mut, MDRequestRef& mdr, CDentry *srcdn,
9576 version_t srcdnpv, CDentry *destdn, CDentry *straydn,
9577 map<client_t,ref_t<MClientSnap>> splits[2], bool finish_mdr)
9578 {
9579 dout(10) << "_rename_rollback_finish " << mut->reqid << dendl;
9580
9581 if (straydn) {
9582 straydn->get_dir()->unlink_inode(straydn);
9583 straydn->pop_projected_linkage();
9584 }
9585 if (destdn) {
9586 destdn->get_dir()->unlink_inode(destdn);
9587 destdn->pop_projected_linkage();
9588 }
9589 if (srcdn) {
9590 srcdn->pop_projected_linkage();
9591 if (srcdn->authority().first == mds->get_nodeid()) {
9592 srcdn->mark_dirty(srcdnpv, mut->ls);
9593 if (srcdn->get_linkage()->is_primary())
9594 srcdn->get_linkage()->get_inode()->state_set(CInode::STATE_AUTH);
9595 }
9596 }
9597
9598 mut->apply();
9599
9600 if (srcdn && srcdn->get_linkage()->is_primary()) {
9601 CInode *in = srcdn->get_linkage()->get_inode();
9602 if (in && in->is_dir()) {
9603 ceph_assert(destdn);
9604 mdcache->adjust_subtree_after_rename(in, destdn->get_dir(), true);
9605 }
9606 }
9607
9608 if (destdn) {
9609 CInode *oldin = destdn->get_linkage()->get_inode();
9610 // update subtree map?
9611 if (oldin && oldin->is_dir()) {
9612 ceph_assert(straydn);
9613 mdcache->adjust_subtree_after_rename(oldin, straydn->get_dir(), true);
9614 }
9615 }
9616
9617 if (mds->is_resolve()) {
9618 CDir *root = NULL;
9619 if (straydn)
9620 root = mdcache->get_subtree_root(straydn->get_dir());
9621 else if (destdn)
9622 root = mdcache->get_subtree_root(destdn->get_dir());
9623 if (root)
9624 mdcache->try_trim_non_auth_subtree(root);
9625 } else {
9626 mdcache->send_snaps(splits[1]);
9627 mdcache->send_snaps(splits[0]);
9628 }
9629
9630 if (mdr) {
9631 MDSContext::vec finished;
9632 if (mdr->more()->is_ambiguous_auth) {
9633 if (srcdn->is_auth())
9634 mdr->more()->rename_inode->unfreeze_inode(finished);
9635
9636 mdr->more()->rename_inode->clear_ambiguous_auth(finished);
9637 mdr->more()->is_ambiguous_auth = false;
9638 }
9639 mds->queue_waiters(finished);
9640 if (finish_mdr || mdr->aborted)
9641 mdcache->request_finish(mdr);
9642 else
9643 mdr->more()->slave_rolling_back = false;
9644 }
9645
9646 mdcache->finish_rollback(mut->reqid, mdr);
9647
9648 mut->cleanup();
9649 }
9650
9651 void Server::handle_slave_rename_prep_ack(MDRequestRef& mdr, const cref_t<MMDSSlaveRequest> &ack)
9652 {
9653 dout(10) << "handle_slave_rename_prep_ack " << *mdr
9654 << " witnessed by " << ack->get_source()
9655 << " " << *ack << dendl;
9656 mds_rank_t from = mds_rank_t(ack->get_source().num());
9657
9658 // note slave
9659 mdr->more()->slaves.insert(from);
9660 if (mdr->more()->srcdn_auth_mds == from &&
9661 mdr->more()->is_remote_frozen_authpin &&
9662 !mdr->more()->is_ambiguous_auth) {
9663 mdr->set_ambiguous_auth(mdr->more()->rename_inode);
9664 }
9665
9666 // witnessed? or add extra witnesses?
9667 ceph_assert(mdr->more()->witnessed.count(from) == 0);
9668 if (ack->is_interrupted()) {
9669 dout(10) << " slave request interrupted, noop" << dendl;
9670 } else if (ack->witnesses.empty()) {
9671 mdr->more()->witnessed.insert(from);
9672 if (!ack->is_not_journaled())
9673 mdr->more()->has_journaled_slaves = true;
9674 } else {
9675 dout(10) << " extra witnesses (srcdn replicas) are " << ack->witnesses << dendl;
9676 mdr->more()->extra_witnesses = ack->witnesses;
9677 mdr->more()->extra_witnesses.erase(mds->get_nodeid()); // not me!
9678 }
9679
9680 // srci import?
9681 if (ack->inode_export.length()) {
9682 dout(10) << " got srci import" << dendl;
9683 mdr->more()->inode_import.share(ack->inode_export);
9684 mdr->more()->inode_import_v = ack->inode_export_v;
9685 }
9686
9687 // remove from waiting list
9688 ceph_assert(mdr->more()->waiting_on_slave.count(from));
9689 mdr->more()->waiting_on_slave.erase(from);
9690
9691 if (mdr->more()->waiting_on_slave.empty())
9692 dispatch_client_request(mdr); // go again!
9693 else
9694 dout(10) << "still waiting on slaves " << mdr->more()->waiting_on_slave << dendl;
9695 }
9696
9697 void Server::handle_slave_rename_notify_ack(MDRequestRef& mdr, const cref_t<MMDSSlaveRequest> &ack)
9698 {
9699 dout(10) << "handle_slave_rename_notify_ack " << *mdr << " from mds."
9700 << ack->get_source() << dendl;
9701 ceph_assert(mdr->is_slave());
9702 mds_rank_t from = mds_rank_t(ack->get_source().num());
9703
9704 if (mdr->more()->waiting_on_slave.count(from)) {
9705 mdr->more()->waiting_on_slave.erase(from);
9706
9707 if (mdr->more()->waiting_on_slave.empty()) {
9708 if (mdr->slave_request)
9709 dispatch_slave_request(mdr);
9710 } else
9711 dout(10) << " still waiting for rename notify acks from "
9712 << mdr->more()->waiting_on_slave << dendl;
9713 }
9714 }
9715
9716 void Server::_slave_rename_sessions_flushed(MDRequestRef& mdr)
9717 {
9718 dout(10) << "_slave_rename_sessions_flushed " << *mdr << dendl;
9719
9720 if (mdr->more()->waiting_on_slave.count(MDS_RANK_NONE)) {
9721 mdr->more()->waiting_on_slave.erase(MDS_RANK_NONE);
9722
9723 if (mdr->more()->waiting_on_slave.empty()) {
9724 if (mdr->slave_request)
9725 dispatch_slave_request(mdr);
9726 } else
9727 dout(10) << " still waiting for rename notify acks from "
9728 << mdr->more()->waiting_on_slave << dendl;
9729 }
9730 }
9731
9732 // snaps
9733 /* This function takes responsibility for the passed mdr*/
9734 void Server::handle_client_lssnap(MDRequestRef& mdr)
9735 {
9736 const cref_t<MClientRequest> &req = mdr->client_request;
9737
9738 // traverse to path
9739 CInode *diri = try_get_auth_inode(mdr, req->get_filepath().get_ino());
9740 if (!diri)
9741 return;
9742
9743 if (!diri->is_dir()) {
9744 respond_to_request(mdr, -ENOTDIR);
9745 return;
9746 }
9747 dout(10) << "lssnap on " << *diri << dendl;
9748
9749 // lock snap
9750 if (!mds->locker->try_rdlock_snap_layout(diri, mdr))
9751 return;
9752
9753 if (!check_access(mdr, diri, MAY_READ))
9754 return;
9755
9756 SnapRealm *realm = diri->find_snaprealm();
9757 map<snapid_t,const SnapInfo*> infomap;
9758 realm->get_snap_info(infomap, diri->get_oldest_snap());
9759
9760 unsigned max_entries = req->head.args.readdir.max_entries;
9761 if (!max_entries)
9762 max_entries = infomap.size();
9763 int max_bytes = req->head.args.readdir.max_bytes;
9764 if (!max_bytes)
9765 // make sure at least one item can be encoded
9766 max_bytes = (512 << 10) + g_conf()->mds_max_xattr_pairs_size;
9767
9768 __u64 last_snapid = 0;
9769 string offset_str = req->get_path2();
9770 if (!offset_str.empty())
9771 last_snapid = realm->resolve_snapname(offset_str, diri->ino());
9772
9773 //Empty DirStat
9774 bufferlist dirbl;
9775 static DirStat empty;
9776 CDir::encode_dirstat(dirbl, mdr->session->info, empty);
9777
9778 max_bytes -= dirbl.length() - sizeof(__u32) + sizeof(__u8) * 2;
9779
9780 __u32 num = 0;
9781 bufferlist dnbl;
9782 auto p = infomap.upper_bound(last_snapid);
9783 for (; p != infomap.end() && num < max_entries; ++p) {
9784 dout(10) << p->first << " -> " << *p->second << dendl;
9785
9786 // actual
9787 string snap_name;
9788 if (p->second->ino == diri->ino())
9789 snap_name = p->second->name;
9790 else
9791 snap_name = p->second->get_long_name();
9792
9793 unsigned start_len = dnbl.length();
9794 if (int(start_len + snap_name.length() + sizeof(__u32) + sizeof(LeaseStat)) > max_bytes)
9795 break;
9796
9797 encode(snap_name, dnbl);
9798 //infinite lease
9799 LeaseStat e(CEPH_LEASE_VALID, -1, 0);
9800 mds->locker->encode_lease(dnbl, mdr->session->info, e);
9801 dout(20) << "encode_infinite_lease" << dendl;
9802
9803 int r = diri->encode_inodestat(dnbl, mdr->session, realm, p->first, max_bytes - (int)dnbl.length());
9804 if (r < 0) {
9805 bufferlist keep;
9806 keep.substr_of(dnbl, 0, start_len);
9807 dnbl.swap(keep);
9808 break;
9809 }
9810 ++num;
9811 }
9812
9813 encode(num, dirbl);
9814 __u16 flags = 0;
9815 if (p == infomap.end()) {
9816 flags = CEPH_READDIR_FRAG_END;
9817 if (last_snapid == 0)
9818 flags |= CEPH_READDIR_FRAG_COMPLETE;
9819 }
9820 encode(flags, dirbl);
9821 dirbl.claim_append(dnbl);
9822
9823 mdr->reply_extra_bl = dirbl;
9824 mdr->tracei = diri;
9825 respond_to_request(mdr, 0);
9826 }
9827
9828
9829 // MKSNAP
9830
9831 struct C_MDS_mksnap_finish : public ServerLogContext {
9832 CInode *diri;
9833 SnapInfo info;
9834 C_MDS_mksnap_finish(Server *s, MDRequestRef& r, CInode *di, SnapInfo &i) :
9835 ServerLogContext(s, r), diri(di), info(i) {}
9836 void finish(int r) override {
9837 server->_mksnap_finish(mdr, diri, info);
9838 }
9839 };
9840
9841 /* This function takes responsibility for the passed mdr*/
9842 void Server::handle_client_mksnap(MDRequestRef& mdr)
9843 {
9844 const cref_t<MClientRequest> &req = mdr->client_request;
9845 // make sure we have as new a map as the client
9846 if (req->get_mdsmap_epoch() > mds->mdsmap->get_epoch()) {
9847 mds->wait_for_mdsmap(req->get_mdsmap_epoch(), new C_MDS_RetryRequest(mdcache, mdr));
9848 return;
9849 }
9850 if (!mds->mdsmap->allows_snaps()) {
9851 // you can't make snapshots until you set an option right now
9852 respond_to_request(mdr, -EPERM);
9853 return;
9854 }
9855
9856 CInode *diri = try_get_auth_inode(mdr, req->get_filepath().get_ino());
9857 if (!diri)
9858 return;
9859
9860 // dir only
9861 if (!diri->is_dir()) {
9862 respond_to_request(mdr, -ENOTDIR);
9863 return;
9864 }
9865 if (diri->is_system() && !diri->is_root()) {
9866 // no snaps in system dirs (root is ok)
9867 respond_to_request(mdr, -EPERM);
9868 return;
9869 }
9870
9871 std::string_view snapname = req->get_filepath().last_dentry();
9872
9873 if (mdr->client_request->get_caller_uid() < g_conf()->mds_snap_min_uid || mdr->client_request->get_caller_uid() > g_conf()->mds_snap_max_uid) {
9874 dout(20) << "mksnap " << snapname << " on " << *diri << " denied to uid " << mdr->client_request->get_caller_uid() << dendl;
9875 respond_to_request(mdr, -EPERM);
9876 return;
9877 }
9878
9879 dout(10) << "mksnap " << snapname << " on " << *diri << dendl;
9880
9881 // lock snap
9882 if (!(mdr->locking_state & MutationImpl::ALL_LOCKED)) {
9883 MutationImpl::LockOpVec lov;
9884 lov.add_xlock(&diri->snaplock);
9885 if (!mds->locker->acquire_locks(mdr, lov))
9886 return;
9887
9888 if (CDentry *pdn = diri->get_projected_parent_dn(); pdn) {
9889 if (!mds->locker->try_rdlock_snap_layout(pdn->get_dir()->get_inode(), mdr))
9890 return;
9891 }
9892 mdr->locking_state |= MutationImpl::ALL_LOCKED;
9893 }
9894
9895 if (!check_access(mdr, diri, MAY_WRITE|MAY_SNAPSHOT))
9896 return;
9897
9898 // check if we can create any more snapshots
9899 // we don't allow any more if we are already at or beyond the limit
9900 if (diri->snaprealm &&
9901 diri->snaprealm->get_snaps().size() >= max_snaps_per_dir) {
9902 respond_to_request(mdr, -EMLINK);
9903 return;
9904 }
9905
9906 // make sure name is unique
9907 if (diri->snaprealm &&
9908 diri->snaprealm->exists(snapname)) {
9909 respond_to_request(mdr, -EEXIST);
9910 return;
9911 }
9912 if (snapname.length() == 0 ||
9913 snapname[0] == '_') {
9914 respond_to_request(mdr, -EINVAL);
9915 return;
9916 }
9917
9918 // allocate a snapid
9919 if (!mdr->more()->stid) {
9920 // prepare an stid
9921 mds->snapclient->prepare_create(diri->ino(), snapname,
9922 mdr->get_mds_stamp(),
9923 &mdr->more()->stid, &mdr->more()->snapidbl,
9924 new C_MDS_RetryRequest(mdcache, mdr));
9925 return;
9926 }
9927
9928 version_t stid = mdr->more()->stid;
9929 snapid_t snapid;
9930 auto p = mdr->more()->snapidbl.cbegin();
9931 decode(snapid, p);
9932 dout(10) << " stid " << stid << " snapid " << snapid << dendl;
9933
9934 ceph_assert(mds->snapclient->get_cached_version() >= stid);
9935
9936 // journal
9937 SnapInfo info;
9938 info.ino = diri->ino();
9939 info.snapid = snapid;
9940 info.name = snapname;
9941 info.stamp = mdr->get_op_stamp();
9942
9943 auto &pi = diri->project_inode(false, true);
9944 pi.inode.ctime = info.stamp;
9945 if (info.stamp > pi.inode.rstat.rctime)
9946 pi.inode.rstat.rctime = info.stamp;
9947 pi.inode.rstat.rsnaps++;
9948 pi.inode.version = diri->pre_dirty();
9949
9950 // project the snaprealm
9951 auto &newsnap = *pi.snapnode;
9952 newsnap.created = snapid;
9953 auto em = newsnap.snaps.emplace(std::piecewise_construct, std::forward_as_tuple(snapid), std::forward_as_tuple(info));
9954 if (!em.second)
9955 em.first->second = info;
9956 newsnap.seq = snapid;
9957 newsnap.last_created = snapid;
9958
9959 // journal the inode changes
9960 mdr->ls = mdlog->get_current_segment();
9961 EUpdate *le = new EUpdate(mdlog, "mksnap");
9962 mdlog->start_entry(le);
9963
9964 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
9965 le->metablob.add_table_transaction(TABLE_SNAP, stid);
9966 mdcache->predirty_journal_parents(mdr, &le->metablob, diri, 0, PREDIRTY_PRIMARY, false);
9967 mdcache->journal_dirty_inode(mdr.get(), &le->metablob, diri);
9968
9969 // journal the snaprealm changes
9970 submit_mdlog_entry(le, new C_MDS_mksnap_finish(this, mdr, diri, info),
9971 mdr, __func__);
9972 mdlog->flush();
9973 }
9974
9975 void Server::_mksnap_finish(MDRequestRef& mdr, CInode *diri, SnapInfo &info)
9976 {
9977 dout(10) << "_mksnap_finish " << *mdr << " " << info << dendl;
9978
9979 int op = (diri->snaprealm? CEPH_SNAP_OP_CREATE : CEPH_SNAP_OP_SPLIT);
9980
9981 diri->pop_and_dirty_projected_inode(mdr->ls);
9982 mdr->apply();
9983
9984 mds->snapclient->commit(mdr->more()->stid, mdr->ls);
9985
9986 // create snap
9987 dout(10) << "snaprealm now " << *diri->snaprealm << dendl;
9988
9989 // notify other mds
9990 mdcache->send_snap_update(diri, mdr->more()->stid, op);
9991
9992 mdcache->do_realm_invalidate_and_update_notify(diri, op);
9993
9994 // yay
9995 mdr->in[0] = diri;
9996 mdr->snapid = info.snapid;
9997 mdr->tracei = diri;
9998 respond_to_request(mdr, 0);
9999 }
10000
10001
10002 // RMSNAP
10003
10004 struct C_MDS_rmsnap_finish : public ServerLogContext {
10005 CInode *diri;
10006 snapid_t snapid;
10007 C_MDS_rmsnap_finish(Server *s, MDRequestRef& r, CInode *di, snapid_t sn) :
10008 ServerLogContext(s, r), diri(di), snapid(sn) {}
10009 void finish(int r) override {
10010 server->_rmsnap_finish(mdr, diri, snapid);
10011 }
10012 };
10013
10014 /* This function takes responsibility for the passed mdr*/
10015 void Server::handle_client_rmsnap(MDRequestRef& mdr)
10016 {
10017 const cref_t<MClientRequest> &req = mdr->client_request;
10018
10019 CInode *diri = try_get_auth_inode(mdr, req->get_filepath().get_ino());
10020 if (!diri)
10021 return;
10022
10023 if (!diri->is_dir()) {
10024 respond_to_request(mdr, -ENOTDIR);
10025 return;
10026 }
10027
10028 std::string_view snapname = req->get_filepath().last_dentry();
10029
10030 if (mdr->client_request->get_caller_uid() < g_conf()->mds_snap_min_uid || mdr->client_request->get_caller_uid() > g_conf()->mds_snap_max_uid) {
10031 dout(20) << "rmsnap " << snapname << " on " << *diri << " denied to uid " << mdr->client_request->get_caller_uid() << dendl;
10032 respond_to_request(mdr, -EPERM);
10033 return;
10034 }
10035
10036 dout(10) << "rmsnap " << snapname << " on " << *diri << dendl;
10037
10038 // does snap exist?
10039 if (snapname.length() == 0 || snapname[0] == '_') {
10040 respond_to_request(mdr, -EINVAL); // can't prune a parent snap, currently.
10041 return;
10042 }
10043 if (!diri->snaprealm || !diri->snaprealm->exists(snapname)) {
10044 respond_to_request(mdr, -ENOENT);
10045 return;
10046 }
10047 snapid_t snapid = diri->snaprealm->resolve_snapname(snapname, diri->ino());
10048 dout(10) << " snapname " << snapname << " is " << snapid << dendl;
10049
10050 if (!(mdr->locking_state & MutationImpl::ALL_LOCKED)) {
10051 MutationImpl::LockOpVec lov;
10052 lov.add_xlock(&diri->snaplock);
10053 if (!mds->locker->acquire_locks(mdr, lov))
10054 return;
10055 if (CDentry *pdn = diri->get_projected_parent_dn(); pdn) {
10056 if (!mds->locker->try_rdlock_snap_layout(pdn->get_dir()->get_inode(), mdr))
10057 return;
10058 }
10059 mdr->locking_state |= MutationImpl::ALL_LOCKED;
10060 }
10061
10062 if (!check_access(mdr, diri, MAY_WRITE|MAY_SNAPSHOT))
10063 return;
10064
10065 // prepare
10066 if (!mdr->more()->stid) {
10067 mds->snapclient->prepare_destroy(diri->ino(), snapid,
10068 &mdr->more()->stid, &mdr->more()->snapidbl,
10069 new C_MDS_RetryRequest(mdcache, mdr));
10070 return;
10071 }
10072 version_t stid = mdr->more()->stid;
10073 auto p = mdr->more()->snapidbl.cbegin();
10074 snapid_t seq;
10075 decode(seq, p);
10076 dout(10) << " stid is " << stid << ", seq is " << seq << dendl;
10077
10078 ceph_assert(mds->snapclient->get_cached_version() >= stid);
10079
10080 // journal
10081 auto &pi = diri->project_inode(false, true);
10082 pi.inode.version = diri->pre_dirty();
10083 pi.inode.ctime = mdr->get_op_stamp();
10084 if (mdr->get_op_stamp() > pi.inode.rstat.rctime)
10085 pi.inode.rstat.rctime = mdr->get_op_stamp();
10086 pi.inode.rstat.rsnaps--;
10087
10088 mdr->ls = mdlog->get_current_segment();
10089 EUpdate *le = new EUpdate(mdlog, "rmsnap");
10090 mdlog->start_entry(le);
10091
10092 // project the snaprealm
10093 auto &newnode = *pi.snapnode;
10094 newnode.snaps.erase(snapid);
10095 newnode.seq = seq;
10096 newnode.last_destroyed = seq;
10097
10098 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
10099 le->metablob.add_table_transaction(TABLE_SNAP, stid);
10100 mdcache->predirty_journal_parents(mdr, &le->metablob, diri, 0, PREDIRTY_PRIMARY, false);
10101 mdcache->journal_dirty_inode(mdr.get(), &le->metablob, diri);
10102
10103 submit_mdlog_entry(le, new C_MDS_rmsnap_finish(this, mdr, diri, snapid),
10104 mdr, __func__);
10105 mdlog->flush();
10106 }
10107
10108 void Server::_rmsnap_finish(MDRequestRef& mdr, CInode *diri, snapid_t snapid)
10109 {
10110 dout(10) << "_rmsnap_finish " << *mdr << " " << snapid << dendl;
10111 snapid_t stid = mdr->more()->stid;
10112 auto p = mdr->more()->snapidbl.cbegin();
10113 snapid_t seq;
10114 decode(seq, p);
10115
10116 diri->pop_and_dirty_projected_inode(mdr->ls);
10117 mdr->apply();
10118
10119 mds->snapclient->commit(stid, mdr->ls);
10120
10121 dout(10) << "snaprealm now " << *diri->snaprealm << dendl;
10122
10123 // notify other mds
10124 mdcache->send_snap_update(diri, mdr->more()->stid, CEPH_SNAP_OP_DESTROY);
10125
10126 mdcache->do_realm_invalidate_and_update_notify(diri, CEPH_SNAP_OP_DESTROY);
10127
10128 // yay
10129 mdr->in[0] = diri;
10130 respond_to_request(mdr, 0);
10131
10132 // purge snapshot data
10133 if (diri->snaprealm->have_past_parents_open())
10134 diri->purge_stale_snap_data(diri->snaprealm->get_snaps());
10135 }
10136
10137 struct C_MDS_renamesnap_finish : public ServerLogContext {
10138 CInode *diri;
10139 snapid_t snapid;
10140 C_MDS_renamesnap_finish(Server *s, MDRequestRef& r, CInode *di, snapid_t sn) :
10141 ServerLogContext(s, r), diri(di), snapid(sn) {}
10142 void finish(int r) override {
10143 server->_renamesnap_finish(mdr, diri, snapid);
10144 }
10145 };
10146
10147 /* This function takes responsibility for the passed mdr*/
10148 void Server::handle_client_renamesnap(MDRequestRef& mdr)
10149 {
10150 const cref_t<MClientRequest> &req = mdr->client_request;
10151 if (req->get_filepath().get_ino() != req->get_filepath2().get_ino()) {
10152 respond_to_request(mdr, -EINVAL);
10153 return;
10154 }
10155
10156 CInode *diri = try_get_auth_inode(mdr, req->get_filepath().get_ino());
10157 if (!diri)
10158 return;
10159
10160 if (!diri->is_dir()) { // dir only
10161 respond_to_request(mdr, -ENOTDIR);
10162 return;
10163 }
10164
10165 if (mdr->client_request->get_caller_uid() < g_conf()->mds_snap_min_uid ||
10166 mdr->client_request->get_caller_uid() > g_conf()->mds_snap_max_uid) {
10167 respond_to_request(mdr, -EPERM);
10168 return;
10169 }
10170
10171 std::string_view dstname = req->get_filepath().last_dentry();
10172 std::string_view srcname = req->get_filepath2().last_dentry();
10173 dout(10) << "renamesnap " << srcname << "->" << dstname << " on " << *diri << dendl;
10174
10175 if (srcname.length() == 0 || srcname[0] == '_') {
10176 respond_to_request(mdr, -EINVAL); // can't rename a parent snap.
10177 return;
10178 }
10179 if (!diri->snaprealm || !diri->snaprealm->exists(srcname)) {
10180 respond_to_request(mdr, -ENOENT);
10181 return;
10182 }
10183 if (dstname.length() == 0 || dstname[0] == '_') {
10184 respond_to_request(mdr, -EINVAL);
10185 return;
10186 }
10187 if (diri->snaprealm->exists(dstname)) {
10188 respond_to_request(mdr, -EEXIST);
10189 return;
10190 }
10191
10192 snapid_t snapid = diri->snaprealm->resolve_snapname(srcname, diri->ino());
10193 dout(10) << " snapname " << srcname << " is " << snapid << dendl;
10194
10195 // lock snap
10196 if (!(mdr->locking_state & MutationImpl::ALL_LOCKED)) {
10197 MutationImpl::LockOpVec lov;
10198 lov.add_xlock(&diri->snaplock);
10199 if (!mds->locker->acquire_locks(mdr, lov))
10200 return;
10201 if (CDentry *pdn = diri->get_projected_parent_dn(); pdn) {
10202 if (!mds->locker->try_rdlock_snap_layout(pdn->get_dir()->get_inode(), mdr))
10203 return;
10204 }
10205 mdr->locking_state |= MutationImpl::ALL_LOCKED;
10206 }
10207
10208 if (!check_access(mdr, diri, MAY_WRITE|MAY_SNAPSHOT))
10209 return;
10210
10211 // prepare
10212 if (!mdr->more()->stid) {
10213 mds->snapclient->prepare_update(diri->ino(), snapid, dstname, utime_t(),
10214 &mdr->more()->stid,
10215 new C_MDS_RetryRequest(mdcache, mdr));
10216 return;
10217 }
10218
10219 version_t stid = mdr->more()->stid;
10220 dout(10) << " stid is " << stid << dendl;
10221
10222 ceph_assert(mds->snapclient->get_cached_version() >= stid);
10223
10224 // journal
10225 auto &pi = diri->project_inode(false, true);
10226 pi.inode.ctime = mdr->get_op_stamp();
10227 if (mdr->get_op_stamp() > pi.inode.rstat.rctime)
10228 pi.inode.rstat.rctime = mdr->get_op_stamp();
10229 pi.inode.version = diri->pre_dirty();
10230
10231 // project the snaprealm
10232 auto &newsnap = *pi.snapnode;
10233 auto it = newsnap.snaps.find(snapid);
10234 ceph_assert(it != newsnap.snaps.end());
10235 it->second.name = dstname;
10236
10237 // journal the inode changes
10238 mdr->ls = mdlog->get_current_segment();
10239 EUpdate *le = new EUpdate(mdlog, "renamesnap");
10240 mdlog->start_entry(le);
10241
10242 le->metablob.add_client_req(req->get_reqid(), req->get_oldest_client_tid());
10243 le->metablob.add_table_transaction(TABLE_SNAP, stid);
10244 mdcache->predirty_journal_parents(mdr, &le->metablob, diri, 0, PREDIRTY_PRIMARY, false);
10245 mdcache->journal_dirty_inode(mdr.get(), &le->metablob, diri);
10246
10247 // journal the snaprealm changes
10248 submit_mdlog_entry(le, new C_MDS_renamesnap_finish(this, mdr, diri, snapid),
10249 mdr, __func__);
10250 mdlog->flush();
10251 }
10252
10253 void Server::_renamesnap_finish(MDRequestRef& mdr, CInode *diri, snapid_t snapid)
10254 {
10255 dout(10) << "_renamesnap_finish " << *mdr << " " << snapid << dendl;
10256
10257 diri->pop_and_dirty_projected_inode(mdr->ls);
10258 mdr->apply();
10259
10260 mds->snapclient->commit(mdr->more()->stid, mdr->ls);
10261
10262 dout(10) << "snaprealm now " << *diri->snaprealm << dendl;
10263
10264 // notify other mds
10265 mdcache->send_snap_update(diri, mdr->more()->stid, CEPH_SNAP_OP_UPDATE);
10266
10267 mdcache->do_realm_invalidate_and_update_notify(diri, CEPH_SNAP_OP_UPDATE);
10268
10269 // yay
10270 mdr->in[0] = diri;
10271 mdr->tracei = diri;
10272 mdr->snapid = snapid;
10273 respond_to_request(mdr, 0);
10274 }
10275
10276 /**
10277 * Return true if server is in state RECONNECT and this
10278 * client has not yet reconnected.
10279 */
10280 bool Server::waiting_for_reconnect(client_t c) const
10281 {
10282 return client_reconnect_gather.count(c) > 0;
10283 }
10284
10285 void Server::dump_reconnect_status(Formatter *f) const
10286 {
10287 f->open_object_section("reconnect_status");
10288 f->dump_stream("client_reconnect_gather") << client_reconnect_gather;
10289 f->close_section();
10290 }