]> git.proxmox.com Git - ceph.git/blob - ceph/src/mon/MDSMonitor.cc
import quincy beta 17.1.0
[ceph.git] / ceph / src / mon / MDSMonitor.cc
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3 /*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 *
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
12 *
13 */
14
15 #include <regex>
16 #include <sstream>
17 #include <boost/utility.hpp>
18
19 #include "MDSMonitor.h"
20 #include "FSCommands.h"
21 #include "Monitor.h"
22 #include "MonitorDBStore.h"
23 #include "OSDMonitor.h"
24
25 #include "common/strtol.h"
26 #include "common/perf_counters.h"
27 #include "common/config.h"
28 #include "common/cmdparse.h"
29 #include "messages/MMDSMap.h"
30 #include "messages/MFSMap.h"
31 #include "messages/MFSMapUser.h"
32 #include "messages/MMDSLoadTargets.h"
33 #include "messages/MMonCommand.h"
34 #include "messages/MGenericMessage.h"
35
36 #include "include/ceph_assert.h"
37 #include "include/str_list.h"
38 #include "include/stringify.h"
39 #include "mds/mdstypes.h"
40 #include "Session.h"
41
42 using namespace TOPNSPC::common;
43
44 using std::dec;
45 using std::hex;
46 using std::list;
47 using std::map;
48 using std::make_pair;
49 using std::ostream;
50 using std::ostringstream;
51 using std::pair;
52 using std::set;
53 using std::string;
54 using std::string_view;
55 using std::stringstream;
56 using std::to_string;
57 using std::vector;
58
59 using ceph::bufferlist;
60 using ceph::decode;
61 using ceph::encode;
62 using ceph::ErasureCodeInterfaceRef;
63 using ceph::ErasureCodeProfile;
64 using ceph::Formatter;
65 using ceph::JSONFormatter;
66 using ceph::make_message;
67 using ceph::mono_clock;
68 using ceph::mono_time;
69
70 #define dout_subsys ceph_subsys_mon
71 #undef dout_prefix
72 #define dout_prefix _prefix(_dout, mon, get_fsmap())
73 static ostream& _prefix(std::ostream *_dout, Monitor &mon, const FSMap& fsmap) {
74 return *_dout << "mon." << mon.name << "@" << mon.rank
75 << "(" << mon.get_state_name()
76 << ").mds e" << fsmap.get_epoch() << " ";
77 }
78
79 static const string MDS_METADATA_PREFIX("mds_metadata");
80 static const string MDS_HEALTH_PREFIX("mds_health");
81
82
83 /*
84 * Specialized implementation of cmd_getval to allow us to parse
85 * out strongly-typedef'd types
86 */
87 namespace TOPNSPC::common {
88 template<> bool cmd_getval(const cmdmap_t& cmdmap,
89 std::string_view k, mds_gid_t &val)
90 {
91 return cmd_getval(cmdmap, k, (int64_t&)val);
92 }
93
94 template<> bool cmd_getval(const cmdmap_t& cmdmap,
95 std::string_view k, mds_rank_t &val)
96 {
97 return cmd_getval(cmdmap, k, (int64_t&)val);
98 }
99
100 template<> bool cmd_getval(const cmdmap_t& cmdmap,
101 std::string_view k, MDSMap::DaemonState &val)
102 {
103 return cmd_getval(cmdmap, k, (int64_t&)val);
104 }
105 }
106 // my methods
107
108 template <int dblV>
109 void MDSMonitor::print_map(const FSMap& m)
110 {
111 dout(dblV) << "print_map\n";
112 m.print(*_dout);
113 *_dout << dendl;
114 }
115
116 // service methods
117 void MDSMonitor::create_initial()
118 {
119 dout(10) << "create_initial" << dendl;
120 }
121
122 void MDSMonitor::get_store_prefixes(std::set<string>& s) const
123 {
124 s.insert(service_name);
125 s.insert(MDS_METADATA_PREFIX);
126 s.insert(MDS_HEALTH_PREFIX);
127 }
128
129 void MDSMonitor::update_from_paxos(bool *need_bootstrap)
130 {
131 version_t version = get_last_committed();
132 if (version == get_fsmap().epoch)
133 return;
134
135 dout(10) << __func__ << " version " << version
136 << ", my e " << get_fsmap().epoch << dendl;
137 ceph_assert(version > get_fsmap().epoch);
138
139 load_health();
140
141 // read and decode
142 bufferlist fsmap_bl;
143 fsmap_bl.clear();
144 int err = get_version(version, fsmap_bl);
145 ceph_assert(err == 0);
146
147 ceph_assert(fsmap_bl.length() > 0);
148 dout(10) << __func__ << " got " << version << dendl;
149 try {
150 PaxosFSMap::decode(fsmap_bl);
151 } catch (const ceph::buffer::malformed_input& e) {
152 derr << "unable to decode FSMap: " << e.what() << dendl;
153 throw;
154 }
155
156 // new map
157 dout(0) << "new map" << dendl;
158 print_map<0>(get_fsmap());
159 if (!g_conf()->mon_mds_skip_sanity) {
160 get_fsmap().sanity();
161 }
162
163 check_subs();
164 }
165
166 void MDSMonitor::init()
167 {
168 (void)load_metadata(pending_metadata);
169 }
170
171 void MDSMonitor::create_pending()
172 {
173 auto &fsmap = PaxosFSMap::create_pending();
174
175 if (mon.osdmon()->is_readable()) {
176 const auto &osdmap = mon.osdmon()->osdmap;
177 fsmap.sanitize([&osdmap](int64_t pool){return osdmap.have_pg_pool(pool);});
178 }
179
180 dout(10) << "create_pending e" << fsmap.epoch << dendl;
181 }
182
183 void MDSMonitor::encode_pending(MonitorDBStore::TransactionRef t)
184 {
185 auto &pending = get_pending_fsmap_writeable();
186 auto &epoch = pending.epoch;
187
188 dout(10) << "encode_pending e" << epoch << dendl;
189
190 // print map iff 'debug mon = 30' or higher
191 print_map<30>(pending);
192 if (!g_conf()->mon_mds_skip_sanity) {
193 pending.sanity(true);
194 }
195
196 // Set 'modified' on maps modified this epoch
197 for (auto &p : pending.filesystems) {
198 if (p.second->mds_map.epoch == epoch) {
199 p.second->mds_map.modified = ceph_clock_now();
200 }
201 }
202
203 // apply to paxos
204 ceph_assert(get_last_committed() + 1 == pending.epoch);
205 bufferlist pending_bl;
206 pending.encode(pending_bl, mon.get_quorum_con_features());
207
208 /* put everything in the transaction */
209 put_version(t, pending.epoch, pending_bl);
210 put_last_committed(t, pending.epoch);
211
212 // Encode MDSHealth data
213 for (std::map<uint64_t, MDSHealth>::iterator i = pending_daemon_health.begin();
214 i != pending_daemon_health.end(); ++i) {
215 bufferlist bl;
216 i->second.encode(bl);
217 t->put(MDS_HEALTH_PREFIX, stringify(i->first), bl);
218 }
219
220 for (std::set<uint64_t>::iterator i = pending_daemon_health_rm.begin();
221 i != pending_daemon_health_rm.end(); ++i) {
222 t->erase(MDS_HEALTH_PREFIX, stringify(*i));
223 }
224 pending_daemon_health_rm.clear();
225 remove_from_metadata(pending, t);
226
227 // health
228 health_check_map_t new_checks;
229 const auto &info_map = pending.get_mds_info();
230 for (const auto &i : info_map) {
231 const auto &gid = i.first;
232 const auto &info = i.second;
233 if (pending_daemon_health_rm.count(gid)) {
234 continue;
235 }
236 MDSHealth health;
237 auto p = pending_daemon_health.find(gid);
238 if (p != pending_daemon_health.end()) {
239 health = p->second;
240 } else {
241 bufferlist bl;
242 mon.store->get(MDS_HEALTH_PREFIX, stringify(gid), bl);
243 if (!bl.length()) {
244 derr << "Missing health data for MDS " << gid << dendl;
245 continue;
246 }
247 auto bl_i = bl.cbegin();
248 health.decode(bl_i);
249 }
250 for (const auto &metric : health.metrics) {
251 const auto rank = info.rank;
252 health_check_t *check = &new_checks.get_or_add(
253 mds_metric_name(metric.type),
254 metric.sev,
255 mds_metric_summary(metric.type),
256 1);
257 ostringstream ss;
258 ss << "mds." << info.name << "(mds." << rank << "): " << metric.message;
259 bool first = true;
260 for (auto &p : metric.metadata) {
261 if (first) {
262 ss << " ";
263 } else {
264 ss << ", ";
265 }
266 ss << p.first << ": " << p.second;
267 first = false;
268 }
269 check->detail.push_back(ss.str());
270 }
271 }
272 pending.get_health_checks(&new_checks);
273 for (auto& p : new_checks.checks) {
274 p.second.summary = std::regex_replace(
275 p.second.summary,
276 std::regex("%num%"),
277 stringify(p.second.detail.size()));
278 p.second.summary = std::regex_replace(
279 p.second.summary,
280 std::regex("%plurals%"),
281 p.second.detail.size() > 1 ? "s" : "");
282 p.second.summary = std::regex_replace(
283 p.second.summary,
284 std::regex("%isorare%"),
285 p.second.detail.size() > 1 ? "are" : "is");
286 p.second.summary = std::regex_replace(
287 p.second.summary,
288 std::regex("%hasorhave%"),
289 p.second.detail.size() > 1 ? "have" : "has");
290 }
291 encode_health(new_checks, t);
292 }
293
294 version_t MDSMonitor::get_trim_to() const
295 {
296 version_t floor = 0;
297 if (g_conf()->mon_mds_force_trim_to > 0 &&
298 g_conf()->mon_mds_force_trim_to <= (int)get_last_committed()) {
299 floor = g_conf()->mon_mds_force_trim_to;
300 dout(10) << __func__ << " explicit mon_mds_force_trim_to = "
301 << floor << dendl;
302 }
303
304 unsigned max = g_conf()->mon_max_mdsmap_epochs;
305 version_t last = get_last_committed();
306
307 if (last - get_first_committed() > max && floor < last - max) {
308 floor = last-max;
309 }
310
311 dout(20) << __func__ << " = " << floor << dendl;
312 return floor;
313 }
314
315 bool MDSMonitor::preprocess_query(MonOpRequestRef op)
316 {
317 op->mark_mdsmon_event(__func__);
318 auto m = op->get_req<PaxosServiceMessage>();
319 dout(10) << "preprocess_query " << *m << " from " << m->get_orig_source()
320 << " " << m->get_orig_source_addrs() << dendl;
321
322 switch (m->get_type()) {
323
324 case MSG_MDS_BEACON:
325 return preprocess_beacon(op);
326
327 case MSG_MON_COMMAND:
328 try {
329 return preprocess_command(op);
330 } catch (const bad_cmd_get& e) {
331 bufferlist bl;
332 mon.reply_command(op, -EINVAL, e.what(), bl, get_last_committed());
333 return true;
334 }
335
336 case MSG_MDS_OFFLOAD_TARGETS:
337 return preprocess_offload_targets(op);
338
339 default:
340 ceph_abort();
341 return true;
342 }
343 }
344
345 void MDSMonitor::_note_beacon(MMDSBeacon *m)
346 {
347 mds_gid_t gid = mds_gid_t(m->get_global_id());
348 version_t seq = m->get_seq();
349
350 dout(5) << "_note_beacon " << *m << " noting time" << dendl;
351 auto &beacon = last_beacon[gid];
352 beacon.stamp = mono_clock::now();
353 beacon.seq = seq;
354 }
355
356 bool MDSMonitor::preprocess_beacon(MonOpRequestRef op)
357 {
358 op->mark_mdsmon_event(__func__);
359 auto m = op->get_req<MMDSBeacon>();
360 MDSMap::DaemonState state = m->get_state();
361 mds_gid_t gid = m->get_global_id();
362 version_t seq = m->get_seq();
363 MDSMap::mds_info_t info;
364 epoch_t effective_epoch = 0;
365
366 const auto &fsmap = get_fsmap();
367
368 // check privileges, ignore if fails
369 MonSession *session = op->get_session();
370 if (!session)
371 goto ignore;
372 if (!session->is_capable("mds", MON_CAP_X)) {
373 dout(0) << "preprocess_beacon got MMDSBeacon from entity with insufficient privileges "
374 << session->caps << dendl;
375 goto ignore;
376 }
377
378 if (m->get_fsid() != mon.monmap->fsid) {
379 dout(0) << "preprocess_beacon on fsid " << m->get_fsid() << " != " << mon.monmap->fsid << dendl;
380 goto ignore;
381 }
382
383 dout(5) << "preprocess_beacon " << *m
384 << " from " << m->get_orig_source()
385 << " " << m->get_orig_source_addrs()
386 << " " << m->get_compat()
387 << dendl;
388
389 // make sure the address has a port
390 if (m->get_orig_source_addr().get_port() == 0) {
391 dout(1) << " ignoring boot message without a port" << dendl;
392 goto ignore;
393 }
394
395 // fw to leader?
396 if (!is_leader())
397 return false;
398
399 // booted, but not in map?
400 if (!fsmap.gid_exists(gid)) {
401 if (state != MDSMap::STATE_BOOT) {
402 dout(7) << "mds_beacon " << *m << " is not in fsmap (state "
403 << ceph_mds_state_name(state) << ")" << dendl;
404
405 /* We can't send an MDSMap this MDS was a part of because we no longer
406 * know which FS it was part of. Nor does this matter. Sending an empty
407 * MDSMap is sufficient for getting the MDS to respawn.
408 */
409 auto m = make_message<MMDSMap>(mon.monmap->fsid, MDSMap::create_null_mdsmap());
410 mon.send_reply(op, m.detach());
411 return true;
412 } else {
413 return false; // not booted yet.
414 }
415 }
416 dout(10) << __func__ << ": GID exists in map: " << gid << dendl;
417 info = fsmap.get_info_gid(gid);
418
419 if (state == MDSMap::STATE_DNE) {
420 return false;
421 }
422
423 // old seq?
424 if (info.state_seq > seq) {
425 dout(7) << "mds_beacon " << *m << " has old seq, ignoring" << dendl;
426 goto ignore;
427 }
428
429 // Work out the latest epoch that this daemon should have seen
430 {
431 fs_cluster_id_t fscid = fsmap.mds_roles.at(gid);
432 if (fscid == FS_CLUSTER_ID_NONE) {
433 effective_epoch = fsmap.standby_epochs.at(gid);
434 } else {
435 effective_epoch = fsmap.get_filesystem(fscid)->mds_map.epoch;
436 }
437 if (effective_epoch != m->get_last_epoch_seen()) {
438 dout(10) << "mds_beacon " << *m
439 << " ignoring requested state, because mds hasn't seen latest map" << dendl;
440 goto reply;
441 }
442 }
443
444 if (info.laggy()) {
445 _note_beacon(m);
446 return false; // no longer laggy, need to update map.
447 }
448 if (state == MDSMap::STATE_BOOT) {
449 // ignore, already booted.
450 goto ignore;
451 }
452
453 // did the join_fscid change
454 if (m->get_fs().size()) {
455 fs_cluster_id_t fscid = FS_CLUSTER_ID_NONE;
456 auto f = fsmap.get_filesystem(m->get_fs());
457 if (f) {
458 fscid = f->fscid;
459 }
460 if (info.join_fscid != fscid) {
461 dout(10) << __func__ << " standby mds_join_fs changed to " << fscid
462 << " (" << m->get_fs() << ")" << dendl;
463 _note_beacon(m);
464 return false;
465 }
466 } else {
467 if (info.join_fscid != FS_CLUSTER_ID_NONE) {
468 dout(10) << __func__ << " standby mds_join_fs was cleared" << dendl;
469 _note_beacon(m);
470 return false;
471 }
472 }
473
474 // is there a state change here?
475 if (info.state != state) {
476 _note_beacon(m);
477 return false;
478 }
479
480 // Comparing known daemon health with m->get_health()
481 // and return false (i.e. require proposal) if they
482 // do not match, to update our stored
483 if (!(pending_daemon_health[gid] == m->get_health())) {
484 dout(10) << __func__ << " health metrics for gid " << gid << " were updated" << dendl;
485 _note_beacon(m);
486 return false;
487 }
488
489 reply:
490 // note time and reply
491 ceph_assert(effective_epoch > 0);
492 _note_beacon(m);
493 {
494 auto beacon = make_message<MMDSBeacon>(mon.monmap->fsid,
495 m->get_global_id(), m->get_name(), effective_epoch,
496 state, seq, CEPH_FEATURES_SUPPORTED_DEFAULT);
497 mon.send_reply(op, beacon.detach());
498 }
499 return true;
500
501 ignore:
502 // I won't reply this beacon, drop it.
503 mon.no_reply(op);
504 return true;
505 }
506
507 bool MDSMonitor::preprocess_offload_targets(MonOpRequestRef op)
508 {
509 op->mark_mdsmon_event(__func__);
510 auto m = op->get_req<MMDSLoadTargets>();
511 dout(10) << "preprocess_offload_targets " << *m << " from " << m->get_orig_source() << dendl;
512
513 const auto &fsmap = get_fsmap();
514
515 // check privileges, ignore message if fails
516 MonSession *session = op->get_session();
517 if (!session)
518 goto ignore;
519 if (!session->is_capable("mds", MON_CAP_X)) {
520 dout(0) << "preprocess_offload_targets got MMDSLoadTargets from entity with insufficient caps "
521 << session->caps << dendl;
522 goto ignore;
523 }
524
525 if (fsmap.gid_exists(m->global_id) &&
526 m->targets == fsmap.get_info_gid(m->global_id).export_targets)
527 goto ignore;
528
529 return false;
530
531 ignore:
532 mon.no_reply(op);
533 return true;
534 }
535
536
537 bool MDSMonitor::prepare_update(MonOpRequestRef op)
538 {
539 op->mark_mdsmon_event(__func__);
540 auto m = op->get_req<PaxosServiceMessage>();
541 dout(7) << "prepare_update " << *m << dendl;
542
543 switch (m->get_type()) {
544
545 case MSG_MDS_BEACON:
546 return prepare_beacon(op);
547
548 case MSG_MON_COMMAND:
549 try {
550 return prepare_command(op);
551 } catch (const bad_cmd_get& e) {
552 bufferlist bl;
553 mon.reply_command(op, -EINVAL, e.what(), bl, get_last_committed());
554 return true;
555 }
556
557 case MSG_MDS_OFFLOAD_TARGETS:
558 return prepare_offload_targets(op);
559
560 default:
561 ceph_abort();
562 }
563
564 return true;
565 }
566
567 bool MDSMonitor::prepare_beacon(MonOpRequestRef op)
568 {
569 op->mark_mdsmon_event(__func__);
570 auto m = op->get_req<MMDSBeacon>();
571 // -- this is an update --
572 dout(12) << "prepare_beacon " << *m << " from " << m->get_orig_source()
573 << " " << m->get_orig_source_addrs() << dendl;
574 entity_addrvec_t addrs = m->get_orig_source_addrs();
575 mds_gid_t gid = m->get_global_id();
576 MDSMap::DaemonState state = m->get_state();
577 version_t seq = m->get_seq();
578
579 auto &pending = get_pending_fsmap_writeable();
580
581 dout(15) << __func__ << " got health from gid " << gid << " with " << m->get_health().metrics.size() << " metrics." << dendl;
582
583 // Calculate deltas of health metrics created and removed
584 // Do this by type rather than MDSHealthMetric equality, because messages can
585 // change a lot when they include e.g. a number of items.
586 const auto &old_health = pending_daemon_health[gid].metrics;
587 const auto &new_health = m->get_health().metrics;
588
589 std::set<mds_metric_t> old_types;
590 for (const auto &i : old_health) {
591 old_types.insert(i.type);
592 }
593
594 std::set<mds_metric_t> new_types;
595 for (const auto &i : new_health) {
596 new_types.insert(i.type);
597 }
598
599 for (const auto &new_metric: new_health) {
600 if (old_types.count(new_metric.type) == 0) {
601 dout(10) << "MDS health message (" << m->get_orig_source()
602 << "): " << new_metric.sev << " " << new_metric.message << dendl;
603 }
604 }
605
606 // Log the disappearance of health messages at INFO
607 for (const auto &old_metric : old_health) {
608 if (new_types.count(old_metric.type) == 0) {
609 mon.clog->info() << "MDS health message cleared ("
610 << m->get_orig_source() << "): " << old_metric.message;
611 }
612 }
613
614 // Store health
615 pending_daemon_health[gid] = m->get_health();
616
617 const auto& cs = m->get_compat();
618 if (state == MDSMap::STATE_BOOT) {
619 // zap previous instance of this name?
620 if (g_conf()->mds_enforce_unique_name) {
621 bool failed_mds = false;
622 while (mds_gid_t existing = pending.find_mds_gid_by_name(m->get_name())) {
623 if (!mon.osdmon()->is_writeable()) {
624 mon.osdmon()->wait_for_writeable(op, new C_RetryMessage(this, op));
625 return false;
626 }
627 const auto& existing_info = pending.get_info_gid(existing);
628 mon.clog->info() << existing_info.human_name() << " restarted";
629 fail_mds_gid(pending, existing);
630 failed_mds = true;
631 }
632 if (failed_mds) {
633 ceph_assert(mon.osdmon()->is_writeable());
634 request_proposal(mon.osdmon());
635 }
636 }
637
638 // Add this daemon to the map
639 if (pending.mds_roles.count(gid) == 0) {
640 MDSMap::mds_info_t new_info;
641 new_info.global_id = gid;
642 new_info.name = m->get_name();
643 new_info.addrs = addrs;
644 new_info.mds_features = m->get_mds_features();
645 new_info.state = MDSMap::STATE_STANDBY;
646 new_info.state_seq = seq;
647 new_info.compat = cs;
648 if (m->get_fs().size()) {
649 fs_cluster_id_t fscid = FS_CLUSTER_ID_NONE;
650 auto f = pending.get_filesystem(m->get_fs());
651 if (f) {
652 fscid = f->fscid;
653 }
654 new_info.join_fscid = fscid;
655 }
656 pending.insert(new_info);
657 }
658
659 // initialize the beacon timer
660 auto &beacon = last_beacon[gid];
661 beacon.stamp = mono_clock::now();
662 beacon.seq = seq;
663
664 update_metadata(m->get_global_id(), m->get_sys_info());
665 } else {
666 // state update
667
668 if (!pending.gid_exists(gid)) {
669 /* gid has been removed from pending, send null map */
670 dout(5) << "mds_beacon " << *m << " is not in fsmap (state "
671 << ceph_mds_state_name(state) << ")" << dendl;
672
673 /* We can't send an MDSMap this MDS was a part of because we no longer
674 * know which FS it was part of. Nor does this matter. Sending an empty
675 * MDSMap is sufficient for getting the MDS to respawn.
676 */
677 goto null;
678 }
679
680 const auto& info = pending.get_info_gid(gid);
681
682 // did the reported compat change? That's illegal!
683 if (cs.compare(info.compat) != 0) {
684 if (!mon.osdmon()->is_writeable()) {
685 mon.osdmon()->wait_for_writeable(op, new C_RetryMessage(this, op));
686 return false;
687 }
688 mon.clog->warn() << info.human_name() << " compat changed unexpectedly";
689 fail_mds_gid(pending, gid);
690 request_proposal(mon.osdmon());
691 return true;
692 }
693
694 // legal state change?
695 if ((info.state == MDSMap::STATE_STANDBY && state > 0) ||
696 (info.state == MDSMap::STATE_STANDBY_REPLAY && state > 0 && state != MDSMap::STATE_DAMAGED)) {
697 /* N.B.: standby-replay can indicate the rank is damaged due to failure to replay */
698 dout(10) << "mds_beacon mds can't activate itself (" << ceph_mds_state_name(info.state)
699 << " -> " << ceph_mds_state_name(state) << ")" << dendl;
700 goto evict;
701 } else if ((state == MDSMap::STATE_STANDBY || state == MDSMap::STATE_STANDBY_REPLAY)
702 && info.rank != MDS_RANK_NONE)
703 {
704 dout(4) << "mds_beacon MDS can't go back into standby after taking rank: "
705 "held rank " << info.rank << " while requesting state "
706 << ceph_mds_state_name(state) << dendl;
707 goto evict;
708 } else if (info.state == MDSMap::STATE_STOPPING &&
709 state != MDSMap::STATE_STOPPING &&
710 state != MDSMap::STATE_STOPPED) {
711 // we can't transition to any other states from STOPPING
712 dout(0) << "got beacon for MDS in STATE_STOPPING, ignoring requested state change"
713 << dendl;
714 goto evict;
715 }
716
717 if (info.laggy()) {
718 dout(1) << "prepare_beacon clearing laggy flag on " << addrs << dendl;
719 pending.modify_daemon(info.global_id, [](auto& info)
720 {
721 info.clear_laggy();
722 }
723 );
724 }
725
726 dout(5) << "prepare_beacon mds." << info.rank
727 << " " << ceph_mds_state_name(info.state)
728 << " -> " << ceph_mds_state_name(state)
729 << dendl;
730
731 fs_cluster_id_t fscid = FS_CLUSTER_ID_NONE;
732 if (m->get_fs().size()) {
733 auto f = pending.get_filesystem(m->get_fs());
734 if (f) {
735 fscid = f->fscid;
736 }
737 }
738 pending.modify_daemon(gid, [fscid](auto& info) {
739 info.join_fscid = fscid;
740 });
741
742 if (state == MDSMap::STATE_STOPPED) {
743 const auto fscid = pending.mds_roles.at(gid);
744 const auto &fs = pending.get_filesystem(fscid);
745
746 mon.clog->info() << info.human_name() << " finished "
747 << "stopping rank " << info.rank << " in filesystem "
748 << fs->mds_map.fs_name << " (now has "
749 << fs->mds_map.get_num_in_mds() - 1 << " ranks)";
750
751 auto erased = pending.stop(gid);
752 erased.push_back(gid);
753
754 for (const auto& erased_gid : erased) {
755 last_beacon.erase(erased_gid);
756 if (pending_daemon_health.count(erased_gid)) {
757 pending_daemon_health.erase(erased_gid);
758 pending_daemon_health_rm.insert(erased_gid);
759 }
760 }
761 } else if (state == MDSMap::STATE_DAMAGED) {
762 if (!mon.osdmon()->is_writeable()) {
763 dout(1) << __func__ << ": DAMAGED from rank " << info.rank
764 << " waiting for osdmon writeable to blocklist it" << dendl;
765 mon.osdmon()->wait_for_writeable(op, new C_RetryMessage(this, op));
766 return false;
767 }
768
769 auto rank = info.rank;
770
771 // Record this MDS rank as damaged, so that other daemons
772 // won't try to run it.
773 dout(0) << __func__ << ": marking rank " << rank << " damaged" << dendl;
774
775 auto fs = pending.get_filesystem(gid);
776 auto rankgid = fs->mds_map.get_gid(rank);
777 auto rankinfo = pending.get_info_gid(rankgid);
778 auto followergid = fs->mds_map.get_standby_replay(rank);
779
780 ceph_assert(gid == rankgid || gid == followergid);
781
782 utime_t until = ceph_clock_now();
783 until += g_conf().get_val<double>("mon_mds_blocklist_interval");
784 const auto blocklist_epoch = mon.osdmon()->blocklist(rankinfo.addrs, until);
785 if (followergid != MDS_GID_NONE) {
786 fail_mds_gid(pending, followergid);
787 last_beacon.erase(followergid);
788 }
789 request_proposal(mon.osdmon());
790 pending.damaged(rankgid, blocklist_epoch);
791 last_beacon.erase(rankgid);
792
793 /* MDS expects beacon reply back */
794 } else if (state == MDSMap::STATE_DNE) {
795 dout(1) << __func__ << ": DNE from " << info << dendl;
796 goto evict;
797 } else if (info.state == MDSMap::STATE_STANDBY && state != info.state) {
798 // Standby daemons should never modify their own
799 // state. Reject any attempts to do so.
800 derr << "standby " << gid << " attempted to change state to "
801 << ceph_mds_state_name(state) << ", rejecting" << dendl;
802 goto evict;
803 } else if (info.state != MDSMap::STATE_STANDBY && state != info.state &&
804 !MDSMap::state_transition_valid(info.state, state)) {
805 // Validate state transitions for daemons that hold a rank
806 derr << "daemon " << gid << " (rank " << info.rank << ") "
807 << "reported invalid state transition "
808 << ceph_mds_state_name(info.state) << " -> "
809 << ceph_mds_state_name(state) << dendl;
810 goto evict;
811 } else {
812 if (info.state != MDSMap::STATE_ACTIVE && state == MDSMap::STATE_ACTIVE) {
813 const auto &fscid = pending.mds_roles.at(gid);
814 const auto &fs = pending.get_filesystem(fscid);
815 mon.clog->info() << info.human_name() << " is now active in "
816 << "filesystem " << fs->mds_map.fs_name << " as rank "
817 << info.rank;
818 }
819
820 // Made it through special cases and validations, record the
821 // daemon's reported state to the FSMap.
822 pending.modify_daemon(gid, [state, seq](auto& info) {
823 info.state = state;
824 info.state_seq = seq;
825 });
826 }
827 }
828
829 dout(5) << "prepare_beacon pending map now:" << dendl;
830 print_map(pending);
831
832 wait_for_finished_proposal(op, new LambdaContext([op, this](int r){
833 if (r >= 0)
834 _updated(op); // success
835 else if (r == -ECANCELED) {
836 mon.no_reply(op);
837 } else {
838 dispatch(op); // try again
839 }
840 }));
841
842 return true;
843
844 evict:
845 if (!mon.osdmon()->is_writeable()) {
846 dout(1) << __func__ << ": waiting for writeable OSDMap to evict" << dendl;
847 mon.osdmon()->wait_for_writeable(op, new C_RetryMessage(this, op));
848 return false;
849 }
850
851 fail_mds_gid(pending, gid);
852 request_proposal(mon.osdmon());
853 dout(5) << __func__ << ": pending map now:" << dendl;
854 print_map(pending);
855
856 goto null;
857
858 null:
859 wait_for_finished_proposal(op, new LambdaContext([op, this](int r){
860 if (r >= 0) {
861 auto m = make_message<MMDSMap>(mon.monmap->fsid, MDSMap::create_null_mdsmap());
862 mon.send_reply(op, m.detach());
863 } else {
864 dispatch(op); // try again
865 }
866 }));
867
868 return true;
869 }
870
871 bool MDSMonitor::prepare_offload_targets(MonOpRequestRef op)
872 {
873 auto &pending = get_pending_fsmap_writeable();
874
875 op->mark_mdsmon_event(__func__);
876 auto m = op->get_req<MMDSLoadTargets>();
877 mds_gid_t gid = m->global_id;
878 if (pending.gid_has_rank(gid)) {
879 dout(10) << "prepare_offload_targets " << gid << " " << m->targets << dendl;
880 pending.update_export_targets(gid, m->targets);
881 } else {
882 dout(10) << "prepare_offload_targets " << gid << " not in map" << dendl;
883 }
884 mon.no_reply(op);
885 return true;
886 }
887
888 bool MDSMonitor::should_propose(double& delay)
889 {
890 // delegate to PaxosService to assess whether we should propose
891 return PaxosService::should_propose(delay);
892 }
893
894 void MDSMonitor::_updated(MonOpRequestRef op)
895 {
896 const auto &fsmap = get_fsmap();
897 op->mark_mdsmon_event(__func__);
898 auto m = op->get_req<MMDSBeacon>();
899 dout(10) << "_updated " << m->get_orig_source() << " " << *m << dendl;
900 mon.clog->debug() << m->get_orig_source() << " "
901 << m->get_orig_source_addrs() << " "
902 << ceph_mds_state_name(m->get_state());
903
904 if (m->get_state() == MDSMap::STATE_STOPPED) {
905 // send the map manually (they're out of the map, so they won't get it automatic)
906 auto m = make_message<MMDSMap>(mon.monmap->fsid, MDSMap::create_null_mdsmap());
907 mon.send_reply(op, m.detach());
908 } else {
909 auto beacon = make_message<MMDSBeacon>(mon.monmap->fsid,
910 m->get_global_id(), m->get_name(), fsmap.get_epoch(),
911 m->get_state(), m->get_seq(), CEPH_FEATURES_SUPPORTED_DEFAULT);
912 mon.send_reply(op, beacon.detach());
913 }
914 }
915
916 void MDSMonitor::on_active()
917 {
918 tick();
919
920 if (is_leader()) {
921 mon.clog->debug() << "fsmap " << get_fsmap();
922 }
923 }
924
925 void MDSMonitor::dump_info(Formatter *f)
926 {
927 f->open_object_section("fsmap");
928 get_fsmap().dump(f);
929 f->close_section();
930
931 f->dump_unsigned("mdsmap_first_committed", get_first_committed());
932 f->dump_unsigned("mdsmap_last_committed", get_last_committed());
933 }
934
935 bool MDSMonitor::preprocess_command(MonOpRequestRef op)
936 {
937 op->mark_mdsmon_event(__func__);
938 auto m = op->get_req<MMonCommand>();
939 int r = -1;
940 bufferlist rdata;
941 stringstream ss, ds;
942
943 cmdmap_t cmdmap;
944 if (!cmdmap_from_json(m->cmd, &cmdmap, ss)) {
945 // ss has reason for failure
946 string rs = ss.str();
947 mon.reply_command(op, -EINVAL, rs, rdata, get_last_committed());
948 return true;
949 }
950
951 string prefix;
952 cmd_getval(cmdmap, "prefix", prefix);
953 string format = cmd_getval_or<string>(cmdmap, "format", "plain");
954 std::unique_ptr<Formatter> f(Formatter::create(format));
955
956 MonSession *session = op->get_session();
957 if (!session) {
958 mon.reply_command(op, -EACCES, "access denied", rdata, get_last_committed());
959 return true;
960 }
961
962 // to use const qualifier filter fsmap beforehand
963 FSMap _fsmap_copy = get_fsmap();
964 _fsmap_copy.filter(session->get_allowed_fs_names());
965 const auto& fsmap = _fsmap_copy;
966
967 if (prefix == "mds stat") {
968 if (f) {
969 f->open_object_section("mds_stat");
970 dump_info(f.get());
971 f->close_section();
972 f->flush(ds);
973 } else {
974 ds << fsmap;
975 }
976 r = 0;
977 } else if (prefix == "mds ok-to-stop") {
978 vector<string> ids;
979 if (!cmd_getval(cmdmap, "ids", ids)) {
980 r = -EINVAL;
981 ss << "must specify mds id";
982 goto out;
983 }
984 if (fsmap.is_any_degraded()) {
985 ss << "one or more filesystems is currently degraded";
986 r = -EBUSY;
987 goto out;
988 }
989 set<mds_gid_t> stopping;
990 for (auto& id : ids) {
991 ostringstream ess;
992 mds_gid_t gid = gid_from_arg(fsmap, id, ess);
993 if (gid == MDS_GID_NONE) {
994 // the mds doesn't exist, but no file systems are unhappy, so losing it
995 // can't have any effect.
996 continue;
997 }
998 stopping.insert(gid);
999 }
1000 set<mds_gid_t> active;
1001 set<mds_gid_t> standby;
1002 for (auto gid : stopping) {
1003 if (fsmap.gid_has_rank(gid)) {
1004 // ignore standby-replay daemons (at this level)
1005 if (!fsmap.is_standby_replay(gid)) {
1006 auto standby = fsmap.get_standby_replay(gid);
1007 if (standby == MDS_GID_NONE ||
1008 stopping.count(standby)) {
1009 // no standby-replay, or we're also stopping the standby-replay
1010 // for this mds
1011 active.insert(gid);
1012 }
1013 }
1014 } else {
1015 // net loss of a standby
1016 standby.insert(gid);
1017 }
1018 }
1019 if (fsmap.get_num_standby() - standby.size() < active.size()) {
1020 r = -EBUSY;
1021 ss << "insufficent standby MDS daemons to stop active gids "
1022 << stringify(active)
1023 << " and/or standby gids " << stringify(standby);;
1024 goto out;
1025 }
1026 r = 0;
1027 ss << "should be safe to stop " << ids;
1028 } else if (prefix == "fs dump") {
1029 int64_t epocharg;
1030 epoch_t epoch;
1031
1032 const FSMap *fsmapp = &fsmap;
1033 FSMap dummy;
1034 if (cmd_getval(cmdmap, "epoch", epocharg)) {
1035 epoch = epocharg;
1036 bufferlist b;
1037 int err = get_version(epoch, b);
1038 if (err == -ENOENT) {
1039 r = -ENOENT;
1040 goto out;
1041 } else {
1042 ceph_assert(err == 0);
1043 ceph_assert(b.length());
1044 dummy.decode(b);
1045 fsmapp = &dummy;
1046 }
1047 }
1048
1049 stringstream ds;
1050 if (f != NULL) {
1051 f->open_object_section("fsmap");
1052 fsmapp->dump(f.get());
1053 f->close_section();
1054 f->flush(ds);
1055 r = 0;
1056 } else {
1057 fsmapp->print(ds);
1058 r = 0;
1059 }
1060
1061 rdata.append(ds);
1062 ss << "dumped fsmap epoch " << fsmapp->get_epoch();
1063 } else if (prefix == "mds metadata") {
1064 if (!f)
1065 f.reset(Formatter::create("json-pretty"));
1066
1067 string who;
1068 bool all = !cmd_getval(cmdmap, "who", who);
1069 dout(1) << "all = " << all << dendl;
1070 if (all) {
1071 r = 0;
1072 // Dump all MDSs' metadata
1073 const auto all_info = fsmap.get_mds_info();
1074
1075 f->open_array_section("mds_metadata");
1076 for(const auto &i : all_info) {
1077 const auto &info = i.second;
1078
1079 f->open_object_section("mds");
1080 f->dump_string("name", info.name);
1081 std::ostringstream get_err;
1082 r = dump_metadata(fsmap, info.name, f.get(), get_err);
1083 if (r == -EINVAL || r == -ENOENT) {
1084 // Drop error, list what metadata we do have
1085 dout(1) << get_err.str() << dendl;
1086 r = 0;
1087 } else if (r != 0) {
1088 derr << "Unexpected error reading metadata: " << cpp_strerror(r)
1089 << dendl;
1090 ss << get_err.str();
1091 f->close_section();
1092 break;
1093 }
1094 f->close_section();
1095 }
1096 f->close_section();
1097 } else {
1098 // Dump a single daemon's metadata
1099 f->open_object_section("mds_metadata");
1100 r = dump_metadata(fsmap, who, f.get(), ss);
1101 f->close_section();
1102 }
1103 f->flush(ds);
1104 } else if (prefix == "mds versions") {
1105 if (!f)
1106 f.reset(Formatter::create("json-pretty"));
1107 count_metadata("ceph_version", f.get());
1108 f->flush(ds);
1109 r = 0;
1110 } else if (prefix == "mds count-metadata") {
1111 if (!f)
1112 f.reset(Formatter::create("json-pretty"));
1113 string field;
1114 cmd_getval(cmdmap, "property", field);
1115 count_metadata(field, f.get());
1116 f->flush(ds);
1117 r = 0;
1118 } else if (prefix == "fs compat show") {
1119 string fs_name;
1120 cmd_getval(cmdmap, "fs_name", fs_name);
1121 const auto &fs = fsmap.get_filesystem(fs_name);
1122 if (fs == nullptr) {
1123 ss << "filesystem '" << fs_name << "' not found";
1124 r = -ENOENT;
1125 goto out;
1126 }
1127
1128 if (f) {
1129 f->open_object_section("mds_compat");
1130 fs->mds_map.compat.dump(f.get());
1131 f->close_section();
1132 f->flush(ds);
1133 } else {
1134 ds << fs->mds_map.compat;
1135 }
1136 r = 0;
1137 } else if (prefix == "mds compat show") {
1138 if (f) {
1139 f->open_object_section("mds_compat");
1140 fsmap.default_compat.dump(f.get());
1141 f->close_section();
1142 f->flush(ds);
1143 } else {
1144 ds << fsmap.default_compat;
1145 }
1146 r = 0;
1147 } else if (prefix == "fs get") {
1148 string fs_name;
1149 cmd_getval(cmdmap, "fs_name", fs_name);
1150 const auto &fs = fsmap.get_filesystem(fs_name);
1151 if (fs == nullptr) {
1152 ss << "filesystem '" << fs_name << "' not found";
1153 r = -ENOENT;
1154 } else {
1155 if (f != nullptr) {
1156 f->open_object_section("filesystem");
1157 fs->dump(f.get());
1158 f->close_section();
1159 f->flush(ds);
1160 r = 0;
1161 } else {
1162 fs->print(ds);
1163 r = 0;
1164 }
1165 }
1166 } else if (prefix == "fs ls") {
1167 if (f) {
1168 f->open_array_section("filesystems");
1169 for (const auto &p : fsmap.filesystems) {
1170 const auto &fs = p.second;
1171 f->open_object_section("filesystem");
1172 {
1173 const MDSMap &mds_map = fs->mds_map;
1174 f->dump_string("name", mds_map.fs_name);
1175 /* Output both the names and IDs of pools, for use by
1176 * humans and machines respectively */
1177 f->dump_string("metadata_pool", mon.osdmon()->osdmap.get_pool_name(
1178 mds_map.metadata_pool));
1179 f->dump_int("metadata_pool_id", mds_map.metadata_pool);
1180 f->open_array_section("data_pool_ids");
1181 for (const auto &id : mds_map.data_pools) {
1182 f->dump_int("data_pool_id", id);
1183 }
1184 f->close_section();
1185
1186 f->open_array_section("data_pools");
1187 for (const auto &id : mds_map.data_pools) {
1188 const auto &name = mon.osdmon()->osdmap.get_pool_name(id);
1189 f->dump_string("data_pool", name);
1190 }
1191 f->close_section();
1192 }
1193 f->close_section();
1194 }
1195 f->close_section();
1196 f->flush(ds);
1197 } else {
1198 for (const auto &p : fsmap.filesystems) {
1199 const auto &fs = p.second;
1200 const MDSMap &mds_map = fs->mds_map;
1201 const string &md_pool_name = mon.osdmon()->osdmap.get_pool_name(
1202 mds_map.metadata_pool);
1203
1204 ds << "name: " << mds_map.fs_name << ", metadata pool: "
1205 << md_pool_name << ", data pools: [";
1206 for (const auto &id : mds_map.data_pools) {
1207 const string &pool_name = mon.osdmon()->osdmap.get_pool_name(id);
1208 ds << pool_name << " ";
1209 }
1210 ds << "]" << std::endl;
1211 }
1212
1213 if (fsmap.filesystems.empty()) {
1214 ds << "No filesystems enabled" << std::endl;
1215 }
1216 }
1217 r = 0;
1218 } else if (prefix == "fs feature ls") {
1219 if (f) {
1220 f->open_array_section("cephfs_features");
1221 for (size_t i = 0; i <= CEPHFS_FEATURE_MAX; ++i) {
1222 f->open_object_section("feature");
1223 f->dump_int("index", i);
1224 f->dump_string("name", cephfs_feature_name(i));
1225 f->close_section();
1226 }
1227 f->close_section();
1228 f->flush(ds);
1229 } else {
1230 for (size_t i = 0; i <= CEPHFS_FEATURE_MAX; ++i) {
1231 ds << i << " " << cephfs_feature_name(i) << std::endl;
1232 }
1233 }
1234 r = 0;
1235 } else if (prefix == "fs lsflags") {
1236 string fs_name;
1237 cmd_getval(cmdmap, "fs_name", fs_name);
1238 const auto &fs = fsmap.get_filesystem(fs_name);
1239 if (!fs) {
1240 ss << "filesystem '" << fs_name << "' not found";
1241 r = -ENOENT;
1242 } else {
1243 const MDSMap &mds_map = fs->mds_map;
1244 if (f) {
1245 mds_map.dump_flags_state(f.get());
1246 f->flush(ds);
1247 }
1248 else {
1249 mds_map.print_flags(ds);
1250 }
1251 r = 0;
1252 }
1253 }
1254
1255 out:
1256 if (r != -1) {
1257 rdata.append(ds);
1258 string rs;
1259 getline(ss, rs);
1260 mon.reply_command(op, r, rs, rdata, get_last_committed());
1261 return true;
1262 } else
1263 return false;
1264 }
1265
1266 bool MDSMonitor::fail_mds_gid(FSMap &fsmap, mds_gid_t gid)
1267 {
1268 const auto& info = fsmap.get_info_gid(gid);
1269 dout(1) << "fail_mds_gid " << gid << " mds." << info.name << " role " << info.rank << dendl;
1270
1271 ceph_assert(mon.osdmon()->is_writeable());
1272
1273 epoch_t blocklist_epoch = 0;
1274 if (info.rank >= 0 && info.state != MDSMap::STATE_STANDBY_REPLAY) {
1275 utime_t until = ceph_clock_now();
1276 until += g_conf().get_val<double>("mon_mds_blocklist_interval");
1277 blocklist_epoch = mon.osdmon()->blocklist(info.addrs, until);
1278 }
1279
1280 fsmap.erase(gid, blocklist_epoch);
1281 last_beacon.erase(gid);
1282 if (pending_daemon_health.count(gid)) {
1283 pending_daemon_health.erase(gid);
1284 pending_daemon_health_rm.insert(gid);
1285 }
1286
1287 return blocklist_epoch != 0;
1288 }
1289
1290 mds_gid_t MDSMonitor::gid_from_arg(const FSMap &fsmap, const std::string &arg, std::ostream &ss)
1291 {
1292 // Try parsing as a role
1293 mds_role_t role;
1294 std::ostringstream ignore_err; // Don't spam 'ss' with parse_role errors
1295 int r = fsmap.parse_role(arg, &role, ignore_err);
1296 if (r == 0) {
1297 // See if a GID is assigned to this role
1298 const auto &fs = fsmap.get_filesystem(role.fscid);
1299 ceph_assert(fs != nullptr); // parse_role ensures it exists
1300 if (fs->mds_map.is_up(role.rank)) {
1301 dout(10) << __func__ << ": validated rank/GID " << role
1302 << " as a rank" << dendl;
1303 return fs->mds_map.get_mds_info(role.rank).global_id;
1304 }
1305 }
1306
1307 // Try parsing as a gid
1308 std::string err;
1309 unsigned long long maybe_gid = strict_strtoll(arg.c_str(), 10, &err);
1310 if (!err.empty()) {
1311 // Not a role or a GID, try as a daemon name
1312 const MDSMap::mds_info_t *mds_info = fsmap.find_by_name(arg);
1313 if (!mds_info) {
1314 ss << "MDS named '" << arg
1315 << "' does not exist, or is not up";
1316 return MDS_GID_NONE;
1317 }
1318 dout(10) << __func__ << ": resolved MDS name '" << arg
1319 << "' to GID " << mds_info->global_id << dendl;
1320 return mds_info->global_id;
1321 } else {
1322 // Not a role, but parses as a an integer, might be a GID
1323 dout(10) << __func__ << ": treating MDS reference '" << arg
1324 << "' as an integer " << maybe_gid << dendl;
1325
1326 if (fsmap.gid_exists(mds_gid_t(maybe_gid))) {
1327 return mds_gid_t(maybe_gid);
1328 }
1329 }
1330
1331 dout(1) << __func__ << ": rank/GID " << arg
1332 << " not a existent rank or GID" << dendl;
1333 return MDS_GID_NONE;
1334 }
1335
1336 int MDSMonitor::fail_mds(FSMap &fsmap, std::ostream &ss,
1337 const std::string &arg, MDSMap::mds_info_t *failed_info)
1338 {
1339 ceph_assert(failed_info != nullptr);
1340
1341 mds_gid_t gid = gid_from_arg(fsmap, arg, ss);
1342 if (gid == MDS_GID_NONE) {
1343 return 0;
1344 }
1345 if (!mon.osdmon()->is_writeable()) {
1346 return -EAGAIN;
1347 }
1348
1349 // Take a copy of the info before removing the MDS from the map,
1350 // so that the caller knows which mds (if any) they ended up removing.
1351 *failed_info = fsmap.get_info_gid(gid);
1352
1353 fail_mds_gid(fsmap, gid);
1354 ss << "failed mds gid " << gid;
1355 ceph_assert(mon.osdmon()->is_writeable());
1356 request_proposal(mon.osdmon());
1357 return 0;
1358 }
1359
1360 bool MDSMonitor::prepare_command(MonOpRequestRef op)
1361 {
1362 op->mark_mdsmon_event(__func__);
1363 auto m = op->get_req<MMonCommand>();
1364 int r = -EINVAL;
1365 stringstream ss;
1366 bufferlist rdata;
1367
1368 cmdmap_t cmdmap;
1369 if (!cmdmap_from_json(m->cmd, &cmdmap, ss)) {
1370 string rs = ss.str();
1371 mon.reply_command(op, -EINVAL, rs, rdata, get_last_committed());
1372 return true;
1373 }
1374
1375 string prefix;
1376 cmd_getval(cmdmap, "prefix", prefix);
1377
1378 /* Refuse access if message not associated with a valid session */
1379 MonSession *session = op->get_session();
1380 if (!session) {
1381 mon.reply_command(op, -EACCES, "access denied", rdata, get_last_committed());
1382 return true;
1383 }
1384
1385 auto &pending = get_pending_fsmap_writeable();
1386
1387 bool batched_propose = false;
1388 for (const auto &h : handlers) {
1389 r = h->can_handle(prefix, op, pending, cmdmap, ss);
1390 if (r == 1) {
1391 ; // pass, since we got the right handler.
1392 } else if (r == 0) {
1393 continue;
1394 } else {
1395 goto out;
1396 }
1397
1398 batched_propose = h->batched_propose();
1399 if (batched_propose) {
1400 paxos.plug();
1401 }
1402 r = h->handle(&mon, pending, op, cmdmap, ss);
1403 if (batched_propose) {
1404 paxos.unplug();
1405 }
1406
1407 if (r == -EAGAIN) {
1408 // message has been enqueued for retry; return.
1409 dout(4) << __func__ << " enqueue for retry by prepare_command" << dendl;
1410 return false;
1411 } else {
1412 if (r == 0) {
1413 // On successful updates, print the updated map
1414 print_map(pending);
1415 }
1416 // Successful or not, we're done: respond.
1417 goto out;
1418 }
1419 }
1420
1421 r = filesystem_command(pending, op, prefix, cmdmap, ss);
1422 if (r >= 0) {
1423 goto out;
1424 } else if (r == -EAGAIN) {
1425 // Do not reply, the message has been enqueued for retry
1426 dout(4) << __func__ << " enqueue for retry by filesystem_command" << dendl;
1427 return false;
1428 } else if (r != -ENOSYS) {
1429 goto out;
1430 }
1431
1432 if (r == -ENOSYS && ss.str().empty()) {
1433 ss << "unrecognized command";
1434 }
1435
1436 out:
1437 dout(4) << __func__ << " done, r=" << r << dendl;
1438 /* Compose response */
1439 string rs;
1440 getline(ss, rs);
1441
1442 if (r >= 0) {
1443 // success.. delay reply
1444 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, r, rs,
1445 get_last_committed() + 1));
1446 if (batched_propose) {
1447 force_immediate_propose();
1448 }
1449 return true;
1450 } else {
1451 // reply immediately
1452 mon.reply_command(op, r, rs, rdata, get_last_committed());
1453 return false;
1454 }
1455 }
1456
1457 int MDSMonitor::filesystem_command(
1458 FSMap &fsmap,
1459 MonOpRequestRef op,
1460 std::string const &prefix,
1461 const cmdmap_t& cmdmap,
1462 std::stringstream &ss)
1463 {
1464 dout(4) << __func__ << " prefix='" << prefix << "'" << dendl;
1465 op->mark_mdsmon_event(__func__);
1466 int r = 0;
1467 string whostr;
1468 cmd_getval(cmdmap, "role", whostr);
1469
1470 if (prefix == "mds set_state") {
1471 mds_gid_t gid;
1472 if (!cmd_getval(cmdmap, "gid", gid)) {
1473 ss << "error parsing 'gid' value '"
1474 << cmd_vartype_stringify(cmdmap.at("gid")) << "'";
1475 return -EINVAL;
1476 }
1477 MDSMap::DaemonState state;
1478 if (!cmd_getval(cmdmap, "state", state)) {
1479 ss << "error parsing 'state' string value '"
1480 << cmd_vartype_stringify(cmdmap.at("state")) << "'";
1481 return -EINVAL;
1482 }
1483 if (fsmap.gid_exists(gid, op->get_session()->get_allowed_fs_names())) {
1484 fsmap.modify_daemon(gid, [state](auto& info) {
1485 info.state = state;
1486 });
1487 ss << "set mds gid " << gid << " to state " << state << " "
1488 << ceph_mds_state_name(state);
1489 return 0;
1490 }
1491 } else if (prefix == "mds fail") {
1492 string who;
1493 cmd_getval(cmdmap, "role_or_gid", who);
1494
1495 MDSMap::mds_info_t failed_info;
1496 mds_gid_t gid = gid_from_arg(fsmap, who, ss);
1497 if (gid == MDS_GID_NONE) {
1498 ss << "MDS named '" << who << "' does not exist, is not up or you "
1499 << "lack the permission to see.";
1500 return 0;
1501 }
1502 if(!fsmap.gid_exists(gid, op->get_session()->get_allowed_fs_names())) {
1503 ss << "MDS named '" << who << "' does not exist, is not up or you "
1504 << "lack the permission to see.";
1505 return -EINVAL;
1506 }
1507 string_view fs_name = fsmap.fs_name_from_gid(gid);
1508 if (!op->get_session()->fs_name_capable(fs_name, MON_CAP_W)) {
1509 ss << "Permission denied.";
1510 return -EPERM;
1511 }
1512
1513 r = fail_mds(fsmap, ss, who, &failed_info);
1514 if (r < 0 && r == -EAGAIN) {
1515 mon.osdmon()->wait_for_writeable(op, new C_RetryMessage(this, op));
1516 return -EAGAIN; // don't propose yet; wait for message to be retried
1517 } else if (r == 0) {
1518 // Only log if we really did something (not when was already gone)
1519 if (failed_info.global_id != MDS_GID_NONE) {
1520 mon.clog->info() << failed_info.human_name() << " marked failed by "
1521 << op->get_session()->entity_name;
1522 }
1523 }
1524 } else if (prefix == "mds rm") {
1525 mds_gid_t gid;
1526 if (!cmd_getval(cmdmap, "gid", gid)) {
1527 ss << "error parsing 'gid' value '"
1528 << cmd_vartype_stringify(cmdmap.at("gid")) << "'";
1529 return -EINVAL;
1530 }
1531 if (!fsmap.gid_exists(gid, op->get_session()->get_allowed_fs_names())) {
1532 ss << "mds gid " << gid << " does not exist";
1533 return 0;
1534 }
1535 string_view fs_name = fsmap.fs_name_from_gid(gid);
1536 if (!op->get_session()->fs_name_capable(fs_name, MON_CAP_W)) {
1537 ss << "Permission denied.";
1538 return -EPERM;
1539 }
1540 const auto &info = fsmap.get_info_gid(gid);
1541 MDSMap::DaemonState state = info.state;
1542 if (state > 0) {
1543 ss << "cannot remove active mds." << info.name
1544 << " rank " << info.rank;
1545 return -EBUSY;
1546 } else {
1547 fsmap.erase(gid, {});
1548 ss << "removed mds gid " << gid;
1549 return 0;
1550 }
1551 } else if (prefix == "mds rmfailed") {
1552 bool confirm = false;
1553 cmd_getval(cmdmap, "yes_i_really_mean_it", confirm);
1554 if (!confirm) {
1555 ss << "WARNING: this can make your filesystem inaccessible! "
1556 "Add --yes-i-really-mean-it if you are sure you wish to continue.";
1557 return -EPERM;
1558 }
1559
1560 std::string role_str;
1561 cmd_getval(cmdmap, "role", role_str);
1562 mds_role_t role;
1563 const auto fs_names = op->get_session()->get_allowed_fs_names();
1564 int r = fsmap.parse_role(role_str, &role, ss, fs_names);
1565 if (r < 0) {
1566 ss << "invalid role '" << role_str << "'";
1567 return -EINVAL;
1568 }
1569 string_view fs_name = fsmap.get_filesystem(role.fscid)->mds_map.get_fs_name();
1570 if (!op->get_session()->fs_name_capable(fs_name, MON_CAP_W)) {
1571 ss << "Permission denied.";
1572 return -EPERM;
1573 }
1574
1575 fsmap.modify_filesystem(
1576 role.fscid,
1577 [role](std::shared_ptr<Filesystem> fs)
1578 {
1579 fs->mds_map.failed.erase(role.rank);
1580 });
1581
1582 ss << "removed failed mds." << role;
1583 return 0;
1584 /* TODO: convert to fs commands to update defaults */
1585 } else if (prefix == "mds compat rm_compat") {
1586 int64_t f;
1587 if (!cmd_getval(cmdmap, "feature", f)) {
1588 ss << "error parsing feature value '"
1589 << cmd_vartype_stringify(cmdmap.at("feature")) << "'";
1590 return -EINVAL;
1591 }
1592 if (fsmap.default_compat.compat.contains(f)) {
1593 ss << "removing compat feature " << f;
1594 fsmap.default_compat.compat.remove(f);
1595 } else {
1596 ss << "compat feature " << f << " not present in " << fsmap.default_compat;
1597 }
1598 r = 0;
1599 } else if (prefix == "mds compat rm_incompat") {
1600 int64_t f;
1601 if (!cmd_getval(cmdmap, "feature", f)) {
1602 ss << "error parsing feature value '"
1603 << cmd_vartype_stringify(cmdmap.at("feature")) << "'";
1604 return -EINVAL;
1605 }
1606 if (fsmap.default_compat.incompat.contains(f)) {
1607 ss << "removing incompat feature " << f;
1608 fsmap.default_compat.incompat.remove(f);
1609 } else {
1610 ss << "incompat feature " << f << " not present in " << fsmap.default_compat;
1611 }
1612 r = 0;
1613 } else if (prefix == "mds repaired") {
1614 std::string role_str;
1615 cmd_getval(cmdmap, "role", role_str);
1616 mds_role_t role;
1617 const auto fs_names = op->get_session()->get_allowed_fs_names();
1618 r = fsmap.parse_role(role_str, &role, ss, fs_names);
1619 if (r < 0) {
1620 return r;
1621 }
1622 string_view fs_name = fsmap.get_filesystem(role.fscid)->mds_map.get_fs_name();
1623 if (!op->get_session()->fs_name_capable(fs_name, MON_CAP_W)) {
1624 ss << "Permission denied.";
1625 return -EPERM;
1626 }
1627
1628 bool modified = fsmap.undamaged(role.fscid, role.rank);
1629 if (modified) {
1630 ss << "repaired: restoring rank " << role;
1631 } else {
1632 ss << "nothing to do: rank is not damaged";
1633 }
1634
1635 r = 0;
1636 } else if (prefix == "mds freeze") {
1637 std::string who;
1638 cmd_getval(cmdmap, "role_or_gid", who);
1639 mds_gid_t gid = gid_from_arg(fsmap, who, ss);
1640 if (gid == MDS_GID_NONE) {
1641 return -EINVAL;
1642 }
1643
1644 string_view fs_name = fsmap.fs_name_from_gid(gid);
1645 if (!op->get_session()->fs_name_capable(fs_name, MON_CAP_W)) {
1646 ss << "Permission denied.";
1647 return -EPERM;
1648 }
1649
1650 bool freeze = false;
1651 {
1652 std::string str;
1653 cmd_getval(cmdmap, "val", str);
1654 if ((r = parse_bool(str, &freeze, ss)) != 0) {
1655 return r;
1656 }
1657 }
1658
1659 auto f = [freeze,gid,&ss](auto& info) {
1660 if (freeze) {
1661 ss << "freezing mds." << gid;
1662 info.freeze();
1663 } else {
1664 ss << "unfreezing mds." << gid;
1665 info.unfreeze();
1666 }
1667 };
1668 fsmap.modify_daemon(gid, f);
1669 r = 0;
1670 } else {
1671 return -ENOSYS;
1672 }
1673
1674 return r;
1675 }
1676
1677 void MDSMonitor::check_subs()
1678 {
1679 // Subscriptions may be to "mdsmap" (MDS and legacy clients),
1680 // "mdsmap.<namespace>", or to "fsmap" for the full state of all
1681 // filesystems. Build a list of all the types we service
1682 // subscriptions for.
1683
1684 std::vector<std::string> types = {
1685 "fsmap",
1686 "fsmap.user",
1687 "mdsmap",
1688 };
1689
1690 for (const auto &p : get_fsmap().filesystems) {
1691 const auto &fscid = p.first;
1692 CachedStackStringStream cos;
1693 *cos << "mdsmap." << fscid;
1694 types.push_back(std::string(cos->strv()));
1695 }
1696
1697 for (const auto &type : types) {
1698 auto& subs = mon.session_map.subs;
1699 auto subs_it = subs.find(type);
1700 if (subs_it == subs.end())
1701 continue;
1702 auto sub_it = subs_it->second->begin();
1703 while (!sub_it.end()) {
1704 auto sub = *sub_it;
1705 ++sub_it; // N.B. check_sub may remove sub!
1706 check_sub(sub);
1707 }
1708 }
1709 }
1710
1711
1712 void MDSMonitor::check_sub(Subscription *sub)
1713 {
1714 dout(20) << __func__ << ": " << sub->type << dendl;
1715
1716 // to use const qualifier filter fsmap beforehand
1717 FSMap _fsmap_copy = get_fsmap();
1718 _fsmap_copy.filter(sub->session->get_allowed_fs_names());
1719 const auto& fsmap = _fsmap_copy;
1720 if (sub->next > fsmap.get_epoch()) {
1721 return;
1722 }
1723
1724 if (sub->type == "fsmap") {
1725 sub->session->con->send_message(new MFSMap(mon.monmap->fsid, fsmap));
1726 if (sub->onetime) {
1727 mon.session_map.remove_sub(sub);
1728 } else {
1729 sub->next = fsmap.get_epoch() + 1;
1730 }
1731 } else if (sub->type == "fsmap.user") {
1732 FSMapUser fsmap_u;
1733 fsmap_u.epoch = fsmap.get_epoch();
1734 fsmap_u.legacy_client_fscid = fsmap.legacy_client_fscid;
1735 for (const auto &p : fsmap.filesystems) {
1736 FSMapUser::fs_info_t& fs_info = fsmap_u.filesystems[p.second->fscid];
1737 fs_info.cid = p.second->fscid;
1738 fs_info.name = p.second->mds_map.fs_name;
1739 }
1740 sub->session->con->send_message(new MFSMapUser(mon.monmap->fsid, fsmap_u));
1741 if (sub->onetime) {
1742 mon.session_map.remove_sub(sub);
1743 } else {
1744 sub->next = fsmap.get_epoch() + 1;
1745 }
1746 } else if (sub->type.compare(0, 6, "mdsmap") == 0) {
1747 const bool is_mds = sub->session->name.is_mds();
1748 mds_gid_t mds_gid = MDS_GID_NONE;
1749 fs_cluster_id_t fscid = FS_CLUSTER_ID_NONE;
1750 if (is_mds) {
1751 // What (if any) namespace are you assigned to?
1752 auto mds_info = fsmap.get_mds_info();
1753 for (const auto &p : mds_info) {
1754 if (p.second.addrs == sub->session->addrs) {
1755 mds_gid = p.first;
1756 fscid = fsmap.mds_roles.at(mds_gid);
1757 }
1758 }
1759 } else {
1760 // You're a client. Did you request a particular
1761 // namespace?
1762 if (sub->type.compare(0, 7, "mdsmap.") == 0) {
1763 auto namespace_id_str = sub->type.substr(std::string("mdsmap.").size());
1764 dout(10) << __func__ << ": namespace_id " << namespace_id_str << dendl;
1765 std::string err;
1766 fscid = strict_strtoll(namespace_id_str.c_str(), 10, &err);
1767 if (!err.empty()) {
1768 // Client asked for a non-existent namespace, send them nothing
1769 dout(1) << "Invalid client subscription '" << sub->type
1770 << "'" << dendl;
1771 return;
1772 }
1773 } else {
1774 // Unqualified request for "mdsmap": give it the one marked
1775 // for use by legacy clients.
1776 if (fsmap.legacy_client_fscid != FS_CLUSTER_ID_NONE) {
1777 fscid = fsmap.legacy_client_fscid;
1778 } else {
1779 dout(1) << "Client subscribed for legacy filesystem but "
1780 "none is configured" << dendl;
1781 return;
1782 }
1783 }
1784 if (!fsmap.filesystem_exists(fscid)) {
1785 // Client asked for a non-existent namespace, send them nothing
1786 // TODO: something more graceful for when a client has a filesystem
1787 // mounted, and the fileysstem is deleted. Add a "shut down you fool"
1788 // flag to MMDSMap?
1789 dout(1) << "Client subscribed to non-existent namespace '" <<
1790 fscid << "'" << dendl;
1791 return;
1792 }
1793 }
1794 dout(10) << __func__ << ": is_mds=" << is_mds << ", fscid=" << fscid << dendl;
1795
1796 // Work out the effective latest epoch
1797 const MDSMap *mds_map = nullptr;
1798 MDSMap null_map = MDSMap::create_null_mdsmap();
1799 if (fscid == FS_CLUSTER_ID_NONE) {
1800 // For a client, we should have already dropped out
1801 ceph_assert(is_mds);
1802
1803 auto it = fsmap.standby_daemons.find(mds_gid);
1804 if (it != fsmap.standby_daemons.end()) {
1805 // For an MDS, we need to feed it an MDSMap with its own state in
1806 null_map.mds_info[mds_gid] = it->second;
1807 null_map.epoch = fsmap.standby_epochs.at(mds_gid);
1808 } else {
1809 null_map.epoch = fsmap.epoch;
1810 }
1811 mds_map = &null_map;
1812 } else {
1813 // Check the effective epoch
1814 mds_map = &fsmap.get_filesystem(fscid)->mds_map;
1815 }
1816
1817 ceph_assert(mds_map != nullptr);
1818 dout(10) << __func__ << " selected MDS map epoch " <<
1819 mds_map->epoch << " for namespace " << fscid << " for subscriber "
1820 << sub->session->name << " who wants epoch " << sub->next << dendl;
1821
1822 if (sub->next > mds_map->epoch) {
1823 return;
1824 }
1825 auto msg = make_message<MMDSMap>(mon.monmap->fsid, *mds_map);
1826
1827 sub->session->con->send_message(msg.detach());
1828 if (sub->onetime) {
1829 mon.session_map.remove_sub(sub);
1830 } else {
1831 sub->next = mds_map->get_epoch() + 1;
1832 }
1833 }
1834 }
1835
1836
1837 void MDSMonitor::update_metadata(mds_gid_t gid,
1838 const map<string, string>& metadata)
1839 {
1840 if (metadata.empty()) {
1841 return;
1842 }
1843 pending_metadata[gid] = metadata;
1844
1845 MonitorDBStore::TransactionRef t = paxos.get_pending_transaction();
1846 bufferlist bl;
1847 encode(pending_metadata, bl);
1848 t->put(MDS_METADATA_PREFIX, "last_metadata", bl);
1849 paxos.trigger_propose();
1850 }
1851
1852 void MDSMonitor::remove_from_metadata(const FSMap &fsmap, MonitorDBStore::TransactionRef t)
1853 {
1854 bool update = false;
1855 for (auto it = pending_metadata.begin(); it != pending_metadata.end(); ) {
1856 if (!fsmap.gid_exists(it->first)) {
1857 it = pending_metadata.erase(it);
1858 update = true;
1859 } else {
1860 ++it;
1861 }
1862 }
1863 if (!update)
1864 return;
1865 bufferlist bl;
1866 encode(pending_metadata, bl);
1867 t->put(MDS_METADATA_PREFIX, "last_metadata", bl);
1868 }
1869
1870 int MDSMonitor::load_metadata(map<mds_gid_t, Metadata>& m)
1871 {
1872 bufferlist bl;
1873 int r = mon.store->get(MDS_METADATA_PREFIX, "last_metadata", bl);
1874 if (r) {
1875 dout(5) << "Unable to load 'last_metadata'" << dendl;
1876 return r;
1877 }
1878
1879 auto it = bl.cbegin();
1880 ceph::decode(m, it);
1881 return 0;
1882 }
1883
1884 void MDSMonitor::count_metadata(const std::string &field, map<string,int> *out)
1885 {
1886 map<mds_gid_t,Metadata> meta;
1887 load_metadata(meta);
1888 for (auto& p : meta) {
1889 auto q = p.second.find(field);
1890 if (q == p.second.end()) {
1891 (*out)["unknown"]++;
1892 } else {
1893 (*out)[q->second]++;
1894 }
1895 }
1896 }
1897
1898 void MDSMonitor::count_metadata(const std::string &field, Formatter *f)
1899 {
1900 map<string,int> by_val;
1901 count_metadata(field, &by_val);
1902 f->open_object_section(field.c_str());
1903 for (auto& p : by_val) {
1904 f->dump_int(p.first.c_str(), p.second);
1905 }
1906 f->close_section();
1907 }
1908
1909 void MDSMonitor::get_versions(std::map<string, list<string> > &versions)
1910 {
1911 map<mds_gid_t,Metadata> meta;
1912 load_metadata(meta);
1913 const auto &fsmap = get_fsmap();
1914 std::map<mds_gid_t, mds_info_t> map = fsmap.get_mds_info();
1915 dout(10) << __func__ << " mds meta=" << meta << dendl;
1916 for (auto& p : meta) {
1917 auto q = p.second.find("ceph_version_short");
1918 if (q == p.second.end()) continue;
1919 versions[q->second].push_back(string("mds.") + map[p.first].name);
1920 }
1921 }
1922
1923 int MDSMonitor::dump_metadata(const FSMap& fsmap, const std::string &who,
1924 Formatter *f, ostream& err)
1925 {
1926 ceph_assert(f);
1927
1928 mds_gid_t gid = gid_from_arg(fsmap, who, err);
1929 if (gid == MDS_GID_NONE) {
1930 return -EINVAL;
1931 }
1932
1933 map<mds_gid_t, Metadata> metadata;
1934 if (int r = load_metadata(metadata)) {
1935 err << "Unable to load 'last_metadata'";
1936 return r;
1937 }
1938
1939 if (!metadata.count(gid)) {
1940 return -ENOENT;
1941 }
1942 const Metadata& m = metadata[gid];
1943 for (Metadata::const_iterator p = m.begin(); p != m.end(); ++p) {
1944 f->dump_string(p->first.c_str(), p->second);
1945 }
1946 return 0;
1947 }
1948
1949 int MDSMonitor::print_nodes(Formatter *f)
1950 {
1951 ceph_assert(f);
1952
1953 const auto &fsmap = get_fsmap();
1954
1955 map<mds_gid_t, Metadata> metadata;
1956 if (int r = load_metadata(metadata)) {
1957 return r;
1958 }
1959
1960 map<string, list<string> > mdses; // hostname => mds
1961 for (const auto &p : metadata) {
1962 const mds_gid_t& gid = p.first;
1963 const Metadata& m = p.second;
1964 Metadata::const_iterator hostname = m.find("hostname");
1965 if (hostname == m.end()) {
1966 // not likely though
1967 continue;
1968 }
1969 if (!fsmap.gid_exists(gid)) {
1970 dout(5) << __func__ << ": GID " << gid << " not existent" << dendl;
1971 continue;
1972 }
1973 const MDSMap::mds_info_t& mds_info = fsmap.get_info_gid(gid);
1974 mdses[hostname->second].push_back(mds_info.name);
1975 }
1976
1977 dump_services(f, mdses, "mds");
1978 return 0;
1979 }
1980
1981 /**
1982 * If a cluster is undersized (with respect to max_mds), then
1983 * attempt to find daemons to grow it. If the cluster is oversized
1984 * (with respect to max_mds) then shrink it by stopping its highest rank.
1985 */
1986 bool MDSMonitor::maybe_resize_cluster(FSMap &fsmap, fs_cluster_id_t fscid)
1987 {
1988 auto &current_mds_map = get_fsmap().get_filesystem(fscid)->mds_map;
1989 auto&& fs = fsmap.get_filesystem(fscid);
1990 auto &mds_map = fs->mds_map;
1991
1992 int in = mds_map.get_num_in_mds();
1993 int max = mds_map.get_max_mds();
1994
1995 dout(20) << __func__ << " in " << in << " max " << max << dendl;
1996
1997 /* Check that both the current epoch mds_map is resizeable as well as the
1998 * current batch of changes in pending. This is important if an MDS is
1999 * becoming active in the next epoch.
2000 */
2001 if (!current_mds_map.is_resizeable() ||
2002 !mds_map.is_resizeable()) {
2003 dout(5) << __func__ << " mds_map is not currently resizeable" << dendl;
2004 return false;
2005 }
2006
2007 if (in < max && !mds_map.test_flag(CEPH_MDSMAP_NOT_JOINABLE)) {
2008 mds_rank_t mds = mds_rank_t(0);
2009 while (mds_map.is_in(mds)) {
2010 mds++;
2011 }
2012 auto info = fsmap.find_replacement_for({fscid, mds});
2013 if (!info) {
2014 return false;
2015 }
2016
2017 dout(1) << "assigned standby " << info->addrs
2018 << " as mds." << mds << dendl;
2019 mon.clog->info() << info->human_name() << " assigned to "
2020 "filesystem " << mds_map.fs_name << " as rank "
2021 << mds << " (now has " << mds_map.get_num_in_mds() + 1
2022 << " ranks)";
2023 fsmap.promote(info->global_id, *fs, mds);
2024 return true;
2025 } else if (in > max) {
2026 mds_rank_t target = in - 1;
2027 const auto &info = mds_map.get_info(target);
2028 if (mds_map.is_active(target)) {
2029 dout(1) << "stopping " << target << dendl;
2030 mon.clog->info() << "stopping " << info.human_name();
2031 auto f = [](auto& info) {
2032 info.state = MDSMap::STATE_STOPPING;
2033 };
2034 fsmap.modify_daemon(info.global_id, f);
2035 return true;
2036 } else {
2037 dout(20) << "skipping stop of " << target << dendl;
2038 return false;
2039 }
2040 }
2041
2042 return false;
2043 }
2044
2045
2046 /**
2047 * Fail a daemon and replace it with a suitable standby.
2048 */
2049 bool MDSMonitor::drop_mds(FSMap &fsmap, mds_gid_t gid, const mds_info_t* rep_info, bool *osd_propose)
2050 {
2051 ceph_assert(osd_propose != nullptr);
2052
2053 const auto fscid = fsmap.mds_roles.at(gid);
2054 const auto& info = fsmap.get_info_gid(gid);
2055 const auto rank = info.rank;
2056 const auto state = info.state;
2057
2058 if (info.is_frozen()) {
2059 return false;
2060 } else if (state == MDSMap::STATE_STANDBY_REPLAY ||
2061 state == MDSMap::STATE_STANDBY) {
2062 dout(1) << " failing and removing standby " << gid << " " << info.addrs
2063 << " mds." << rank
2064 << "." << info.inc << " " << ceph_mds_state_name(state)
2065 << dendl;
2066 *osd_propose |= fail_mds_gid(fsmap, gid);
2067 return true;
2068 } else if (rank >= 0 && rep_info) {
2069 auto fs = fsmap.filesystems.at(fscid);
2070 if (fs->mds_map.test_flag(CEPH_MDSMAP_NOT_JOINABLE)) {
2071 return false;
2072 }
2073 // are we in?
2074 // and is there a non-laggy standby that can take over for us?
2075 dout(1) << " replacing " << gid << " " << info.addrs
2076 << " mds." << rank << "." << info.inc
2077 << " " << ceph_mds_state_name(state)
2078 << " with " << rep_info->global_id << "/" << rep_info->name << " " << rep_info->addrs
2079 << dendl;
2080
2081 mon.clog->warn() << "Replacing " << info.human_name()
2082 << " as rank " << rank
2083 << " with standby " << rep_info->human_name();
2084
2085 // Remove the old one
2086 *osd_propose |= fail_mds_gid(fsmap, gid);
2087
2088 // Promote the replacement
2089 fsmap.promote(rep_info->global_id, *fs, rank);
2090
2091 return true;
2092 }
2093 return false;
2094 }
2095
2096 bool MDSMonitor::check_health(FSMap& fsmap, bool* propose_osdmap)
2097 {
2098 bool do_propose = false;
2099 const auto now = mono_clock::now();
2100 const bool osdmap_writeable = mon.osdmon()->is_writeable();
2101 const auto mds_beacon_grace = g_conf().get_val<double>("mds_beacon_grace");
2102 const auto mds_beacon_interval = g_conf().get_val<double>("mds_beacon_interval");
2103
2104 if (mono_clock::is_zero(last_tick)) {
2105 last_tick = now;
2106 }
2107
2108 {
2109 auto since_last = std::chrono::duration<double>(now-last_tick);
2110
2111 if (since_last.count() > (mds_beacon_grace-mds_beacon_interval)) {
2112 // This case handles either local slowness (calls being delayed
2113 // for whatever reason) or cluster election slowness (a long gap
2114 // between calls while an election happened)
2115 dout(1) << __func__ << ": resetting beacon timeouts due to mon delay "
2116 "(slow election?) of " << since_last.count() << " seconds" << dendl;
2117 for (auto& p : last_beacon) {
2118 p.second.stamp = now;
2119 }
2120 }
2121 }
2122
2123 // make sure last_beacon is fully populated
2124 for (auto& p : fsmap.mds_roles) {
2125 auto& gid = p.first;
2126 last_beacon.emplace(std::piecewise_construct,
2127 std::forward_as_tuple(gid),
2128 std::forward_as_tuple(now, 0));
2129 }
2130
2131 // We will only take decisive action (replacing/removing a daemon)
2132 // if we have some indication that some other daemon(s) are successfully
2133 // getting beacons through recently.
2134 mono_time latest_beacon = mono_clock::zero();
2135 for (const auto& p : last_beacon) {
2136 latest_beacon = std::max(p.second.stamp, latest_beacon);
2137 }
2138 auto since = std::chrono::duration<double>(now-latest_beacon);
2139 const bool may_replace = since.count() <
2140 std::max(g_conf()->mds_beacon_interval, g_conf()->mds_beacon_grace * 0.5);
2141
2142 // check beacon timestamps
2143 std::vector<mds_gid_t> to_remove;
2144 const bool mon_down = mon.is_mon_down();
2145 const auto mds_beacon_mon_down_grace =
2146 g_conf().get_val<std::chrono::seconds>("mds_beacon_mon_down_grace");
2147 const auto quorum_age = std::chrono::seconds(mon.quorum_age());
2148 const bool new_quorum = quorum_age < mds_beacon_mon_down_grace;
2149 for (auto it = last_beacon.begin(); it != last_beacon.end(); ) {
2150 auto& [gid, beacon_info] = *it;
2151 auto since_last = std::chrono::duration<double>(now-beacon_info.stamp);
2152
2153 if (!fsmap.gid_exists(gid)) {
2154 // gid no longer exists, remove from tracked beacons
2155 it = last_beacon.erase(it);
2156 continue;
2157 }
2158
2159 if (since_last.count() >= g_conf()->mds_beacon_grace) {
2160 auto& info = fsmap.get_info_gid(gid);
2161 dout(1) << "no beacon from mds." << info.rank << "." << info.inc
2162 << " (gid: " << gid << " addr: " << info.addrs
2163 << " state: " << ceph_mds_state_name(info.state) << ")"
2164 << " since " << since_last.count() << dendl;
2165 if ((mon_down || new_quorum) && since_last < mds_beacon_mon_down_grace) {
2166 /* The MDS may be sending beacons to a monitor not yet in quorum or
2167 * temporarily partitioned. Hold off on removal for a little longer...
2168 */
2169 dout(10) << "deferring removal for mds_beacon_mon_down_grace during MON_DOWN" << dendl;
2170 ++it;
2171 continue;
2172 }
2173 // If the OSDMap is writeable, we can blocklist things, so we can
2174 // try failing any laggy MDS daemons. Consider each one for failure.
2175 if (!info.laggy()) {
2176 dout(1) << " marking " << gid << " " << info.addrs
2177 << " mds." << info.rank << "." << info.inc
2178 << " " << ceph_mds_state_name(info.state)
2179 << " laggy" << dendl;
2180 fsmap.modify_daemon(info.global_id, [](auto& info) {
2181 info.laggy_since = ceph_clock_now();
2182 });
2183 do_propose = true;
2184 }
2185 if (osdmap_writeable && may_replace) {
2186 to_remove.push_back(gid); // drop_mds may invalidate iterator
2187 }
2188 }
2189
2190 ++it;
2191 }
2192
2193 for (const auto& gid : to_remove) {
2194 auto info = fsmap.get_info_gid(gid);
2195 const mds_info_t* rep_info = nullptr;
2196 if (info.rank >= 0) {
2197 auto fscid = fsmap.fscid_from_gid(gid);
2198 rep_info = fsmap.find_replacement_for({fscid, info.rank});
2199 }
2200 bool dropped = drop_mds(fsmap, gid, rep_info, propose_osdmap);
2201 if (dropped) {
2202 mon.clog->info() << "MDS " << info.human_name()
2203 << " is removed because it is dead or otherwise unavailable.";
2204 do_propose = true;
2205 }
2206 }
2207
2208 if (osdmap_writeable) {
2209 for (auto& [fscid, fs] : fsmap.filesystems) {
2210 if (!fs->mds_map.test_flag(CEPH_MDSMAP_NOT_JOINABLE) &&
2211 fs->mds_map.is_resizeable()) {
2212 // Check if a rank or standby-replay should be replaced with a stronger
2213 // affinity standby. This looks at ranks and standby-replay:
2214 for (const auto& [gid, info] : fs->mds_map.get_mds_info()) {
2215 const auto join_fscid = info.join_fscid;
2216 if (join_fscid == fscid)
2217 continue;
2218 const auto rank = info.rank;
2219 const auto state = info.state;
2220 const mds_info_t* rep_info = nullptr;
2221 if (state == MDSMap::STATE_STANDBY_REPLAY) {
2222 rep_info = fsmap.get_available_standby(*fs);
2223 } else if (state == MDSMap::STATE_ACTIVE) {
2224 rep_info = fsmap.find_replacement_for({fscid, rank});
2225 } else {
2226 /* N.B. !is_degraded() */
2227 ceph_abort_msg("invalid state in MDSMap");
2228 }
2229 if (!rep_info) {
2230 break;
2231 }
2232 bool better_affinity = false;
2233 if (join_fscid == FS_CLUSTER_ID_NONE) {
2234 better_affinity = (rep_info->join_fscid == fscid);
2235 } else {
2236 better_affinity = (rep_info->join_fscid == fscid) ||
2237 (rep_info->join_fscid == FS_CLUSTER_ID_NONE);
2238 }
2239 if (better_affinity) {
2240 if (state == MDSMap::STATE_STANDBY_REPLAY) {
2241 mon.clog->info() << "Dropping low affinity standby-replay "
2242 << info.human_name()
2243 << " in favor of higher affinity standby.";
2244 *propose_osdmap |= fail_mds_gid(fsmap, gid);
2245 /* Now let maybe_promote_standby do the promotion. */
2246 } else {
2247 mon.clog->info() << "Dropping low affinity active "
2248 << info.human_name()
2249 << " in favor of higher affinity standby.";
2250 do_propose |= drop_mds(fsmap, gid, rep_info, propose_osdmap);
2251 }
2252 break; /* don't replace more than one per tick per fs */
2253 }
2254 }
2255 }
2256 }
2257 }
2258 return do_propose;
2259 }
2260
2261 bool MDSMonitor::maybe_promote_standby(FSMap &fsmap, Filesystem& fs)
2262 {
2263 if (fs.mds_map.test_flag(CEPH_MDSMAP_NOT_JOINABLE)) {
2264 return false;
2265 }
2266
2267 bool do_propose = false;
2268
2269 // have a standby take over?
2270 set<mds_rank_t> failed;
2271 fs.mds_map.get_failed_mds_set(failed);
2272 for (const auto& rank : failed) {
2273 auto info = fsmap.find_replacement_for({fs.fscid, rank});
2274 if (info) {
2275 dout(1) << " taking over failed mds." << rank << " with " << info->global_id
2276 << "/" << info->name << " " << info->addrs << dendl;
2277 mon.clog->info() << "Standby " << info->human_name()
2278 << " assigned to filesystem " << fs.mds_map.fs_name
2279 << " as rank " << rank;
2280
2281 fsmap.promote(info->global_id, fs, rank);
2282 do_propose = true;
2283 }
2284 }
2285
2286 if (fs.mds_map.is_resizeable() && fs.mds_map.allows_standby_replay()) {
2287 // There were no failures to replace, so try using any available standbys
2288 // as standby-replay daemons. Don't do this when the cluster is degraded
2289 // as a standby-replay daemon may try to read a journal being migrated.
2290 for (;;) {
2291 auto info = fsmap.get_available_standby(fs);
2292 if (!info) break;
2293 dout(20) << "standby available mds." << info->global_id << dendl;
2294 bool changed = false;
2295 for (const auto& rank : fs.mds_map.in) {
2296 dout(20) << "examining " << rank << dendl;
2297 if (fs.mds_map.is_followable(rank)) {
2298 dout(1) << " setting mds." << info->global_id
2299 << " to follow mds rank " << rank << dendl;
2300 fsmap.assign_standby_replay(info->global_id, fs.fscid, rank);
2301 do_propose = true;
2302 changed = true;
2303 break;
2304 }
2305 }
2306 if (!changed) break;
2307 }
2308 }
2309
2310 return do_propose;
2311 }
2312
2313 void MDSMonitor::tick()
2314 {
2315 if (!is_active() || !is_leader()) return;
2316
2317 auto &pending = get_pending_fsmap_writeable();
2318
2319 bool do_propose = false;
2320 bool propose_osdmap = false;
2321
2322 if (check_fsmap_struct_version) {
2323 /* Allow time for trimming otherwise PaxosService::is_writeable will always
2324 * be false.
2325 */
2326
2327 auto now = clock::now();
2328 auto elapsed = now - last_fsmap_struct_flush;
2329 if (elapsed > std::chrono::seconds(30)) {
2330 FSMap fsmap;
2331 bufferlist bl;
2332 auto v = get_first_committed();
2333 int err = get_version(v, bl);
2334 if (err) {
2335 derr << "could not get version " << v << dendl;
2336 ceph_abort();
2337 }
2338 try {
2339 fsmap.decode(bl);
2340 } catch (const ceph::buffer::malformed_input& e) {
2341 dout(5) << "flushing old fsmap struct because unable to decode FSMap: " << e.what() << dendl;
2342 }
2343 /* N.B. FSMap::is_struct_old is also true for undecoded (failed to decode) FSMap */
2344 if (fsmap.is_struct_old()) {
2345 dout(5) << "fsmap struct is too old; proposing to flush out old versions" << dendl;
2346 do_propose = true;
2347 last_fsmap_struct_flush = now;
2348 } else {
2349 dout(20) << "struct is recent" << dendl;
2350 check_fsmap_struct_version = false;
2351 }
2352 }
2353 }
2354
2355 do_propose |= pending.check_health();
2356
2357 /* Check health and affinity of ranks */
2358 do_propose |= check_health(pending, &propose_osdmap);
2359
2360 /* Resize the cluster according to max_mds. */
2361 for (auto& p : pending.filesystems) {
2362 do_propose |= maybe_resize_cluster(pending, p.second->fscid);
2363 }
2364
2365 /* Replace any failed ranks. */
2366 for (auto& p : pending.filesystems) {
2367 do_propose |= maybe_promote_standby(pending, *p.second);
2368 }
2369
2370 if (propose_osdmap) {
2371 request_proposal(mon.osdmon());
2372 }
2373
2374 if (do_propose) {
2375 propose_pending();
2376 }
2377
2378 last_tick = mono_clock::now();
2379 }
2380
2381 MDSMonitor::MDSMonitor(Monitor &mn, Paxos &p, string service_name)
2382 : PaxosService(mn, p, service_name)
2383 {
2384 handlers = FileSystemCommandHandler::load(&p);
2385 }
2386
2387 void MDSMonitor::on_restart()
2388 {
2389 // Clear out the leader-specific state.
2390 last_tick = mono_clock::now();
2391 last_beacon.clear();
2392 }
2393