]> git.proxmox.com Git - ceph.git/blob - ceph/src/mon/MDSMonitor.cc
update ceph source to reef 18.2.0
[ceph.git] / ceph / src / mon / MDSMonitor.cc
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3 /*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 *
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
12 *
13 */
14
15 #include <regex>
16 #include <sstream>
17 #include <boost/utility.hpp>
18
19 #include "MDSMonitor.h"
20 #include "FSCommands.h"
21 #include "Monitor.h"
22 #include "MonitorDBStore.h"
23 #include "OSDMonitor.h"
24
25 #include "common/strtol.h"
26 #include "common/perf_counters.h"
27 #include "common/config.h"
28 #include "common/cmdparse.h"
29 #include "messages/MMDSMap.h"
30 #include "messages/MFSMap.h"
31 #include "messages/MFSMapUser.h"
32 #include "messages/MMDSLoadTargets.h"
33 #include "messages/MMonCommand.h"
34 #include "messages/MGenericMessage.h"
35
36 #include "include/ceph_assert.h"
37 #include "include/str_list.h"
38 #include "include/stringify.h"
39 #include "mds/mdstypes.h"
40 #include "Session.h"
41
42 using namespace TOPNSPC::common;
43
44 using std::dec;
45 using std::hex;
46 using std::list;
47 using std::map;
48 using std::make_pair;
49 using std::ostream;
50 using std::ostringstream;
51 using std::pair;
52 using std::set;
53 using std::string;
54 using std::string_view;
55 using std::stringstream;
56 using std::to_string;
57 using std::vector;
58
59 using ceph::bufferlist;
60 using ceph::decode;
61 using ceph::encode;
62 using ceph::ErasureCodeInterfaceRef;
63 using ceph::ErasureCodeProfile;
64 using ceph::Formatter;
65 using ceph::JSONFormatter;
66 using ceph::make_message;
67 using ceph::mono_clock;
68 using ceph::mono_time;
69
70 #define dout_subsys ceph_subsys_mon
71 #undef dout_prefix
72 #define dout_prefix _prefix(_dout, mon, get_fsmap())
73 static ostream& _prefix(std::ostream *_dout, Monitor &mon, const FSMap& fsmap) {
74 return *_dout << "mon." << mon.name << "@" << mon.rank
75 << "(" << mon.get_state_name()
76 << ").mds e" << fsmap.get_epoch() << " ";
77 }
78
79 static const string MDS_METADATA_PREFIX("mds_metadata");
80 static const string MDS_HEALTH_PREFIX("mds_health");
81
82
83 /*
84 * Specialized implementation of cmd_getval to allow us to parse
85 * out strongly-typedef'd types
86 */
87 namespace TOPNSPC::common {
88 template<> bool cmd_getval(const cmdmap_t& cmdmap,
89 std::string_view k, mds_gid_t &val)
90 {
91 return cmd_getval(cmdmap, k, (int64_t&)val);
92 }
93
94 template<> bool cmd_getval(const cmdmap_t& cmdmap,
95 std::string_view k, mds_rank_t &val)
96 {
97 return cmd_getval(cmdmap, k, (int64_t&)val);
98 }
99
100 template<> bool cmd_getval(const cmdmap_t& cmdmap,
101 std::string_view k, MDSMap::DaemonState &val)
102 {
103 return cmd_getval(cmdmap, k, (int64_t&)val);
104 }
105 }
106 // my methods
107
108 template <int dblV>
109 void MDSMonitor::print_map(const FSMap& m)
110 {
111 dout(dblV) << "print_map\n";
112 m.print(*_dout);
113 *_dout << dendl;
114 }
115
116 // service methods
117 void MDSMonitor::create_initial()
118 {
119 dout(10) << "create_initial" << dendl;
120 }
121
122 void MDSMonitor::get_store_prefixes(std::set<string>& s) const
123 {
124 s.insert(service_name);
125 s.insert(MDS_METADATA_PREFIX);
126 s.insert(MDS_HEALTH_PREFIX);
127 }
128
129 void MDSMonitor::update_from_paxos(bool *need_bootstrap)
130 {
131 version_t version = get_last_committed();
132 if (version == get_fsmap().epoch)
133 return;
134
135 dout(10) << __func__ << " version " << version
136 << ", my e " << get_fsmap().epoch << dendl;
137 ceph_assert(version > get_fsmap().epoch);
138
139 load_health();
140
141 // read and decode
142 bufferlist fsmap_bl;
143 fsmap_bl.clear();
144 int err = get_version(version, fsmap_bl);
145 ceph_assert(err == 0);
146
147 ceph_assert(fsmap_bl.length() > 0);
148 dout(10) << __func__ << " got " << version << dendl;
149 try {
150 PaxosFSMap::decode(fsmap_bl);
151 } catch (const ceph::buffer::malformed_input& e) {
152 derr << "unable to decode FSMap: " << e.what() << dendl;
153 throw;
154 }
155
156 // new map
157 dout(0) << "new map" << dendl;
158 print_map<0>(get_fsmap());
159 if (!g_conf()->mon_mds_skip_sanity) {
160 get_fsmap().sanity();
161 }
162
163 check_subs();
164 }
165
166 void MDSMonitor::init()
167 {
168 (void)load_metadata(pending_metadata);
169 }
170
171 void MDSMonitor::create_pending()
172 {
173 auto &fsmap = PaxosFSMap::create_pending();
174
175 if (mon.osdmon()->is_readable()) {
176 const auto &osdmap = mon.osdmon()->osdmap;
177 fsmap.sanitize([&osdmap](int64_t pool){return osdmap.have_pg_pool(pool);});
178 }
179
180 dout(10) << "create_pending e" << fsmap.epoch << dendl;
181 }
182
183 void MDSMonitor::encode_pending(MonitorDBStore::TransactionRef t)
184 {
185 auto &pending = get_pending_fsmap_writeable();
186 auto &epoch = pending.epoch;
187
188 dout(10) << "encode_pending e" << epoch << dendl;
189
190 // print map iff 'debug mon = 30' or higher
191 print_map<30>(pending);
192 if (!g_conf()->mon_mds_skip_sanity) {
193 pending.sanity(true);
194 }
195
196 // Set 'modified' on maps modified this epoch
197 for (auto &p : pending.filesystems) {
198 if (p.second->mds_map.epoch == epoch) {
199 p.second->mds_map.modified = ceph_clock_now();
200 }
201 }
202
203 // apply to paxos
204 ceph_assert(get_last_committed() + 1 == pending.epoch);
205 bufferlist pending_bl;
206 pending.encode(pending_bl, mon.get_quorum_con_features());
207
208 /* put everything in the transaction */
209 put_version(t, pending.epoch, pending_bl);
210 put_last_committed(t, pending.epoch);
211
212 // Encode MDSHealth data
213 for (std::map<uint64_t, MDSHealth>::iterator i = pending_daemon_health.begin();
214 i != pending_daemon_health.end(); ++i) {
215 bufferlist bl;
216 i->second.encode(bl);
217 t->put(MDS_HEALTH_PREFIX, stringify(i->first), bl);
218 }
219
220 for (std::set<uint64_t>::iterator i = pending_daemon_health_rm.begin();
221 i != pending_daemon_health_rm.end(); ++i) {
222 t->erase(MDS_HEALTH_PREFIX, stringify(*i));
223 }
224 pending_daemon_health_rm.clear();
225 remove_from_metadata(pending, t);
226
227 // health
228 health_check_map_t new_checks;
229 const auto &info_map = pending.get_mds_info();
230 for (const auto &i : info_map) {
231 const auto &gid = i.first;
232 const auto &info = i.second;
233 if (pending_daemon_health_rm.count(gid)) {
234 continue;
235 }
236 MDSHealth health;
237 auto p = pending_daemon_health.find(gid);
238 if (p != pending_daemon_health.end()) {
239 health = p->second;
240 } else {
241 bufferlist bl;
242 mon.store->get(MDS_HEALTH_PREFIX, stringify(gid), bl);
243 if (!bl.length()) {
244 derr << "Missing health data for MDS " << gid << dendl;
245 continue;
246 }
247 auto bl_i = bl.cbegin();
248 health.decode(bl_i);
249 }
250 for (const auto &metric : health.metrics) {
251 if (metric.type == MDS_HEALTH_DUMMY) {
252 continue;
253 }
254 const auto rank = info.rank;
255 health_check_t *check = &new_checks.get_or_add(
256 mds_metric_name(metric.type),
257 metric.sev,
258 mds_metric_summary(metric.type),
259 1);
260 ostringstream ss;
261 ss << "mds." << info.name << "(mds." << rank << "): " << metric.message;
262 bool first = true;
263 for (auto &p : metric.metadata) {
264 if (first) {
265 ss << " ";
266 } else {
267 ss << ", ";
268 }
269 ss << p.first << ": " << p.second;
270 first = false;
271 }
272 check->detail.push_back(ss.str());
273 }
274 }
275 pending.get_health_checks(&new_checks);
276 for (auto& p : new_checks.checks) {
277 p.second.summary = std::regex_replace(
278 p.second.summary,
279 std::regex("%num%"),
280 stringify(p.second.detail.size()));
281 p.second.summary = std::regex_replace(
282 p.second.summary,
283 std::regex("%plurals%"),
284 p.second.detail.size() > 1 ? "s" : "");
285 p.second.summary = std::regex_replace(
286 p.second.summary,
287 std::regex("%isorare%"),
288 p.second.detail.size() > 1 ? "are" : "is");
289 p.second.summary = std::regex_replace(
290 p.second.summary,
291 std::regex("%hasorhave%"),
292 p.second.detail.size() > 1 ? "have" : "has");
293 }
294 encode_health(new_checks, t);
295 }
296
297 version_t MDSMonitor::get_trim_to() const
298 {
299 version_t floor = 0;
300 if (g_conf()->mon_mds_force_trim_to > 0 &&
301 g_conf()->mon_mds_force_trim_to <= (int)get_last_committed()) {
302 floor = g_conf()->mon_mds_force_trim_to;
303 dout(10) << __func__ << " explicit mon_mds_force_trim_to = "
304 << floor << dendl;
305 }
306
307 unsigned max = g_conf()->mon_max_mdsmap_epochs;
308 version_t last = get_last_committed();
309
310 if (last - get_first_committed() > max && floor < last - max) {
311 floor = last-max;
312 }
313
314 dout(20) << __func__ << " = " << floor << dendl;
315 return floor;
316 }
317
318 bool MDSMonitor::preprocess_query(MonOpRequestRef op)
319 {
320 op->mark_mdsmon_event(__func__);
321 auto m = op->get_req<PaxosServiceMessage>();
322 dout(10) << "preprocess_query " << *m << " from " << m->get_orig_source()
323 << " " << m->get_orig_source_addrs() << dendl;
324
325 switch (m->get_type()) {
326
327 case MSG_MDS_BEACON:
328 return preprocess_beacon(op);
329
330 case MSG_MON_COMMAND:
331 try {
332 return preprocess_command(op);
333 } catch (const bad_cmd_get& e) {
334 bufferlist bl;
335 mon.reply_command(op, -EINVAL, e.what(), bl, get_last_committed());
336 return true;
337 }
338
339 case MSG_MDS_OFFLOAD_TARGETS:
340 return preprocess_offload_targets(op);
341
342 default:
343 ceph_abort();
344 return true;
345 }
346 }
347
348 void MDSMonitor::_note_beacon(MMDSBeacon *m)
349 {
350 mds_gid_t gid = mds_gid_t(m->get_global_id());
351 version_t seq = m->get_seq();
352
353 dout(5) << "_note_beacon " << *m << " noting time" << dendl;
354 auto &beacon = last_beacon[gid];
355 beacon.stamp = mono_clock::now();
356 beacon.seq = seq;
357 }
358
359 bool MDSMonitor::preprocess_beacon(MonOpRequestRef op)
360 {
361 op->mark_mdsmon_event(__func__);
362 auto m = op->get_req<MMDSBeacon>();
363 MDSMap::DaemonState state = m->get_state();
364 mds_gid_t gid = m->get_global_id();
365 version_t seq = m->get_seq();
366 MDSMap::mds_info_t info;
367 epoch_t effective_epoch = 0;
368
369 const auto &fsmap = get_fsmap();
370
371 // check privileges, ignore if fails
372 MonSession *session = op->get_session();
373 if (!session)
374 goto ignore;
375 if (!session->is_capable("mds", MON_CAP_X)) {
376 dout(0) << "preprocess_beacon got MMDSBeacon from entity with insufficient privileges "
377 << session->caps << dendl;
378 goto ignore;
379 }
380
381 if (m->get_fsid() != mon.monmap->fsid) {
382 dout(0) << "preprocess_beacon on fsid " << m->get_fsid() << " != " << mon.monmap->fsid << dendl;
383 goto ignore;
384 }
385
386 dout(5) << "preprocess_beacon " << *m
387 << " from " << m->get_orig_source()
388 << " " << m->get_orig_source_addrs()
389 << " " << m->get_compat()
390 << dendl;
391
392 // make sure the address has a port
393 if (m->get_orig_source_addr().get_port() == 0) {
394 dout(1) << " ignoring boot message without a port" << dendl;
395 goto ignore;
396 }
397
398 // fw to leader?
399 if (!is_leader())
400 return false;
401
402 // booted, but not in map?
403 if (!fsmap.gid_exists(gid)) {
404 if (state != MDSMap::STATE_BOOT) {
405 dout(7) << "mds_beacon " << *m << " is not in fsmap (state "
406 << ceph_mds_state_name(state) << ")" << dendl;
407
408 /* We can't send an MDSMap this MDS was a part of because we no longer
409 * know which FS it was part of. Nor does this matter. Sending an empty
410 * MDSMap is sufficient for getting the MDS to respawn.
411 */
412 auto m = make_message<MMDSMap>(mon.monmap->fsid, MDSMap::create_null_mdsmap());
413 mon.send_reply(op, m.detach());
414 return true;
415 } else {
416 /* check if we've already recorded its entry in pending */
417 const auto& pending = get_pending_fsmap();
418 if (pending.gid_exists(gid)) {
419 /* MDS is already booted. */
420 goto ignore;
421 } else {
422 return false; // not booted yet.
423 }
424 }
425 }
426 dout(10) << __func__ << ": GID exists in map: " << gid << dendl;
427 info = fsmap.get_info_gid(gid);
428
429 if (state == MDSMap::STATE_DNE) {
430 return false;
431 }
432
433 // old seq?
434 if (info.state_seq > seq) {
435 dout(7) << "mds_beacon " << *m << " has old seq, ignoring" << dendl;
436 goto ignore;
437 }
438
439 // Work out the latest epoch that this daemon should have seen
440 {
441 fs_cluster_id_t fscid = fsmap.mds_roles.at(gid);
442 if (fscid == FS_CLUSTER_ID_NONE) {
443 effective_epoch = fsmap.standby_epochs.at(gid);
444 } else {
445 effective_epoch = fsmap.get_filesystem(fscid)->mds_map.epoch;
446 }
447 if (effective_epoch != m->get_last_epoch_seen()) {
448 dout(10) << "mds_beacon " << *m
449 << " ignoring requested state, because mds hasn't seen latest map" << dendl;
450 goto reply;
451 }
452 }
453
454 if (info.laggy()) {
455 _note_beacon(m);
456 return false; // no longer laggy, need to update map.
457 }
458 if (state == MDSMap::STATE_BOOT) {
459 // ignore, already booted.
460 goto ignore;
461 }
462
463 // did the join_fscid change
464 if (m->get_fs().size()) {
465 fs_cluster_id_t fscid = FS_CLUSTER_ID_NONE;
466 auto f = fsmap.get_filesystem(m->get_fs());
467 if (f) {
468 fscid = f->fscid;
469 }
470 if (info.join_fscid != fscid) {
471 dout(10) << __func__ << " standby mds_join_fs changed to " << fscid
472 << " (" << m->get_fs() << ")" << dendl;
473 _note_beacon(m);
474 return false;
475 }
476 } else {
477 if (info.join_fscid != FS_CLUSTER_ID_NONE) {
478 dout(10) << __func__ << " standby mds_join_fs was cleared" << dendl;
479 _note_beacon(m);
480 return false;
481 }
482 }
483
484 // is there a state change here?
485 if (info.state != state) {
486 _note_beacon(m);
487 return false;
488 }
489
490 // Comparing known daemon health with m->get_health()
491 // and return false (i.e. require proposal) if they
492 // do not match, to update our stored
493 if (!(pending_daemon_health[gid] == m->get_health())) {
494 dout(10) << __func__ << " health metrics for gid " << gid << " were updated" << dendl;
495 _note_beacon(m);
496 return false;
497 }
498
499 reply:
500 // note time and reply
501 ceph_assert(effective_epoch > 0);
502 _note_beacon(m);
503 {
504 auto beacon = make_message<MMDSBeacon>(mon.monmap->fsid,
505 m->get_global_id(), m->get_name(), effective_epoch,
506 state, seq, CEPH_FEATURES_SUPPORTED_DEFAULT);
507 mon.send_reply(op, beacon.detach());
508 }
509 return true;
510
511 ignore:
512 // I won't reply this beacon, drop it.
513 mon.no_reply(op);
514 return true;
515 }
516
517 bool MDSMonitor::preprocess_offload_targets(MonOpRequestRef op)
518 {
519 op->mark_mdsmon_event(__func__);
520 auto m = op->get_req<MMDSLoadTargets>();
521 dout(10) << "preprocess_offload_targets " << *m << " from " << m->get_orig_source() << dendl;
522
523 const auto &fsmap = get_fsmap();
524
525 // check privileges, ignore message if fails
526 MonSession *session = op->get_session();
527 if (!session)
528 goto ignore;
529 if (!session->is_capable("mds", MON_CAP_X)) {
530 dout(0) << "preprocess_offload_targets got MMDSLoadTargets from entity with insufficient caps "
531 << session->caps << dendl;
532 goto ignore;
533 }
534
535 if (fsmap.gid_exists(m->global_id) &&
536 m->targets == fsmap.get_info_gid(m->global_id).export_targets)
537 goto ignore;
538
539 return false;
540
541 ignore:
542 mon.no_reply(op);
543 return true;
544 }
545
546
547 bool MDSMonitor::prepare_update(MonOpRequestRef op)
548 {
549 op->mark_mdsmon_event(__func__);
550 auto m = op->get_req<PaxosServiceMessage>();
551 dout(7) << "prepare_update " << *m << dendl;
552
553 switch (m->get_type()) {
554
555 case MSG_MDS_BEACON:
556 return prepare_beacon(op);
557
558 case MSG_MON_COMMAND:
559 try {
560 return prepare_command(op);
561 } catch (const bad_cmd_get& e) {
562 bufferlist bl;
563 mon.reply_command(op, -EINVAL, e.what(), bl, get_last_committed());
564 return false; /* nothing to propose */
565 }
566
567 case MSG_MDS_OFFLOAD_TARGETS:
568 return prepare_offload_targets(op);
569
570 default:
571 ceph_abort();
572 }
573
574 return false; /* nothing to propose! */
575 }
576
577 bool MDSMonitor::prepare_beacon(MonOpRequestRef op)
578 {
579 op->mark_mdsmon_event(__func__);
580 auto m = op->get_req<MMDSBeacon>();
581 // -- this is an update --
582 dout(12) << "prepare_beacon " << *m << " from " << m->get_orig_source()
583 << " " << m->get_orig_source_addrs() << dendl;
584 entity_addrvec_t addrs = m->get_orig_source_addrs();
585 mds_gid_t gid = m->get_global_id();
586 MDSMap::DaemonState state = m->get_state();
587 version_t seq = m->get_seq();
588
589 auto &pending = get_pending_fsmap_writeable();
590
591 dout(15) << __func__ << " got health from gid " << gid << " with " << m->get_health().metrics.size() << " metrics." << dendl;
592
593 // Calculate deltas of health metrics created and removed
594 // Do this by type rather than MDSHealthMetric equality, because messages can
595 // change a lot when they include e.g. a number of items.
596 const auto &old_health = pending_daemon_health[gid].metrics;
597 const auto &new_health = m->get_health().metrics;
598
599 std::set<mds_metric_t> old_types;
600 for (const auto &i : old_health) {
601 old_types.insert(i.type);
602 }
603
604 std::set<mds_metric_t> new_types;
605 for (const auto &i : new_health) {
606 if (i.type == MDS_HEALTH_DUMMY) {
607 continue;
608 }
609 new_types.insert(i.type);
610 }
611
612 for (const auto &new_metric: new_health) {
613 if (new_metric.type == MDS_HEALTH_DUMMY) {
614 continue;
615 }
616 if (old_types.count(new_metric.type) == 0) {
617 dout(10) << "MDS health message (" << m->get_orig_source()
618 << "): " << new_metric.sev << " " << new_metric.message << dendl;
619 }
620 }
621
622 // Log the disappearance of health messages at INFO
623 for (const auto &old_metric : old_health) {
624 if (new_types.count(old_metric.type) == 0) {
625 mon.clog->info() << "MDS health message cleared ("
626 << m->get_orig_source() << "): " << old_metric.message;
627 }
628 }
629
630 // Store health
631 pending_daemon_health[gid] = m->get_health();
632
633 const auto& cs = m->get_compat();
634 if (state == MDSMap::STATE_BOOT) {
635 // zap previous instance of this name?
636 if (g_conf()->mds_enforce_unique_name) {
637 bool failed_mds = false;
638 while (mds_gid_t existing = pending.find_mds_gid_by_name(m->get_name())) {
639 if (!mon.osdmon()->is_writeable()) {
640 mon.osdmon()->wait_for_writeable(op, new C_RetryMessage(this, op));
641 return false;
642 }
643 const auto& existing_info = pending.get_info_gid(existing);
644 mon.clog->info() << existing_info.human_name() << " restarted";
645 fail_mds_gid(pending, existing);
646 failed_mds = true;
647 }
648 if (failed_mds) {
649 ceph_assert(mon.osdmon()->is_writeable());
650 request_proposal(mon.osdmon());
651 }
652 }
653
654 // Add this daemon to the map
655 if (pending.mds_roles.count(gid) == 0) {
656 MDSMap::mds_info_t new_info;
657 new_info.global_id = gid;
658 new_info.name = m->get_name();
659 new_info.addrs = addrs;
660 new_info.mds_features = m->get_mds_features();
661 new_info.state = MDSMap::STATE_STANDBY;
662 new_info.state_seq = seq;
663 new_info.compat = cs;
664 if (m->get_fs().size()) {
665 fs_cluster_id_t fscid = FS_CLUSTER_ID_NONE;
666 auto f = pending.get_filesystem(m->get_fs());
667 if (f) {
668 fscid = f->fscid;
669 }
670 new_info.join_fscid = fscid;
671 }
672 pending.insert(new_info);
673 }
674
675 // initialize the beacon timer
676 auto &beacon = last_beacon[gid];
677 beacon.stamp = mono_clock::now();
678 beacon.seq = seq;
679
680 update_metadata(m->get_global_id(), m->get_sys_info());
681 } else {
682 // state update
683
684 if (!pending.gid_exists(gid)) {
685 /* gid has been removed from pending, send null map */
686 dout(5) << "mds_beacon " << *m << " is not in fsmap (state "
687 << ceph_mds_state_name(state) << ")" << dendl;
688
689 /* We can't send an MDSMap this MDS was a part of because we no longer
690 * know which FS it was part of. Nor does this matter. Sending an empty
691 * MDSMap is sufficient for getting the MDS to respawn.
692 */
693 goto null;
694 }
695
696 const auto& info = pending.get_info_gid(gid);
697
698 // did the reported compat change? That's illegal!
699 if (cs.compare(info.compat) != 0) {
700 if (!mon.osdmon()->is_writeable()) {
701 mon.osdmon()->wait_for_writeable(op, new C_RetryMessage(this, op));
702 return false;
703 }
704 mon.clog->warn() << info.human_name() << " compat changed unexpectedly";
705 fail_mds_gid(pending, gid);
706 request_proposal(mon.osdmon());
707 return true;
708 }
709
710 if (state == MDSMap::STATE_DNE) {
711 dout(1) << __func__ << ": DNE from " << info << dendl;
712 goto evict;
713 }
714
715 // legal state change?
716 if ((info.state == MDSMap::STATE_STANDBY && state != info.state) ||
717 (info.state == MDSMap::STATE_STANDBY_REPLAY && state != info.state && state != MDSMap::STATE_DAMAGED)) {
718 // Standby daemons should never modify their own state.
719 // Except that standby-replay can indicate the rank is damaged due to failure to replay.
720 // Reject any attempts to do so.
721 derr << "standby " << gid << " attempted to change state to "
722 << ceph_mds_state_name(state) << ", rejecting" << dendl;
723 goto evict;
724 } else if (info.state != MDSMap::STATE_STANDBY && state != info.state &&
725 !MDSMap::state_transition_valid(info.state, state)) {
726 // Validate state transitions for daemons that hold a rank
727 derr << "daemon " << gid << " (rank " << info.rank << ") "
728 << "reported invalid state transition "
729 << ceph_mds_state_name(info.state) << " -> "
730 << ceph_mds_state_name(state) << dendl;
731 goto evict;
732 }
733
734 if (info.laggy()) {
735 dout(1) << "prepare_beacon clearing laggy flag on " << addrs << dendl;
736 pending.modify_daemon(info.global_id, [](auto& info)
737 {
738 info.clear_laggy();
739 }
740 );
741 }
742
743 dout(5) << "prepare_beacon mds." << info.rank
744 << " " << ceph_mds_state_name(info.state)
745 << " -> " << ceph_mds_state_name(state)
746 << dendl;
747
748 fs_cluster_id_t fscid = FS_CLUSTER_ID_NONE;
749 if (m->get_fs().size()) {
750 auto f = pending.get_filesystem(m->get_fs());
751 if (f) {
752 fscid = f->fscid;
753 }
754 }
755 pending.modify_daemon(gid, [fscid](auto& info) {
756 info.join_fscid = fscid;
757 });
758
759 if (state == MDSMap::STATE_STOPPED) {
760 const auto fscid = pending.mds_roles.at(gid);
761 const auto &fs = pending.get_filesystem(fscid);
762
763 mon.clog->info() << info.human_name() << " finished "
764 << "stopping rank " << info.rank << " in filesystem "
765 << fs->mds_map.fs_name << " (now has "
766 << fs->mds_map.get_num_in_mds() - 1 << " ranks)";
767
768 auto erased = pending.stop(gid);
769 erased.push_back(gid);
770
771 for (const auto& erased_gid : erased) {
772 last_beacon.erase(erased_gid);
773 if (pending_daemon_health.count(erased_gid)) {
774 pending_daemon_health.erase(erased_gid);
775 pending_daemon_health_rm.insert(erased_gid);
776 }
777 }
778 } else if (state == MDSMap::STATE_DAMAGED) {
779 if (!mon.osdmon()->is_writeable()) {
780 dout(1) << __func__ << ": DAMAGED from rank " << info.rank
781 << " waiting for osdmon writeable to blocklist it" << dendl;
782 mon.osdmon()->wait_for_writeable(op, new C_RetryMessage(this, op));
783 return false;
784 }
785
786 auto rank = info.rank;
787
788 // Record this MDS rank as damaged, so that other daemons
789 // won't try to run it.
790 dout(0) << __func__ << ": marking rank " << rank << " damaged" << dendl;
791
792 auto fs = pending.get_filesystem(gid);
793 auto rankgid = fs->mds_map.get_gid(rank);
794 auto rankinfo = pending.get_info_gid(rankgid);
795 auto followergid = fs->mds_map.get_standby_replay(rank);
796
797 ceph_assert(gid == rankgid || gid == followergid);
798
799 utime_t until = ceph_clock_now();
800 until += g_conf().get_val<double>("mon_mds_blocklist_interval");
801 const auto blocklist_epoch = mon.osdmon()->blocklist(rankinfo.addrs, until);
802 if (followergid != MDS_GID_NONE) {
803 fail_mds_gid(pending, followergid);
804 last_beacon.erase(followergid);
805 }
806 request_proposal(mon.osdmon());
807 pending.damaged(rankgid, blocklist_epoch);
808 last_beacon.erase(rankgid);
809
810 /* MDS expects beacon reply back */
811 } else {
812 if (info.state != MDSMap::STATE_ACTIVE && state == MDSMap::STATE_ACTIVE) {
813 const auto &fscid = pending.mds_roles.at(gid);
814 const auto &fs = pending.get_filesystem(fscid);
815 mon.clog->info() << info.human_name() << " is now active in "
816 << "filesystem " << fs->mds_map.fs_name << " as rank "
817 << info.rank;
818 }
819
820 // Made it through special cases and validations, record the
821 // daemon's reported state to the FSMap.
822 pending.modify_daemon(gid, [state, seq](auto& info) {
823 info.state = state;
824 info.state_seq = seq;
825 });
826 }
827 }
828
829 dout(5) << "prepare_beacon pending map now:" << dendl;
830 print_map(pending);
831
832 wait_for_finished_proposal(op, new LambdaContext([op, this](int r){
833 if (r >= 0)
834 _updated(op); // success
835 else if (r == -ECANCELED) {
836 mon.no_reply(op);
837 } else {
838 dispatch(op); // try again
839 }
840 }));
841
842 return true;
843
844 evict:
845 if (!mon.osdmon()->is_writeable()) {
846 dout(1) << __func__ << ": waiting for writeable OSDMap to evict" << dendl;
847 mon.osdmon()->wait_for_writeable(op, new C_RetryMessage(this, op));
848 return false;
849 }
850
851 fail_mds_gid(pending, gid);
852 request_proposal(mon.osdmon());
853 dout(5) << __func__ << ": pending map now:" << dendl;
854 print_map(pending);
855
856 goto null;
857
858 null:
859 wait_for_finished_proposal(op, new LambdaContext([op, this](int r){
860 if (r >= 0) {
861 auto m = make_message<MMDSMap>(mon.monmap->fsid, MDSMap::create_null_mdsmap());
862 mon.send_reply(op, m.detach());
863 } else {
864 dispatch(op); // try again
865 }
866 }));
867
868 return true;
869 }
870
871 bool MDSMonitor::prepare_offload_targets(MonOpRequestRef op)
872 {
873 auto &pending = get_pending_fsmap_writeable();
874 bool propose = false;
875
876 op->mark_mdsmon_event(__func__);
877 auto m = op->get_req<MMDSLoadTargets>();
878 mds_gid_t gid = m->global_id;
879 if (pending.gid_has_rank(gid)) {
880 dout(10) << "prepare_offload_targets " << gid << " " << m->targets << dendl;
881 pending.update_export_targets(gid, m->targets);
882 propose = true;
883 } else {
884 dout(10) << "prepare_offload_targets " << gid << " not in map" << dendl;
885 }
886 mon.no_reply(op);
887 return propose;
888 }
889
890 bool MDSMonitor::should_propose(double& delay)
891 {
892 // delegate to PaxosService to assess whether we should propose
893 return PaxosService::should_propose(delay);
894 }
895
896 void MDSMonitor::_updated(MonOpRequestRef op)
897 {
898 const auto &fsmap = get_fsmap();
899 op->mark_mdsmon_event(__func__);
900 auto m = op->get_req<MMDSBeacon>();
901 dout(10) << "_updated " << m->get_orig_source() << " " << *m << dendl;
902 mon.clog->debug() << m->get_orig_source() << " "
903 << m->get_orig_source_addrs() << " "
904 << ceph_mds_state_name(m->get_state());
905
906 if (m->get_state() == MDSMap::STATE_STOPPED) {
907 // send the map manually (they're out of the map, so they won't get it automatic)
908 auto m = make_message<MMDSMap>(mon.monmap->fsid, MDSMap::create_null_mdsmap());
909 mon.send_reply(op, m.detach());
910 } else {
911 auto beacon = make_message<MMDSBeacon>(mon.monmap->fsid,
912 m->get_global_id(), m->get_name(), fsmap.get_epoch(),
913 m->get_state(), m->get_seq(), CEPH_FEATURES_SUPPORTED_DEFAULT);
914 mon.send_reply(op, beacon.detach());
915 }
916 }
917
918 void MDSMonitor::on_active()
919 {
920 tick();
921
922 if (is_leader()) {
923 mon.clog->debug() << "fsmap " << get_fsmap();
924 }
925 }
926
927 void MDSMonitor::dump_info(Formatter *f)
928 {
929 f->open_object_section("fsmap");
930 get_fsmap().dump(f);
931 f->close_section();
932
933 f->dump_unsigned("mdsmap_first_committed", get_first_committed());
934 f->dump_unsigned("mdsmap_last_committed", get_last_committed());
935 }
936
937 bool MDSMonitor::preprocess_command(MonOpRequestRef op)
938 {
939 op->mark_mdsmon_event(__func__);
940 auto m = op->get_req<MMonCommand>();
941 int r = -1;
942 bufferlist rdata;
943 stringstream ss, ds;
944
945 cmdmap_t cmdmap;
946 if (!cmdmap_from_json(m->cmd, &cmdmap, ss)) {
947 // ss has reason for failure
948 string rs = ss.str();
949 mon.reply_command(op, -EINVAL, rs, rdata, get_last_committed());
950 return true;
951 }
952
953 string prefix;
954 cmd_getval(cmdmap, "prefix", prefix);
955 string format = cmd_getval_or<string>(cmdmap, "format", "plain");
956 std::unique_ptr<Formatter> f(Formatter::create(format));
957
958 MonSession *session = op->get_session();
959 if (!session) {
960 mon.reply_command(op, -EACCES, "access denied", rdata, get_last_committed());
961 return true;
962 }
963
964 // to use const qualifier filter fsmap beforehand
965 FSMap _fsmap_copy = get_fsmap();
966 _fsmap_copy.filter(session->get_allowed_fs_names());
967 const auto& fsmap = _fsmap_copy;
968
969 if (prefix == "mds stat") {
970 if (f) {
971 f->open_object_section("mds_stat");
972 dump_info(f.get());
973 f->close_section();
974 f->flush(ds);
975 } else {
976 ds << fsmap;
977 }
978 r = 0;
979 } else if (prefix == "mds ok-to-stop") {
980 vector<string> ids;
981 if (!cmd_getval(cmdmap, "ids", ids)) {
982 r = -EINVAL;
983 ss << "must specify mds id";
984 goto out;
985 }
986 if (fsmap.is_any_degraded()) {
987 ss << "one or more filesystems is currently degraded";
988 r = -EBUSY;
989 goto out;
990 }
991 set<mds_gid_t> stopping;
992 for (auto& id : ids) {
993 ostringstream ess;
994 mds_gid_t gid = gid_from_arg(fsmap, id, ess);
995 if (gid == MDS_GID_NONE) {
996 // the mds doesn't exist, but no file systems are unhappy, so losing it
997 // can't have any effect.
998 continue;
999 }
1000 stopping.insert(gid);
1001 }
1002 set<mds_gid_t> active;
1003 set<mds_gid_t> standby;
1004 for (auto gid : stopping) {
1005 if (fsmap.gid_has_rank(gid)) {
1006 // ignore standby-replay daemons (at this level)
1007 if (!fsmap.is_standby_replay(gid)) {
1008 auto standby = fsmap.get_standby_replay(gid);
1009 if (standby == MDS_GID_NONE ||
1010 stopping.count(standby)) {
1011 // no standby-replay, or we're also stopping the standby-replay
1012 // for this mds
1013 active.insert(gid);
1014 }
1015 }
1016 } else {
1017 // net loss of a standby
1018 standby.insert(gid);
1019 }
1020 }
1021 if (fsmap.get_num_standby() - standby.size() < active.size()) {
1022 r = -EBUSY;
1023 ss << "insufficent standby MDS daemons to stop active gids "
1024 << stringify(active)
1025 << " and/or standby gids " << stringify(standby);;
1026 goto out;
1027 }
1028 r = 0;
1029 ss << "should be safe to stop " << ids;
1030 } else if (prefix == "fs dump") {
1031 int64_t epocharg;
1032 epoch_t epoch;
1033
1034 const FSMap *fsmapp = &fsmap;
1035 FSMap dummy;
1036 if (cmd_getval(cmdmap, "epoch", epocharg)) {
1037 epoch = epocharg;
1038 bufferlist b;
1039 int err = get_version(epoch, b);
1040 if (err == -ENOENT) {
1041 r = -ENOENT;
1042 goto out;
1043 } else {
1044 ceph_assert(err == 0);
1045 ceph_assert(b.length());
1046 dummy.decode(b);
1047 fsmapp = &dummy;
1048 }
1049 }
1050
1051 stringstream ds;
1052 if (f != NULL) {
1053 f->open_object_section("fsmap");
1054 fsmapp->dump(f.get());
1055 f->close_section();
1056 f->flush(ds);
1057 r = 0;
1058 } else {
1059 fsmapp->print(ds);
1060 r = 0;
1061 }
1062
1063 rdata.append(ds);
1064 ss << "dumped fsmap epoch " << fsmapp->get_epoch();
1065 } else if (prefix == "mds metadata") {
1066 if (!f)
1067 f.reset(Formatter::create("json-pretty"));
1068
1069 string who;
1070 bool all = !cmd_getval(cmdmap, "who", who);
1071 dout(1) << "all = " << all << dendl;
1072 if (all) {
1073 r = 0;
1074 // Dump all MDSs' metadata
1075 const auto all_info = fsmap.get_mds_info();
1076
1077 f->open_array_section("mds_metadata");
1078 for(const auto &i : all_info) {
1079 const auto &info = i.second;
1080
1081 f->open_object_section("mds");
1082 f->dump_string("name", info.name);
1083 std::ostringstream get_err;
1084 r = dump_metadata(fsmap, info.name, f.get(), get_err);
1085 if (r == -EINVAL || r == -ENOENT) {
1086 // Drop error, list what metadata we do have
1087 dout(1) << get_err.str() << dendl;
1088 r = 0;
1089 } else if (r != 0) {
1090 derr << "Unexpected error reading metadata: " << cpp_strerror(r)
1091 << dendl;
1092 ss << get_err.str();
1093 f->close_section();
1094 break;
1095 }
1096 f->close_section();
1097 }
1098 f->close_section();
1099 } else {
1100 // Dump a single daemon's metadata
1101 f->open_object_section("mds_metadata");
1102 r = dump_metadata(fsmap, who, f.get(), ss);
1103 f->close_section();
1104 }
1105 f->flush(ds);
1106 } else if (prefix == "mds versions") {
1107 if (!f)
1108 f.reset(Formatter::create("json-pretty"));
1109 count_metadata("ceph_version", f.get());
1110 f->flush(ds);
1111 r = 0;
1112 } else if (prefix == "mds count-metadata") {
1113 if (!f)
1114 f.reset(Formatter::create("json-pretty"));
1115 string field;
1116 cmd_getval(cmdmap, "property", field);
1117 count_metadata(field, f.get());
1118 f->flush(ds);
1119 r = 0;
1120 } else if (prefix == "fs compat show") {
1121 string fs_name;
1122 cmd_getval(cmdmap, "fs_name", fs_name);
1123 const auto &fs = fsmap.get_filesystem(fs_name);
1124 if (fs == nullptr) {
1125 ss << "filesystem '" << fs_name << "' not found";
1126 r = -ENOENT;
1127 goto out;
1128 }
1129
1130 if (f) {
1131 f->open_object_section("mds_compat");
1132 fs->mds_map.compat.dump(f.get());
1133 f->close_section();
1134 f->flush(ds);
1135 } else {
1136 ds << fs->mds_map.compat;
1137 }
1138 r = 0;
1139 } else if (prefix == "mds compat show") {
1140 if (f) {
1141 f->open_object_section("mds_compat");
1142 fsmap.default_compat.dump(f.get());
1143 f->close_section();
1144 f->flush(ds);
1145 } else {
1146 ds << fsmap.default_compat;
1147 }
1148 r = 0;
1149 } else if (prefix == "fs get") {
1150 string fs_name;
1151 cmd_getval(cmdmap, "fs_name", fs_name);
1152 const auto &fs = fsmap.get_filesystem(fs_name);
1153 if (fs == nullptr) {
1154 ss << "filesystem '" << fs_name << "' not found";
1155 r = -ENOENT;
1156 } else {
1157 if (f != nullptr) {
1158 f->open_object_section("filesystem");
1159 fs->dump(f.get());
1160 f->close_section();
1161 f->flush(ds);
1162 r = 0;
1163 } else {
1164 fs->print(ds);
1165 r = 0;
1166 }
1167 }
1168 } else if (prefix == "fs ls") {
1169 if (f) {
1170 f->open_array_section("filesystems");
1171 for (const auto &p : fsmap.filesystems) {
1172 const auto &fs = p.second;
1173 f->open_object_section("filesystem");
1174 {
1175 const MDSMap &mds_map = fs->mds_map;
1176 f->dump_string("name", mds_map.fs_name);
1177 /* Output both the names and IDs of pools, for use by
1178 * humans and machines respectively */
1179 f->dump_string("metadata_pool", mon.osdmon()->osdmap.get_pool_name(
1180 mds_map.metadata_pool));
1181 f->dump_int("metadata_pool_id", mds_map.metadata_pool);
1182 f->open_array_section("data_pool_ids");
1183 for (const auto &id : mds_map.data_pools) {
1184 f->dump_int("data_pool_id", id);
1185 }
1186 f->close_section();
1187
1188 f->open_array_section("data_pools");
1189 for (const auto &id : mds_map.data_pools) {
1190 const auto &name = mon.osdmon()->osdmap.get_pool_name(id);
1191 f->dump_string("data_pool", name);
1192 }
1193 f->close_section();
1194 }
1195 f->close_section();
1196 }
1197 f->close_section();
1198 f->flush(ds);
1199 } else {
1200 for (const auto &p : fsmap.filesystems) {
1201 const auto &fs = p.second;
1202 const MDSMap &mds_map = fs->mds_map;
1203 const string &md_pool_name = mon.osdmon()->osdmap.get_pool_name(
1204 mds_map.metadata_pool);
1205
1206 ds << "name: " << mds_map.fs_name << ", metadata pool: "
1207 << md_pool_name << ", data pools: [";
1208 for (const auto &id : mds_map.data_pools) {
1209 const string &pool_name = mon.osdmon()->osdmap.get_pool_name(id);
1210 ds << pool_name << " ";
1211 }
1212 ds << "]" << std::endl;
1213 }
1214
1215 if (fsmap.filesystems.empty()) {
1216 ds << "No filesystems enabled" << std::endl;
1217 }
1218 }
1219 r = 0;
1220 } else if (prefix == "fs feature ls") {
1221 if (f) {
1222 f->open_array_section("cephfs_features");
1223 for (size_t i = 0; i <= CEPHFS_FEATURE_MAX; ++i) {
1224 f->open_object_section("feature");
1225 f->dump_int("index", i);
1226 f->dump_string("name", cephfs_feature_name(i));
1227 f->close_section();
1228 }
1229 f->close_section();
1230 f->flush(ds);
1231 } else {
1232 for (size_t i = 0; i <= CEPHFS_FEATURE_MAX; ++i) {
1233 ds << i << " " << cephfs_feature_name(i) << std::endl;
1234 }
1235 }
1236 r = 0;
1237 } else if (prefix == "fs lsflags") {
1238 string fs_name;
1239 cmd_getval(cmdmap, "fs_name", fs_name);
1240 const auto &fs = fsmap.get_filesystem(fs_name);
1241 if (!fs) {
1242 ss << "filesystem '" << fs_name << "' not found";
1243 r = -ENOENT;
1244 } else {
1245 const MDSMap &mds_map = fs->mds_map;
1246 if (f) {
1247 mds_map.dump_flags_state(f.get());
1248 f->flush(ds);
1249 }
1250 else {
1251 mds_map.print_flags(ds);
1252 }
1253 r = 0;
1254 }
1255 }
1256
1257 out:
1258 if (r != -1) {
1259 rdata.append(ds);
1260 string rs;
1261 getline(ss, rs);
1262 mon.reply_command(op, r, rs, rdata, get_last_committed());
1263 return true;
1264 } else
1265 return false;
1266 }
1267
1268 bool MDSMonitor::fail_mds_gid(FSMap &fsmap, mds_gid_t gid)
1269 {
1270 const auto& info = fsmap.get_info_gid(gid);
1271 dout(1) << "fail_mds_gid " << gid << " mds." << info.name << " role " << info.rank << dendl;
1272
1273 ceph_assert(mon.osdmon()->is_writeable());
1274
1275 epoch_t blocklist_epoch = 0;
1276 if (info.rank >= 0 && info.state != MDSMap::STATE_STANDBY_REPLAY) {
1277 utime_t until = ceph_clock_now();
1278 until += g_conf().get_val<double>("mon_mds_blocklist_interval");
1279 blocklist_epoch = mon.osdmon()->blocklist(info.addrs, until);
1280 }
1281
1282 fsmap.erase(gid, blocklist_epoch);
1283 last_beacon.erase(gid);
1284 if (pending_daemon_health.count(gid)) {
1285 pending_daemon_health.erase(gid);
1286 pending_daemon_health_rm.insert(gid);
1287 }
1288
1289 return blocklist_epoch != 0;
1290 }
1291
1292 mds_gid_t MDSMonitor::gid_from_arg(const FSMap &fsmap, const std::string &arg, std::ostream &ss)
1293 {
1294 // Try parsing as a role
1295 mds_role_t role;
1296 std::ostringstream ignore_err; // Don't spam 'ss' with parse_role errors
1297 int r = fsmap.parse_role(arg, &role, ignore_err);
1298 if (r == 0) {
1299 // See if a GID is assigned to this role
1300 const auto &fs = fsmap.get_filesystem(role.fscid);
1301 ceph_assert(fs != nullptr); // parse_role ensures it exists
1302 if (fs->mds_map.is_up(role.rank)) {
1303 dout(10) << __func__ << ": validated rank/GID " << role
1304 << " as a rank" << dendl;
1305 return fs->mds_map.get_mds_info(role.rank).global_id;
1306 }
1307 }
1308
1309 // Try parsing as a gid
1310 std::string err;
1311 unsigned long long maybe_gid = strict_strtoll(arg.c_str(), 10, &err);
1312 if (!err.empty()) {
1313 // Not a role or a GID, try as a daemon name
1314 const MDSMap::mds_info_t *mds_info = fsmap.find_by_name(arg);
1315 if (!mds_info) {
1316 ss << "MDS named '" << arg
1317 << "' does not exist, or is not up";
1318 return MDS_GID_NONE;
1319 }
1320 dout(10) << __func__ << ": resolved MDS name '" << arg
1321 << "' to GID " << mds_info->global_id << dendl;
1322 return mds_info->global_id;
1323 } else {
1324 // Not a role, but parses as a an integer, might be a GID
1325 dout(10) << __func__ << ": treating MDS reference '" << arg
1326 << "' as an integer " << maybe_gid << dendl;
1327
1328 if (fsmap.gid_exists(mds_gid_t(maybe_gid))) {
1329 return mds_gid_t(maybe_gid);
1330 }
1331 }
1332
1333 dout(1) << __func__ << ": rank/GID " << arg
1334 << " not a existent rank or GID" << dendl;
1335 return MDS_GID_NONE;
1336 }
1337
1338 int MDSMonitor::fail_mds(FSMap &fsmap, std::ostream &ss,
1339 const std::string &arg, MDSMap::mds_info_t *failed_info)
1340 {
1341 ceph_assert(failed_info != nullptr);
1342
1343 mds_gid_t gid = gid_from_arg(fsmap, arg, ss);
1344 if (gid == MDS_GID_NONE) {
1345 return 0;
1346 }
1347 if (!mon.osdmon()->is_writeable()) {
1348 return -EAGAIN;
1349 }
1350
1351 // Take a copy of the info before removing the MDS from the map,
1352 // so that the caller knows which mds (if any) they ended up removing.
1353 *failed_info = fsmap.get_info_gid(gid);
1354
1355 fail_mds_gid(fsmap, gid);
1356 ss << "failed mds gid " << gid;
1357 ceph_assert(mon.osdmon()->is_writeable());
1358 request_proposal(mon.osdmon());
1359 return 0;
1360 }
1361
1362 bool MDSMonitor::prepare_command(MonOpRequestRef op)
1363 {
1364 op->mark_mdsmon_event(__func__);
1365 auto m = op->get_req<MMonCommand>();
1366 int r = -EINVAL;
1367 stringstream ss;
1368 bufferlist rdata;
1369
1370 cmdmap_t cmdmap;
1371 if (!cmdmap_from_json(m->cmd, &cmdmap, ss)) {
1372 string rs = ss.str();
1373 mon.reply_command(op, -EINVAL, rs, rdata, get_last_committed());
1374 return false;
1375 }
1376
1377 string prefix;
1378 cmd_getval(cmdmap, "prefix", prefix);
1379
1380 /* Refuse access if message not associated with a valid session */
1381 MonSession *session = op->get_session();
1382 if (!session) {
1383 mon.reply_command(op, -EACCES, "access denied", rdata, get_last_committed());
1384 return false;
1385 }
1386
1387 auto &pending = get_pending_fsmap_writeable();
1388
1389 bool batched_propose = false;
1390 for (const auto &h : handlers) {
1391 r = h->can_handle(prefix, op, pending, cmdmap, ss);
1392 if (r == 1) {
1393 ; // pass, since we got the right handler.
1394 } else if (r == 0) {
1395 continue;
1396 } else {
1397 goto out;
1398 }
1399
1400 batched_propose = h->batched_propose();
1401 if (batched_propose) {
1402 paxos.plug();
1403 }
1404 r = h->handle(&mon, pending, op, cmdmap, ss);
1405 if (batched_propose) {
1406 paxos.unplug();
1407 }
1408
1409 if (r == -EAGAIN) {
1410 // message has been enqueued for retry; return.
1411 dout(4) << __func__ << " enqueue for retry by prepare_command" << dendl;
1412 return false;
1413 } else {
1414 if (r == 0) {
1415 // On successful updates, print the updated map
1416 print_map(pending);
1417 }
1418 // Successful or not, we're done: respond.
1419 goto out;
1420 }
1421 }
1422
1423 r = filesystem_command(pending, op, prefix, cmdmap, ss);
1424 if (r >= 0) {
1425 goto out;
1426 } else if (r == -EAGAIN) {
1427 // Do not reply, the message has been enqueued for retry
1428 dout(4) << __func__ << " enqueue for retry by filesystem_command" << dendl;
1429 return false;
1430 } else if (r != -ENOSYS) {
1431 goto out;
1432 }
1433
1434 if (r == -ENOSYS && ss.str().empty()) {
1435 ss << "unrecognized command";
1436 }
1437
1438 out:
1439 dout(4) << __func__ << " done, r=" << r << dendl;
1440 /* Compose response */
1441 string rs;
1442 getline(ss, rs);
1443
1444 if (r >= 0) {
1445 // success.. delay reply
1446 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, r, rs,
1447 get_last_committed() + 1));
1448 if (batched_propose) {
1449 force_immediate_propose();
1450 }
1451 return true;
1452 } else {
1453 // reply immediately
1454 mon.reply_command(op, r, rs, rdata, get_last_committed());
1455 return false;
1456 }
1457 }
1458
1459 int MDSMonitor::filesystem_command(
1460 FSMap &fsmap,
1461 MonOpRequestRef op,
1462 std::string const &prefix,
1463 const cmdmap_t& cmdmap,
1464 std::stringstream &ss)
1465 {
1466 dout(4) << __func__ << " prefix='" << prefix << "'" << dendl;
1467 op->mark_mdsmon_event(__func__);
1468 int r = 0;
1469 string whostr;
1470 cmd_getval(cmdmap, "role", whostr);
1471
1472 if (prefix == "mds set_state") {
1473 mds_gid_t gid;
1474 if (!cmd_getval(cmdmap, "gid", gid)) {
1475 ss << "error parsing 'gid' value '"
1476 << cmd_vartype_stringify(cmdmap.at("gid")) << "'";
1477 return -EINVAL;
1478 }
1479 MDSMap::DaemonState state;
1480 if (!cmd_getval(cmdmap, "state", state)) {
1481 ss << "error parsing 'state' string value '"
1482 << cmd_vartype_stringify(cmdmap.at("state")) << "'";
1483 return -EINVAL;
1484 }
1485 if (fsmap.gid_exists(gid, op->get_session()->get_allowed_fs_names())) {
1486 fsmap.modify_daemon(gid, [state](auto& info) {
1487 info.state = state;
1488 });
1489 ss << "set mds gid " << gid << " to state " << state << " "
1490 << ceph_mds_state_name(state);
1491 return 0;
1492 }
1493 } else if (prefix == "mds fail") {
1494 string who;
1495 cmd_getval(cmdmap, "role_or_gid", who);
1496
1497 MDSMap::mds_info_t failed_info;
1498 mds_gid_t gid = gid_from_arg(fsmap, who, ss);
1499 if (gid == MDS_GID_NONE) {
1500 ss << "MDS named '" << who << "' does not exist, is not up or you "
1501 << "lack the permission to see.";
1502 return 0;
1503 }
1504 if(!fsmap.gid_exists(gid, op->get_session()->get_allowed_fs_names())) {
1505 ss << "MDS named '" << who << "' does not exist, is not up or you "
1506 << "lack the permission to see.";
1507 return -EINVAL;
1508 }
1509 string_view fs_name = fsmap.fs_name_from_gid(gid);
1510 if (!op->get_session()->fs_name_capable(fs_name, MON_CAP_W)) {
1511 ss << "Permission denied.";
1512 return -EPERM;
1513 }
1514
1515 r = fail_mds(fsmap, ss, who, &failed_info);
1516 if (r < 0 && r == -EAGAIN) {
1517 mon.osdmon()->wait_for_writeable(op, new C_RetryMessage(this, op));
1518 return -EAGAIN; // don't propose yet; wait for message to be retried
1519 } else if (r == 0) {
1520 // Only log if we really did something (not when was already gone)
1521 if (failed_info.global_id != MDS_GID_NONE) {
1522 mon.clog->info() << failed_info.human_name() << " marked failed by "
1523 << op->get_session()->entity_name;
1524 }
1525 }
1526 } else if (prefix == "mds rm") {
1527 mds_gid_t gid;
1528 if (!cmd_getval(cmdmap, "gid", gid)) {
1529 ss << "error parsing 'gid' value '"
1530 << cmd_vartype_stringify(cmdmap.at("gid")) << "'";
1531 return -EINVAL;
1532 }
1533 if (!fsmap.gid_exists(gid, op->get_session()->get_allowed_fs_names())) {
1534 ss << "mds gid " << gid << " does not exist";
1535 return 0;
1536 }
1537 string_view fs_name = fsmap.fs_name_from_gid(gid);
1538 if (!op->get_session()->fs_name_capable(fs_name, MON_CAP_W)) {
1539 ss << "Permission denied.";
1540 return -EPERM;
1541 }
1542 const auto &info = fsmap.get_info_gid(gid);
1543 MDSMap::DaemonState state = info.state;
1544 if (state > 0) {
1545 ss << "cannot remove active mds." << info.name
1546 << " rank " << info.rank;
1547 return -EBUSY;
1548 } else {
1549 fsmap.erase(gid, {});
1550 ss << "removed mds gid " << gid;
1551 return 0;
1552 }
1553 } else if (prefix == "mds rmfailed") {
1554 bool confirm = false;
1555 cmd_getval(cmdmap, "yes_i_really_mean_it", confirm);
1556 if (!confirm) {
1557 ss << "WARNING: this can make your filesystem inaccessible! "
1558 "Add --yes-i-really-mean-it if you are sure you wish to continue.";
1559 return -EPERM;
1560 }
1561
1562 std::string role_str;
1563 cmd_getval(cmdmap, "role", role_str);
1564 mds_role_t role;
1565 const auto fs_names = op->get_session()->get_allowed_fs_names();
1566 int r = fsmap.parse_role(role_str, &role, ss, fs_names);
1567 if (r < 0) {
1568 ss << "invalid role '" << role_str << "'";
1569 return -EINVAL;
1570 }
1571 string_view fs_name = fsmap.get_filesystem(role.fscid)->mds_map.get_fs_name();
1572 if (!op->get_session()->fs_name_capable(fs_name, MON_CAP_W)) {
1573 ss << "Permission denied.";
1574 return -EPERM;
1575 }
1576
1577 fsmap.modify_filesystem(
1578 role.fscid,
1579 [role](std::shared_ptr<Filesystem> fs)
1580 {
1581 fs->mds_map.failed.erase(role.rank);
1582 });
1583
1584 ss << "removed failed mds." << role;
1585 return 0;
1586 /* TODO: convert to fs commands to update defaults */
1587 } else if (prefix == "mds compat rm_compat") {
1588 int64_t f;
1589 if (!cmd_getval(cmdmap, "feature", f)) {
1590 ss << "error parsing feature value '"
1591 << cmd_vartype_stringify(cmdmap.at("feature")) << "'";
1592 return -EINVAL;
1593 }
1594 if (fsmap.default_compat.compat.contains(f)) {
1595 ss << "removing compat feature " << f;
1596 fsmap.default_compat.compat.remove(f);
1597 } else {
1598 ss << "compat feature " << f << " not present in " << fsmap.default_compat;
1599 }
1600 r = 0;
1601 } else if (prefix == "mds compat rm_incompat") {
1602 int64_t f;
1603 if (!cmd_getval(cmdmap, "feature", f)) {
1604 ss << "error parsing feature value '"
1605 << cmd_vartype_stringify(cmdmap.at("feature")) << "'";
1606 return -EINVAL;
1607 }
1608 if (fsmap.default_compat.incompat.contains(f)) {
1609 ss << "removing incompat feature " << f;
1610 fsmap.default_compat.incompat.remove(f);
1611 } else {
1612 ss << "incompat feature " << f << " not present in " << fsmap.default_compat;
1613 }
1614 r = 0;
1615 } else if (prefix == "mds repaired") {
1616 std::string role_str;
1617 cmd_getval(cmdmap, "role", role_str);
1618 mds_role_t role;
1619 const auto fs_names = op->get_session()->get_allowed_fs_names();
1620 r = fsmap.parse_role(role_str, &role, ss, fs_names);
1621 if (r < 0) {
1622 return r;
1623 }
1624 string_view fs_name = fsmap.get_filesystem(role.fscid)->mds_map.get_fs_name();
1625 if (!op->get_session()->fs_name_capable(fs_name, MON_CAP_W)) {
1626 ss << "Permission denied.";
1627 return -EPERM;
1628 }
1629
1630 bool modified = fsmap.undamaged(role.fscid, role.rank);
1631 if (modified) {
1632 ss << "repaired: restoring rank " << role;
1633 } else {
1634 ss << "nothing to do: rank is not damaged";
1635 }
1636
1637 r = 0;
1638 } else if (prefix == "mds freeze") {
1639 std::string who;
1640 cmd_getval(cmdmap, "role_or_gid", who);
1641 mds_gid_t gid = gid_from_arg(fsmap, who, ss);
1642 if (gid == MDS_GID_NONE) {
1643 return -EINVAL;
1644 }
1645
1646 string_view fs_name = fsmap.fs_name_from_gid(gid);
1647 if (!op->get_session()->fs_name_capable(fs_name, MON_CAP_W)) {
1648 ss << "Permission denied.";
1649 return -EPERM;
1650 }
1651
1652 bool freeze = false;
1653 {
1654 std::string str;
1655 cmd_getval(cmdmap, "val", str);
1656 if ((r = parse_bool(str, &freeze, ss)) != 0) {
1657 return r;
1658 }
1659 }
1660
1661 auto f = [freeze,gid,&ss](auto& info) {
1662 if (freeze) {
1663 ss << "freezing mds." << gid;
1664 info.freeze();
1665 } else {
1666 ss << "unfreezing mds." << gid;
1667 info.unfreeze();
1668 }
1669 };
1670 fsmap.modify_daemon(gid, f);
1671 r = 0;
1672 } else {
1673 return -ENOSYS;
1674 }
1675
1676 return r;
1677 }
1678
1679 void MDSMonitor::check_subs()
1680 {
1681 // Subscriptions may be to "mdsmap" (MDS and legacy clients),
1682 // "mdsmap.<namespace>", or to "fsmap" for the full state of all
1683 // filesystems. Build a list of all the types we service
1684 // subscriptions for.
1685
1686 std::vector<std::string> types = {
1687 "fsmap",
1688 "fsmap.user",
1689 "mdsmap",
1690 };
1691
1692 for (const auto &p : get_fsmap().filesystems) {
1693 const auto &fscid = p.first;
1694 CachedStackStringStream cos;
1695 *cos << "mdsmap." << fscid;
1696 types.push_back(std::string(cos->strv()));
1697 }
1698
1699 for (const auto &type : types) {
1700 auto& subs = mon.session_map.subs;
1701 auto subs_it = subs.find(type);
1702 if (subs_it == subs.end())
1703 continue;
1704 auto sub_it = subs_it->second->begin();
1705 while (!sub_it.end()) {
1706 auto sub = *sub_it;
1707 ++sub_it; // N.B. check_sub may remove sub!
1708 check_sub(sub);
1709 }
1710 }
1711 }
1712
1713
1714 void MDSMonitor::check_sub(Subscription *sub)
1715 {
1716 dout(20) << __func__ << ": " << sub->type << dendl;
1717
1718 // to use const qualifier filter fsmap beforehand
1719 FSMap _fsmap_copy = get_fsmap();
1720 _fsmap_copy.filter(sub->session->get_allowed_fs_names());
1721 const auto& fsmap = _fsmap_copy;
1722 if (sub->next > fsmap.get_epoch()) {
1723 return;
1724 }
1725
1726 if (sub->type == "fsmap") {
1727 sub->session->con->send_message(new MFSMap(mon.monmap->fsid, fsmap));
1728 if (sub->onetime) {
1729 mon.session_map.remove_sub(sub);
1730 } else {
1731 sub->next = fsmap.get_epoch() + 1;
1732 }
1733 } else if (sub->type == "fsmap.user") {
1734 FSMapUser fsmap_u;
1735 fsmap_u.epoch = fsmap.get_epoch();
1736 fsmap_u.legacy_client_fscid = fsmap.legacy_client_fscid;
1737 for (const auto &p : fsmap.filesystems) {
1738 FSMapUser::fs_info_t& fs_info = fsmap_u.filesystems[p.second->fscid];
1739 fs_info.cid = p.second->fscid;
1740 fs_info.name = p.second->mds_map.fs_name;
1741 }
1742 sub->session->con->send_message(new MFSMapUser(mon.monmap->fsid, fsmap_u));
1743 if (sub->onetime) {
1744 mon.session_map.remove_sub(sub);
1745 } else {
1746 sub->next = fsmap.get_epoch() + 1;
1747 }
1748 } else if (sub->type.compare(0, 6, "mdsmap") == 0) {
1749 const bool is_mds = sub->session->name.is_mds();
1750 mds_gid_t mds_gid = MDS_GID_NONE;
1751 fs_cluster_id_t fscid = FS_CLUSTER_ID_NONE;
1752 if (is_mds) {
1753 // What (if any) namespace are you assigned to?
1754 auto mds_info = fsmap.get_mds_info();
1755 for (const auto &p : mds_info) {
1756 if (p.second.addrs == sub->session->addrs) {
1757 mds_gid = p.first;
1758 fscid = fsmap.mds_roles.at(mds_gid);
1759 }
1760 }
1761 } else {
1762 // You're a client. Did you request a particular
1763 // namespace?
1764 if (sub->type.compare(0, 7, "mdsmap.") == 0) {
1765 auto namespace_id_str = sub->type.substr(std::string("mdsmap.").size());
1766 dout(10) << __func__ << ": namespace_id " << namespace_id_str << dendl;
1767 std::string err;
1768 fscid = strict_strtoll(namespace_id_str.c_str(), 10, &err);
1769 if (!err.empty()) {
1770 // Client asked for a non-existent namespace, send them nothing
1771 dout(1) << "Invalid client subscription '" << sub->type
1772 << "'" << dendl;
1773 return;
1774 }
1775 } else {
1776 // Unqualified request for "mdsmap": give it the one marked
1777 // for use by legacy clients.
1778 if (fsmap.legacy_client_fscid != FS_CLUSTER_ID_NONE) {
1779 fscid = fsmap.legacy_client_fscid;
1780 } else {
1781 dout(1) << "Client subscribed for legacy filesystem but "
1782 "none is configured" << dendl;
1783 return;
1784 }
1785 }
1786 if (!fsmap.filesystem_exists(fscid)) {
1787 // Client asked for a non-existent namespace, send them nothing
1788 // TODO: something more graceful for when a client has a filesystem
1789 // mounted, and the fileysstem is deleted. Add a "shut down you fool"
1790 // flag to MMDSMap?
1791 dout(1) << "Client subscribed to non-existent namespace '" <<
1792 fscid << "'" << dendl;
1793 return;
1794 }
1795 }
1796 dout(10) << __func__ << ": is_mds=" << is_mds << ", fscid=" << fscid << dendl;
1797
1798 // Work out the effective latest epoch
1799 const MDSMap *mds_map = nullptr;
1800 MDSMap null_map = MDSMap::create_null_mdsmap();
1801 if (fscid == FS_CLUSTER_ID_NONE) {
1802 // For a client, we should have already dropped out
1803 ceph_assert(is_mds);
1804
1805 auto it = fsmap.standby_daemons.find(mds_gid);
1806 if (it != fsmap.standby_daemons.end()) {
1807 // For an MDS, we need to feed it an MDSMap with its own state in
1808 null_map.mds_info[mds_gid] = it->second;
1809 null_map.epoch = fsmap.standby_epochs.at(mds_gid);
1810 } else {
1811 null_map.epoch = fsmap.epoch;
1812 }
1813 mds_map = &null_map;
1814 } else {
1815 // Check the effective epoch
1816 mds_map = &fsmap.get_filesystem(fscid)->mds_map;
1817 }
1818
1819 ceph_assert(mds_map != nullptr);
1820 dout(10) << __func__ << " selected MDS map epoch " <<
1821 mds_map->epoch << " for namespace " << fscid << " for subscriber "
1822 << sub->session->name << " who wants epoch " << sub->next << dendl;
1823
1824 if (sub->next > mds_map->epoch) {
1825 return;
1826 }
1827 auto msg = make_message<MMDSMap>(mon.monmap->fsid, *mds_map);
1828
1829 sub->session->con->send_message(msg.detach());
1830 if (sub->onetime) {
1831 mon.session_map.remove_sub(sub);
1832 } else {
1833 sub->next = mds_map->get_epoch() + 1;
1834 }
1835 }
1836 }
1837
1838
1839 void MDSMonitor::update_metadata(mds_gid_t gid,
1840 const map<string, string>& metadata)
1841 {
1842 dout(20) << __func__ << ": mds." << gid << ": " << metadata << dendl;
1843 if (metadata.empty()) {
1844 dout(5) << __func__ << ": mds." << gid << ": no metadata!" << dendl;
1845 return;
1846 }
1847 pending_metadata[gid] = metadata;
1848
1849 MonitorDBStore::TransactionRef t = paxos.get_pending_transaction();
1850 bufferlist bl;
1851 encode(pending_metadata, bl);
1852 t->put(MDS_METADATA_PREFIX, "last_metadata", bl);
1853 }
1854
1855 void MDSMonitor::remove_from_metadata(const FSMap &fsmap, MonitorDBStore::TransactionRef t)
1856 {
1857 bool update = false;
1858 for (auto it = pending_metadata.begin(); it != pending_metadata.end(); ) {
1859 if (!fsmap.gid_exists(it->first)) {
1860 it = pending_metadata.erase(it);
1861 update = true;
1862 } else {
1863 ++it;
1864 }
1865 }
1866 if (!update)
1867 return;
1868 bufferlist bl;
1869 encode(pending_metadata, bl);
1870 t->put(MDS_METADATA_PREFIX, "last_metadata", bl);
1871 }
1872
1873 int MDSMonitor::load_metadata(map<mds_gid_t, Metadata>& m)
1874 {
1875 bufferlist bl;
1876 int r = mon.store->get(MDS_METADATA_PREFIX, "last_metadata", bl);
1877 if (r) {
1878 dout(5) << "Unable to load 'last_metadata'" << dendl;
1879 return r;
1880 }
1881
1882 auto it = bl.cbegin();
1883 ceph::decode(m, it);
1884 return 0;
1885 }
1886
1887 void MDSMonitor::count_metadata(const std::string &field, map<string,int> *out)
1888 {
1889 map<mds_gid_t,Metadata> meta;
1890 load_metadata(meta);
1891 for (auto& p : meta) {
1892 auto q = p.second.find(field);
1893 if (q == p.second.end()) {
1894 (*out)["unknown"]++;
1895 } else {
1896 (*out)[q->second]++;
1897 }
1898 }
1899 }
1900
1901 void MDSMonitor::count_metadata(const std::string &field, Formatter *f)
1902 {
1903 map<string,int> by_val;
1904 count_metadata(field, &by_val);
1905 f->open_object_section(field.c_str());
1906 for (auto& p : by_val) {
1907 f->dump_int(p.first.c_str(), p.second);
1908 }
1909 f->close_section();
1910 }
1911
1912 void MDSMonitor::get_versions(std::map<string, list<string> > &versions)
1913 {
1914 map<mds_gid_t,Metadata> meta;
1915 load_metadata(meta);
1916 const auto &fsmap = get_fsmap();
1917 std::map<mds_gid_t, mds_info_t> map = fsmap.get_mds_info();
1918 dout(10) << __func__ << " mds meta=" << meta << dendl;
1919 for (auto& p : meta) {
1920 auto q = p.second.find("ceph_version_short");
1921 if (q == p.second.end()) continue;
1922 versions[q->second].push_back(string("mds.") + map[p.first].name);
1923 }
1924 }
1925
1926 int MDSMonitor::dump_metadata(const FSMap& fsmap, const std::string &who,
1927 Formatter *f, ostream& err)
1928 {
1929 ceph_assert(f);
1930
1931 mds_gid_t gid = gid_from_arg(fsmap, who, err);
1932 if (gid == MDS_GID_NONE) {
1933 return -EINVAL;
1934 }
1935
1936 map<mds_gid_t, Metadata> metadata;
1937 if (int r = load_metadata(metadata)) {
1938 err << "Unable to load 'last_metadata'";
1939 return r;
1940 }
1941
1942 if (!metadata.count(gid)) {
1943 return -ENOENT;
1944 }
1945 const Metadata& m = metadata[gid];
1946 for (Metadata::const_iterator p = m.begin(); p != m.end(); ++p) {
1947 f->dump_string(p->first.c_str(), p->second);
1948 }
1949 return 0;
1950 }
1951
1952 int MDSMonitor::print_nodes(Formatter *f)
1953 {
1954 ceph_assert(f);
1955
1956 const auto &fsmap = get_fsmap();
1957
1958 map<mds_gid_t, Metadata> metadata;
1959 if (int r = load_metadata(metadata)) {
1960 return r;
1961 }
1962
1963 map<string, list<string> > mdses; // hostname => mds
1964 for (const auto &p : metadata) {
1965 const mds_gid_t& gid = p.first;
1966 const Metadata& m = p.second;
1967 Metadata::const_iterator hostname = m.find("hostname");
1968 if (hostname == m.end()) {
1969 // not likely though
1970 continue;
1971 }
1972 if (!fsmap.gid_exists(gid)) {
1973 dout(5) << __func__ << ": GID " << gid << " not existent" << dendl;
1974 continue;
1975 }
1976 const MDSMap::mds_info_t& mds_info = fsmap.get_info_gid(gid);
1977 mdses[hostname->second].push_back(mds_info.name);
1978 }
1979
1980 dump_services(f, mdses, "mds");
1981 return 0;
1982 }
1983
1984 /**
1985 * If a cluster is undersized (with respect to max_mds), then
1986 * attempt to find daemons to grow it. If the cluster is oversized
1987 * (with respect to max_mds) then shrink it by stopping its highest rank.
1988 */
1989 bool MDSMonitor::maybe_resize_cluster(FSMap &fsmap, fs_cluster_id_t fscid)
1990 {
1991 auto&& fs = fsmap.get_filesystem(fscid);
1992 auto &mds_map = fs->mds_map;
1993
1994 int in = mds_map.get_num_in_mds();
1995 int max = mds_map.get_max_mds();
1996
1997 dout(20) << __func__ << " in " << in << " max " << max << dendl;
1998
1999 /* Check that both the current epoch mds_map is resizeable as well as the
2000 * current batch of changes in pending. This is important if an MDS is
2001 * becoming active in the next epoch.
2002 */
2003 if (!get_fsmap().filesystem_exists(fscid) ||
2004 !get_fsmap().get_filesystem(fscid)->mds_map.is_resizeable() ||
2005 !mds_map.is_resizeable()) {
2006 dout(5) << __func__ << " mds_map is not currently resizeable" << dendl;
2007 return false;
2008 }
2009
2010 if (in < max && !mds_map.test_flag(CEPH_MDSMAP_NOT_JOINABLE)) {
2011 mds_rank_t mds = mds_rank_t(0);
2012 while (mds_map.is_in(mds)) {
2013 mds++;
2014 }
2015 auto info = fsmap.find_replacement_for({fscid, mds});
2016 if (!info) {
2017 return false;
2018 }
2019
2020 dout(1) << "assigned standby " << info->addrs
2021 << " as mds." << mds << dendl;
2022 mon.clog->info() << info->human_name() << " assigned to "
2023 "filesystem " << mds_map.fs_name << " as rank "
2024 << mds << " (now has " << mds_map.get_num_in_mds() + 1
2025 << " ranks)";
2026 fsmap.promote(info->global_id, *fs, mds);
2027 return true;
2028 } else if (in > max) {
2029 mds_rank_t target = in - 1;
2030 const auto &info = mds_map.get_info(target);
2031 if (mds_map.is_active(target)) {
2032 dout(1) << "stopping " << target << dendl;
2033 mon.clog->info() << "stopping " << info.human_name();
2034 auto f = [](auto& info) {
2035 info.state = MDSMap::STATE_STOPPING;
2036 };
2037 fsmap.modify_daemon(info.global_id, f);
2038 return true;
2039 } else {
2040 dout(20) << "skipping stop of " << target << dendl;
2041 return false;
2042 }
2043 }
2044
2045 return false;
2046 }
2047
2048
2049 /**
2050 * Fail a daemon and replace it with a suitable standby.
2051 */
2052 bool MDSMonitor::drop_mds(FSMap &fsmap, mds_gid_t gid, const mds_info_t* rep_info, bool *osd_propose)
2053 {
2054 ceph_assert(osd_propose != nullptr);
2055
2056 const auto fscid = fsmap.mds_roles.at(gid);
2057 const auto& info = fsmap.get_info_gid(gid);
2058 const auto rank = info.rank;
2059 const auto state = info.state;
2060
2061 if (info.is_frozen()) {
2062 return false;
2063 } else if (state == MDSMap::STATE_STANDBY_REPLAY ||
2064 state == MDSMap::STATE_STANDBY) {
2065 dout(1) << " failing and removing standby " << gid << " " << info.addrs
2066 << " mds." << rank
2067 << "." << info.inc << " " << ceph_mds_state_name(state)
2068 << dendl;
2069 *osd_propose |= fail_mds_gid(fsmap, gid);
2070 return true;
2071 } else if (rank >= 0 && rep_info) {
2072 auto fs = fsmap.filesystems.at(fscid);
2073 if (fs->mds_map.test_flag(CEPH_MDSMAP_NOT_JOINABLE)) {
2074 return false;
2075 }
2076 // are we in?
2077 // and is there a non-laggy standby that can take over for us?
2078 dout(1) << " replacing " << gid << " " << info.addrs
2079 << " mds." << rank << "." << info.inc
2080 << " " << ceph_mds_state_name(state)
2081 << " with " << rep_info->global_id << "/" << rep_info->name << " " << rep_info->addrs
2082 << dendl;
2083
2084 mon.clog->warn() << "Replacing " << info.human_name()
2085 << " as rank " << rank
2086 << " with standby " << rep_info->human_name();
2087
2088 // Remove the old one
2089 *osd_propose |= fail_mds_gid(fsmap, gid);
2090
2091 // Promote the replacement
2092 fsmap.promote(rep_info->global_id, *fs, rank);
2093
2094 return true;
2095 }
2096 return false;
2097 }
2098
2099 bool MDSMonitor::check_health(FSMap& fsmap, bool* propose_osdmap)
2100 {
2101 bool do_propose = false;
2102 const auto now = mono_clock::now();
2103 const bool osdmap_writeable = mon.osdmon()->is_writeable();
2104 const auto mds_beacon_grace = g_conf().get_val<double>("mds_beacon_grace");
2105 const auto mds_beacon_interval = g_conf().get_val<double>("mds_beacon_interval");
2106
2107 if (mono_clock::is_zero(last_tick)) {
2108 last_tick = now;
2109 }
2110
2111 {
2112 auto since_last = std::chrono::duration<double>(now-last_tick);
2113
2114 if (since_last.count() > (mds_beacon_grace-mds_beacon_interval)) {
2115 // This case handles either local slowness (calls being delayed
2116 // for whatever reason) or cluster election slowness (a long gap
2117 // between calls while an election happened)
2118 dout(1) << __func__ << ": resetting beacon timeouts due to mon delay "
2119 "(slow election?) of " << since_last.count() << " seconds" << dendl;
2120 for (auto& p : last_beacon) {
2121 p.second.stamp = now;
2122 }
2123 }
2124 }
2125
2126 // make sure last_beacon is fully populated
2127 for (auto& p : fsmap.mds_roles) {
2128 auto& gid = p.first;
2129 last_beacon.emplace(std::piecewise_construct,
2130 std::forward_as_tuple(gid),
2131 std::forward_as_tuple(now, 0));
2132 }
2133
2134 // We will only take decisive action (replacing/removing a daemon)
2135 // if we have some indication that some other daemon(s) are successfully
2136 // getting beacons through recently.
2137 mono_time latest_beacon = mono_clock::zero();
2138 for (const auto& p : last_beacon) {
2139 latest_beacon = std::max(p.second.stamp, latest_beacon);
2140 }
2141 auto since = std::chrono::duration<double>(now-latest_beacon);
2142 const bool may_replace = since.count() <
2143 std::max(g_conf()->mds_beacon_interval, g_conf()->mds_beacon_grace * 0.5);
2144
2145 // check beacon timestamps
2146 std::vector<mds_gid_t> to_remove;
2147 const bool mon_down = mon.is_mon_down();
2148 const auto mds_beacon_mon_down_grace =
2149 g_conf().get_val<std::chrono::seconds>("mds_beacon_mon_down_grace");
2150 const auto quorum_age = std::chrono::seconds(mon.quorum_age());
2151 const bool new_quorum = quorum_age < mds_beacon_mon_down_grace;
2152 for (auto it = last_beacon.begin(); it != last_beacon.end(); ) {
2153 auto& [gid, beacon_info] = *it;
2154 auto since_last = std::chrono::duration<double>(now-beacon_info.stamp);
2155
2156 if (!fsmap.gid_exists(gid)) {
2157 // gid no longer exists, remove from tracked beacons
2158 it = last_beacon.erase(it);
2159 continue;
2160 }
2161
2162 if (since_last.count() >= g_conf()->mds_beacon_grace) {
2163 auto& info = fsmap.get_info_gid(gid);
2164 dout(1) << "no beacon from mds." << info.rank << "." << info.inc
2165 << " (gid: " << gid << " addr: " << info.addrs
2166 << " state: " << ceph_mds_state_name(info.state) << ")"
2167 << " since " << since_last.count() << dendl;
2168 if ((mon_down || new_quorum) && since_last < mds_beacon_mon_down_grace) {
2169 /* The MDS may be sending beacons to a monitor not yet in quorum or
2170 * temporarily partitioned. Hold off on removal for a little longer...
2171 */
2172 dout(10) << "deferring removal for mds_beacon_mon_down_grace during MON_DOWN" << dendl;
2173 ++it;
2174 continue;
2175 }
2176 // If the OSDMap is writeable, we can blocklist things, so we can
2177 // try failing any laggy MDS daemons. Consider each one for failure.
2178 if (!info.laggy()) {
2179 dout(1) << " marking " << gid << " " << info.addrs
2180 << " mds." << info.rank << "." << info.inc
2181 << " " << ceph_mds_state_name(info.state)
2182 << " laggy" << dendl;
2183 fsmap.modify_daemon(info.global_id, [](auto& info) {
2184 info.laggy_since = ceph_clock_now();
2185 });
2186 do_propose = true;
2187 }
2188 if (osdmap_writeable && may_replace) {
2189 to_remove.push_back(gid); // drop_mds may invalidate iterator
2190 }
2191 }
2192
2193 ++it;
2194 }
2195
2196 for (const auto& gid : to_remove) {
2197 auto info = fsmap.get_info_gid(gid);
2198 const mds_info_t* rep_info = nullptr;
2199 if (info.rank >= 0) {
2200 auto fscid = fsmap.fscid_from_gid(gid);
2201 rep_info = fsmap.find_replacement_for({fscid, info.rank});
2202 }
2203 bool dropped = drop_mds(fsmap, gid, rep_info, propose_osdmap);
2204 if (dropped) {
2205 mon.clog->info() << "MDS " << info.human_name()
2206 << " is removed because it is dead or otherwise unavailable.";
2207 do_propose = true;
2208 }
2209 }
2210
2211 if (osdmap_writeable) {
2212 for (auto& [fscid, fs] : fsmap.filesystems) {
2213 if (!fs->mds_map.test_flag(CEPH_MDSMAP_NOT_JOINABLE) &&
2214 fs->mds_map.is_resizeable()) {
2215 // Check if a rank or standby-replay should be replaced with a stronger
2216 // affinity standby. This looks at ranks and standby-replay:
2217 for (const auto& [gid, info] : fs->mds_map.get_mds_info()) {
2218 const auto join_fscid = info.join_fscid;
2219 if (join_fscid == fscid)
2220 continue;
2221 const auto rank = info.rank;
2222 const auto state = info.state;
2223 const mds_info_t* rep_info = nullptr;
2224 if (state == MDSMap::STATE_STANDBY_REPLAY) {
2225 rep_info = fsmap.get_available_standby(*fs);
2226 } else if (state == MDSMap::STATE_ACTIVE) {
2227 rep_info = fsmap.find_replacement_for({fscid, rank});
2228 } else {
2229 /* N.B. !is_degraded() */
2230 ceph_abort_msg("invalid state in MDSMap");
2231 }
2232 if (!rep_info) {
2233 break;
2234 }
2235 bool better_affinity = false;
2236 if (join_fscid == FS_CLUSTER_ID_NONE) {
2237 better_affinity = (rep_info->join_fscid == fscid);
2238 } else {
2239 better_affinity = (rep_info->join_fscid == fscid) ||
2240 (rep_info->join_fscid == FS_CLUSTER_ID_NONE);
2241 }
2242 if (better_affinity) {
2243 if (state == MDSMap::STATE_STANDBY_REPLAY) {
2244 mon.clog->info() << "Dropping low affinity standby-replay "
2245 << info.human_name()
2246 << " in favor of higher affinity standby.";
2247 *propose_osdmap |= fail_mds_gid(fsmap, gid);
2248 /* Now let maybe_promote_standby do the promotion. */
2249 } else {
2250 mon.clog->info() << "Dropping low affinity active "
2251 << info.human_name()
2252 << " in favor of higher affinity standby.";
2253 do_propose |= drop_mds(fsmap, gid, rep_info, propose_osdmap);
2254 }
2255 break; /* don't replace more than one per tick per fs */
2256 }
2257 }
2258 }
2259 }
2260 }
2261 return do_propose;
2262 }
2263
2264 bool MDSMonitor::maybe_promote_standby(FSMap &fsmap, Filesystem& fs)
2265 {
2266 if (fs.mds_map.test_flag(CEPH_MDSMAP_NOT_JOINABLE)) {
2267 return false;
2268 }
2269
2270 bool do_propose = false;
2271
2272 // have a standby take over?
2273 set<mds_rank_t> failed;
2274 fs.mds_map.get_failed_mds_set(failed);
2275 for (const auto& rank : failed) {
2276 auto info = fsmap.find_replacement_for({fs.fscid, rank});
2277 if (info) {
2278 dout(1) << " taking over failed mds." << rank << " with " << info->global_id
2279 << "/" << info->name << " " << info->addrs << dendl;
2280 mon.clog->info() << "Standby " << info->human_name()
2281 << " assigned to filesystem " << fs.mds_map.fs_name
2282 << " as rank " << rank;
2283
2284 fsmap.promote(info->global_id, fs, rank);
2285 do_propose = true;
2286 }
2287 }
2288
2289 if (fs.mds_map.is_resizeable() && fs.mds_map.allows_standby_replay()) {
2290 // There were no failures to replace, so try using any available standbys
2291 // as standby-replay daemons. Don't do this when the cluster is degraded
2292 // as a standby-replay daemon may try to read a journal being migrated.
2293 for (;;) {
2294 auto info = fsmap.get_available_standby(fs);
2295 if (!info) break;
2296 dout(20) << "standby available mds." << info->global_id << dendl;
2297 bool changed = false;
2298 for (const auto& rank : fs.mds_map.in) {
2299 dout(20) << "examining " << rank << dendl;
2300 if (fs.mds_map.is_followable(rank)) {
2301 dout(1) << " setting mds." << info->global_id
2302 << " to follow mds rank " << rank << dendl;
2303 fsmap.assign_standby_replay(info->global_id, fs.fscid, rank);
2304 do_propose = true;
2305 changed = true;
2306 break;
2307 }
2308 }
2309 if (!changed) break;
2310 }
2311 }
2312
2313 return do_propose;
2314 }
2315
2316 void MDSMonitor::tick()
2317 {
2318 if (!is_active() || !is_leader()) return;
2319
2320 auto &pending = get_pending_fsmap_writeable();
2321
2322 bool do_propose = false;
2323 bool propose_osdmap = false;
2324
2325 if (check_fsmap_struct_version) {
2326 /* Allow time for trimming otherwise PaxosService::is_writeable will always
2327 * be false.
2328 */
2329
2330 auto now = clock::now();
2331 auto elapsed = now - last_fsmap_struct_flush;
2332 if (elapsed > std::chrono::seconds(30)) {
2333 FSMap fsmap;
2334 bufferlist bl;
2335 auto v = get_first_committed();
2336 int err = get_version(v, bl);
2337 if (err) {
2338 derr << "could not get version " << v << dendl;
2339 ceph_abort();
2340 }
2341 try {
2342 fsmap.decode(bl);
2343 } catch (const ceph::buffer::malformed_input& e) {
2344 dout(5) << "flushing old fsmap struct because unable to decode FSMap: " << e.what() << dendl;
2345 }
2346 /* N.B. FSMap::is_struct_old is also true for undecoded (failed to decode) FSMap */
2347 if (fsmap.is_struct_old()) {
2348 dout(5) << "fsmap struct is too old; proposing to flush out old versions" << dendl;
2349 do_propose = true;
2350 last_fsmap_struct_flush = now;
2351 } else {
2352 dout(20) << "struct is recent" << dendl;
2353 check_fsmap_struct_version = false;
2354 }
2355 }
2356 }
2357
2358 do_propose |= pending.check_health();
2359
2360 /* Check health and affinity of ranks */
2361 do_propose |= check_health(pending, &propose_osdmap);
2362
2363 /* Resize the cluster according to max_mds. */
2364 for (auto& p : pending.filesystems) {
2365 do_propose |= maybe_resize_cluster(pending, p.second->fscid);
2366 }
2367
2368 /* Replace any failed ranks. */
2369 for (auto& p : pending.filesystems) {
2370 do_propose |= maybe_promote_standby(pending, *p.second);
2371 }
2372
2373 if (propose_osdmap) {
2374 request_proposal(mon.osdmon());
2375 }
2376
2377 if (do_propose) {
2378 propose_pending();
2379 }
2380
2381 last_tick = mono_clock::now();
2382 }
2383
2384 MDSMonitor::MDSMonitor(Monitor &mn, Paxos &p, string service_name)
2385 : PaxosService(mn, p, service_name)
2386 {
2387 handlers = FileSystemCommandHandler::load(&p);
2388 }
2389
2390 void MDSMonitor::on_restart()
2391 {
2392 // Clear out the leader-specific state.
2393 last_tick = mono_clock::now();
2394 last_beacon.clear();
2395 }
2396