]> git.proxmox.com Git - ceph.git/blame - ceph/src/mon/MDSMonitor.cc
use the buster suite for getting the source package for now
[ceph.git] / ceph / src / mon / MDSMonitor.cc
CommitLineData
7c673cae
FG
1// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2// vim: ts=8 sw=2 smarttab
3/*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 *
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
12 *
13 */
14
11fdf7f2 15#include <regex>
7c673cae
FG
16#include <sstream>
17#include <boost/utility.hpp>
18
19#include "MDSMonitor.h"
20#include "FSCommands.h"
21#include "Monitor.h"
22#include "MonitorDBStore.h"
23#include "OSDMonitor.h"
7c673cae
FG
24
25#include "common/strtol.h"
26#include "common/perf_counters.h"
27#include "common/config.h"
28#include "common/cmdparse.h"
29#include "messages/MMDSMap.h"
30#include "messages/MFSMap.h"
31#include "messages/MFSMapUser.h"
32#include "messages/MMDSLoadTargets.h"
33#include "messages/MMonCommand.h"
34#include "messages/MGenericMessage.h"
35
11fdf7f2 36#include "include/ceph_assert.h"
7c673cae
FG
37#include "include/str_list.h"
38#include "include/stringify.h"
39#include "mds/mdstypes.h"
40#include "Session.h"
41
f67539c2
TL
42using namespace TOPNSPC::common;
43
44using std::dec;
45using std::hex;
46using std::list;
47using std::map;
48using std::make_pair;
49using std::ostream;
50using std::ostringstream;
51using std::pair;
52using std::set;
53using std::string;
54using std::string_view;
55using std::stringstream;
56using std::to_string;
57using std::vector;
58
59using ceph::bufferlist;
60using ceph::decode;
61using ceph::encode;
62using ceph::ErasureCodeInterfaceRef;
63using ceph::ErasureCodeProfile;
64using ceph::Formatter;
65using ceph::JSONFormatter;
66using ceph::make_message;
67using ceph::mono_clock;
68using ceph::mono_time;
69
7c673cae
FG
70#define dout_subsys ceph_subsys_mon
71#undef dout_prefix
28e407b8 72#define dout_prefix _prefix(_dout, mon, get_fsmap())
f67539c2
TL
73static ostream& _prefix(std::ostream *_dout, Monitor &mon, const FSMap& fsmap) {
74 return *_dout << "mon." << mon.name << "@" << mon.rank
75 << "(" << mon.get_state_name()
7c673cae
FG
76 << ").mds e" << fsmap.get_epoch() << " ";
77}
78
3efd9988
FG
79static const string MDS_METADATA_PREFIX("mds_metadata");
80static const string MDS_HEALTH_PREFIX("mds_health");
81
82
7c673cae
FG
83/*
84 * Specialized implementation of cmd_getval to allow us to parse
85 * out strongly-typedef'd types
86 */
9f95a23c
TL
87namespace TOPNSPC::common {
88template<> bool cmd_getval(const cmdmap_t& cmdmap,
31f18b77 89 const std::string& k, mds_gid_t &val)
7c673cae 90{
9f95a23c 91 return cmd_getval(cmdmap, k, (int64_t&)val);
7c673cae
FG
92}
93
9f95a23c 94template<> bool cmd_getval(const cmdmap_t& cmdmap,
31f18b77 95 const std::string& k, mds_rank_t &val)
7c673cae 96{
9f95a23c 97 return cmd_getval(cmdmap, k, (int64_t&)val);
7c673cae
FG
98}
99
9f95a23c 100template<> bool cmd_getval(const cmdmap_t& cmdmap,
31f18b77 101 const std::string& k, MDSMap::DaemonState &val)
7c673cae 102{
9f95a23c
TL
103 return cmd_getval(cmdmap, k, (int64_t&)val);
104}
7c673cae 105}
7c673cae
FG
106// my methods
107
11fdf7f2
TL
108template <int dblV>
109void MDSMonitor::print_map(const FSMap& m)
7c673cae 110{
11fdf7f2 111 dout(dblV) << "print_map\n";
7c673cae
FG
112 m.print(*_dout);
113 *_dout << dendl;
114}
115
116// service methods
117void MDSMonitor::create_initial()
118{
119 dout(10) << "create_initial" << dendl;
120}
121
11fdf7f2 122void MDSMonitor::get_store_prefixes(std::set<string>& s) const
3efd9988
FG
123{
124 s.insert(service_name);
125 s.insert(MDS_METADATA_PREFIX);
126 s.insert(MDS_HEALTH_PREFIX);
127}
7c673cae
FG
128
129void MDSMonitor::update_from_paxos(bool *need_bootstrap)
130{
131 version_t version = get_last_committed();
28e407b8 132 if (version == get_fsmap().epoch)
7c673cae
FG
133 return;
134
135 dout(10) << __func__ << " version " << version
28e407b8 136 << ", my e " << get_fsmap().epoch << dendl;
11fdf7f2 137 ceph_assert(version > get_fsmap().epoch);
7c673cae 138
224ce89b
WB
139 load_health();
140
7c673cae
FG
141 // read and decode
142 bufferlist fsmap_bl;
143 fsmap_bl.clear();
144 int err = get_version(version, fsmap_bl);
11fdf7f2 145 ceph_assert(err == 0);
7c673cae 146
11fdf7f2 147 ceph_assert(fsmap_bl.length() > 0);
7c673cae 148 dout(10) << __func__ << " got " << version << dendl;
28e407b8 149 PaxosFSMap::decode(fsmap_bl);
7c673cae
FG
150
151 // new map
91327a77 152 dout(0) << "new map" << dendl;
11fdf7f2
TL
153 print_map<0>(get_fsmap());
154 if (!g_conf()->mon_mds_skip_sanity) {
28e407b8 155 get_fsmap().sanity();
7c673cae
FG
156 }
157
158 check_subs();
7c673cae
FG
159}
160
161void MDSMonitor::init()
162{
163 (void)load_metadata(pending_metadata);
164}
165
166void MDSMonitor::create_pending()
167{
28e407b8 168 auto &fsmap = PaxosFSMap::create_pending();
7c673cae 169
f67539c2
TL
170 if (mon.osdmon()->is_readable()) {
171 const auto &osdmap = mon.osdmon()->osdmap;
28e407b8 172 fsmap.sanitize([&osdmap](int64_t pool){return osdmap.have_pg_pool(pool);});
3efd9988
FG
173 }
174
28e407b8 175 dout(10) << "create_pending e" << fsmap.epoch << dendl;
7c673cae
FG
176}
177
178void MDSMonitor::encode_pending(MonitorDBStore::TransactionRef t)
179{
28e407b8
AA
180 auto &pending = get_pending_fsmap_writeable();
181 auto &epoch = pending.epoch;
7c673cae 182
28e407b8 183 dout(10) << "encode_pending e" << epoch << dendl;
7c673cae
FG
184
185 // print map iff 'debug mon = 30' or higher
11fdf7f2
TL
186 print_map<30>(pending);
187 if (!g_conf()->mon_mds_skip_sanity) {
28e407b8 188 pending.sanity();
7c673cae
FG
189 }
190
191 // Set 'modified' on maps modified this epoch
28e407b8
AA
192 for (auto &p : pending.filesystems) {
193 if (p.second->mds_map.epoch == epoch) {
194 p.second->mds_map.modified = ceph_clock_now();
7c673cae
FG
195 }
196 }
197
198 // apply to paxos
11fdf7f2 199 ceph_assert(get_last_committed() + 1 == pending.epoch);
28e407b8 200 bufferlist pending_bl;
f67539c2 201 pending.encode(pending_bl, mon.get_quorum_con_features());
7c673cae
FG
202
203 /* put everything in the transaction */
28e407b8
AA
204 put_version(t, pending.epoch, pending_bl);
205 put_last_committed(t, pending.epoch);
7c673cae
FG
206
207 // Encode MDSHealth data
208 for (std::map<uint64_t, MDSHealth>::iterator i = pending_daemon_health.begin();
209 i != pending_daemon_health.end(); ++i) {
210 bufferlist bl;
211 i->second.encode(bl);
212 t->put(MDS_HEALTH_PREFIX, stringify(i->first), bl);
213 }
214
215 for (std::set<uint64_t>::iterator i = pending_daemon_health_rm.begin();
216 i != pending_daemon_health_rm.end(); ++i) {
217 t->erase(MDS_HEALTH_PREFIX, stringify(*i));
218 }
219 pending_daemon_health_rm.clear();
1adf2230 220 remove_from_metadata(pending, t);
224ce89b
WB
221
222 // health
223 health_check_map_t new_checks;
28e407b8 224 const auto &info_map = pending.get_mds_info();
224ce89b
WB
225 for (const auto &i : info_map) {
226 const auto &gid = i.first;
227 const auto &info = i.second;
228 if (pending_daemon_health_rm.count(gid)) {
229 continue;
230 }
231 MDSHealth health;
232 auto p = pending_daemon_health.find(gid);
233 if (p != pending_daemon_health.end()) {
234 health = p->second;
235 } else {
236 bufferlist bl;
f67539c2 237 mon.store->get(MDS_HEALTH_PREFIX, stringify(gid), bl);
224ce89b
WB
238 if (!bl.length()) {
239 derr << "Missing health data for MDS " << gid << dendl;
240 continue;
241 }
11fdf7f2 242 auto bl_i = bl.cbegin();
224ce89b
WB
243 health.decode(bl_i);
244 }
245 for (const auto &metric : health.metrics) {
9f95a23c 246 const auto rank = info.rank;
224ce89b
WB
247 health_check_t *check = &new_checks.get_or_add(
248 mds_metric_name(metric.type),
249 metric.sev,
9f95a23c
TL
250 mds_metric_summary(metric.type),
251 1);
224ce89b 252 ostringstream ss;
f91f0fd5 253 ss << "mds." << info.name << "(mds." << rank << "): " << metric.message;
28e407b8
AA
254 bool first = true;
255 for (auto &p : metric.metadata) {
256 if (first) {
257 ss << " ";
258 } else {
224ce89b 259 ss << ", ";
28e407b8
AA
260 }
261 ss << p.first << ": " << p.second;
262 first = false;
224ce89b
WB
263 }
264 check->detail.push_back(ss.str());
265 }
266 }
28e407b8 267 pending.get_health_checks(&new_checks);
224ce89b 268 for (auto& p : new_checks.checks) {
11fdf7f2 269 p.second.summary = std::regex_replace(
224ce89b 270 p.second.summary,
11fdf7f2 271 std::regex("%num%"),
224ce89b 272 stringify(p.second.detail.size()));
11fdf7f2 273 p.second.summary = std::regex_replace(
224ce89b 274 p.second.summary,
11fdf7f2 275 std::regex("%plurals%"),
224ce89b 276 p.second.detail.size() > 1 ? "s" : "");
11fdf7f2 277 p.second.summary = std::regex_replace(
224ce89b 278 p.second.summary,
11fdf7f2 279 std::regex("%isorare%"),
224ce89b 280 p.second.detail.size() > 1 ? "are" : "is");
11fdf7f2 281 p.second.summary = std::regex_replace(
181888fb 282 p.second.summary,
11fdf7f2 283 std::regex("%hasorhave%"),
181888fb 284 p.second.detail.size() > 1 ? "have" : "has");
224ce89b
WB
285 }
286 encode_health(new_checks, t);
7c673cae
FG
287}
288
11fdf7f2 289version_t MDSMonitor::get_trim_to() const
7c673cae
FG
290{
291 version_t floor = 0;
11fdf7f2
TL
292 if (g_conf()->mon_mds_force_trim_to > 0 &&
293 g_conf()->mon_mds_force_trim_to < (int)get_last_committed()) {
294 floor = g_conf()->mon_mds_force_trim_to;
7c673cae
FG
295 dout(10) << __func__ << " explicit mon_mds_force_trim_to = "
296 << floor << dendl;
297 }
298
11fdf7f2 299 unsigned max = g_conf()->mon_max_mdsmap_epochs;
7c673cae
FG
300 version_t last = get_last_committed();
301
302 if (last - get_first_committed() > max && floor < last - max)
303 return last - max;
304 return floor;
305}
306
7c673cae
FG
307bool MDSMonitor::preprocess_query(MonOpRequestRef op)
308{
309 op->mark_mdsmon_event(__func__);
9f95a23c 310 auto m = op->get_req<PaxosServiceMessage>();
11fdf7f2
TL
311 dout(10) << "preprocess_query " << *m << " from " << m->get_orig_source()
312 << " " << m->get_orig_source_addrs() << dendl;
7c673cae
FG
313
314 switch (m->get_type()) {
315
316 case MSG_MDS_BEACON:
317 return preprocess_beacon(op);
318
319 case MSG_MON_COMMAND:
f64942e4
AA
320 try {
321 return preprocess_command(op);
11fdf7f2 322 } catch (const bad_cmd_get& e) {
f64942e4 323 bufferlist bl;
f67539c2 324 mon.reply_command(op, -EINVAL, e.what(), bl, get_last_committed());
f64942e4
AA
325 return true;
326 }
7c673cae
FG
327
328 case MSG_MDS_OFFLOAD_TARGETS:
329 return preprocess_offload_targets(op);
330
331 default:
332 ceph_abort();
333 return true;
334 }
335}
336
337void MDSMonitor::_note_beacon(MMDSBeacon *m)
338{
339 mds_gid_t gid = mds_gid_t(m->get_global_id());
340 version_t seq = m->get_seq();
341
91327a77 342 dout(5) << "_note_beacon " << *m << " noting time" << dendl;
1adf2230
AA
343 auto &beacon = last_beacon[gid];
344 beacon.stamp = mono_clock::now();
345 beacon.seq = seq;
7c673cae
FG
346}
347
348bool MDSMonitor::preprocess_beacon(MonOpRequestRef op)
349{
350 op->mark_mdsmon_event(__func__);
9f95a23c 351 auto m = op->get_req<MMDSBeacon>();
7c673cae
FG
352 MDSMap::DaemonState state = m->get_state();
353 mds_gid_t gid = m->get_global_id();
354 version_t seq = m->get_seq();
355 MDSMap::mds_info_t info;
356 epoch_t effective_epoch = 0;
357
1adf2230 358 const auto &fsmap = get_fsmap();
28e407b8 359
7c673cae 360 // check privileges, ignore if fails
11fdf7f2
TL
361 MonSession *session = op->get_session();
362 if (!session)
363 goto ignore;
7c673cae
FG
364 if (!session->is_capable("mds", MON_CAP_X)) {
365 dout(0) << "preprocess_beacon got MMDSBeacon from entity with insufficient privileges "
366 << session->caps << dendl;
367 goto ignore;
368 }
369
f67539c2
TL
370 if (m->get_fsid() != mon.monmap->fsid) {
371 dout(0) << "preprocess_beacon on fsid " << m->get_fsid() << " != " << mon.monmap->fsid << dendl;
7c673cae
FG
372 goto ignore;
373 }
374
91327a77 375 dout(5) << "preprocess_beacon " << *m
11fdf7f2
TL
376 << " from " << m->get_orig_source()
377 << " " << m->get_orig_source_addrs()
7c673cae
FG
378 << " " << m->get_compat()
379 << dendl;
380
381 // make sure the address has a port
382 if (m->get_orig_source_addr().get_port() == 0) {
383 dout(1) << " ignoring boot message without a port" << dendl;
384 goto ignore;
385 }
386
387 // check compat
388 if (!m->get_compat().writeable(fsmap.compat)) {
11fdf7f2
TL
389 dout(1) << " mds " << m->get_orig_source()
390 << " " << m->get_orig_source_addrs()
391 << " can't write to fsmap " << fsmap.compat << dendl;
7c673cae
FG
392 goto ignore;
393 }
394
395 // fw to leader?
28e407b8 396 if (!is_leader())
7c673cae
FG
397 return false;
398
399 // booted, but not in map?
28e407b8 400 if (!fsmap.gid_exists(gid)) {
7c673cae
FG
401 if (state != MDSMap::STATE_BOOT) {
402 dout(7) << "mds_beacon " << *m << " is not in fsmap (state "
403 << ceph_mds_state_name(state) << ")" << dendl;
404
1adf2230
AA
405 /* We can't send an MDSMap this MDS was a part of because we no longer
406 * know which FS it was part of. Nor does this matter. Sending an empty
407 * MDSMap is sufficient for getting the MDS to respawn.
408 */
7c673cae
FG
409 MDSMap null_map;
410 null_map.epoch = fsmap.epoch;
411 null_map.compat = fsmap.compat;
f67539c2
TL
412 auto m = make_message<MMDSMap>(mon.monmap->fsid, null_map);
413 mon.send_reply(op, m.detach());
7c673cae
FG
414 return true;
415 } else {
416 return false; // not booted yet.
417 }
418 }
419 dout(10) << __func__ << ": GID exists in map: " << gid << dendl;
28e407b8 420 info = fsmap.get_info_gid(gid);
7c673cae 421
f91f0fd5
TL
422 if (state == MDSMap::STATE_DNE) {
423 return false;
424 }
425
7c673cae
FG
426 // old seq?
427 if (info.state_seq > seq) {
428 dout(7) << "mds_beacon " << *m << " has old seq, ignoring" << dendl;
429 goto ignore;
430 }
431
432 // Work out the latest epoch that this daemon should have seen
433 {
28e407b8 434 fs_cluster_id_t fscid = fsmap.mds_roles.at(gid);
7c673cae 435 if (fscid == FS_CLUSTER_ID_NONE) {
28e407b8 436 effective_epoch = fsmap.standby_epochs.at(gid);
7c673cae 437 } else {
28e407b8 438 effective_epoch = fsmap.get_filesystem(fscid)->mds_map.epoch;
7c673cae
FG
439 }
440 if (effective_epoch != m->get_last_epoch_seen()) {
441 dout(10) << "mds_beacon " << *m
442 << " ignoring requested state, because mds hasn't seen latest map" << dendl;
443 goto reply;
444 }
445 }
446
447 if (info.laggy()) {
448 _note_beacon(m);
449 return false; // no longer laggy, need to update map.
450 }
451 if (state == MDSMap::STATE_BOOT) {
452 // ignore, already booted.
453 goto ignore;
454 }
9f95a23c
TL
455
456 // did the join_fscid change
457 if (m->get_fs().size()) {
458 fs_cluster_id_t fscid = FS_CLUSTER_ID_NONE;
459 auto f = fsmap.get_filesystem(m->get_fs());
460 if (f) {
461 fscid = f->fscid;
462 }
463 if (info.join_fscid != fscid) {
464 dout(10) << __func__ << " standby mds_join_fs changed to " << fscid
465 << " (" << m->get_fs() << ")" << dendl;
466 _note_beacon(m);
467 return false;
468 }
469 } else {
470 if (info.join_fscid != FS_CLUSTER_ID_NONE) {
471 dout(10) << __func__ << " standby mds_join_fs was cleared" << dendl;
472 _note_beacon(m);
473 return false;
474 }
475 }
476
7c673cae
FG
477 // is there a state change here?
478 if (info.state != state) {
479 // legal state change?
480 if ((info.state == MDSMap::STATE_STANDBY ||
481 info.state == MDSMap::STATE_STANDBY_REPLAY) && state > 0) {
482 dout(10) << "mds_beacon mds can't activate itself (" << ceph_mds_state_name(info.state)
483 << " -> " << ceph_mds_state_name(state) << ")" << dendl;
484 goto reply;
485 }
486
487 if ((state == MDSMap::STATE_STANDBY || state == MDSMap::STATE_STANDBY_REPLAY)
488 && info.rank != MDS_RANK_NONE)
489 {
490 dout(4) << "mds_beacon MDS can't go back into standby after taking rank: "
491 "held rank " << info.rank << " while requesting state "
492 << ceph_mds_state_name(state) << dendl;
493 goto reply;
494 }
495
496 _note_beacon(m);
497 return false;
498 }
499
500 // Comparing known daemon health with m->get_health()
501 // and return false (i.e. require proposal) if they
502 // do not match, to update our stored
503 if (!(pending_daemon_health[gid] == m->get_health())) {
91327a77 504 dout(10) << __func__ << " health metrics for gid " << gid << " were updated" << dendl;
7c673cae
FG
505 _note_beacon(m);
506 return false;
507 }
508
509 reply:
510 // note time and reply
11fdf7f2 511 ceph_assert(effective_epoch > 0);
7c673cae 512 _note_beacon(m);
11fdf7f2 513 {
f67539c2 514 auto beacon = make_message<MMDSBeacon>(mon.monmap->fsid,
11fdf7f2
TL
515 m->get_global_id(), m->get_name(), effective_epoch,
516 state, seq, CEPH_FEATURES_SUPPORTED_DEFAULT);
f67539c2 517 mon.send_reply(op, beacon.detach());
11fdf7f2 518 }
7c673cae
FG
519 return true;
520
521 ignore:
522 // I won't reply this beacon, drop it.
f67539c2 523 mon.no_reply(op);
7c673cae
FG
524 return true;
525}
526
527bool MDSMonitor::preprocess_offload_targets(MonOpRequestRef op)
528{
529 op->mark_mdsmon_event(__func__);
9f95a23c 530 auto m = op->get_req<MMDSLoadTargets>();
7c673cae 531 dout(10) << "preprocess_offload_targets " << *m << " from " << m->get_orig_source() << dendl;
28e407b8 532
1adf2230 533 const auto &fsmap = get_fsmap();
7c673cae
FG
534
535 // check privileges, ignore message if fails
11fdf7f2 536 MonSession *session = op->get_session();
7c673cae 537 if (!session)
1adf2230 538 goto ignore;
7c673cae
FG
539 if (!session->is_capable("mds", MON_CAP_X)) {
540 dout(0) << "preprocess_offload_targets got MMDSLoadTargets from entity with insufficient caps "
541 << session->caps << dendl;
1adf2230 542 goto ignore;
7c673cae
FG
543 }
544
545 if (fsmap.gid_exists(m->global_id) &&
546 m->targets == fsmap.get_info_gid(m->global_id).export_targets)
1adf2230 547 goto ignore;
7c673cae
FG
548
549 return false;
550
1adf2230 551 ignore:
f67539c2 552 mon.no_reply(op);
7c673cae
FG
553 return true;
554}
555
556
557bool MDSMonitor::prepare_update(MonOpRequestRef op)
558{
559 op->mark_mdsmon_event(__func__);
9f95a23c 560 auto m = op->get_req<PaxosServiceMessage>();
7c673cae
FG
561 dout(7) << "prepare_update " << *m << dendl;
562
563 switch (m->get_type()) {
564
565 case MSG_MDS_BEACON:
566 return prepare_beacon(op);
567
568 case MSG_MON_COMMAND:
f64942e4
AA
569 try {
570 return prepare_command(op);
11fdf7f2 571 } catch (const bad_cmd_get& e) {
f64942e4 572 bufferlist bl;
f67539c2 573 mon.reply_command(op, -EINVAL, e.what(), bl, get_last_committed());
f64942e4
AA
574 return true;
575 }
7c673cae
FG
576
577 case MSG_MDS_OFFLOAD_TARGETS:
578 return prepare_offload_targets(op);
579
580 default:
581 ceph_abort();
582 }
583
584 return true;
585}
586
587bool MDSMonitor::prepare_beacon(MonOpRequestRef op)
588{
589 op->mark_mdsmon_event(__func__);
9f95a23c 590 auto m = op->get_req<MMDSBeacon>();
7c673cae 591 // -- this is an update --
11fdf7f2
TL
592 dout(12) << "prepare_beacon " << *m << " from " << m->get_orig_source()
593 << " " << m->get_orig_source_addrs() << dendl;
594 entity_addrvec_t addrs = m->get_orig_source_addrs();
7c673cae
FG
595 mds_gid_t gid = m->get_global_id();
596 MDSMap::DaemonState state = m->get_state();
597 version_t seq = m->get_seq();
598
28e407b8
AA
599 auto &pending = get_pending_fsmap_writeable();
600
91327a77 601 dout(15) << __func__ << " got health from gid " << gid << " with " << m->get_health().metrics.size() << " metrics." << dendl;
7c673cae
FG
602
603 // Calculate deltas of health metrics created and removed
604 // Do this by type rather than MDSHealthMetric equality, because messages can
605 // change a lot when they include e.g. a number of items.
606 const auto &old_health = pending_daemon_health[gid].metrics;
607 const auto &new_health = m->get_health().metrics;
608
609 std::set<mds_metric_t> old_types;
610 for (const auto &i : old_health) {
611 old_types.insert(i.type);
612 }
613
614 std::set<mds_metric_t> new_types;
615 for (const auto &i : new_health) {
616 new_types.insert(i.type);
617 }
618
619 for (const auto &new_metric: new_health) {
620 if (old_types.count(new_metric.type) == 0) {
11fdf7f2 621 dout(10) << "MDS health message (" << m->get_orig_source()
28e407b8 622 << "): " << new_metric.sev << " " << new_metric.message << dendl;
7c673cae
FG
623 }
624 }
625
626 // Log the disappearance of health messages at INFO
627 for (const auto &old_metric : old_health) {
628 if (new_types.count(old_metric.type) == 0) {
f67539c2 629 mon.clog->info() << "MDS health message cleared ("
11fdf7f2 630 << m->get_orig_source() << "): " << old_metric.message;
7c673cae
FG
631 }
632 }
633
634 // Store health
635 pending_daemon_health[gid] = m->get_health();
636
637 // boot?
638 if (state == MDSMap::STATE_BOOT) {
639 // zap previous instance of this name?
11fdf7f2 640 if (g_conf()->mds_enforce_unique_name) {
7c673cae 641 bool failed_mds = false;
28e407b8 642 while (mds_gid_t existing = pending.find_mds_gid_by_name(m->get_name())) {
f67539c2
TL
643 if (!mon.osdmon()->is_writeable()) {
644 mon.osdmon()->wait_for_writeable(op, new C_RetryMessage(this, op));
7c673cae
FG
645 return false;
646 }
d2e6a577 647 const MDSMap::mds_info_t &existing_info =
28e407b8 648 pending.get_info_gid(existing);
f67539c2 649 mon.clog->info() << existing_info.human_name() << " restarted";
1adf2230 650 fail_mds_gid(pending, existing);
7c673cae
FG
651 failed_mds = true;
652 }
653 if (failed_mds) {
f67539c2
TL
654 ceph_assert(mon.osdmon()->is_writeable());
655 request_proposal(mon.osdmon());
7c673cae
FG
656 }
657 }
658
659 // Add this daemon to the map
28e407b8 660 if (pending.mds_roles.count(gid) == 0) {
7c673cae
FG
661 MDSMap::mds_info_t new_info;
662 new_info.global_id = gid;
663 new_info.name = m->get_name();
11fdf7f2 664 new_info.addrs = addrs;
7c673cae
FG
665 new_info.mds_features = m->get_mds_features();
666 new_info.state = MDSMap::STATE_STANDBY;
667 new_info.state_seq = seq;
28e407b8 668 pending.insert(new_info);
9f95a23c
TL
669 if (m->get_fs().size()) {
670 fs_cluster_id_t fscid = FS_CLUSTER_ID_NONE;
671 auto f = pending.get_filesystem(m->get_fs());
672 if (f) {
673 fscid = f->fscid;
674 }
675 new_info.join_fscid = fscid;
676 }
7c673cae
FG
677 }
678
7c673cae 679 // initialize the beacon timer
1adf2230
AA
680 auto &beacon = last_beacon[gid];
681 beacon.stamp = mono_clock::now();
682 beacon.seq = seq;
7c673cae
FG
683
684 // new incompat?
28e407b8
AA
685 if (!pending.compat.writeable(m->get_compat())) {
686 dout(10) << " fsmap " << pending.compat
7c673cae
FG
687 << " can't write to new mds' " << m->get_compat()
688 << ", updating fsmap and killing old mds's"
689 << dendl;
28e407b8 690 pending.update_compat(m->get_compat());
7c673cae
FG
691 }
692
693 update_metadata(m->get_global_id(), m->get_sys_info());
694 } else {
695 // state update
91327a77
AA
696
697 if (!pending.gid_exists(gid)) {
698 /* gid has been removed from pending, send null map */
699 dout(5) << "mds_beacon " << *m << " is not in fsmap (state "
700 << ceph_mds_state_name(state) << ")" << dendl;
701
702 /* We can't send an MDSMap this MDS was a part of because we no longer
703 * know which FS it was part of. Nor does this matter. Sending an empty
704 * MDSMap is sufficient for getting the MDS to respawn.
705 */
9f95a23c 706 wait_for_finished_proposal(op, new LambdaContext([op, this](int r){
91327a77
AA
707 if (r >= 0) {
708 const auto& fsmap = get_fsmap();
709 MDSMap null_map;
710 null_map.epoch = fsmap.epoch;
711 null_map.compat = fsmap.compat;
f67539c2
TL
712 auto m = make_message<MMDSMap>(mon.monmap->fsid, null_map);
713 mon.send_reply(op, m.detach());
91327a77
AA
714 } else {
715 dispatch(op); // try again
716 }
717 }));
718 return true;
719 }
720
11fdf7f2 721 const auto& info = pending.get_info_gid(gid);
f64942e4
AA
722 if (info.state == MDSMap::STATE_STOPPING &&
723 state != MDSMap::STATE_STOPPING &&
724 state != MDSMap::STATE_STOPPED) {
7c673cae
FG
725 // we can't transition to any other states from STOPPING
726 dout(0) << "got beacon for MDS in STATE_STOPPING, ignoring requested state change"
727 << dendl;
728 _note_beacon(m);
729 return true;
730 }
731
732 if (info.laggy()) {
11fdf7f2
TL
733 dout(1) << "prepare_beacon clearing laggy flag on " << addrs << dendl;
734 pending.modify_daemon(info.global_id, [](auto& info)
7c673cae 735 {
11fdf7f2 736 info.clear_laggy();
7c673cae
FG
737 }
738 );
739 }
9f95a23c 740
91327a77 741 dout(5) << "prepare_beacon mds." << info.rank
7c673cae
FG
742 << " " << ceph_mds_state_name(info.state)
743 << " -> " << ceph_mds_state_name(state)
7c673cae 744 << dendl;
9f95a23c
TL
745
746 fs_cluster_id_t fscid = FS_CLUSTER_ID_NONE;
747 if (m->get_fs().size()) {
748 auto f = pending.get_filesystem(m->get_fs());
749 if (f) {
750 fscid = f->fscid;
751 }
752 }
753 pending.modify_daemon(gid, [fscid](auto& info) {
754 info.join_fscid = fscid;
755 });
756
7c673cae 757 if (state == MDSMap::STATE_STOPPED) {
28e407b8
AA
758 const auto fscid = pending.mds_roles.at(gid);
759 const auto &fs = pending.get_filesystem(fscid);
181888fb 760
f67539c2 761 mon.clog->info() << info.human_name() << " finished "
11fdf7f2 762 << "stopping rank " << info.rank << " in filesystem "
d2e6a577 763 << fs->mds_map.fs_name << " (now has "
181888fb 764 << fs->mds_map.get_num_in_mds() - 1 << " ranks)";
d2e6a577 765
28e407b8 766 auto erased = pending.stop(gid);
7c673cae
FG
767 erased.push_back(gid);
768
9f95a23c 769 for (const auto& erased_gid : erased) {
7c673cae
FG
770 last_beacon.erase(erased_gid);
771 if (pending_daemon_health.count(erased_gid)) {
772 pending_daemon_health.erase(erased_gid);
773 pending_daemon_health_rm.insert(erased_gid);
774 }
775 }
d2e6a577
FG
776
777
7c673cae 778 } else if (state == MDSMap::STATE_DAMAGED) {
f67539c2 779 if (!mon.osdmon()->is_writeable()) {
91327a77 780 dout(1) << __func__ << ": DAMAGED from rank " << info.rank
f67539c2
TL
781 << " waiting for osdmon writeable to blocklist it" << dendl;
782 mon.osdmon()->wait_for_writeable(op, new C_RetryMessage(this, op));
7c673cae
FG
783 return false;
784 }
785
786 // Record this MDS rank as damaged, so that other daemons
787 // won't try to run it.
91327a77 788 dout(0) << __func__ << ": marking rank "
7c673cae
FG
789 << info.rank << " damaged" << dendl;
790
791 utime_t until = ceph_clock_now();
f67539c2
TL
792 until += g_conf().get_val<double>("mon_mds_blocklist_interval");
793 const auto blocklist_epoch = mon.osdmon()->blocklist(info.addrs, until);
794 request_proposal(mon.osdmon());
795 pending.damaged(gid, blocklist_epoch);
7c673cae
FG
796 last_beacon.erase(gid);
797
798 // Respond to MDS, so that it knows it can continue to shut down
9f95a23c 799 auto beacon = make_message<MMDSBeacon>(
f67539c2 800 mon.monmap->fsid, m->get_global_id(),
28e407b8 801 m->get_name(), pending.get_epoch(), state, seq,
11fdf7f2 802 CEPH_FEATURES_SUPPORTED_DEFAULT);
f67539c2 803 mon.send_reply(op, beacon.detach());
7c673cae 804 } else if (state == MDSMap::STATE_DNE) {
f67539c2 805 if (!mon.osdmon()->is_writeable()) {
91327a77 806 dout(1) << __func__ << ": DNE from rank " << info.rank
f67539c2
TL
807 << " waiting for osdmon writeable to blocklist it" << dendl;
808 mon.osdmon()->wait_for_writeable(op, new C_RetryMessage(this, op));
7c673cae
FG
809 return false;
810 }
811
1adf2230 812 fail_mds_gid(pending, gid);
f67539c2
TL
813 ceph_assert(mon.osdmon()->is_writeable());
814 request_proposal(mon.osdmon());
7c673cae
FG
815
816 // Respond to MDS, so that it knows it can continue to shut down
f67539c2 817 auto beacon = make_message<MMDSBeacon>(mon.monmap->fsid,
11fdf7f2
TL
818 m->get_global_id(), m->get_name(), pending.get_epoch(), state, seq,
819 CEPH_FEATURES_SUPPORTED_DEFAULT);
f67539c2 820 mon.send_reply(op, beacon.detach());
7c673cae
FG
821 } else if (info.state == MDSMap::STATE_STANDBY && state != info.state) {
822 // Standby daemons should never modify their own
823 // state. Reject any attempts to do so.
824 derr << "standby " << gid << " attempted to change state to "
825 << ceph_mds_state_name(state) << ", rejecting" << dendl;
826 return true;
827 } else if (info.state != MDSMap::STATE_STANDBY && state != info.state &&
828 !MDSMap::state_transition_valid(info.state, state)) {
829 // Validate state transitions for daemons that hold a rank
830 derr << "daemon " << gid << " (rank " << info.rank << ") "
831 << "reported invalid state transition "
832 << ceph_mds_state_name(info.state) << " -> "
833 << ceph_mds_state_name(state) << dendl;
834 return true;
835 } else {
b32b8144 836 if (info.state != MDSMap::STATE_ACTIVE && state == MDSMap::STATE_ACTIVE) {
28e407b8
AA
837 const auto &fscid = pending.mds_roles.at(gid);
838 const auto &fs = pending.get_filesystem(fscid);
f67539c2 839 mon.clog->info() << info.human_name() << " is now active in "
d2e6a577
FG
840 << "filesystem " << fs->mds_map.fs_name << " as rank "
841 << info.rank;
842 }
b32b8144
FG
843
844 // Made it through special cases and validations, record the
845 // daemon's reported state to the FSMap.
11fdf7f2
TL
846 pending.modify_daemon(gid, [state, seq](auto& info) {
847 info.state = state;
848 info.state_seq = seq;
b32b8144 849 });
7c673cae
FG
850 }
851 }
852
91327a77 853 dout(5) << "prepare_beacon pending map now:" << dendl;
28e407b8 854 print_map(pending);
7c673cae 855
9f95a23c 856 wait_for_finished_proposal(op, new LambdaContext([op, this](int r){
7c673cae
FG
857 if (r >= 0)
858 _updated(op); // success
859 else if (r == -ECANCELED) {
f67539c2 860 mon.no_reply(op);
7c673cae
FG
861 } else {
862 dispatch(op); // try again
863 }
864 }));
865
866 return true;
867}
868
869bool MDSMonitor::prepare_offload_targets(MonOpRequestRef op)
870{
28e407b8
AA
871 auto &pending = get_pending_fsmap_writeable();
872
7c673cae 873 op->mark_mdsmon_event(__func__);
9f95a23c 874 auto m = op->get_req<MMDSLoadTargets>();
7c673cae 875 mds_gid_t gid = m->global_id;
28e407b8 876 if (pending.gid_has_rank(gid)) {
7c673cae 877 dout(10) << "prepare_offload_targets " << gid << " " << m->targets << dendl;
28e407b8 878 pending.update_export_targets(gid, m->targets);
7c673cae
FG
879 } else {
880 dout(10) << "prepare_offload_targets " << gid << " not in map" << dendl;
881 }
f67539c2 882 mon.no_reply(op);
7c673cae
FG
883 return true;
884}
885
886bool MDSMonitor::should_propose(double& delay)
887{
888 // delegate to PaxosService to assess whether we should propose
889 return PaxosService::should_propose(delay);
890}
891
892void MDSMonitor::_updated(MonOpRequestRef op)
893{
28e407b8 894 const auto &fsmap = get_fsmap();
7c673cae 895 op->mark_mdsmon_event(__func__);
9f95a23c 896 auto m = op->get_req<MMDSBeacon>();
7c673cae 897 dout(10) << "_updated " << m->get_orig_source() << " " << *m << dendl;
f67539c2 898 mon.clog->debug() << m->get_orig_source() << " "
11fdf7f2
TL
899 << m->get_orig_source_addrs() << " "
900 << ceph_mds_state_name(m->get_state());
7c673cae
FG
901
902 if (m->get_state() == MDSMap::STATE_STOPPED) {
903 // send the map manually (they're out of the map, so they won't get it automatic)
904 MDSMap null_map;
905 null_map.epoch = fsmap.epoch;
906 null_map.compat = fsmap.compat;
f67539c2
TL
907 auto m = make_message<MMDSMap>(mon.monmap->fsid, null_map);
908 mon.send_reply(op, m.detach());
7c673cae 909 } else {
f67539c2 910 auto beacon = make_message<MMDSBeacon>(mon.monmap->fsid,
11fdf7f2
TL
911 m->get_global_id(), m->get_name(), fsmap.get_epoch(),
912 m->get_state(), m->get_seq(), CEPH_FEATURES_SUPPORTED_DEFAULT);
f67539c2 913 mon.send_reply(op, beacon.detach());
7c673cae
FG
914 }
915}
916
917void MDSMonitor::on_active()
918{
919 tick();
7c673cae 920
28e407b8 921 if (is_leader()) {
f67539c2 922 mon.clog->debug() << "fsmap " << get_fsmap();
224ce89b 923 }
7c673cae
FG
924}
925
7c673cae
FG
926void MDSMonitor::dump_info(Formatter *f)
927{
928 f->open_object_section("fsmap");
28e407b8 929 get_fsmap().dump(f);
7c673cae
FG
930 f->close_section();
931
932 f->dump_unsigned("mdsmap_first_committed", get_first_committed());
933 f->dump_unsigned("mdsmap_last_committed", get_last_committed());
934}
935
936bool MDSMonitor::preprocess_command(MonOpRequestRef op)
937{
938 op->mark_mdsmon_event(__func__);
9f95a23c 939 auto m = op->get_req<MMonCommand>();
7c673cae
FG
940 int r = -1;
941 bufferlist rdata;
942 stringstream ss, ds;
943
11fdf7f2 944 cmdmap_t cmdmap;
7c673cae
FG
945 if (!cmdmap_from_json(m->cmd, &cmdmap, ss)) {
946 // ss has reason for failure
947 string rs = ss.str();
f67539c2 948 mon.reply_command(op, -EINVAL, rs, rdata, get_last_committed());
7c673cae
FG
949 return true;
950 }
951
952 string prefix;
9f95a23c 953 cmd_getval(cmdmap, "prefix", prefix);
7c673cae 954 string format;
9f95a23c 955 cmd_getval(cmdmap, "format", format, string("plain"));
1adf2230 956 std::unique_ptr<Formatter> f(Formatter::create(format));
7c673cae 957
11fdf7f2 958 MonSession *session = op->get_session();
7c673cae 959 if (!session) {
f67539c2 960 mon.reply_command(op, -EACCES, "access denied", rdata, get_last_committed());
7c673cae
FG
961 return true;
962 }
963
f67539c2
TL
964 // to use const qualifier filter fsmap beforehand
965 FSMap _fsmap_copy = get_fsmap();
966 _fsmap_copy.filter(session->get_allowed_fs_names());
967 const auto& fsmap = _fsmap_copy;
968
7c673cae
FG
969 if (prefix == "mds stat") {
970 if (f) {
971 f->open_object_section("mds_stat");
972 dump_info(f.get());
973 f->close_section();
974 f->flush(ds);
975 } else {
976 ds << fsmap;
977 }
978 r = 0;
11fdf7f2
TL
979 } else if (prefix == "mds ok-to-stop") {
980 vector<string> ids;
9f95a23c 981 if (!cmd_getval(cmdmap, "ids", ids)) {
11fdf7f2
TL
982 r = -EINVAL;
983 ss << "must specify mds id";
984 goto out;
985 }
986 if (fsmap.is_any_degraded()) {
987 ss << "one or more filesystems is currently degraded";
988 r = -EBUSY;
989 goto out;
990 }
991 set<mds_gid_t> stopping;
992 for (auto& id : ids) {
993 ostringstream ess;
994 mds_gid_t gid = gid_from_arg(fsmap, id, ess);
995 if (gid == MDS_GID_NONE) {
996 // the mds doesn't exist, but no file systems are unhappy, so losing it
997 // can't have any effect.
998 continue;
999 }
1000 stopping.insert(gid);
1001 }
1002 set<mds_gid_t> active;
1003 set<mds_gid_t> standby;
1004 for (auto gid : stopping) {
1005 if (fsmap.gid_has_rank(gid)) {
1006 // ignore standby-replay daemons (at this level)
1007 if (!fsmap.is_standby_replay(gid)) {
1008 auto standby = fsmap.get_standby_replay(gid);
1009 if (standby == MDS_GID_NONE ||
1010 stopping.count(standby)) {
1011 // no standby-replay, or we're also stopping the standby-replay
1012 // for this mds
1013 active.insert(gid);
1014 }
1015 }
7c673cae 1016 } else {
11fdf7f2
TL
1017 // net loss of a standby
1018 standby.insert(gid);
7c673cae
FG
1019 }
1020 }
11fdf7f2
TL
1021 if (fsmap.get_num_standby() - standby.size() < active.size()) {
1022 r = -EBUSY;
1023 ss << "insufficent standby MDS daemons to stop active gids "
1024 << stringify(active)
1025 << " and/or standby gids " << stringify(standby);;
1026 goto out;
28e407b8 1027 }
11fdf7f2
TL
1028 r = 0;
1029 ss << "should be safe to stop " << ids;
7c673cae
FG
1030 } else if (prefix == "fs dump") {
1031 int64_t epocharg;
1032 epoch_t epoch;
1033
1adf2230 1034 const FSMap *fsmapp = &fsmap;
28e407b8 1035 FSMap dummy;
9f95a23c 1036 if (cmd_getval(cmdmap, "epoch", epocharg)) {
7c673cae
FG
1037 epoch = epocharg;
1038 bufferlist b;
1039 int err = get_version(epoch, b);
1040 if (err == -ENOENT) {
7c673cae 1041 r = -ENOENT;
28e407b8 1042 goto out;
7c673cae 1043 } else {
11fdf7f2
TL
1044 ceph_assert(err == 0);
1045 ceph_assert(b.length());
28e407b8
AA
1046 dummy.decode(b);
1047 fsmapp = &dummy;
7c673cae
FG
1048 }
1049 }
c07f9fc5 1050
28e407b8
AA
1051 stringstream ds;
1052 if (f != NULL) {
1053 f->open_object_section("fsmap");
1054 fsmapp->dump(f.get());
1055 f->close_section();
1056 f->flush(ds);
1057 r = 0;
1058 } else {
1059 fsmapp->print(ds);
1060 r = 0;
7c673cae 1061 }
28e407b8
AA
1062
1063 rdata.append(ds);
1064 ss << "dumped fsmap epoch " << fsmapp->get_epoch();
7c673cae
FG
1065 } else if (prefix == "mds metadata") {
1066 if (!f)
1067 f.reset(Formatter::create("json-pretty"));
1068
1069 string who;
9f95a23c 1070 bool all = !cmd_getval(cmdmap, "who", who);
7c673cae
FG
1071 dout(1) << "all = " << all << dendl;
1072 if (all) {
1073 r = 0;
1074 // Dump all MDSs' metadata
1075 const auto all_info = fsmap.get_mds_info();
1076
1077 f->open_array_section("mds_metadata");
1078 for(const auto &i : all_info) {
1079 const auto &info = i.second;
1080
1081 f->open_object_section("mds");
1082 f->dump_string("name", info.name);
1083 std::ostringstream get_err;
1adf2230 1084 r = dump_metadata(fsmap, info.name, f.get(), get_err);
7c673cae
FG
1085 if (r == -EINVAL || r == -ENOENT) {
1086 // Drop error, list what metadata we do have
1087 dout(1) << get_err.str() << dendl;
1088 r = 0;
1089 } else if (r != 0) {
1090 derr << "Unexpected error reading metadata: " << cpp_strerror(r)
1091 << dendl;
1092 ss << get_err.str();
c07f9fc5 1093 f->close_section();
7c673cae
FG
1094 break;
1095 }
1096 f->close_section();
1097 }
1098 f->close_section();
1099 } else {
1100 // Dump a single daemon's metadata
1101 f->open_object_section("mds_metadata");
1adf2230 1102 r = dump_metadata(fsmap, who, f.get(), ss);
7c673cae
FG
1103 f->close_section();
1104 }
1105 f->flush(ds);
31f18b77
FG
1106 } else if (prefix == "mds versions") {
1107 if (!f)
1108 f.reset(Formatter::create("json-pretty"));
1109 count_metadata("ceph_version", f.get());
1110 f->flush(ds);
1111 r = 0;
1112 } else if (prefix == "mds count-metadata") {
1113 if (!f)
1114 f.reset(Formatter::create("json-pretty"));
1115 string field;
9f95a23c 1116 cmd_getval(cmdmap, "property", field);
31f18b77
FG
1117 count_metadata(field, f.get());
1118 f->flush(ds);
1119 r = 0;
7c673cae
FG
1120 } else if (prefix == "mds compat show") {
1121 if (f) {
1122 f->open_object_section("mds_compat");
1123 fsmap.compat.dump(f.get());
1124 f->close_section();
1125 f->flush(ds);
1126 } else {
1127 ds << fsmap.compat;
1128 }
1129 r = 0;
1130 } else if (prefix == "fs get") {
1131 string fs_name;
9f95a23c 1132 cmd_getval(cmdmap, "fs_name", fs_name);
28e407b8 1133 const auto &fs = fsmap.get_filesystem(fs_name);
7c673cae
FG
1134 if (fs == nullptr) {
1135 ss << "filesystem '" << fs_name << "' not found";
1136 r = -ENOENT;
1137 } else {
1138 if (f != nullptr) {
1139 f->open_object_section("filesystem");
1140 fs->dump(f.get());
1141 f->close_section();
1142 f->flush(ds);
1143 r = 0;
1144 } else {
1145 fs->print(ds);
1146 r = 0;
1147 }
1148 }
1149 } else if (prefix == "fs ls") {
1150 if (f) {
1151 f->open_array_section("filesystems");
1adf2230
AA
1152 for (const auto &p : fsmap.filesystems) {
1153 const auto &fs = p.second;
1154 f->open_object_section("filesystem");
1155 {
1156 const MDSMap &mds_map = fs->mds_map;
1157 f->dump_string("name", mds_map.fs_name);
1158 /* Output both the names and IDs of pools, for use by
1159 * humans and machines respectively */
f67539c2 1160 f->dump_string("metadata_pool", mon.osdmon()->osdmap.get_pool_name(
1adf2230
AA
1161 mds_map.metadata_pool));
1162 f->dump_int("metadata_pool_id", mds_map.metadata_pool);
1163 f->open_array_section("data_pool_ids");
1164 for (const auto &id : mds_map.data_pools) {
1165 f->dump_int("data_pool_id", id);
1166 }
1167 f->close_section();
7c673cae 1168
1adf2230
AA
1169 f->open_array_section("data_pools");
1170 for (const auto &id : mds_map.data_pools) {
f67539c2 1171 const auto &name = mon.osdmon()->osdmap.get_pool_name(id);
1adf2230 1172 f->dump_string("data_pool", name);
7c673cae
FG
1173 }
1174 f->close_section();
1175 }
1adf2230 1176 f->close_section();
7c673cae
FG
1177 }
1178 f->close_section();
1179 f->flush(ds);
1180 } else {
28e407b8
AA
1181 for (const auto &p : fsmap.filesystems) {
1182 const auto &fs = p.second;
7c673cae 1183 const MDSMap &mds_map = fs->mds_map;
f67539c2 1184 const string &md_pool_name = mon.osdmon()->osdmap.get_pool_name(
7c673cae
FG
1185 mds_map.metadata_pool);
1186
1187 ds << "name: " << mds_map.fs_name << ", metadata pool: "
1188 << md_pool_name << ", data pools: [";
1adf2230 1189 for (const auto &id : mds_map.data_pools) {
f67539c2 1190 const string &pool_name = mon.osdmon()->osdmap.get_pool_name(id);
7c673cae
FG
1191 ds << pool_name << " ";
1192 }
1193 ds << "]" << std::endl;
1194 }
1195
1196 if (fsmap.filesystems.empty()) {
1197 ds << "No filesystems enabled" << std::endl;
1198 }
1199 }
1200 r = 0;
f67539c2
TL
1201 } else if (prefix == "fs feature ls") {
1202 if (f) {
1203 f->open_array_section("cephfs_features");
1204 for (size_t i = 0; i <= CEPHFS_FEATURE_MAX; ++i) {
1205 f->open_object_section("feature");
1206 f->dump_int("index", i);
1207 f->dump_string("name", cephfs_feature_name(i));
1208 f->close_section();
1209 }
1210 f->close_section();
1211 f->flush(ds);
1212 } else {
1213 for (size_t i = 0; i <= CEPHFS_FEATURE_MAX; ++i) {
1214 ds << i << " " << cephfs_feature_name(i) << std::endl;
1215 }
1216 }
1217 r = 0;
7c673cae
FG
1218 }
1219
28e407b8 1220out:
7c673cae
FG
1221 if (r != -1) {
1222 rdata.append(ds);
1223 string rs;
1224 getline(ss, rs);
f67539c2 1225 mon.reply_command(op, r, rs, rdata, get_last_committed());
7c673cae
FG
1226 return true;
1227 } else
1228 return false;
1229}
1230
1adf2230 1231bool MDSMonitor::fail_mds_gid(FSMap &fsmap, mds_gid_t gid)
7c673cae 1232{
9f95a23c 1233 const auto& info = fsmap.get_info_gid(gid);
91327a77 1234 dout(1) << "fail_mds_gid " << gid << " mds." << info.name << " role " << info.rank << dendl;
7c673cae 1235
f67539c2 1236 ceph_assert(mon.osdmon()->is_writeable());
a8e16298 1237
f67539c2 1238 epoch_t blocklist_epoch = 0;
7c673cae
FG
1239 if (info.rank >= 0 && info.state != MDSMap::STATE_STANDBY_REPLAY) {
1240 utime_t until = ceph_clock_now();
f67539c2
TL
1241 until += g_conf().get_val<double>("mon_mds_blocklist_interval");
1242 blocklist_epoch = mon.osdmon()->blocklist(info.addrs, until);
7c673cae
FG
1243 }
1244
f67539c2 1245 fsmap.erase(gid, blocklist_epoch);
7c673cae
FG
1246 last_beacon.erase(gid);
1247 if (pending_daemon_health.count(gid)) {
1248 pending_daemon_health.erase(gid);
1249 pending_daemon_health_rm.insert(gid);
1250 }
1251
f67539c2 1252 return blocklist_epoch != 0;
7c673cae
FG
1253}
1254
1adf2230 1255mds_gid_t MDSMonitor::gid_from_arg(const FSMap &fsmap, const std::string &arg, std::ostream &ss)
7c673cae
FG
1256{
1257 // Try parsing as a role
1258 mds_role_t role;
1259 std::ostringstream ignore_err; // Don't spam 'ss' with parse_role errors
1adf2230 1260 int r = fsmap.parse_role(arg, &role, ignore_err);
7c673cae
FG
1261 if (r == 0) {
1262 // See if a GID is assigned to this role
28e407b8 1263 const auto &fs = fsmap.get_filesystem(role.fscid);
11fdf7f2 1264 ceph_assert(fs != nullptr); // parse_role ensures it exists
7c673cae
FG
1265 if (fs->mds_map.is_up(role.rank)) {
1266 dout(10) << __func__ << ": validated rank/GID " << role
1267 << " as a rank" << dendl;
1268 return fs->mds_map.get_mds_info(role.rank).global_id;
1269 }
1270 }
1271
1272 // Try parsing as a gid
1273 std::string err;
1274 unsigned long long maybe_gid = strict_strtoll(arg.c_str(), 10, &err);
1275 if (!err.empty()) {
1276 // Not a role or a GID, try as a daemon name
28e407b8 1277 const MDSMap::mds_info_t *mds_info = fsmap.find_by_name(arg);
7c673cae
FG
1278 if (!mds_info) {
1279 ss << "MDS named '" << arg
1280 << "' does not exist, or is not up";
1281 return MDS_GID_NONE;
1282 }
1283 dout(10) << __func__ << ": resolved MDS name '" << arg
1284 << "' to GID " << mds_info->global_id << dendl;
1285 return mds_info->global_id;
1286 } else {
1287 // Not a role, but parses as a an integer, might be a GID
1288 dout(10) << __func__ << ": treating MDS reference '" << arg
1289 << "' as an integer " << maybe_gid << dendl;
31f18b77 1290
28e407b8 1291 if (fsmap.gid_exists(mds_gid_t(maybe_gid))) {
31f18b77 1292 return mds_gid_t(maybe_gid);
7c673cae
FG
1293 }
1294 }
1295
1296 dout(1) << __func__ << ": rank/GID " << arg
1297 << " not a existent rank or GID" << dendl;
1298 return MDS_GID_NONE;
1299}
1300
1adf2230
AA
1301int MDSMonitor::fail_mds(FSMap &fsmap, std::ostream &ss,
1302 const std::string &arg, MDSMap::mds_info_t *failed_info)
7c673cae 1303{
11fdf7f2 1304 ceph_assert(failed_info != nullptr);
d2e6a577 1305
1adf2230 1306 mds_gid_t gid = gid_from_arg(fsmap, arg, ss);
7c673cae
FG
1307 if (gid == MDS_GID_NONE) {
1308 return 0;
1309 }
f67539c2 1310 if (!mon.osdmon()->is_writeable()) {
7c673cae
FG
1311 return -EAGAIN;
1312 }
d2e6a577
FG
1313
1314 // Take a copy of the info before removing the MDS from the map,
1315 // so that the caller knows which mds (if any) they ended up removing.
1adf2230 1316 *failed_info = fsmap.get_info_gid(gid);
d2e6a577 1317
1adf2230 1318 fail_mds_gid(fsmap, gid);
7c673cae 1319 ss << "failed mds gid " << gid;
f67539c2
TL
1320 ceph_assert(mon.osdmon()->is_writeable());
1321 request_proposal(mon.osdmon());
7c673cae
FG
1322 return 0;
1323}
1324
1325bool MDSMonitor::prepare_command(MonOpRequestRef op)
1326{
1327 op->mark_mdsmon_event(__func__);
9f95a23c 1328 auto m = op->get_req<MMonCommand>();
7c673cae
FG
1329 int r = -EINVAL;
1330 stringstream ss;
1331 bufferlist rdata;
1332
11fdf7f2 1333 cmdmap_t cmdmap;
7c673cae
FG
1334 if (!cmdmap_from_json(m->cmd, &cmdmap, ss)) {
1335 string rs = ss.str();
f67539c2 1336 mon.reply_command(op, -EINVAL, rs, rdata, get_last_committed());
7c673cae
FG
1337 return true;
1338 }
1339
1340 string prefix;
9f95a23c 1341 cmd_getval(cmdmap, "prefix", prefix);
7c673cae
FG
1342
1343 /* Refuse access if message not associated with a valid session */
11fdf7f2 1344 MonSession *session = op->get_session();
7c673cae 1345 if (!session) {
f67539c2 1346 mon.reply_command(op, -EACCES, "access denied", rdata, get_last_committed());
7c673cae
FG
1347 return true;
1348 }
1349
28e407b8
AA
1350 auto &pending = get_pending_fsmap_writeable();
1351
c07f9fc5 1352 bool batched_propose = false;
28e407b8 1353 for (const auto &h : handlers) {
f67539c2
TL
1354 r = h->can_handle(prefix, op, pending, cmdmap, ss);
1355 if (r == 1) {
1356 ; // pass, since we got the right handler.
1357 } else if (r == 0) {
1358 continue;
1359 } else {
1360 goto out;
1361 }
c07f9fc5 1362
f67539c2
TL
1363 batched_propose = h->batched_propose();
1364 if (batched_propose) {
1365 paxos.plug();
1366 }
1367 r = h->handle(&mon, pending, op, cmdmap, ss);
1368 if (batched_propose) {
1369 paxos.unplug();
1370 }
1371
1372 if (r == -EAGAIN) {
1373 // message has been enqueued for retry; return.
1374 dout(4) << __func__ << " enqueue for retry by prepare_command" << dendl;
1375 return false;
1376 } else {
1377 if (r == 0) {
1378 // On successful updates, print the updated map
1379 print_map(pending);
7c673cae 1380 }
f67539c2
TL
1381 // Successful or not, we're done: respond.
1382 goto out;
7c673cae
FG
1383 }
1384 }
1385
1adf2230 1386 r = filesystem_command(pending, op, prefix, cmdmap, ss);
7c673cae
FG
1387 if (r >= 0) {
1388 goto out;
1389 } else if (r == -EAGAIN) {
1390 // Do not reply, the message has been enqueued for retry
1391 dout(4) << __func__ << " enqueue for retry by filesystem_command" << dendl;
1392 return false;
1393 } else if (r != -ENOSYS) {
1394 goto out;
1395 }
1396
7c673cae
FG
1397 if (r == -ENOSYS && ss.str().empty()) {
1398 ss << "unrecognized command";
1399 }
1400
1401out:
1402 dout(4) << __func__ << " done, r=" << r << dendl;
1403 /* Compose response */
1404 string rs;
1405 getline(ss, rs);
1406
1407 if (r >= 0) {
1408 // success.. delay reply
1409 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, r, rs,
1410 get_last_committed() + 1));
c07f9fc5
FG
1411 if (batched_propose) {
1412 force_immediate_propose();
1413 }
7c673cae
FG
1414 return true;
1415 } else {
1416 // reply immediately
f67539c2 1417 mon.reply_command(op, r, rs, rdata, get_last_committed());
7c673cae
FG
1418 return false;
1419 }
1420}
1421
7c673cae 1422int MDSMonitor::filesystem_command(
1adf2230 1423 FSMap &fsmap,
7c673cae
FG
1424 MonOpRequestRef op,
1425 std::string const &prefix,
11fdf7f2 1426 const cmdmap_t& cmdmap,
7c673cae
FG
1427 std::stringstream &ss)
1428{
1429 dout(4) << __func__ << " prefix='" << prefix << "'" << dendl;
1430 op->mark_mdsmon_event(__func__);
1431 int r = 0;
1432 string whostr;
9f95a23c 1433 cmd_getval(cmdmap, "role", whostr);
7c673cae 1434
11fdf7f2 1435 if (prefix == "mds set_state") {
7c673cae 1436 mds_gid_t gid;
9f95a23c 1437 if (!cmd_getval(cmdmap, "gid", gid)) {
7c673cae 1438 ss << "error parsing 'gid' value '"
11fdf7f2 1439 << cmd_vartype_stringify(cmdmap.at("gid")) << "'";
7c673cae
FG
1440 return -EINVAL;
1441 }
1442 MDSMap::DaemonState state;
9f95a23c 1443 if (!cmd_getval(cmdmap, "state", state)) {
7c673cae 1444 ss << "error parsing 'state' string value '"
11fdf7f2 1445 << cmd_vartype_stringify(cmdmap.at("state")) << "'";
7c673cae
FG
1446 return -EINVAL;
1447 }
f67539c2 1448 if (fsmap.gid_exists(gid, op->get_session()->get_allowed_fs_names())) {
11fdf7f2
TL
1449 fsmap.modify_daemon(gid, [state](auto& info) {
1450 info.state = state;
7c673cae
FG
1451 });
1452 ss << "set mds gid " << gid << " to state " << state << " "
1453 << ceph_mds_state_name(state);
1454 return 0;
1455 }
1456 } else if (prefix == "mds fail") {
1457 string who;
9f95a23c 1458 cmd_getval(cmdmap, "role_or_gid", who);
d2e6a577
FG
1459
1460 MDSMap::mds_info_t failed_info;
f67539c2
TL
1461 mds_gid_t gid = gid_from_arg(fsmap, who, ss);
1462 if (gid == MDS_GID_NONE) {
1463 ss << "MDS named '" << who << "' does not exist, is not up or you "
1464 << "lack the permission to see.";
1465 return 0;
1466 }
1467 if(!fsmap.gid_exists(gid, op->get_session()->get_allowed_fs_names())) {
1468 ss << "MDS named '" << who << "' does not exist, is not up or you "
1469 << "lack the permission to see.";
1470 return -EINVAL;
1471 }
1472 string_view fs_name = fsmap.fs_name_from_gid(gid);
1473 if (!op->get_session()->fs_name_capable(fs_name, MON_CAP_W)) {
1474 ss << "Permission denied.";
1475 return -EPERM;
1476 }
1477
1adf2230 1478 r = fail_mds(fsmap, ss, who, &failed_info);
7c673cae 1479 if (r < 0 && r == -EAGAIN) {
f67539c2 1480 mon.osdmon()->wait_for_writeable(op, new C_RetryMessage(this, op));
7c673cae 1481 return -EAGAIN; // don't propose yet; wait for message to be retried
d2e6a577
FG
1482 } else if (r == 0) {
1483 // Only log if we really did something (not when was already gone)
1484 if (failed_info.global_id != MDS_GID_NONE) {
f67539c2 1485 mon.clog->info() << failed_info.human_name() << " marked failed by "
d2e6a577
FG
1486 << op->get_session()->entity_name;
1487 }
7c673cae
FG
1488 }
1489 } else if (prefix == "mds rm") {
1490 mds_gid_t gid;
9f95a23c 1491 if (!cmd_getval(cmdmap, "gid", gid)) {
7c673cae 1492 ss << "error parsing 'gid' value '"
11fdf7f2 1493 << cmd_vartype_stringify(cmdmap.at("gid")) << "'";
7c673cae
FG
1494 return -EINVAL;
1495 }
f67539c2 1496 if (!fsmap.gid_exists(gid, op->get_session()->get_allowed_fs_names())) {
11fdf7f2 1497 ss << "mds gid " << gid << " does not exist";
f67539c2
TL
1498 return 0;
1499 }
1500 string_view fs_name = fsmap.fs_name_from_gid(gid);
1501 if (!op->get_session()->fs_name_capable(fs_name, MON_CAP_W)) {
1502 ss << "Permission denied.";
1503 return -EPERM;
1504 }
1505 const auto &info = fsmap.get_info_gid(gid);
1506 MDSMap::DaemonState state = info.state;
1507 if (state > 0) {
1508 ss << "cannot remove active mds." << info.name
1509 << " rank " << info.rank;
1510 return -EBUSY;
7c673cae 1511 } else {
f67539c2
TL
1512 fsmap.erase(gid, {});
1513 ss << "removed mds gid " << gid;
1514 return 0;
7c673cae
FG
1515 }
1516 } else if (prefix == "mds rmfailed") {
11fdf7f2 1517 bool confirm = false;
9f95a23c 1518 cmd_getval(cmdmap, "yes_i_really_mean_it", confirm);
11fdf7f2 1519 if (!confirm) {
7c673cae
FG
1520 ss << "WARNING: this can make your filesystem inaccessible! "
1521 "Add --yes-i-really-mean-it if you are sure you wish to continue.";
1522 return -EPERM;
1523 }
1524
1525 std::string role_str;
9f95a23c 1526 cmd_getval(cmdmap, "role", role_str);
7c673cae 1527 mds_role_t role;
f67539c2
TL
1528 const auto fs_names = op->get_session()->get_allowed_fs_names();
1529 int r = fsmap.parse_role(role_str, &role, ss, fs_names);
7c673cae
FG
1530 if (r < 0) {
1531 ss << "invalid role '" << role_str << "'";
1532 return -EINVAL;
1533 }
f67539c2
TL
1534 string_view fs_name = fsmap.get_filesystem(role.fscid)->mds_map.get_fs_name();
1535 if (!op->get_session()->fs_name_capable(fs_name, MON_CAP_W)) {
1536 ss << "Permission denied.";
1537 return -EPERM;
1538 }
7c673cae 1539
1adf2230 1540 fsmap.modify_filesystem(
7c673cae
FG
1541 role.fscid,
1542 [role](std::shared_ptr<Filesystem> fs)
1543 {
1544 fs->mds_map.failed.erase(role.rank);
1545 });
1546
1547 ss << "removed failed mds." << role;
1548 return 0;
1549 } else if (prefix == "mds compat rm_compat") {
1550 int64_t f;
9f95a23c 1551 if (!cmd_getval(cmdmap, "feature", f)) {
7c673cae 1552 ss << "error parsing feature value '"
11fdf7f2 1553 << cmd_vartype_stringify(cmdmap.at("feature")) << "'";
7c673cae
FG
1554 return -EINVAL;
1555 }
1adf2230 1556 if (fsmap.compat.compat.contains(f)) {
7c673cae 1557 ss << "removing compat feature " << f;
1adf2230 1558 CompatSet modified = fsmap.compat;
7c673cae 1559 modified.compat.remove(f);
1adf2230 1560 fsmap.update_compat(modified);
7c673cae 1561 } else {
1adf2230 1562 ss << "compat feature " << f << " not present in " << fsmap.compat;
7c673cae
FG
1563 }
1564 r = 0;
1565 } else if (prefix == "mds compat rm_incompat") {
1566 int64_t f;
9f95a23c 1567 if (!cmd_getval(cmdmap, "feature", f)) {
7c673cae 1568 ss << "error parsing feature value '"
11fdf7f2 1569 << cmd_vartype_stringify(cmdmap.at("feature")) << "'";
7c673cae
FG
1570 return -EINVAL;
1571 }
1adf2230 1572 if (fsmap.compat.incompat.contains(f)) {
7c673cae 1573 ss << "removing incompat feature " << f;
1adf2230 1574 CompatSet modified = fsmap.compat;
7c673cae 1575 modified.incompat.remove(f);
1adf2230 1576 fsmap.update_compat(modified);
7c673cae 1577 } else {
1adf2230 1578 ss << "incompat feature " << f << " not present in " << fsmap.compat;
7c673cae
FG
1579 }
1580 r = 0;
1581 } else if (prefix == "mds repaired") {
1582 std::string role_str;
9f95a23c 1583 cmd_getval(cmdmap, "role", role_str);
7c673cae 1584 mds_role_t role;
f67539c2
TL
1585 const auto fs_names = op->get_session()->get_allowed_fs_names();
1586 r = fsmap.parse_role(role_str, &role, ss, fs_names);
7c673cae
FG
1587 if (r < 0) {
1588 return r;
1589 }
f67539c2
TL
1590 string_view fs_name = fsmap.get_filesystem(role.fscid)->mds_map.get_fs_name();
1591 if (!op->get_session()->fs_name_capable(fs_name, MON_CAP_W)) {
1592 ss << "Permission denied.";
1593 return -EPERM;
1594 }
7c673cae 1595
1adf2230 1596 bool modified = fsmap.undamaged(role.fscid, role.rank);
7c673cae 1597 if (modified) {
494da23a 1598 ss << "repaired: restoring rank " << role;
7c673cae 1599 } else {
494da23a 1600 ss << "nothing to do: rank is not damaged";
7c673cae
FG
1601 }
1602
1603 r = 0;
11fdf7f2
TL
1604 } else if (prefix == "mds freeze") {
1605 std::string who;
9f95a23c 1606 cmd_getval(cmdmap, "role_or_gid", who);
11fdf7f2
TL
1607 mds_gid_t gid = gid_from_arg(fsmap, who, ss);
1608 if (gid == MDS_GID_NONE) {
7c673cae
FG
1609 return -EINVAL;
1610 }
1611
f67539c2
TL
1612 string_view fs_name = fsmap.fs_name_from_gid(gid);
1613 if (!op->get_session()->fs_name_capable(fs_name, MON_CAP_W)) {
1614 ss << "Permission denied.";
1615 return -EPERM;
1616 }
1617
11fdf7f2 1618 bool freeze = false;
7c673cae 1619 {
11fdf7f2 1620 std::string str;
9f95a23c 1621 cmd_getval(cmdmap, "val", str);
11fdf7f2
TL
1622 if ((r = parse_bool(str, &freeze, ss)) != 0) {
1623 return r;
1624 }
1625 }
7c673cae 1626
11fdf7f2
TL
1627 auto f = [freeze,gid,&ss](auto& info) {
1628 if (freeze) {
1629 ss << "freezing mds." << gid;
1630 info.freeze();
1631 } else {
1632 ss << "unfreezing mds." << gid;
1633 info.unfreeze();
1634 }
1635 };
1636 fsmap.modify_daemon(gid, f);
7c673cae
FG
1637 r = 0;
1638 } else {
1639 return -ENOSYS;
1640 }
1641
1642 return r;
1643}
1644
7c673cae
FG
1645void MDSMonitor::check_subs()
1646{
7c673cae
FG
1647 // Subscriptions may be to "mdsmap" (MDS and legacy clients),
1648 // "mdsmap.<namespace>", or to "fsmap" for the full state of all
1649 // filesystems. Build a list of all the types we service
1650 // subscriptions for.
9f95a23c
TL
1651
1652 std::vector<std::string> types = {
1653 "fsmap",
1654 "fsmap.user",
1655 "mdsmap",
1656 };
1657
28e407b8
AA
1658 for (const auto &p : get_fsmap().filesystems) {
1659 const auto &fscid = p.first;
9f95a23c
TL
1660 CachedStackStringStream cos;
1661 *cos << "mdsmap." << fscid;
1662 types.push_back(std::string(cos->strv()));
7c673cae
FG
1663 }
1664
1665 for (const auto &type : types) {
f67539c2 1666 auto& subs = mon.session_map.subs;
9f95a23c
TL
1667 auto subs_it = subs.find(type);
1668 if (subs_it == subs.end())
7c673cae 1669 continue;
9f95a23c
TL
1670 auto sub_it = subs_it->second->begin();
1671 while (!sub_it.end()) {
1672 auto sub = *sub_it;
1673 ++sub_it; // N.B. check_sub may remove sub!
7c673cae
FG
1674 check_sub(sub);
1675 }
1676 }
1677}
1678
1679
1680void MDSMonitor::check_sub(Subscription *sub)
1681{
1682 dout(20) << __func__ << ": " << sub->type << dendl;
1683
f67539c2
TL
1684 // to use const qualifier filter fsmap beforehand
1685 FSMap _fsmap_copy = get_fsmap();
1686 _fsmap_copy.filter(sub->session->get_allowed_fs_names());
1687 const auto& fsmap = _fsmap_copy;
1688 if (sub->next > fsmap.get_epoch()) {
1689 return;
1690 }
28e407b8 1691
7c673cae 1692 if (sub->type == "fsmap") {
f67539c2
TL
1693 sub->session->con->send_message(new MFSMap(mon.monmap->fsid, fsmap));
1694 if (sub->onetime) {
1695 mon.session_map.remove_sub(sub);
1696 } else {
1697 sub->next = fsmap.get_epoch() + 1;
7c673cae
FG
1698 }
1699 } else if (sub->type == "fsmap.user") {
f67539c2
TL
1700 FSMapUser fsmap_u;
1701 fsmap_u.epoch = fsmap.get_epoch();
1702 fsmap_u.legacy_client_fscid = fsmap.legacy_client_fscid;
1703 for (const auto &p : fsmap.filesystems) {
1704 FSMapUser::fs_info_t& fs_info = fsmap_u.filesystems[p.second->fscid];
1705 fs_info.cid = p.second->fscid;
1706 fs_info.name = p.second->mds_map.fs_name;
1707 }
1708 sub->session->con->send_message(new MFSMapUser(mon.monmap->fsid, fsmap_u));
1709 if (sub->onetime) {
1710 mon.session_map.remove_sub(sub);
1711 } else {
1712 sub->next = fsmap.get_epoch() + 1;
7c673cae
FG
1713 }
1714 } else if (sub->type.compare(0, 6, "mdsmap") == 0) {
11fdf7f2 1715 const bool is_mds = sub->session->name.is_mds();
7c673cae
FG
1716 mds_gid_t mds_gid = MDS_GID_NONE;
1717 fs_cluster_id_t fscid = FS_CLUSTER_ID_NONE;
1718 if (is_mds) {
1719 // What (if any) namespace are you assigned to?
1720 auto mds_info = fsmap.get_mds_info();
1adf2230 1721 for (const auto &p : mds_info) {
11fdf7f2 1722 if (p.second.addrs == sub->session->addrs) {
1adf2230 1723 mds_gid = p.first;
7c673cae
FG
1724 fscid = fsmap.mds_roles.at(mds_gid);
1725 }
1726 }
1727 } else {
1728 // You're a client. Did you request a particular
1729 // namespace?
11fdf7f2 1730 if (sub->type.compare(0, 7, "mdsmap.") == 0) {
7c673cae
FG
1731 auto namespace_id_str = sub->type.substr(std::string("mdsmap.").size());
1732 dout(10) << __func__ << ": namespace_id " << namespace_id_str << dendl;
1733 std::string err;
1734 fscid = strict_strtoll(namespace_id_str.c_str(), 10, &err);
1735 if (!err.empty()) {
1736 // Client asked for a non-existent namespace, send them nothing
1737 dout(1) << "Invalid client subscription '" << sub->type
1738 << "'" << dendl;
1739 return;
1740 }
7c673cae
FG
1741 } else {
1742 // Unqualified request for "mdsmap": give it the one marked
1743 // for use by legacy clients.
1744 if (fsmap.legacy_client_fscid != FS_CLUSTER_ID_NONE) {
1745 fscid = fsmap.legacy_client_fscid;
1746 } else {
1747 dout(1) << "Client subscribed for legacy filesystem but "
1748 "none is configured" << dendl;
1749 return;
1750 }
1751 }
b3b6e05e
TL
1752 if (!fsmap.filesystem_exists(fscid)) {
1753 // Client asked for a non-existent namespace, send them nothing
1754 // TODO: something more graceful for when a client has a filesystem
1755 // mounted, and the fileysstem is deleted. Add a "shut down you fool"
1756 // flag to MMDSMap?
1757 dout(1) << "Client subscribed to non-existent namespace '" <<
1758 fscid << "'" << dendl;
1759 return;
1760 }
7c673cae
FG
1761 }
1762 dout(10) << __func__ << ": is_mds=" << is_mds << ", fscid= " << fscid << dendl;
1763
1764 // Work out the effective latest epoch
28e407b8 1765 const MDSMap *mds_map = nullptr;
7c673cae
FG
1766 MDSMap null_map;
1767 null_map.compat = fsmap.compat;
1768 if (fscid == FS_CLUSTER_ID_NONE) {
1769 // For a client, we should have already dropped out
11fdf7f2 1770 ceph_assert(is_mds);
7c673cae 1771
28e407b8
AA
1772 auto it = fsmap.standby_daemons.find(mds_gid);
1773 if (it != fsmap.standby_daemons.end()) {
7c673cae 1774 // For an MDS, we need to feed it an MDSMap with its own state in
28e407b8
AA
1775 null_map.mds_info[mds_gid] = it->second;
1776 null_map.epoch = fsmap.standby_epochs.at(mds_gid);
7c673cae
FG
1777 } else {
1778 null_map.epoch = fsmap.epoch;
1779 }
1780 mds_map = &null_map;
1781 } else {
1782 // Check the effective epoch
28e407b8 1783 mds_map = &fsmap.get_filesystem(fscid)->mds_map;
7c673cae
FG
1784 }
1785
11fdf7f2 1786 ceph_assert(mds_map != nullptr);
7c673cae
FG
1787 dout(10) << __func__ << " selected MDS map epoch " <<
1788 mds_map->epoch << " for namespace " << fscid << " for subscriber "
11fdf7f2 1789 << sub->session->name << " who wants epoch " << sub->next << dendl;
7c673cae
FG
1790
1791 if (sub->next > mds_map->epoch) {
1792 return;
1793 }
f67539c2
TL
1794 auto msg = make_message<MMDSMap>(mon.monmap->fsid, *mds_map,
1795 mds_map->fs_name);
7c673cae 1796
11fdf7f2 1797 sub->session->con->send_message(msg.detach());
7c673cae 1798 if (sub->onetime) {
f67539c2 1799 mon.session_map.remove_sub(sub);
7c673cae
FG
1800 } else {
1801 sub->next = mds_map->get_epoch() + 1;
1802 }
1803 }
1804}
1805
1806
1807void MDSMonitor::update_metadata(mds_gid_t gid,
1808 const map<string, string>& metadata)
1809{
1810 if (metadata.empty()) {
1811 return;
1812 }
1813 pending_metadata[gid] = metadata;
1814
f67539c2 1815 MonitorDBStore::TransactionRef t = paxos.get_pending_transaction();
7c673cae 1816 bufferlist bl;
11fdf7f2 1817 encode(pending_metadata, bl);
7c673cae 1818 t->put(MDS_METADATA_PREFIX, "last_metadata", bl);
f67539c2 1819 paxos.trigger_propose();
7c673cae
FG
1820}
1821
1adf2230 1822void MDSMonitor::remove_from_metadata(const FSMap &fsmap, MonitorDBStore::TransactionRef t)
7c673cae
FG
1823{
1824 bool update = false;
1adf2230
AA
1825 for (auto it = pending_metadata.begin(); it != pending_metadata.end(); ) {
1826 if (!fsmap.gid_exists(it->first)) {
1827 it = pending_metadata.erase(it);
7c673cae
FG
1828 update = true;
1829 } else {
1adf2230 1830 ++it;
7c673cae
FG
1831 }
1832 }
1833 if (!update)
1834 return;
1835 bufferlist bl;
11fdf7f2 1836 encode(pending_metadata, bl);
7c673cae
FG
1837 t->put(MDS_METADATA_PREFIX, "last_metadata", bl);
1838}
1839
1840int MDSMonitor::load_metadata(map<mds_gid_t, Metadata>& m)
1841{
1842 bufferlist bl;
f67539c2 1843 int r = mon.store->get(MDS_METADATA_PREFIX, "last_metadata", bl);
7c673cae 1844 if (r) {
11fdf7f2 1845 dout(5) << "Unable to load 'last_metadata'" << dendl;
7c673cae
FG
1846 return r;
1847 }
1848
11fdf7f2
TL
1849 auto it = bl.cbegin();
1850 ceph::decode(m, it);
7c673cae
FG
1851 return 0;
1852}
1853
1adf2230 1854void MDSMonitor::count_metadata(const std::string &field, map<string,int> *out)
31f18b77 1855{
31f18b77
FG
1856 map<mds_gid_t,Metadata> meta;
1857 load_metadata(meta);
1858 for (auto& p : meta) {
1859 auto q = p.second.find(field);
1860 if (q == p.second.end()) {
c07f9fc5 1861 (*out)["unknown"]++;
31f18b77 1862 } else {
c07f9fc5 1863 (*out)[q->second]++;
31f18b77
FG
1864 }
1865 }
c07f9fc5
FG
1866}
1867
1adf2230 1868void MDSMonitor::count_metadata(const std::string &field, Formatter *f)
c07f9fc5
FG
1869{
1870 map<string,int> by_val;
1871 count_metadata(field, &by_val);
31f18b77
FG
1872 f->open_object_section(field.c_str());
1873 for (auto& p : by_val) {
1874 f->dump_int(p.first.c_str(), p.second);
1875 }
1876 f->close_section();
1877}
1878
f67539c2
TL
1879void MDSMonitor::get_versions(std::map<string, list<string> > &versions)
1880{
1881 map<mds_gid_t,Metadata> meta;
1882 load_metadata(meta);
1883 const auto &fsmap = get_fsmap();
1884 std::map<mds_gid_t, mds_info_t> map = fsmap.get_mds_info();
1885 dout(10) << __func__ << " mds meta=" << meta << dendl;
1886 for (auto& p : meta) {
1887 auto q = p.second.find("ceph_version_short");
1888 if (q == p.second.end()) continue;
1889 versions[q->second].push_back(string("mds.") + map[p.first].name);
1890 }
1891}
1892
1adf2230
AA
1893int MDSMonitor::dump_metadata(const FSMap& fsmap, const std::string &who,
1894 Formatter *f, ostream& err)
7c673cae 1895{
11fdf7f2 1896 ceph_assert(f);
7c673cae 1897
1adf2230 1898 mds_gid_t gid = gid_from_arg(fsmap, who, err);
7c673cae
FG
1899 if (gid == MDS_GID_NONE) {
1900 return -EINVAL;
1901 }
1902
1903 map<mds_gid_t, Metadata> metadata;
1904 if (int r = load_metadata(metadata)) {
1905 err << "Unable to load 'last_metadata'";
1906 return r;
1907 }
1908
1909 if (!metadata.count(gid)) {
1910 return -ENOENT;
1911 }
1912 const Metadata& m = metadata[gid];
1913 for (Metadata::const_iterator p = m.begin(); p != m.end(); ++p) {
1914 f->dump_string(p->first.c_str(), p->second);
1915 }
1916 return 0;
1917}
1918
1919int MDSMonitor::print_nodes(Formatter *f)
1920{
11fdf7f2 1921 ceph_assert(f);
7c673cae 1922
1adf2230
AA
1923 const auto &fsmap = get_fsmap();
1924
7c673cae
FG
1925 map<mds_gid_t, Metadata> metadata;
1926 if (int r = load_metadata(metadata)) {
1927 return r;
1928 }
1929
11fdf7f2 1930 map<string, list<string> > mdses; // hostname => mds
1adf2230
AA
1931 for (const auto &p : metadata) {
1932 const mds_gid_t& gid = p.first;
1933 const Metadata& m = p.second;
7c673cae
FG
1934 Metadata::const_iterator hostname = m.find("hostname");
1935 if (hostname == m.end()) {
1936 // not likely though
1937 continue;
1938 }
1adf2230 1939 if (!fsmap.gid_exists(gid)) {
7c673cae
FG
1940 dout(5) << __func__ << ": GID " << gid << " not existent" << dendl;
1941 continue;
1942 }
1adf2230 1943 const MDSMap::mds_info_t& mds_info = fsmap.get_info_gid(gid);
11fdf7f2 1944 mdses[hostname->second].push_back(mds_info.name);
7c673cae
FG
1945 }
1946
1947 dump_services(f, mdses, "mds");
1948 return 0;
1949}
1950
1951/**
1952 * If a cluster is undersized (with respect to max_mds), then
11fdf7f2
TL
1953 * attempt to find daemons to grow it. If the cluster is oversized
1954 * (with respect to max_mds) then shrink it by stopping its highest rank.
7c673cae 1955 */
11fdf7f2 1956bool MDSMonitor::maybe_resize_cluster(FSMap &fsmap, fs_cluster_id_t fscid)
7c673cae 1957{
11fdf7f2
TL
1958 auto &current_mds_map = get_fsmap().get_filesystem(fscid)->mds_map;
1959 auto&& fs = fsmap.get_filesystem(fscid);
1adf2230 1960 auto &mds_map = fs->mds_map;
7c673cae 1961
1adf2230
AA
1962 int in = mds_map.get_num_in_mds();
1963 int max = mds_map.get_max_mds();
1964
1965 dout(20) << __func__ << " in " << in << " max " << max << dendl;
1966
11fdf7f2
TL
1967 /* Check that both the current epoch mds_map is resizeable as well as the
1968 * current batch of changes in pending. This is important if an MDS is
1969 * becoming active in the next epoch.
1970 */
1971 if (!current_mds_map.is_resizeable() ||
1972 !mds_map.is_resizeable()) {
1973 dout(5) << __func__ << " mds_map is not currently resizeable" << dendl;
1974 return false;
1975 }
1976
1977 if (in < max && !mds_map.test_flag(CEPH_MDSMAP_NOT_JOINABLE)) {
7c673cae 1978 mds_rank_t mds = mds_rank_t(0);
1adf2230 1979 while (mds_map.is_in(mds)) {
7c673cae
FG
1980 mds++;
1981 }
9f95a23c
TL
1982 auto info = fsmap.find_replacement_for({fscid, mds});
1983 if (!info) {
1adf2230 1984 return false;
7c673cae
FG
1985 }
1986
9f95a23c 1987 dout(1) << "assigned standby " << info->addrs
7c673cae 1988 << " as mds." << mds << dendl;
f67539c2 1989 mon.clog->info() << info->human_name() << " assigned to "
1adf2230
AA
1990 "filesystem " << mds_map.fs_name << " as rank "
1991 << mds << " (now has " << mds_map.get_num_in_mds() + 1
d2e6a577 1992 << " ranks)";
9f95a23c 1993 fsmap.promote(info->global_id, *fs, mds);
1adf2230 1994 return true;
11fdf7f2
TL
1995 } else if (in > max) {
1996 mds_rank_t target = in - 1;
1997 const auto &info = mds_map.get_info(target);
1998 if (mds_map.is_active(target)) {
1999 dout(1) << "stopping " << target << dendl;
f67539c2 2000 mon.clog->info() << "stopping " << info.human_name();
11fdf7f2
TL
2001 auto f = [](auto& info) {
2002 info.state = MDSMap::STATE_STOPPING;
2003 };
2004 fsmap.modify_daemon(info.global_id, f);
2005 return true;
2006 } else {
2007 dout(20) << "skipping stop of " << target << dendl;
2008 return false;
2009 }
7c673cae
FG
2010 }
2011
1adf2230 2012 return false;
7c673cae
FG
2013}
2014
2015
2016/**
9f95a23c 2017 * Fail a daemon and replace it with a suitable standby.
7c673cae 2018 */
9f95a23c 2019bool MDSMonitor::drop_mds(FSMap &fsmap, mds_gid_t gid, const mds_info_t* rep_info, bool *osd_propose)
7c673cae 2020{
11fdf7f2 2021 ceph_assert(osd_propose != nullptr);
7c673cae 2022
1adf2230 2023 const auto fscid = fsmap.mds_roles.at(gid);
9f95a23c
TL
2024 const auto& info = fsmap.get_info_gid(gid);
2025 const auto rank = info.rank;
2026 const auto state = info.state;
2027
2028 if (info.is_frozen()) {
2029 return false;
2030 } else if (state == MDSMap::STATE_STANDBY_REPLAY ||
2031 state == MDSMap::STATE_STANDBY) {
2032 dout(1) << " failing and removing standby " << gid << " " << info.addrs
2033 << " mds." << rank
2034 << "." << info.inc << " " << ceph_mds_state_name(state)
2035 << dendl;
2036 *osd_propose |= fail_mds_gid(fsmap, gid);
2037 return true;
2038 } else if (rank >= 0 && rep_info) {
2039 auto fs = fsmap.filesystems.at(fscid);
2040 if (fs->mds_map.test_flag(CEPH_MDSMAP_NOT_JOINABLE)) {
2041 return false;
2042 }
2043 // are we in?
2044 // and is there a non-laggy standby that can take over for us?
2045 dout(1) << " replacing " << gid << " " << info.addrs
2046 << " mds." << rank << "." << info.inc
2047 << " " << ceph_mds_state_name(state)
2048 << " with " << rep_info->global_id << "/" << rep_info->name << " " << rep_info->addrs
2049 << dendl;
2050
f67539c2 2051 mon.clog->warn() << "Replacing " << info.human_name()
9f95a23c
TL
2052 << " as rank " << rank
2053 << " with standby " << rep_info->human_name();
2054
2055 // Remove the old one
2056 *osd_propose |= fail_mds_gid(fsmap, gid);
2057
2058 // Promote the replacement
2059 fsmap.promote(rep_info->global_id, *fs, rank);
2060
2061 return true;
2062 }
2063 return false;
2064}
2065
2066bool MDSMonitor::check_health(FSMap& fsmap, bool* propose_osdmap)
2067{
2068 bool do_propose = false;
2069 const auto now = mono_clock::now();
f67539c2 2070 const bool osdmap_writeable = mon.osdmon()->is_writeable();
9f95a23c
TL
2071 const auto mds_beacon_grace = g_conf().get_val<double>("mds_beacon_grace");
2072 const auto mds_beacon_interval = g_conf().get_val<double>("mds_beacon_interval");
2073
2074 if (mono_clock::is_zero(last_tick)) {
2075 last_tick = now;
2076 }
2077
2078 {
2079 auto since_last = std::chrono::duration<double>(now-last_tick);
2080
2081 if (since_last.count() > (mds_beacon_grace-mds_beacon_interval)) {
2082 // This case handles either local slowness (calls being delayed
2083 // for whatever reason) or cluster election slowness (a long gap
2084 // between calls while an election happened)
2085 dout(1) << __func__ << ": resetting beacon timeouts due to mon delay "
2086 "(slow election?) of " << since_last.count() << " seconds" << dendl;
2087 for (auto& p : last_beacon) {
2088 p.second.stamp = now;
2089 }
2090 }
2091 }
2092
2093 // make sure last_beacon is fully populated
2094 for (auto& p : fsmap.mds_roles) {
2095 auto& gid = p.first;
2096 last_beacon.emplace(std::piecewise_construct,
2097 std::forward_as_tuple(gid),
2098 std::forward_as_tuple(now, 0));
2099 }
7c673cae 2100
31f18b77 2101 // We will only take decisive action (replacing/removing a daemon)
9f95a23c 2102 // if we have some indication that some other daemon(s) are successfully
31f18b77 2103 // getting beacons through recently.
1adf2230 2104 mono_time latest_beacon = mono_clock::zero();
9f95a23c 2105 for (const auto& p : last_beacon) {
1adf2230 2106 latest_beacon = std::max(p.second.stamp, latest_beacon);
31f18b77 2107 }
f67539c2 2108 auto since = std::chrono::duration<double>(now-latest_beacon);
1adf2230 2109 const bool may_replace = since.count() <
11fdf7f2 2110 std::max(g_conf()->mds_beacon_interval, g_conf()->mds_beacon_grace * 0.5);
31f18b77 2111
9f95a23c
TL
2112 // check beacon timestamps
2113 std::vector<mds_gid_t> to_remove;
2114 for (auto it = last_beacon.begin(); it != last_beacon.end(); ) {
2115 auto& [gid, beacon_info] = *it;
f67539c2 2116 auto since_last = std::chrono::duration<double>(now-beacon_info.stamp);
9f95a23c
TL
2117
2118 if (!fsmap.gid_exists(gid)) {
2119 // gid no longer exists, remove from tracked beacons
2120 it = last_beacon.erase(it);
2121 continue;
2122 }
7c673cae 2123
9f95a23c
TL
2124 if (since_last.count() >= g_conf()->mds_beacon_grace) {
2125 auto& info = fsmap.get_info_gid(gid);
2126 dout(1) << "no beacon from mds." << info.rank << "." << info.inc
2127 << " (gid: " << gid << " addr: " << info.addrs
2128 << " state: " << ceph_mds_state_name(info.state) << ")"
2129 << " since " << since_last.count() << dendl;
f67539c2 2130 // If the OSDMap is writeable, we can blocklist things, so we can
9f95a23c
TL
2131 // try failing any laggy MDS daemons. Consider each one for failure.
2132 if (!info.laggy()) {
2133 dout(1) << " marking " << gid << " " << info.addrs
2134 << " mds." << info.rank << "." << info.inc
2135 << " " << ceph_mds_state_name(info.state)
2136 << " laggy" << dendl;
2137 fsmap.modify_daemon(info.global_id, [](auto& info) {
2138 info.laggy_since = ceph_clock_now();
2139 });
2140 do_propose = true;
2141 }
2142 if (osdmap_writeable && may_replace) {
2143 to_remove.push_back(gid); // drop_mds may invalidate iterator
2144 }
2145 }
31f18b77 2146
9f95a23c
TL
2147 ++it;
2148 }
7c673cae 2149
9f95a23c 2150 for (const auto& gid : to_remove) {
f6b5b4d7 2151 auto info = fsmap.get_info_gid(gid);
9f95a23c
TL
2152 const mds_info_t* rep_info = nullptr;
2153 if (info.rank >= 0) {
f67539c2 2154 auto fscid = fsmap.fscid_from_gid(gid);
9f95a23c
TL
2155 rep_info = fsmap.find_replacement_for({fscid, info.rank});
2156 }
2157 bool dropped = drop_mds(fsmap, gid, rep_info, propose_osdmap);
2158 if (dropped) {
f67539c2 2159 mon.clog->info() << "MDS " << info.human_name()
9f95a23c
TL
2160 << " is removed because it is dead or otherwise unavailable.";
2161 do_propose = true;
2162 }
2163 }
7c673cae 2164
9f95a23c
TL
2165 if (osdmap_writeable) {
2166 for (auto& [fscid, fs] : fsmap.filesystems) {
2167 if (!fs->mds_map.test_flag(CEPH_MDSMAP_NOT_JOINABLE) &&
2168 fs->mds_map.is_resizeable()) {
2169 // Check if a rank or standby-replay should be replaced with a stronger
2170 // affinity standby. This looks at ranks and standby-replay:
2171 for (const auto& [gid, info] : fs->mds_map.get_mds_info()) {
2172 const auto join_fscid = info.join_fscid;
2173 if (join_fscid == fscid)
2174 continue;
2175 const auto rank = info.rank;
2176 const auto state = info.state;
2177 const mds_info_t* rep_info = nullptr;
2178 if (state == MDSMap::STATE_STANDBY_REPLAY) {
2179 rep_info = fsmap.get_available_standby(fscid);
2180 } else if (state == MDSMap::STATE_ACTIVE) {
2181 rep_info = fsmap.find_replacement_for({fscid, rank});
2182 } else {
2183 /* N.B. !is_degraded() */
2184 ceph_abort_msg("invalid state in MDSMap");
2185 }
2186 if (!rep_info) {
2187 break;
2188 }
2189 bool better_affinity = false;
2190 if (join_fscid == FS_CLUSTER_ID_NONE) {
2191 better_affinity = (rep_info->join_fscid == fscid);
2192 } else {
2193 better_affinity = (rep_info->join_fscid == fscid) ||
2194 (rep_info->join_fscid == FS_CLUSTER_ID_NONE);
2195 }
2196 if (better_affinity) {
2197 if (state == MDSMap::STATE_STANDBY_REPLAY) {
f67539c2 2198 mon.clog->info() << "Dropping low affinity standby-replay "
9f95a23c
TL
2199 << info.human_name()
2200 << " in favor of higher affinity standby.";
2201 *propose_osdmap |= fail_mds_gid(fsmap, gid);
2202 /* Now let maybe_promote_standby do the promotion. */
2203 } else {
f67539c2 2204 mon.clog->info() << "Dropping low affinity active "
9f95a23c
TL
2205 << info.human_name()
2206 << " in favor of higher affinity standby.";
2207 do_propose |= drop_mds(fsmap, gid, rep_info, propose_osdmap);
2208 }
2209 break; /* don't replace more than one per tick per fs */
2210 }
2211 }
2212 }
2213 }
7c673cae 2214 }
9f95a23c 2215 return do_propose;
7c673cae
FG
2216}
2217
11fdf7f2 2218bool MDSMonitor::maybe_promote_standby(FSMap &fsmap, Filesystem& fs)
7c673cae 2219{
11fdf7f2
TL
2220 if (fs.mds_map.test_flag(CEPH_MDSMAP_NOT_JOINABLE)) {
2221 return false;
2222 }
7c673cae
FG
2223
2224 bool do_propose = false;
2225
2226 // have a standby take over?
2227 set<mds_rank_t> failed;
11fdf7f2
TL
2228 fs.mds_map.get_failed_mds_set(failed);
2229 for (const auto& rank : failed) {
9f95a23c
TL
2230 auto info = fsmap.find_replacement_for({fs.fscid, rank});
2231 if (info) {
2232 dout(1) << " taking over failed mds." << rank << " with " << info->global_id
2233 << "/" << info->name << " " << info->addrs << dendl;
f67539c2 2234 mon.clog->info() << "Standby " << info->human_name()
11fdf7f2
TL
2235 << " assigned to filesystem " << fs.mds_map.fs_name
2236 << " as rank " << rank;
2237
9f95a23c 2238 fsmap.promote(info->global_id, fs, rank);
11fdf7f2 2239 do_propose = true;
7c673cae 2240 }
11fdf7f2
TL
2241 }
2242
f67539c2 2243 if (fs.mds_map.is_resizeable() && fs.mds_map.allows_standby_replay()) {
7c673cae 2244 // There were no failures to replace, so try using any available standbys
a8e16298
TL
2245 // as standby-replay daemons. Don't do this when the cluster is degraded
2246 // as a standby-replay daemon may try to read a journal being migrated.
11fdf7f2 2247 for (;;) {
9f95a23c
TL
2248 auto info = fsmap.get_available_standby(fs.fscid);
2249 if (!info) break;
2250 dout(20) << "standby available mds." << info->global_id << dendl;
11fdf7f2
TL
2251 bool changed = false;
2252 for (const auto& rank : fs.mds_map.in) {
9f95a23c 2253 dout(20) << "examining " << rank << dendl;
11fdf7f2 2254 if (fs.mds_map.is_followable(rank)) {
9f95a23c 2255 dout(1) << " setting mds." << info->global_id
11fdf7f2 2256 << " to follow mds rank " << rank << dendl;
9f95a23c 2257 fsmap.assign_standby_replay(info->global_id, fs.fscid, rank);
11fdf7f2
TL
2258 do_propose = true;
2259 changed = true;
2260 break;
7c673cae 2261 }
7c673cae 2262 }
11fdf7f2 2263 if (!changed) break;
7c673cae
FG
2264 }
2265 }
2266
2267 return do_propose;
2268}
2269
2270void MDSMonitor::tick()
2271{
1adf2230 2272 if (!is_active() || !is_leader()) return;
28e407b8
AA
2273
2274 auto &pending = get_pending_fsmap_writeable();
7c673cae 2275
28e407b8 2276 bool do_propose = false;
9f95a23c 2277 bool propose_osdmap = false;
7c673cae 2278
28e407b8 2279 do_propose |= pending.check_health();
7c673cae 2280
9f95a23c
TL
2281 /* Check health and affinity of ranks */
2282 do_propose |= check_health(pending, &propose_osdmap);
7c673cae 2283
9f95a23c
TL
2284 /* Resize the cluster according to max_mds. */
2285 for (auto& p : pending.filesystems) {
2286 do_propose |= maybe_resize_cluster(pending, p.second->fscid);
7c673cae
FG
2287 }
2288
9f95a23c
TL
2289 /* Replace any failed ranks. */
2290 for (auto& p : pending.filesystems) {
2291 do_propose |= maybe_promote_standby(pending, *p.second);
7c673cae
FG
2292 }
2293
c07f9fc5 2294 if (propose_osdmap) {
f67539c2 2295 request_proposal(mon.osdmon());
c07f9fc5 2296 }
7c673cae 2297
7c673cae
FG
2298 if (do_propose) {
2299 propose_pending();
2300 }
9f95a23c
TL
2301
2302 last_tick = mono_clock::now();
7c673cae
FG
2303}
2304
f67539c2 2305MDSMonitor::MDSMonitor(Monitor &mn, Paxos &p, string service_name)
7c673cae
FG
2306 : PaxosService(mn, p, service_name)
2307{
f67539c2 2308 handlers = FileSystemCommandHandler::load(&p);
7c673cae
FG
2309}
2310
2311void MDSMonitor::on_restart()
2312{
2313 // Clear out the leader-specific state.
1adf2230 2314 last_tick = mono_clock::now();
7c673cae
FG
2315 last_beacon.clear();
2316}
2317