]> git.proxmox.com Git - ceph.git/blame - ceph/src/mon/MDSMonitor.cc
update ceph source to reef 18.2.1
[ceph.git] / ceph / src / mon / MDSMonitor.cc
CommitLineData
7c673cae
FG
1// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2// vim: ts=8 sw=2 smarttab
3/*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 *
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
12 *
13 */
14
11fdf7f2 15#include <regex>
7c673cae
FG
16#include <sstream>
17#include <boost/utility.hpp>
18
19#include "MDSMonitor.h"
20#include "FSCommands.h"
21#include "Monitor.h"
22#include "MonitorDBStore.h"
23#include "OSDMonitor.h"
7c673cae
FG
24
25#include "common/strtol.h"
26#include "common/perf_counters.h"
27#include "common/config.h"
28#include "common/cmdparse.h"
29#include "messages/MMDSMap.h"
30#include "messages/MFSMap.h"
31#include "messages/MFSMapUser.h"
32#include "messages/MMDSLoadTargets.h"
33#include "messages/MMonCommand.h"
34#include "messages/MGenericMessage.h"
35
11fdf7f2 36#include "include/ceph_assert.h"
7c673cae
FG
37#include "include/str_list.h"
38#include "include/stringify.h"
39#include "mds/mdstypes.h"
40#include "Session.h"
41
f67539c2
TL
42using namespace TOPNSPC::common;
43
44using std::dec;
45using std::hex;
46using std::list;
47using std::map;
48using std::make_pair;
49using std::ostream;
50using std::ostringstream;
51using std::pair;
52using std::set;
53using std::string;
54using std::string_view;
55using std::stringstream;
56using std::to_string;
57using std::vector;
58
59using ceph::bufferlist;
60using ceph::decode;
61using ceph::encode;
62using ceph::ErasureCodeInterfaceRef;
63using ceph::ErasureCodeProfile;
64using ceph::Formatter;
65using ceph::JSONFormatter;
66using ceph::make_message;
67using ceph::mono_clock;
68using ceph::mono_time;
69
7c673cae
FG
70#define dout_subsys ceph_subsys_mon
71#undef dout_prefix
28e407b8 72#define dout_prefix _prefix(_dout, mon, get_fsmap())
f67539c2
TL
73static ostream& _prefix(std::ostream *_dout, Monitor &mon, const FSMap& fsmap) {
74 return *_dout << "mon." << mon.name << "@" << mon.rank
75 << "(" << mon.get_state_name()
7c673cae
FG
76 << ").mds e" << fsmap.get_epoch() << " ";
77}
78
3efd9988
FG
79static const string MDS_METADATA_PREFIX("mds_metadata");
80static const string MDS_HEALTH_PREFIX("mds_health");
81
82
7c673cae
FG
83/*
84 * Specialized implementation of cmd_getval to allow us to parse
85 * out strongly-typedef'd types
86 */
9f95a23c
TL
87namespace TOPNSPC::common {
88template<> bool cmd_getval(const cmdmap_t& cmdmap,
20effc67 89 std::string_view k, mds_gid_t &val)
7c673cae 90{
9f95a23c 91 return cmd_getval(cmdmap, k, (int64_t&)val);
7c673cae
FG
92}
93
9f95a23c 94template<> bool cmd_getval(const cmdmap_t& cmdmap,
20effc67 95 std::string_view k, mds_rank_t &val)
7c673cae 96{
9f95a23c 97 return cmd_getval(cmdmap, k, (int64_t&)val);
7c673cae
FG
98}
99
9f95a23c 100template<> bool cmd_getval(const cmdmap_t& cmdmap,
20effc67 101 std::string_view k, MDSMap::DaemonState &val)
7c673cae 102{
9f95a23c
TL
103 return cmd_getval(cmdmap, k, (int64_t&)val);
104}
7c673cae 105}
7c673cae
FG
106// my methods
107
11fdf7f2
TL
108template <int dblV>
109void MDSMonitor::print_map(const FSMap& m)
7c673cae 110{
11fdf7f2 111 dout(dblV) << "print_map\n";
7c673cae
FG
112 m.print(*_dout);
113 *_dout << dendl;
114}
115
116// service methods
117void MDSMonitor::create_initial()
118{
119 dout(10) << "create_initial" << dendl;
120}
121
11fdf7f2 122void MDSMonitor::get_store_prefixes(std::set<string>& s) const
3efd9988
FG
123{
124 s.insert(service_name);
125 s.insert(MDS_METADATA_PREFIX);
126 s.insert(MDS_HEALTH_PREFIX);
127}
7c673cae
FG
128
129void MDSMonitor::update_from_paxos(bool *need_bootstrap)
130{
131 version_t version = get_last_committed();
28e407b8 132 if (version == get_fsmap().epoch)
7c673cae
FG
133 return;
134
135 dout(10) << __func__ << " version " << version
28e407b8 136 << ", my e " << get_fsmap().epoch << dendl;
11fdf7f2 137 ceph_assert(version > get_fsmap().epoch);
7c673cae 138
224ce89b
WB
139 load_health();
140
7c673cae
FG
141 // read and decode
142 bufferlist fsmap_bl;
143 fsmap_bl.clear();
144 int err = get_version(version, fsmap_bl);
11fdf7f2 145 ceph_assert(err == 0);
7c673cae 146
11fdf7f2 147 ceph_assert(fsmap_bl.length() > 0);
7c673cae 148 dout(10) << __func__ << " got " << version << dendl;
522d829b
TL
149 try {
150 PaxosFSMap::decode(fsmap_bl);
151 } catch (const ceph::buffer::malformed_input& e) {
152 derr << "unable to decode FSMap: " << e.what() << dendl;
153 throw;
154 }
7c673cae
FG
155
156 // new map
91327a77 157 dout(0) << "new map" << dendl;
11fdf7f2
TL
158 print_map<0>(get_fsmap());
159 if (!g_conf()->mon_mds_skip_sanity) {
28e407b8 160 get_fsmap().sanity();
7c673cae
FG
161 }
162
163 check_subs();
7c673cae
FG
164}
165
166void MDSMonitor::init()
167{
168 (void)load_metadata(pending_metadata);
169}
170
171void MDSMonitor::create_pending()
172{
28e407b8 173 auto &fsmap = PaxosFSMap::create_pending();
7c673cae 174
f67539c2
TL
175 if (mon.osdmon()->is_readable()) {
176 const auto &osdmap = mon.osdmon()->osdmap;
28e407b8 177 fsmap.sanitize([&osdmap](int64_t pool){return osdmap.have_pg_pool(pool);});
3efd9988
FG
178 }
179
28e407b8 180 dout(10) << "create_pending e" << fsmap.epoch << dendl;
7c673cae
FG
181}
182
183void MDSMonitor::encode_pending(MonitorDBStore::TransactionRef t)
184{
28e407b8
AA
185 auto &pending = get_pending_fsmap_writeable();
186 auto &epoch = pending.epoch;
7c673cae 187
28e407b8 188 dout(10) << "encode_pending e" << epoch << dendl;
7c673cae
FG
189
190 // print map iff 'debug mon = 30' or higher
11fdf7f2
TL
191 print_map<30>(pending);
192 if (!g_conf()->mon_mds_skip_sanity) {
a4b75251 193 pending.sanity(true);
7c673cae
FG
194 }
195
196 // Set 'modified' on maps modified this epoch
28e407b8
AA
197 for (auto &p : pending.filesystems) {
198 if (p.second->mds_map.epoch == epoch) {
199 p.second->mds_map.modified = ceph_clock_now();
7c673cae
FG
200 }
201 }
202
203 // apply to paxos
11fdf7f2 204 ceph_assert(get_last_committed() + 1 == pending.epoch);
28e407b8 205 bufferlist pending_bl;
f67539c2 206 pending.encode(pending_bl, mon.get_quorum_con_features());
7c673cae
FG
207
208 /* put everything in the transaction */
28e407b8
AA
209 put_version(t, pending.epoch, pending_bl);
210 put_last_committed(t, pending.epoch);
7c673cae
FG
211
212 // Encode MDSHealth data
213 for (std::map<uint64_t, MDSHealth>::iterator i = pending_daemon_health.begin();
214 i != pending_daemon_health.end(); ++i) {
215 bufferlist bl;
216 i->second.encode(bl);
217 t->put(MDS_HEALTH_PREFIX, stringify(i->first), bl);
218 }
219
220 for (std::set<uint64_t>::iterator i = pending_daemon_health_rm.begin();
221 i != pending_daemon_health_rm.end(); ++i) {
222 t->erase(MDS_HEALTH_PREFIX, stringify(*i));
223 }
224 pending_daemon_health_rm.clear();
1adf2230 225 remove_from_metadata(pending, t);
224ce89b
WB
226
227 // health
228 health_check_map_t new_checks;
28e407b8 229 const auto &info_map = pending.get_mds_info();
224ce89b
WB
230 for (const auto &i : info_map) {
231 const auto &gid = i.first;
232 const auto &info = i.second;
233 if (pending_daemon_health_rm.count(gid)) {
234 continue;
235 }
236 MDSHealth health;
237 auto p = pending_daemon_health.find(gid);
238 if (p != pending_daemon_health.end()) {
239 health = p->second;
240 } else {
241 bufferlist bl;
f67539c2 242 mon.store->get(MDS_HEALTH_PREFIX, stringify(gid), bl);
224ce89b
WB
243 if (!bl.length()) {
244 derr << "Missing health data for MDS " << gid << dendl;
245 continue;
246 }
11fdf7f2 247 auto bl_i = bl.cbegin();
224ce89b
WB
248 health.decode(bl_i);
249 }
250 for (const auto &metric : health.metrics) {
2a845540
TL
251 if (metric.type == MDS_HEALTH_DUMMY) {
252 continue;
253 }
9f95a23c 254 const auto rank = info.rank;
224ce89b
WB
255 health_check_t *check = &new_checks.get_or_add(
256 mds_metric_name(metric.type),
257 metric.sev,
9f95a23c
TL
258 mds_metric_summary(metric.type),
259 1);
224ce89b 260 ostringstream ss;
f91f0fd5 261 ss << "mds." << info.name << "(mds." << rank << "): " << metric.message;
28e407b8
AA
262 bool first = true;
263 for (auto &p : metric.metadata) {
264 if (first) {
265 ss << " ";
266 } else {
224ce89b 267 ss << ", ";
28e407b8
AA
268 }
269 ss << p.first << ": " << p.second;
270 first = false;
224ce89b
WB
271 }
272 check->detail.push_back(ss.str());
273 }
274 }
28e407b8 275 pending.get_health_checks(&new_checks);
224ce89b 276 for (auto& p : new_checks.checks) {
11fdf7f2 277 p.second.summary = std::regex_replace(
224ce89b 278 p.second.summary,
11fdf7f2 279 std::regex("%num%"),
224ce89b 280 stringify(p.second.detail.size()));
11fdf7f2 281 p.second.summary = std::regex_replace(
224ce89b 282 p.second.summary,
11fdf7f2 283 std::regex("%plurals%"),
224ce89b 284 p.second.detail.size() > 1 ? "s" : "");
11fdf7f2 285 p.second.summary = std::regex_replace(
224ce89b 286 p.second.summary,
11fdf7f2 287 std::regex("%isorare%"),
224ce89b 288 p.second.detail.size() > 1 ? "are" : "is");
11fdf7f2 289 p.second.summary = std::regex_replace(
181888fb 290 p.second.summary,
11fdf7f2 291 std::regex("%hasorhave%"),
181888fb 292 p.second.detail.size() > 1 ? "have" : "has");
224ce89b
WB
293 }
294 encode_health(new_checks, t);
7c673cae
FG
295}
296
11fdf7f2 297version_t MDSMonitor::get_trim_to() const
7c673cae
FG
298{
299 version_t floor = 0;
11fdf7f2 300 if (g_conf()->mon_mds_force_trim_to > 0 &&
522d829b 301 g_conf()->mon_mds_force_trim_to <= (int)get_last_committed()) {
11fdf7f2 302 floor = g_conf()->mon_mds_force_trim_to;
7c673cae
FG
303 dout(10) << __func__ << " explicit mon_mds_force_trim_to = "
304 << floor << dendl;
305 }
306
11fdf7f2 307 unsigned max = g_conf()->mon_max_mdsmap_epochs;
7c673cae
FG
308 version_t last = get_last_committed();
309
522d829b
TL
310 if (last - get_first_committed() > max && floor < last - max) {
311 floor = last-max;
312 }
313
314 dout(20) << __func__ << " = " << floor << dendl;
7c673cae
FG
315 return floor;
316}
317
7c673cae
FG
318bool MDSMonitor::preprocess_query(MonOpRequestRef op)
319{
320 op->mark_mdsmon_event(__func__);
9f95a23c 321 auto m = op->get_req<PaxosServiceMessage>();
11fdf7f2
TL
322 dout(10) << "preprocess_query " << *m << " from " << m->get_orig_source()
323 << " " << m->get_orig_source_addrs() << dendl;
7c673cae
FG
324
325 switch (m->get_type()) {
326
327 case MSG_MDS_BEACON:
328 return preprocess_beacon(op);
329
330 case MSG_MON_COMMAND:
f64942e4
AA
331 try {
332 return preprocess_command(op);
11fdf7f2 333 } catch (const bad_cmd_get& e) {
f64942e4 334 bufferlist bl;
f67539c2 335 mon.reply_command(op, -EINVAL, e.what(), bl, get_last_committed());
f64942e4
AA
336 return true;
337 }
7c673cae
FG
338
339 case MSG_MDS_OFFLOAD_TARGETS:
340 return preprocess_offload_targets(op);
341
342 default:
343 ceph_abort();
344 return true;
345 }
346}
347
348void MDSMonitor::_note_beacon(MMDSBeacon *m)
349{
350 mds_gid_t gid = mds_gid_t(m->get_global_id());
351 version_t seq = m->get_seq();
352
91327a77 353 dout(5) << "_note_beacon " << *m << " noting time" << dendl;
1adf2230
AA
354 auto &beacon = last_beacon[gid];
355 beacon.stamp = mono_clock::now();
356 beacon.seq = seq;
7c673cae
FG
357}
358
359bool MDSMonitor::preprocess_beacon(MonOpRequestRef op)
360{
361 op->mark_mdsmon_event(__func__);
9f95a23c 362 auto m = op->get_req<MMDSBeacon>();
7c673cae
FG
363 MDSMap::DaemonState state = m->get_state();
364 mds_gid_t gid = m->get_global_id();
365 version_t seq = m->get_seq();
366 MDSMap::mds_info_t info;
367 epoch_t effective_epoch = 0;
368
1adf2230 369 const auto &fsmap = get_fsmap();
28e407b8 370
7c673cae 371 // check privileges, ignore if fails
11fdf7f2
TL
372 MonSession *session = op->get_session();
373 if (!session)
374 goto ignore;
7c673cae
FG
375 if (!session->is_capable("mds", MON_CAP_X)) {
376 dout(0) << "preprocess_beacon got MMDSBeacon from entity with insufficient privileges "
377 << session->caps << dendl;
378 goto ignore;
379 }
380
f67539c2
TL
381 if (m->get_fsid() != mon.monmap->fsid) {
382 dout(0) << "preprocess_beacon on fsid " << m->get_fsid() << " != " << mon.monmap->fsid << dendl;
7c673cae
FG
383 goto ignore;
384 }
385
91327a77 386 dout(5) << "preprocess_beacon " << *m
11fdf7f2
TL
387 << " from " << m->get_orig_source()
388 << " " << m->get_orig_source_addrs()
7c673cae
FG
389 << " " << m->get_compat()
390 << dendl;
391
392 // make sure the address has a port
393 if (m->get_orig_source_addr().get_port() == 0) {
394 dout(1) << " ignoring boot message without a port" << dendl;
395 goto ignore;
396 }
397
7c673cae 398 // fw to leader?
28e407b8 399 if (!is_leader())
7c673cae
FG
400 return false;
401
402 // booted, but not in map?
28e407b8 403 if (!fsmap.gid_exists(gid)) {
7c673cae
FG
404 if (state != MDSMap::STATE_BOOT) {
405 dout(7) << "mds_beacon " << *m << " is not in fsmap (state "
406 << ceph_mds_state_name(state) << ")" << dendl;
407
1adf2230
AA
408 /* We can't send an MDSMap this MDS was a part of because we no longer
409 * know which FS it was part of. Nor does this matter. Sending an empty
410 * MDSMap is sufficient for getting the MDS to respawn.
411 */
522d829b 412 auto m = make_message<MMDSMap>(mon.monmap->fsid, MDSMap::create_null_mdsmap());
f67539c2 413 mon.send_reply(op, m.detach());
7c673cae
FG
414 return true;
415 } else {
05a536ef
TL
416 /* check if we've already recorded its entry in pending */
417 const auto& pending = get_pending_fsmap();
418 if (pending.gid_exists(gid)) {
419 /* MDS is already booted. */
420 goto ignore;
421 } else {
422 return false; // not booted yet.
423 }
7c673cae
FG
424 }
425 }
426 dout(10) << __func__ << ": GID exists in map: " << gid << dendl;
28e407b8 427 info = fsmap.get_info_gid(gid);
7c673cae 428
f91f0fd5
TL
429 if (state == MDSMap::STATE_DNE) {
430 return false;
431 }
432
7c673cae
FG
433 // old seq?
434 if (info.state_seq > seq) {
435 dout(7) << "mds_beacon " << *m << " has old seq, ignoring" << dendl;
436 goto ignore;
437 }
438
439 // Work out the latest epoch that this daemon should have seen
440 {
28e407b8 441 fs_cluster_id_t fscid = fsmap.mds_roles.at(gid);
7c673cae 442 if (fscid == FS_CLUSTER_ID_NONE) {
28e407b8 443 effective_epoch = fsmap.standby_epochs.at(gid);
7c673cae 444 } else {
28e407b8 445 effective_epoch = fsmap.get_filesystem(fscid)->mds_map.epoch;
7c673cae
FG
446 }
447 if (effective_epoch != m->get_last_epoch_seen()) {
448 dout(10) << "mds_beacon " << *m
449 << " ignoring requested state, because mds hasn't seen latest map" << dendl;
450 goto reply;
451 }
452 }
453
454 if (info.laggy()) {
455 _note_beacon(m);
456 return false; // no longer laggy, need to update map.
457 }
458 if (state == MDSMap::STATE_BOOT) {
459 // ignore, already booted.
460 goto ignore;
461 }
9f95a23c
TL
462
463 // did the join_fscid change
464 if (m->get_fs().size()) {
465 fs_cluster_id_t fscid = FS_CLUSTER_ID_NONE;
466 auto f = fsmap.get_filesystem(m->get_fs());
467 if (f) {
468 fscid = f->fscid;
469 }
470 if (info.join_fscid != fscid) {
471 dout(10) << __func__ << " standby mds_join_fs changed to " << fscid
472 << " (" << m->get_fs() << ")" << dendl;
473 _note_beacon(m);
474 return false;
475 }
476 } else {
477 if (info.join_fscid != FS_CLUSTER_ID_NONE) {
478 dout(10) << __func__ << " standby mds_join_fs was cleared" << dendl;
479 _note_beacon(m);
480 return false;
481 }
482 }
483
7c673cae
FG
484 // is there a state change here?
485 if (info.state != state) {
7c673cae
FG
486 _note_beacon(m);
487 return false;
488 }
489
490 // Comparing known daemon health with m->get_health()
491 // and return false (i.e. require proposal) if they
492 // do not match, to update our stored
493 if (!(pending_daemon_health[gid] == m->get_health())) {
91327a77 494 dout(10) << __func__ << " health metrics for gid " << gid << " were updated" << dendl;
7c673cae
FG
495 _note_beacon(m);
496 return false;
497 }
498
499 reply:
500 // note time and reply
11fdf7f2 501 ceph_assert(effective_epoch > 0);
7c673cae 502 _note_beacon(m);
11fdf7f2 503 {
f67539c2 504 auto beacon = make_message<MMDSBeacon>(mon.monmap->fsid,
11fdf7f2
TL
505 m->get_global_id(), m->get_name(), effective_epoch,
506 state, seq, CEPH_FEATURES_SUPPORTED_DEFAULT);
f67539c2 507 mon.send_reply(op, beacon.detach());
11fdf7f2 508 }
7c673cae
FG
509 return true;
510
511 ignore:
512 // I won't reply this beacon, drop it.
f67539c2 513 mon.no_reply(op);
7c673cae
FG
514 return true;
515}
516
517bool MDSMonitor::preprocess_offload_targets(MonOpRequestRef op)
518{
519 op->mark_mdsmon_event(__func__);
9f95a23c 520 auto m = op->get_req<MMDSLoadTargets>();
7c673cae 521 dout(10) << "preprocess_offload_targets " << *m << " from " << m->get_orig_source() << dendl;
28e407b8 522
1adf2230 523 const auto &fsmap = get_fsmap();
7c673cae
FG
524
525 // check privileges, ignore message if fails
11fdf7f2 526 MonSession *session = op->get_session();
7c673cae 527 if (!session)
1adf2230 528 goto ignore;
7c673cae
FG
529 if (!session->is_capable("mds", MON_CAP_X)) {
530 dout(0) << "preprocess_offload_targets got MMDSLoadTargets from entity with insufficient caps "
531 << session->caps << dendl;
1adf2230 532 goto ignore;
7c673cae
FG
533 }
534
535 if (fsmap.gid_exists(m->global_id) &&
536 m->targets == fsmap.get_info_gid(m->global_id).export_targets)
1adf2230 537 goto ignore;
7c673cae
FG
538
539 return false;
540
1adf2230 541 ignore:
f67539c2 542 mon.no_reply(op);
7c673cae
FG
543 return true;
544}
545
546
547bool MDSMonitor::prepare_update(MonOpRequestRef op)
548{
549 op->mark_mdsmon_event(__func__);
9f95a23c 550 auto m = op->get_req<PaxosServiceMessage>();
7c673cae
FG
551 dout(7) << "prepare_update " << *m << dendl;
552
553 switch (m->get_type()) {
554
555 case MSG_MDS_BEACON:
556 return prepare_beacon(op);
557
558 case MSG_MON_COMMAND:
f64942e4
AA
559 try {
560 return prepare_command(op);
11fdf7f2 561 } catch (const bad_cmd_get& e) {
f64942e4 562 bufferlist bl;
f67539c2 563 mon.reply_command(op, -EINVAL, e.what(), bl, get_last_committed());
05a536ef 564 return false; /* nothing to propose */
f64942e4 565 }
7c673cae
FG
566
567 case MSG_MDS_OFFLOAD_TARGETS:
568 return prepare_offload_targets(op);
569
570 default:
571 ceph_abort();
572 }
573
05a536ef 574 return false; /* nothing to propose! */
7c673cae
FG
575}
576
577bool MDSMonitor::prepare_beacon(MonOpRequestRef op)
578{
579 op->mark_mdsmon_event(__func__);
9f95a23c 580 auto m = op->get_req<MMDSBeacon>();
7c673cae 581 // -- this is an update --
11fdf7f2
TL
582 dout(12) << "prepare_beacon " << *m << " from " << m->get_orig_source()
583 << " " << m->get_orig_source_addrs() << dendl;
584 entity_addrvec_t addrs = m->get_orig_source_addrs();
7c673cae
FG
585 mds_gid_t gid = m->get_global_id();
586 MDSMap::DaemonState state = m->get_state();
587 version_t seq = m->get_seq();
588
28e407b8
AA
589 auto &pending = get_pending_fsmap_writeable();
590
91327a77 591 dout(15) << __func__ << " got health from gid " << gid << " with " << m->get_health().metrics.size() << " metrics." << dendl;
7c673cae
FG
592
593 // Calculate deltas of health metrics created and removed
594 // Do this by type rather than MDSHealthMetric equality, because messages can
595 // change a lot when they include e.g. a number of items.
596 const auto &old_health = pending_daemon_health[gid].metrics;
597 const auto &new_health = m->get_health().metrics;
598
599 std::set<mds_metric_t> old_types;
600 for (const auto &i : old_health) {
601 old_types.insert(i.type);
602 }
603
604 std::set<mds_metric_t> new_types;
605 for (const auto &i : new_health) {
2a845540
TL
606 if (i.type == MDS_HEALTH_DUMMY) {
607 continue;
608 }
7c673cae
FG
609 new_types.insert(i.type);
610 }
611
612 for (const auto &new_metric: new_health) {
2a845540
TL
613 if (new_metric.type == MDS_HEALTH_DUMMY) {
614 continue;
615 }
7c673cae 616 if (old_types.count(new_metric.type) == 0) {
11fdf7f2 617 dout(10) << "MDS health message (" << m->get_orig_source()
28e407b8 618 << "): " << new_metric.sev << " " << new_metric.message << dendl;
7c673cae
FG
619 }
620 }
621
622 // Log the disappearance of health messages at INFO
623 for (const auto &old_metric : old_health) {
624 if (new_types.count(old_metric.type) == 0) {
f67539c2 625 mon.clog->info() << "MDS health message cleared ("
11fdf7f2 626 << m->get_orig_source() << "): " << old_metric.message;
7c673cae
FG
627 }
628 }
629
630 // Store health
631 pending_daemon_health[gid] = m->get_health();
632
522d829b 633 const auto& cs = m->get_compat();
7c673cae
FG
634 if (state == MDSMap::STATE_BOOT) {
635 // zap previous instance of this name?
11fdf7f2 636 if (g_conf()->mds_enforce_unique_name) {
7c673cae 637 bool failed_mds = false;
28e407b8 638 while (mds_gid_t existing = pending.find_mds_gid_by_name(m->get_name())) {
f67539c2
TL
639 if (!mon.osdmon()->is_writeable()) {
640 mon.osdmon()->wait_for_writeable(op, new C_RetryMessage(this, op));
7c673cae
FG
641 return false;
642 }
522d829b 643 const auto& existing_info = pending.get_info_gid(existing);
f67539c2 644 mon.clog->info() << existing_info.human_name() << " restarted";
1adf2230 645 fail_mds_gid(pending, existing);
7c673cae
FG
646 failed_mds = true;
647 }
648 if (failed_mds) {
f67539c2
TL
649 ceph_assert(mon.osdmon()->is_writeable());
650 request_proposal(mon.osdmon());
7c673cae
FG
651 }
652 }
653
654 // Add this daemon to the map
28e407b8 655 if (pending.mds_roles.count(gid) == 0) {
7c673cae
FG
656 MDSMap::mds_info_t new_info;
657 new_info.global_id = gid;
658 new_info.name = m->get_name();
11fdf7f2 659 new_info.addrs = addrs;
7c673cae
FG
660 new_info.mds_features = m->get_mds_features();
661 new_info.state = MDSMap::STATE_STANDBY;
662 new_info.state_seq = seq;
522d829b 663 new_info.compat = cs;
9f95a23c
TL
664 if (m->get_fs().size()) {
665 fs_cluster_id_t fscid = FS_CLUSTER_ID_NONE;
666 auto f = pending.get_filesystem(m->get_fs());
667 if (f) {
668 fscid = f->fscid;
669 }
670 new_info.join_fscid = fscid;
671 }
522d829b 672 pending.insert(new_info);
7c673cae
FG
673 }
674
7c673cae 675 // initialize the beacon timer
1adf2230
AA
676 auto &beacon = last_beacon[gid];
677 beacon.stamp = mono_clock::now();
678 beacon.seq = seq;
7c673cae 679
7c673cae
FG
680 update_metadata(m->get_global_id(), m->get_sys_info());
681 } else {
682 // state update
91327a77
AA
683
684 if (!pending.gid_exists(gid)) {
685 /* gid has been removed from pending, send null map */
686 dout(5) << "mds_beacon " << *m << " is not in fsmap (state "
687 << ceph_mds_state_name(state) << ")" << dendl;
688
689 /* We can't send an MDSMap this MDS was a part of because we no longer
690 * know which FS it was part of. Nor does this matter. Sending an empty
691 * MDSMap is sufficient for getting the MDS to respawn.
692 */
a4b75251 693 goto null;
91327a77
AA
694 }
695
11fdf7f2 696 const auto& info = pending.get_info_gid(gid);
522d829b
TL
697
698 // did the reported compat change? That's illegal!
699 if (cs.compare(info.compat) != 0) {
700 if (!mon.osdmon()->is_writeable()) {
701 mon.osdmon()->wait_for_writeable(op, new C_RetryMessage(this, op));
702 return false;
703 }
704 mon.clog->warn() << info.human_name() << " compat changed unexpectedly";
705 fail_mds_gid(pending, gid);
706 request_proposal(mon.osdmon());
707 return true;
708 }
709
2a845540
TL
710 if (state == MDSMap::STATE_DNE) {
711 dout(1) << __func__ << ": DNE from " << info << dendl;
a4b75251 712 goto evict;
2a845540
TL
713 }
714
715 // legal state change?
716 if ((info.state == MDSMap::STATE_STANDBY && state != info.state) ||
717 (info.state == MDSMap::STATE_STANDBY_REPLAY && state != info.state && state != MDSMap::STATE_DAMAGED)) {
718 // Standby daemons should never modify their own state.
719 // Except that standby-replay can indicate the rank is damaged due to failure to replay.
720 // Reject any attempts to do so.
721 derr << "standby " << gid << " attempted to change state to "
722 << ceph_mds_state_name(state) << ", rejecting" << dendl;
a4b75251 723 goto evict;
2a845540
TL
724 } else if (info.state != MDSMap::STATE_STANDBY && state != info.state &&
725 !MDSMap::state_transition_valid(info.state, state)) {
726 // Validate state transitions for daemons that hold a rank
727 derr << "daemon " << gid << " (rank " << info.rank << ") "
728 << "reported invalid state transition "
729 << ceph_mds_state_name(info.state) << " -> "
730 << ceph_mds_state_name(state) << dendl;
a4b75251 731 goto evict;
7c673cae
FG
732 }
733
734 if (info.laggy()) {
11fdf7f2
TL
735 dout(1) << "prepare_beacon clearing laggy flag on " << addrs << dendl;
736 pending.modify_daemon(info.global_id, [](auto& info)
7c673cae 737 {
11fdf7f2 738 info.clear_laggy();
7c673cae
FG
739 }
740 );
741 }
9f95a23c 742
91327a77 743 dout(5) << "prepare_beacon mds." << info.rank
7c673cae
FG
744 << " " << ceph_mds_state_name(info.state)
745 << " -> " << ceph_mds_state_name(state)
7c673cae 746 << dendl;
9f95a23c
TL
747
748 fs_cluster_id_t fscid = FS_CLUSTER_ID_NONE;
749 if (m->get_fs().size()) {
750 auto f = pending.get_filesystem(m->get_fs());
751 if (f) {
752 fscid = f->fscid;
753 }
754 }
755 pending.modify_daemon(gid, [fscid](auto& info) {
756 info.join_fscid = fscid;
757 });
758
7c673cae 759 if (state == MDSMap::STATE_STOPPED) {
28e407b8
AA
760 const auto fscid = pending.mds_roles.at(gid);
761 const auto &fs = pending.get_filesystem(fscid);
181888fb 762
f67539c2 763 mon.clog->info() << info.human_name() << " finished "
11fdf7f2 764 << "stopping rank " << info.rank << " in filesystem "
d2e6a577 765 << fs->mds_map.fs_name << " (now has "
181888fb 766 << fs->mds_map.get_num_in_mds() - 1 << " ranks)";
d2e6a577 767
28e407b8 768 auto erased = pending.stop(gid);
7c673cae
FG
769 erased.push_back(gid);
770
9f95a23c 771 for (const auto& erased_gid : erased) {
7c673cae
FG
772 last_beacon.erase(erased_gid);
773 if (pending_daemon_health.count(erased_gid)) {
774 pending_daemon_health.erase(erased_gid);
775 pending_daemon_health_rm.insert(erased_gid);
776 }
777 }
778 } else if (state == MDSMap::STATE_DAMAGED) {
f67539c2 779 if (!mon.osdmon()->is_writeable()) {
91327a77 780 dout(1) << __func__ << ": DAMAGED from rank " << info.rank
f67539c2
TL
781 << " waiting for osdmon writeable to blocklist it" << dendl;
782 mon.osdmon()->wait_for_writeable(op, new C_RetryMessage(this, op));
7c673cae
FG
783 return false;
784 }
785
a4b75251
TL
786 auto rank = info.rank;
787
7c673cae
FG
788 // Record this MDS rank as damaged, so that other daemons
789 // won't try to run it.
a4b75251
TL
790 dout(0) << __func__ << ": marking rank " << rank << " damaged" << dendl;
791
792 auto fs = pending.get_filesystem(gid);
793 auto rankgid = fs->mds_map.get_gid(rank);
794 auto rankinfo = pending.get_info_gid(rankgid);
795 auto followergid = fs->mds_map.get_standby_replay(rank);
796
797 ceph_assert(gid == rankgid || gid == followergid);
7c673cae
FG
798
799 utime_t until = ceph_clock_now();
f67539c2 800 until += g_conf().get_val<double>("mon_mds_blocklist_interval");
a4b75251
TL
801 const auto blocklist_epoch = mon.osdmon()->blocklist(rankinfo.addrs, until);
802 if (followergid != MDS_GID_NONE) {
803 fail_mds_gid(pending, followergid);
804 last_beacon.erase(followergid);
7c673cae 805 }
f67539c2 806 request_proposal(mon.osdmon());
aee94f69 807 force_immediate_propose();
a4b75251
TL
808 pending.damaged(rankgid, blocklist_epoch);
809 last_beacon.erase(rankgid);
7c673cae 810
a4b75251 811 /* MDS expects beacon reply back */
7c673cae 812 } else {
b32b8144 813 if (info.state != MDSMap::STATE_ACTIVE && state == MDSMap::STATE_ACTIVE) {
28e407b8
AA
814 const auto &fscid = pending.mds_roles.at(gid);
815 const auto &fs = pending.get_filesystem(fscid);
f67539c2 816 mon.clog->info() << info.human_name() << " is now active in "
d2e6a577
FG
817 << "filesystem " << fs->mds_map.fs_name << " as rank "
818 << info.rank;
819 }
b32b8144
FG
820
821 // Made it through special cases and validations, record the
822 // daemon's reported state to the FSMap.
11fdf7f2
TL
823 pending.modify_daemon(gid, [state, seq](auto& info) {
824 info.state = state;
825 info.state_seq = seq;
b32b8144 826 });
7c673cae
FG
827 }
828 }
829
91327a77 830 dout(5) << "prepare_beacon pending map now:" << dendl;
28e407b8 831 print_map(pending);
7c673cae 832
9f95a23c 833 wait_for_finished_proposal(op, new LambdaContext([op, this](int r){
7c673cae
FG
834 if (r >= 0)
835 _updated(op); // success
836 else if (r == -ECANCELED) {
f67539c2 837 mon.no_reply(op);
7c673cae
FG
838 } else {
839 dispatch(op); // try again
840 }
841 }));
842
a4b75251
TL
843 return true;
844
845evict:
846 if (!mon.osdmon()->is_writeable()) {
847 dout(1) << __func__ << ": waiting for writeable OSDMap to evict" << dendl;
848 mon.osdmon()->wait_for_writeable(op, new C_RetryMessage(this, op));
849 return false;
850 }
851
852 fail_mds_gid(pending, gid);
853 request_proposal(mon.osdmon());
854 dout(5) << __func__ << ": pending map now:" << dendl;
855 print_map(pending);
856
857 goto null;
858
859null:
860 wait_for_finished_proposal(op, new LambdaContext([op, this](int r){
861 if (r >= 0) {
862 auto m = make_message<MMDSMap>(mon.monmap->fsid, MDSMap::create_null_mdsmap());
863 mon.send_reply(op, m.detach());
864 } else {
865 dispatch(op); // try again
866 }
867 }));
868
7c673cae
FG
869 return true;
870}
871
872bool MDSMonitor::prepare_offload_targets(MonOpRequestRef op)
873{
28e407b8 874 auto &pending = get_pending_fsmap_writeable();
05a536ef 875 bool propose = false;
28e407b8 876
7c673cae 877 op->mark_mdsmon_event(__func__);
9f95a23c 878 auto m = op->get_req<MMDSLoadTargets>();
7c673cae 879 mds_gid_t gid = m->global_id;
28e407b8 880 if (pending.gid_has_rank(gid)) {
7c673cae 881 dout(10) << "prepare_offload_targets " << gid << " " << m->targets << dendl;
28e407b8 882 pending.update_export_targets(gid, m->targets);
05a536ef 883 propose = true;
7c673cae
FG
884 } else {
885 dout(10) << "prepare_offload_targets " << gid << " not in map" << dendl;
886 }
f67539c2 887 mon.no_reply(op);
05a536ef 888 return propose;
7c673cae
FG
889}
890
891bool MDSMonitor::should_propose(double& delay)
892{
893 // delegate to PaxosService to assess whether we should propose
894 return PaxosService::should_propose(delay);
895}
896
897void MDSMonitor::_updated(MonOpRequestRef op)
898{
28e407b8 899 const auto &fsmap = get_fsmap();
7c673cae 900 op->mark_mdsmon_event(__func__);
9f95a23c 901 auto m = op->get_req<MMDSBeacon>();
7c673cae 902 dout(10) << "_updated " << m->get_orig_source() << " " << *m << dendl;
f67539c2 903 mon.clog->debug() << m->get_orig_source() << " "
11fdf7f2
TL
904 << m->get_orig_source_addrs() << " "
905 << ceph_mds_state_name(m->get_state());
7c673cae
FG
906
907 if (m->get_state() == MDSMap::STATE_STOPPED) {
908 // send the map manually (they're out of the map, so they won't get it automatic)
522d829b 909 auto m = make_message<MMDSMap>(mon.monmap->fsid, MDSMap::create_null_mdsmap());
f67539c2 910 mon.send_reply(op, m.detach());
7c673cae 911 } else {
f67539c2 912 auto beacon = make_message<MMDSBeacon>(mon.monmap->fsid,
11fdf7f2
TL
913 m->get_global_id(), m->get_name(), fsmap.get_epoch(),
914 m->get_state(), m->get_seq(), CEPH_FEATURES_SUPPORTED_DEFAULT);
f67539c2 915 mon.send_reply(op, beacon.detach());
7c673cae
FG
916 }
917}
918
919void MDSMonitor::on_active()
920{
921 tick();
7c673cae 922
28e407b8 923 if (is_leader()) {
f67539c2 924 mon.clog->debug() << "fsmap " << get_fsmap();
224ce89b 925 }
7c673cae
FG
926}
927
7c673cae
FG
928void MDSMonitor::dump_info(Formatter *f)
929{
930 f->open_object_section("fsmap");
28e407b8 931 get_fsmap().dump(f);
7c673cae
FG
932 f->close_section();
933
934 f->dump_unsigned("mdsmap_first_committed", get_first_committed());
935 f->dump_unsigned("mdsmap_last_committed", get_last_committed());
936}
937
938bool MDSMonitor::preprocess_command(MonOpRequestRef op)
939{
940 op->mark_mdsmon_event(__func__);
9f95a23c 941 auto m = op->get_req<MMonCommand>();
7c673cae
FG
942 int r = -1;
943 bufferlist rdata;
944 stringstream ss, ds;
945
11fdf7f2 946 cmdmap_t cmdmap;
7c673cae
FG
947 if (!cmdmap_from_json(m->cmd, &cmdmap, ss)) {
948 // ss has reason for failure
949 string rs = ss.str();
f67539c2 950 mon.reply_command(op, -EINVAL, rs, rdata, get_last_committed());
7c673cae
FG
951 return true;
952 }
953
954 string prefix;
9f95a23c 955 cmd_getval(cmdmap, "prefix", prefix);
20effc67 956 string format = cmd_getval_or<string>(cmdmap, "format", "plain");
1adf2230 957 std::unique_ptr<Formatter> f(Formatter::create(format));
7c673cae 958
11fdf7f2 959 MonSession *session = op->get_session();
7c673cae 960 if (!session) {
f67539c2 961 mon.reply_command(op, -EACCES, "access denied", rdata, get_last_committed());
7c673cae
FG
962 return true;
963 }
964
f67539c2
TL
965 // to use const qualifier filter fsmap beforehand
966 FSMap _fsmap_copy = get_fsmap();
967 _fsmap_copy.filter(session->get_allowed_fs_names());
968 const auto& fsmap = _fsmap_copy;
969
7c673cae
FG
970 if (prefix == "mds stat") {
971 if (f) {
972 f->open_object_section("mds_stat");
973 dump_info(f.get());
974 f->close_section();
975 f->flush(ds);
976 } else {
977 ds << fsmap;
978 }
979 r = 0;
11fdf7f2
TL
980 } else if (prefix == "mds ok-to-stop") {
981 vector<string> ids;
9f95a23c 982 if (!cmd_getval(cmdmap, "ids", ids)) {
11fdf7f2
TL
983 r = -EINVAL;
984 ss << "must specify mds id";
985 goto out;
986 }
987 if (fsmap.is_any_degraded()) {
988 ss << "one or more filesystems is currently degraded";
989 r = -EBUSY;
990 goto out;
991 }
992 set<mds_gid_t> stopping;
993 for (auto& id : ids) {
994 ostringstream ess;
995 mds_gid_t gid = gid_from_arg(fsmap, id, ess);
996 if (gid == MDS_GID_NONE) {
997 // the mds doesn't exist, but no file systems are unhappy, so losing it
998 // can't have any effect.
999 continue;
1000 }
1001 stopping.insert(gid);
1002 }
1003 set<mds_gid_t> active;
1004 set<mds_gid_t> standby;
1005 for (auto gid : stopping) {
1006 if (fsmap.gid_has_rank(gid)) {
1007 // ignore standby-replay daemons (at this level)
1008 if (!fsmap.is_standby_replay(gid)) {
1009 auto standby = fsmap.get_standby_replay(gid);
1010 if (standby == MDS_GID_NONE ||
1011 stopping.count(standby)) {
1012 // no standby-replay, or we're also stopping the standby-replay
1013 // for this mds
1014 active.insert(gid);
1015 }
1016 }
7c673cae 1017 } else {
11fdf7f2
TL
1018 // net loss of a standby
1019 standby.insert(gid);
7c673cae
FG
1020 }
1021 }
11fdf7f2
TL
1022 if (fsmap.get_num_standby() - standby.size() < active.size()) {
1023 r = -EBUSY;
1024 ss << "insufficent standby MDS daemons to stop active gids "
1025 << stringify(active)
1026 << " and/or standby gids " << stringify(standby);;
1027 goto out;
28e407b8 1028 }
11fdf7f2
TL
1029 r = 0;
1030 ss << "should be safe to stop " << ids;
7c673cae
FG
1031 } else if (prefix == "fs dump") {
1032 int64_t epocharg;
1033 epoch_t epoch;
1034
1adf2230 1035 const FSMap *fsmapp = &fsmap;
28e407b8 1036 FSMap dummy;
9f95a23c 1037 if (cmd_getval(cmdmap, "epoch", epocharg)) {
7c673cae
FG
1038 epoch = epocharg;
1039 bufferlist b;
1040 int err = get_version(epoch, b);
1041 if (err == -ENOENT) {
7c673cae 1042 r = -ENOENT;
28e407b8 1043 goto out;
7c673cae 1044 } else {
11fdf7f2
TL
1045 ceph_assert(err == 0);
1046 ceph_assert(b.length());
28e407b8
AA
1047 dummy.decode(b);
1048 fsmapp = &dummy;
7c673cae
FG
1049 }
1050 }
c07f9fc5 1051
28e407b8
AA
1052 stringstream ds;
1053 if (f != NULL) {
1054 f->open_object_section("fsmap");
1055 fsmapp->dump(f.get());
1056 f->close_section();
1057 f->flush(ds);
1058 r = 0;
1059 } else {
1060 fsmapp->print(ds);
1061 r = 0;
7c673cae 1062 }
28e407b8
AA
1063
1064 rdata.append(ds);
1065 ss << "dumped fsmap epoch " << fsmapp->get_epoch();
7c673cae
FG
1066 } else if (prefix == "mds metadata") {
1067 if (!f)
1068 f.reset(Formatter::create("json-pretty"));
1069
1070 string who;
9f95a23c 1071 bool all = !cmd_getval(cmdmap, "who", who);
7c673cae
FG
1072 dout(1) << "all = " << all << dendl;
1073 if (all) {
1074 r = 0;
1075 // Dump all MDSs' metadata
1076 const auto all_info = fsmap.get_mds_info();
1077
1078 f->open_array_section("mds_metadata");
1079 for(const auto &i : all_info) {
1080 const auto &info = i.second;
1081
1082 f->open_object_section("mds");
1083 f->dump_string("name", info.name);
1084 std::ostringstream get_err;
1adf2230 1085 r = dump_metadata(fsmap, info.name, f.get(), get_err);
7c673cae
FG
1086 if (r == -EINVAL || r == -ENOENT) {
1087 // Drop error, list what metadata we do have
1088 dout(1) << get_err.str() << dendl;
1089 r = 0;
1090 } else if (r != 0) {
1091 derr << "Unexpected error reading metadata: " << cpp_strerror(r)
1092 << dendl;
1093 ss << get_err.str();
c07f9fc5 1094 f->close_section();
7c673cae
FG
1095 break;
1096 }
1097 f->close_section();
1098 }
1099 f->close_section();
1100 } else {
1101 // Dump a single daemon's metadata
1102 f->open_object_section("mds_metadata");
1adf2230 1103 r = dump_metadata(fsmap, who, f.get(), ss);
7c673cae
FG
1104 f->close_section();
1105 }
1106 f->flush(ds);
31f18b77
FG
1107 } else if (prefix == "mds versions") {
1108 if (!f)
1109 f.reset(Formatter::create("json-pretty"));
1110 count_metadata("ceph_version", f.get());
1111 f->flush(ds);
1112 r = 0;
1113 } else if (prefix == "mds count-metadata") {
1114 if (!f)
1115 f.reset(Formatter::create("json-pretty"));
1116 string field;
9f95a23c 1117 cmd_getval(cmdmap, "property", field);
31f18b77
FG
1118 count_metadata(field, f.get());
1119 f->flush(ds);
1120 r = 0;
522d829b
TL
1121 } else if (prefix == "fs compat show") {
1122 string fs_name;
1123 cmd_getval(cmdmap, "fs_name", fs_name);
1124 const auto &fs = fsmap.get_filesystem(fs_name);
1125 if (fs == nullptr) {
1126 ss << "filesystem '" << fs_name << "' not found";
1127 r = -ENOENT;
1128 goto out;
1129 }
1130
1131 if (f) {
1132 f->open_object_section("mds_compat");
1133 fs->mds_map.compat.dump(f.get());
1134 f->close_section();
1135 f->flush(ds);
1136 } else {
1137 ds << fs->mds_map.compat;
1138 }
1139 r = 0;
7c673cae
FG
1140 } else if (prefix == "mds compat show") {
1141 if (f) {
1142 f->open_object_section("mds_compat");
522d829b 1143 fsmap.default_compat.dump(f.get());
7c673cae
FG
1144 f->close_section();
1145 f->flush(ds);
1146 } else {
522d829b 1147 ds << fsmap.default_compat;
7c673cae
FG
1148 }
1149 r = 0;
1150 } else if (prefix == "fs get") {
1151 string fs_name;
9f95a23c 1152 cmd_getval(cmdmap, "fs_name", fs_name);
28e407b8 1153 const auto &fs = fsmap.get_filesystem(fs_name);
7c673cae
FG
1154 if (fs == nullptr) {
1155 ss << "filesystem '" << fs_name << "' not found";
1156 r = -ENOENT;
1157 } else {
1158 if (f != nullptr) {
1159 f->open_object_section("filesystem");
1160 fs->dump(f.get());
1161 f->close_section();
1162 f->flush(ds);
1163 r = 0;
1164 } else {
1165 fs->print(ds);
1166 r = 0;
1167 }
1168 }
1169 } else if (prefix == "fs ls") {
1170 if (f) {
1171 f->open_array_section("filesystems");
1adf2230
AA
1172 for (const auto &p : fsmap.filesystems) {
1173 const auto &fs = p.second;
1174 f->open_object_section("filesystem");
1175 {
1176 const MDSMap &mds_map = fs->mds_map;
1177 f->dump_string("name", mds_map.fs_name);
1178 /* Output both the names and IDs of pools, for use by
1179 * humans and machines respectively */
f67539c2 1180 f->dump_string("metadata_pool", mon.osdmon()->osdmap.get_pool_name(
1adf2230
AA
1181 mds_map.metadata_pool));
1182 f->dump_int("metadata_pool_id", mds_map.metadata_pool);
1183 f->open_array_section("data_pool_ids");
1184 for (const auto &id : mds_map.data_pools) {
1185 f->dump_int("data_pool_id", id);
1186 }
1187 f->close_section();
7c673cae 1188
1adf2230
AA
1189 f->open_array_section("data_pools");
1190 for (const auto &id : mds_map.data_pools) {
f67539c2 1191 const auto &name = mon.osdmon()->osdmap.get_pool_name(id);
1adf2230 1192 f->dump_string("data_pool", name);
7c673cae
FG
1193 }
1194 f->close_section();
1195 }
1adf2230 1196 f->close_section();
7c673cae
FG
1197 }
1198 f->close_section();
1199 f->flush(ds);
1200 } else {
28e407b8
AA
1201 for (const auto &p : fsmap.filesystems) {
1202 const auto &fs = p.second;
7c673cae 1203 const MDSMap &mds_map = fs->mds_map;
f67539c2 1204 const string &md_pool_name = mon.osdmon()->osdmap.get_pool_name(
7c673cae
FG
1205 mds_map.metadata_pool);
1206
1207 ds << "name: " << mds_map.fs_name << ", metadata pool: "
1208 << md_pool_name << ", data pools: [";
1adf2230 1209 for (const auto &id : mds_map.data_pools) {
f67539c2 1210 const string &pool_name = mon.osdmon()->osdmap.get_pool_name(id);
7c673cae
FG
1211 ds << pool_name << " ";
1212 }
1213 ds << "]" << std::endl;
1214 }
1215
1216 if (fsmap.filesystems.empty()) {
1217 ds << "No filesystems enabled" << std::endl;
1218 }
1219 }
1220 r = 0;
f67539c2
TL
1221 } else if (prefix == "fs feature ls") {
1222 if (f) {
1223 f->open_array_section("cephfs_features");
1224 for (size_t i = 0; i <= CEPHFS_FEATURE_MAX; ++i) {
1225 f->open_object_section("feature");
1226 f->dump_int("index", i);
1227 f->dump_string("name", cephfs_feature_name(i));
1228 f->close_section();
1229 }
1230 f->close_section();
1231 f->flush(ds);
1232 } else {
1233 for (size_t i = 0; i <= CEPHFS_FEATURE_MAX; ++i) {
1234 ds << i << " " << cephfs_feature_name(i) << std::endl;
1235 }
1236 }
1237 r = 0;
20effc67
TL
1238 } else if (prefix == "fs lsflags") {
1239 string fs_name;
1240 cmd_getval(cmdmap, "fs_name", fs_name);
1241 const auto &fs = fsmap.get_filesystem(fs_name);
1242 if (!fs) {
1243 ss << "filesystem '" << fs_name << "' not found";
1244 r = -ENOENT;
1245 } else {
1246 const MDSMap &mds_map = fs->mds_map;
1247 if (f) {
1248 mds_map.dump_flags_state(f.get());
1249 f->flush(ds);
1250 }
1251 else {
1252 mds_map.print_flags(ds);
1253 }
1254 r = 0;
1255 }
7c673cae
FG
1256 }
1257
28e407b8 1258out:
7c673cae
FG
1259 if (r != -1) {
1260 rdata.append(ds);
1261 string rs;
1262 getline(ss, rs);
f67539c2 1263 mon.reply_command(op, r, rs, rdata, get_last_committed());
7c673cae
FG
1264 return true;
1265 } else
1266 return false;
1267}
1268
1adf2230 1269bool MDSMonitor::fail_mds_gid(FSMap &fsmap, mds_gid_t gid)
7c673cae 1270{
9f95a23c 1271 const auto& info = fsmap.get_info_gid(gid);
91327a77 1272 dout(1) << "fail_mds_gid " << gid << " mds." << info.name << " role " << info.rank << dendl;
7c673cae 1273
f67539c2 1274 ceph_assert(mon.osdmon()->is_writeable());
a8e16298 1275
f67539c2 1276 epoch_t blocklist_epoch = 0;
7c673cae
FG
1277 if (info.rank >= 0 && info.state != MDSMap::STATE_STANDBY_REPLAY) {
1278 utime_t until = ceph_clock_now();
f67539c2
TL
1279 until += g_conf().get_val<double>("mon_mds_blocklist_interval");
1280 blocklist_epoch = mon.osdmon()->blocklist(info.addrs, until);
aee94f69
TL
1281 /* do not delay when we are evicting an MDS */
1282 force_immediate_propose();
7c673cae
FG
1283 }
1284
f67539c2 1285 fsmap.erase(gid, blocklist_epoch);
7c673cae
FG
1286 last_beacon.erase(gid);
1287 if (pending_daemon_health.count(gid)) {
1288 pending_daemon_health.erase(gid);
1289 pending_daemon_health_rm.insert(gid);
1290 }
1291
f67539c2 1292 return blocklist_epoch != 0;
7c673cae
FG
1293}
1294
1adf2230 1295mds_gid_t MDSMonitor::gid_from_arg(const FSMap &fsmap, const std::string &arg, std::ostream &ss)
7c673cae
FG
1296{
1297 // Try parsing as a role
1298 mds_role_t role;
1299 std::ostringstream ignore_err; // Don't spam 'ss' with parse_role errors
1adf2230 1300 int r = fsmap.parse_role(arg, &role, ignore_err);
7c673cae
FG
1301 if (r == 0) {
1302 // See if a GID is assigned to this role
28e407b8 1303 const auto &fs = fsmap.get_filesystem(role.fscid);
11fdf7f2 1304 ceph_assert(fs != nullptr); // parse_role ensures it exists
7c673cae
FG
1305 if (fs->mds_map.is_up(role.rank)) {
1306 dout(10) << __func__ << ": validated rank/GID " << role
1307 << " as a rank" << dendl;
1308 return fs->mds_map.get_mds_info(role.rank).global_id;
1309 }
1310 }
1311
1312 // Try parsing as a gid
1313 std::string err;
1314 unsigned long long maybe_gid = strict_strtoll(arg.c_str(), 10, &err);
1315 if (!err.empty()) {
1316 // Not a role or a GID, try as a daemon name
28e407b8 1317 const MDSMap::mds_info_t *mds_info = fsmap.find_by_name(arg);
7c673cae
FG
1318 if (!mds_info) {
1319 ss << "MDS named '" << arg
1320 << "' does not exist, or is not up";
1321 return MDS_GID_NONE;
1322 }
1323 dout(10) << __func__ << ": resolved MDS name '" << arg
1324 << "' to GID " << mds_info->global_id << dendl;
1325 return mds_info->global_id;
1326 } else {
1327 // Not a role, but parses as a an integer, might be a GID
1328 dout(10) << __func__ << ": treating MDS reference '" << arg
1329 << "' as an integer " << maybe_gid << dendl;
31f18b77 1330
28e407b8 1331 if (fsmap.gid_exists(mds_gid_t(maybe_gid))) {
31f18b77 1332 return mds_gid_t(maybe_gid);
7c673cae
FG
1333 }
1334 }
1335
1336 dout(1) << __func__ << ": rank/GID " << arg
1337 << " not a existent rank or GID" << dendl;
1338 return MDS_GID_NONE;
1339}
1340
1adf2230
AA
1341int MDSMonitor::fail_mds(FSMap &fsmap, std::ostream &ss,
1342 const std::string &arg, MDSMap::mds_info_t *failed_info)
7c673cae 1343{
11fdf7f2 1344 ceph_assert(failed_info != nullptr);
d2e6a577 1345
1adf2230 1346 mds_gid_t gid = gid_from_arg(fsmap, arg, ss);
7c673cae
FG
1347 if (gid == MDS_GID_NONE) {
1348 return 0;
1349 }
f67539c2 1350 if (!mon.osdmon()->is_writeable()) {
7c673cae
FG
1351 return -EAGAIN;
1352 }
d2e6a577
FG
1353
1354 // Take a copy of the info before removing the MDS from the map,
1355 // so that the caller knows which mds (if any) they ended up removing.
1adf2230 1356 *failed_info = fsmap.get_info_gid(gid);
d2e6a577 1357
1adf2230 1358 fail_mds_gid(fsmap, gid);
7c673cae 1359 ss << "failed mds gid " << gid;
f67539c2
TL
1360 ceph_assert(mon.osdmon()->is_writeable());
1361 request_proposal(mon.osdmon());
7c673cae
FG
1362 return 0;
1363}
1364
1365bool MDSMonitor::prepare_command(MonOpRequestRef op)
1366{
1367 op->mark_mdsmon_event(__func__);
9f95a23c 1368 auto m = op->get_req<MMonCommand>();
7c673cae
FG
1369 int r = -EINVAL;
1370 stringstream ss;
1371 bufferlist rdata;
1372
11fdf7f2 1373 cmdmap_t cmdmap;
7c673cae
FG
1374 if (!cmdmap_from_json(m->cmd, &cmdmap, ss)) {
1375 string rs = ss.str();
f67539c2 1376 mon.reply_command(op, -EINVAL, rs, rdata, get_last_committed());
05a536ef 1377 return false;
7c673cae
FG
1378 }
1379
1380 string prefix;
9f95a23c 1381 cmd_getval(cmdmap, "prefix", prefix);
7c673cae
FG
1382
1383 /* Refuse access if message not associated with a valid session */
11fdf7f2 1384 MonSession *session = op->get_session();
7c673cae 1385 if (!session) {
f67539c2 1386 mon.reply_command(op, -EACCES, "access denied", rdata, get_last_committed());
05a536ef 1387 return false;
7c673cae
FG
1388 }
1389
28e407b8
AA
1390 auto &pending = get_pending_fsmap_writeable();
1391
c07f9fc5 1392 bool batched_propose = false;
28e407b8 1393 for (const auto &h : handlers) {
f67539c2
TL
1394 r = h->can_handle(prefix, op, pending, cmdmap, ss);
1395 if (r == 1) {
1396 ; // pass, since we got the right handler.
1397 } else if (r == 0) {
1398 continue;
1399 } else {
1400 goto out;
1401 }
c07f9fc5 1402
f67539c2
TL
1403 batched_propose = h->batched_propose();
1404 if (batched_propose) {
1405 paxos.plug();
1406 }
1407 r = h->handle(&mon, pending, op, cmdmap, ss);
1408 if (batched_propose) {
1409 paxos.unplug();
1410 }
1411
1412 if (r == -EAGAIN) {
1413 // message has been enqueued for retry; return.
1414 dout(4) << __func__ << " enqueue for retry by prepare_command" << dendl;
1415 return false;
1416 } else {
1417 if (r == 0) {
1418 // On successful updates, print the updated map
1419 print_map(pending);
7c673cae 1420 }
f67539c2
TL
1421 // Successful or not, we're done: respond.
1422 goto out;
7c673cae
FG
1423 }
1424 }
1425
1adf2230 1426 r = filesystem_command(pending, op, prefix, cmdmap, ss);
7c673cae
FG
1427 if (r >= 0) {
1428 goto out;
1429 } else if (r == -EAGAIN) {
1430 // Do not reply, the message has been enqueued for retry
1431 dout(4) << __func__ << " enqueue for retry by filesystem_command" << dendl;
1432 return false;
1433 } else if (r != -ENOSYS) {
1434 goto out;
1435 }
1436
7c673cae
FG
1437 if (r == -ENOSYS && ss.str().empty()) {
1438 ss << "unrecognized command";
1439 }
1440
1441out:
1442 dout(4) << __func__ << " done, r=" << r << dendl;
1443 /* Compose response */
aee94f69 1444 string rs = ss.str();
7c673cae
FG
1445
1446 if (r >= 0) {
1447 // success.. delay reply
1448 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, r, rs,
1449 get_last_committed() + 1));
c07f9fc5
FG
1450 if (batched_propose) {
1451 force_immediate_propose();
1452 }
7c673cae
FG
1453 return true;
1454 } else {
1455 // reply immediately
f67539c2 1456 mon.reply_command(op, r, rs, rdata, get_last_committed());
7c673cae
FG
1457 return false;
1458 }
1459}
1460
7c673cae 1461int MDSMonitor::filesystem_command(
1adf2230 1462 FSMap &fsmap,
7c673cae
FG
1463 MonOpRequestRef op,
1464 std::string const &prefix,
11fdf7f2 1465 const cmdmap_t& cmdmap,
7c673cae
FG
1466 std::stringstream &ss)
1467{
1468 dout(4) << __func__ << " prefix='" << prefix << "'" << dendl;
1469 op->mark_mdsmon_event(__func__);
1470 int r = 0;
1471 string whostr;
9f95a23c 1472 cmd_getval(cmdmap, "role", whostr);
7c673cae 1473
11fdf7f2 1474 if (prefix == "mds set_state") {
7c673cae 1475 mds_gid_t gid;
9f95a23c 1476 if (!cmd_getval(cmdmap, "gid", gid)) {
7c673cae 1477 ss << "error parsing 'gid' value '"
11fdf7f2 1478 << cmd_vartype_stringify(cmdmap.at("gid")) << "'";
7c673cae
FG
1479 return -EINVAL;
1480 }
1481 MDSMap::DaemonState state;
9f95a23c 1482 if (!cmd_getval(cmdmap, "state", state)) {
7c673cae 1483 ss << "error parsing 'state' string value '"
11fdf7f2 1484 << cmd_vartype_stringify(cmdmap.at("state")) << "'";
7c673cae
FG
1485 return -EINVAL;
1486 }
f67539c2 1487 if (fsmap.gid_exists(gid, op->get_session()->get_allowed_fs_names())) {
11fdf7f2
TL
1488 fsmap.modify_daemon(gid, [state](auto& info) {
1489 info.state = state;
7c673cae
FG
1490 });
1491 ss << "set mds gid " << gid << " to state " << state << " "
1492 << ceph_mds_state_name(state);
1493 return 0;
1494 }
1495 } else if (prefix == "mds fail") {
1496 string who;
9f95a23c 1497 cmd_getval(cmdmap, "role_or_gid", who);
d2e6a577
FG
1498
1499 MDSMap::mds_info_t failed_info;
f67539c2
TL
1500 mds_gid_t gid = gid_from_arg(fsmap, who, ss);
1501 if (gid == MDS_GID_NONE) {
1502 ss << "MDS named '" << who << "' does not exist, is not up or you "
1503 << "lack the permission to see.";
1504 return 0;
1505 }
1506 if(!fsmap.gid_exists(gid, op->get_session()->get_allowed_fs_names())) {
1507 ss << "MDS named '" << who << "' does not exist, is not up or you "
1508 << "lack the permission to see.";
1509 return -EINVAL;
1510 }
1511 string_view fs_name = fsmap.fs_name_from_gid(gid);
1512 if (!op->get_session()->fs_name_capable(fs_name, MON_CAP_W)) {
1513 ss << "Permission denied.";
1514 return -EPERM;
1515 }
1516
1adf2230 1517 r = fail_mds(fsmap, ss, who, &failed_info);
7c673cae 1518 if (r < 0 && r == -EAGAIN) {
f67539c2 1519 mon.osdmon()->wait_for_writeable(op, new C_RetryMessage(this, op));
7c673cae 1520 return -EAGAIN; // don't propose yet; wait for message to be retried
d2e6a577
FG
1521 } else if (r == 0) {
1522 // Only log if we really did something (not when was already gone)
1523 if (failed_info.global_id != MDS_GID_NONE) {
f67539c2 1524 mon.clog->info() << failed_info.human_name() << " marked failed by "
d2e6a577
FG
1525 << op->get_session()->entity_name;
1526 }
7c673cae
FG
1527 }
1528 } else if (prefix == "mds rm") {
1529 mds_gid_t gid;
9f95a23c 1530 if (!cmd_getval(cmdmap, "gid", gid)) {
7c673cae 1531 ss << "error parsing 'gid' value '"
11fdf7f2 1532 << cmd_vartype_stringify(cmdmap.at("gid")) << "'";
7c673cae
FG
1533 return -EINVAL;
1534 }
f67539c2 1535 if (!fsmap.gid_exists(gid, op->get_session()->get_allowed_fs_names())) {
11fdf7f2 1536 ss << "mds gid " << gid << " does not exist";
f67539c2
TL
1537 return 0;
1538 }
1539 string_view fs_name = fsmap.fs_name_from_gid(gid);
1540 if (!op->get_session()->fs_name_capable(fs_name, MON_CAP_W)) {
1541 ss << "Permission denied.";
1542 return -EPERM;
1543 }
1544 const auto &info = fsmap.get_info_gid(gid);
1545 MDSMap::DaemonState state = info.state;
1546 if (state > 0) {
1547 ss << "cannot remove active mds." << info.name
1548 << " rank " << info.rank;
1549 return -EBUSY;
7c673cae 1550 } else {
f67539c2
TL
1551 fsmap.erase(gid, {});
1552 ss << "removed mds gid " << gid;
1553 return 0;
7c673cae
FG
1554 }
1555 } else if (prefix == "mds rmfailed") {
11fdf7f2 1556 bool confirm = false;
9f95a23c 1557 cmd_getval(cmdmap, "yes_i_really_mean_it", confirm);
11fdf7f2 1558 if (!confirm) {
7c673cae
FG
1559 ss << "WARNING: this can make your filesystem inaccessible! "
1560 "Add --yes-i-really-mean-it if you are sure you wish to continue.";
1561 return -EPERM;
1562 }
1563
1564 std::string role_str;
9f95a23c 1565 cmd_getval(cmdmap, "role", role_str);
7c673cae 1566 mds_role_t role;
f67539c2
TL
1567 const auto fs_names = op->get_session()->get_allowed_fs_names();
1568 int r = fsmap.parse_role(role_str, &role, ss, fs_names);
7c673cae
FG
1569 if (r < 0) {
1570 ss << "invalid role '" << role_str << "'";
1571 return -EINVAL;
1572 }
f67539c2
TL
1573 string_view fs_name = fsmap.get_filesystem(role.fscid)->mds_map.get_fs_name();
1574 if (!op->get_session()->fs_name_capable(fs_name, MON_CAP_W)) {
1575 ss << "Permission denied.";
1576 return -EPERM;
1577 }
7c673cae 1578
1adf2230 1579 fsmap.modify_filesystem(
7c673cae
FG
1580 role.fscid,
1581 [role](std::shared_ptr<Filesystem> fs)
1582 {
1583 fs->mds_map.failed.erase(role.rank);
1584 });
1585
1586 ss << "removed failed mds." << role;
1587 return 0;
522d829b 1588 /* TODO: convert to fs commands to update defaults */
7c673cae
FG
1589 } else if (prefix == "mds compat rm_compat") {
1590 int64_t f;
9f95a23c 1591 if (!cmd_getval(cmdmap, "feature", f)) {
7c673cae 1592 ss << "error parsing feature value '"
11fdf7f2 1593 << cmd_vartype_stringify(cmdmap.at("feature")) << "'";
7c673cae
FG
1594 return -EINVAL;
1595 }
522d829b 1596 if (fsmap.default_compat.compat.contains(f)) {
7c673cae 1597 ss << "removing compat feature " << f;
522d829b 1598 fsmap.default_compat.compat.remove(f);
7c673cae 1599 } else {
522d829b 1600 ss << "compat feature " << f << " not present in " << fsmap.default_compat;
7c673cae
FG
1601 }
1602 r = 0;
1603 } else if (prefix == "mds compat rm_incompat") {
1604 int64_t f;
9f95a23c 1605 if (!cmd_getval(cmdmap, "feature", f)) {
7c673cae 1606 ss << "error parsing feature value '"
11fdf7f2 1607 << cmd_vartype_stringify(cmdmap.at("feature")) << "'";
7c673cae
FG
1608 return -EINVAL;
1609 }
522d829b 1610 if (fsmap.default_compat.incompat.contains(f)) {
7c673cae 1611 ss << "removing incompat feature " << f;
522d829b 1612 fsmap.default_compat.incompat.remove(f);
7c673cae 1613 } else {
522d829b 1614 ss << "incompat feature " << f << " not present in " << fsmap.default_compat;
7c673cae
FG
1615 }
1616 r = 0;
1617 } else if (prefix == "mds repaired") {
1618 std::string role_str;
9f95a23c 1619 cmd_getval(cmdmap, "role", role_str);
7c673cae 1620 mds_role_t role;
f67539c2
TL
1621 const auto fs_names = op->get_session()->get_allowed_fs_names();
1622 r = fsmap.parse_role(role_str, &role, ss, fs_names);
7c673cae
FG
1623 if (r < 0) {
1624 return r;
1625 }
f67539c2
TL
1626 string_view fs_name = fsmap.get_filesystem(role.fscid)->mds_map.get_fs_name();
1627 if (!op->get_session()->fs_name_capable(fs_name, MON_CAP_W)) {
1628 ss << "Permission denied.";
1629 return -EPERM;
1630 }
7c673cae 1631
1adf2230 1632 bool modified = fsmap.undamaged(role.fscid, role.rank);
7c673cae 1633 if (modified) {
494da23a 1634 ss << "repaired: restoring rank " << role;
7c673cae 1635 } else {
494da23a 1636 ss << "nothing to do: rank is not damaged";
7c673cae
FG
1637 }
1638
1639 r = 0;
11fdf7f2
TL
1640 } else if (prefix == "mds freeze") {
1641 std::string who;
9f95a23c 1642 cmd_getval(cmdmap, "role_or_gid", who);
11fdf7f2
TL
1643 mds_gid_t gid = gid_from_arg(fsmap, who, ss);
1644 if (gid == MDS_GID_NONE) {
7c673cae
FG
1645 return -EINVAL;
1646 }
1647
f67539c2
TL
1648 string_view fs_name = fsmap.fs_name_from_gid(gid);
1649 if (!op->get_session()->fs_name_capable(fs_name, MON_CAP_W)) {
1650 ss << "Permission denied.";
1651 return -EPERM;
1652 }
1653
11fdf7f2 1654 bool freeze = false;
7c673cae 1655 {
11fdf7f2 1656 std::string str;
9f95a23c 1657 cmd_getval(cmdmap, "val", str);
11fdf7f2
TL
1658 if ((r = parse_bool(str, &freeze, ss)) != 0) {
1659 return r;
1660 }
1661 }
7c673cae 1662
11fdf7f2
TL
1663 auto f = [freeze,gid,&ss](auto& info) {
1664 if (freeze) {
1665 ss << "freezing mds." << gid;
1666 info.freeze();
1667 } else {
1668 ss << "unfreezing mds." << gid;
1669 info.unfreeze();
1670 }
1671 };
1672 fsmap.modify_daemon(gid, f);
7c673cae
FG
1673 r = 0;
1674 } else {
1675 return -ENOSYS;
1676 }
1677
1678 return r;
1679}
1680
7c673cae
FG
1681void MDSMonitor::check_subs()
1682{
7c673cae
FG
1683 // Subscriptions may be to "mdsmap" (MDS and legacy clients),
1684 // "mdsmap.<namespace>", or to "fsmap" for the full state of all
1685 // filesystems. Build a list of all the types we service
1686 // subscriptions for.
9f95a23c
TL
1687
1688 std::vector<std::string> types = {
1689 "fsmap",
1690 "fsmap.user",
1691 "mdsmap",
1692 };
1693
28e407b8
AA
1694 for (const auto &p : get_fsmap().filesystems) {
1695 const auto &fscid = p.first;
9f95a23c
TL
1696 CachedStackStringStream cos;
1697 *cos << "mdsmap." << fscid;
1698 types.push_back(std::string(cos->strv()));
7c673cae
FG
1699 }
1700
1701 for (const auto &type : types) {
f67539c2 1702 auto& subs = mon.session_map.subs;
9f95a23c
TL
1703 auto subs_it = subs.find(type);
1704 if (subs_it == subs.end())
7c673cae 1705 continue;
9f95a23c
TL
1706 auto sub_it = subs_it->second->begin();
1707 while (!sub_it.end()) {
1708 auto sub = *sub_it;
1709 ++sub_it; // N.B. check_sub may remove sub!
7c673cae
FG
1710 check_sub(sub);
1711 }
1712 }
1713}
1714
1715
1716void MDSMonitor::check_sub(Subscription *sub)
1717{
1718 dout(20) << __func__ << ": " << sub->type << dendl;
1719
f67539c2
TL
1720 // to use const qualifier filter fsmap beforehand
1721 FSMap _fsmap_copy = get_fsmap();
1722 _fsmap_copy.filter(sub->session->get_allowed_fs_names());
1723 const auto& fsmap = _fsmap_copy;
1724 if (sub->next > fsmap.get_epoch()) {
1725 return;
1726 }
28e407b8 1727
7c673cae 1728 if (sub->type == "fsmap") {
f67539c2
TL
1729 sub->session->con->send_message(new MFSMap(mon.monmap->fsid, fsmap));
1730 if (sub->onetime) {
1731 mon.session_map.remove_sub(sub);
1732 } else {
1733 sub->next = fsmap.get_epoch() + 1;
7c673cae
FG
1734 }
1735 } else if (sub->type == "fsmap.user") {
f67539c2
TL
1736 FSMapUser fsmap_u;
1737 fsmap_u.epoch = fsmap.get_epoch();
1738 fsmap_u.legacy_client_fscid = fsmap.legacy_client_fscid;
1739 for (const auto &p : fsmap.filesystems) {
1740 FSMapUser::fs_info_t& fs_info = fsmap_u.filesystems[p.second->fscid];
1741 fs_info.cid = p.second->fscid;
1742 fs_info.name = p.second->mds_map.fs_name;
1743 }
1744 sub->session->con->send_message(new MFSMapUser(mon.monmap->fsid, fsmap_u));
1745 if (sub->onetime) {
1746 mon.session_map.remove_sub(sub);
1747 } else {
1748 sub->next = fsmap.get_epoch() + 1;
7c673cae
FG
1749 }
1750 } else if (sub->type.compare(0, 6, "mdsmap") == 0) {
11fdf7f2 1751 const bool is_mds = sub->session->name.is_mds();
7c673cae
FG
1752 mds_gid_t mds_gid = MDS_GID_NONE;
1753 fs_cluster_id_t fscid = FS_CLUSTER_ID_NONE;
1754 if (is_mds) {
1755 // What (if any) namespace are you assigned to?
1756 auto mds_info = fsmap.get_mds_info();
1adf2230 1757 for (const auto &p : mds_info) {
11fdf7f2 1758 if (p.second.addrs == sub->session->addrs) {
1adf2230 1759 mds_gid = p.first;
7c673cae
FG
1760 fscid = fsmap.mds_roles.at(mds_gid);
1761 }
1762 }
1763 } else {
1764 // You're a client. Did you request a particular
1765 // namespace?
11fdf7f2 1766 if (sub->type.compare(0, 7, "mdsmap.") == 0) {
7c673cae
FG
1767 auto namespace_id_str = sub->type.substr(std::string("mdsmap.").size());
1768 dout(10) << __func__ << ": namespace_id " << namespace_id_str << dendl;
1769 std::string err;
1770 fscid = strict_strtoll(namespace_id_str.c_str(), 10, &err);
1771 if (!err.empty()) {
1772 // Client asked for a non-existent namespace, send them nothing
1773 dout(1) << "Invalid client subscription '" << sub->type
1774 << "'" << dendl;
1775 return;
1776 }
7c673cae
FG
1777 } else {
1778 // Unqualified request for "mdsmap": give it the one marked
1779 // for use by legacy clients.
1780 if (fsmap.legacy_client_fscid != FS_CLUSTER_ID_NONE) {
1781 fscid = fsmap.legacy_client_fscid;
1782 } else {
1783 dout(1) << "Client subscribed for legacy filesystem but "
1784 "none is configured" << dendl;
1785 return;
1786 }
1787 }
b3b6e05e
TL
1788 if (!fsmap.filesystem_exists(fscid)) {
1789 // Client asked for a non-existent namespace, send them nothing
1790 // TODO: something more graceful for when a client has a filesystem
1791 // mounted, and the fileysstem is deleted. Add a "shut down you fool"
1792 // flag to MMDSMap?
1793 dout(1) << "Client subscribed to non-existent namespace '" <<
1794 fscid << "'" << dendl;
1795 return;
1796 }
7c673cae 1797 }
20effc67 1798 dout(10) << __func__ << ": is_mds=" << is_mds << ", fscid=" << fscid << dendl;
7c673cae
FG
1799
1800 // Work out the effective latest epoch
28e407b8 1801 const MDSMap *mds_map = nullptr;
522d829b 1802 MDSMap null_map = MDSMap::create_null_mdsmap();
7c673cae
FG
1803 if (fscid == FS_CLUSTER_ID_NONE) {
1804 // For a client, we should have already dropped out
11fdf7f2 1805 ceph_assert(is_mds);
7c673cae 1806
28e407b8
AA
1807 auto it = fsmap.standby_daemons.find(mds_gid);
1808 if (it != fsmap.standby_daemons.end()) {
7c673cae 1809 // For an MDS, we need to feed it an MDSMap with its own state in
28e407b8
AA
1810 null_map.mds_info[mds_gid] = it->second;
1811 null_map.epoch = fsmap.standby_epochs.at(mds_gid);
7c673cae
FG
1812 } else {
1813 null_map.epoch = fsmap.epoch;
1814 }
1815 mds_map = &null_map;
1816 } else {
1817 // Check the effective epoch
28e407b8 1818 mds_map = &fsmap.get_filesystem(fscid)->mds_map;
7c673cae
FG
1819 }
1820
11fdf7f2 1821 ceph_assert(mds_map != nullptr);
7c673cae
FG
1822 dout(10) << __func__ << " selected MDS map epoch " <<
1823 mds_map->epoch << " for namespace " << fscid << " for subscriber "
11fdf7f2 1824 << sub->session->name << " who wants epoch " << sub->next << dendl;
7c673cae
FG
1825
1826 if (sub->next > mds_map->epoch) {
1827 return;
1828 }
20effc67 1829 auto msg = make_message<MMDSMap>(mon.monmap->fsid, *mds_map);
7c673cae 1830
11fdf7f2 1831 sub->session->con->send_message(msg.detach());
7c673cae 1832 if (sub->onetime) {
f67539c2 1833 mon.session_map.remove_sub(sub);
7c673cae
FG
1834 } else {
1835 sub->next = mds_map->get_epoch() + 1;
1836 }
1837 }
1838}
1839
1840
1841void MDSMonitor::update_metadata(mds_gid_t gid,
1842 const map<string, string>& metadata)
1843{
05a536ef 1844 dout(20) << __func__ << ": mds." << gid << ": " << metadata << dendl;
7c673cae 1845 if (metadata.empty()) {
05a536ef 1846 dout(5) << __func__ << ": mds." << gid << ": no metadata!" << dendl;
7c673cae
FG
1847 return;
1848 }
1849 pending_metadata[gid] = metadata;
1850
f67539c2 1851 MonitorDBStore::TransactionRef t = paxos.get_pending_transaction();
7c673cae 1852 bufferlist bl;
11fdf7f2 1853 encode(pending_metadata, bl);
7c673cae 1854 t->put(MDS_METADATA_PREFIX, "last_metadata", bl);
7c673cae
FG
1855}
1856
1adf2230 1857void MDSMonitor::remove_from_metadata(const FSMap &fsmap, MonitorDBStore::TransactionRef t)
7c673cae
FG
1858{
1859 bool update = false;
1adf2230
AA
1860 for (auto it = pending_metadata.begin(); it != pending_metadata.end(); ) {
1861 if (!fsmap.gid_exists(it->first)) {
1862 it = pending_metadata.erase(it);
7c673cae
FG
1863 update = true;
1864 } else {
1adf2230 1865 ++it;
7c673cae
FG
1866 }
1867 }
1868 if (!update)
1869 return;
1870 bufferlist bl;
11fdf7f2 1871 encode(pending_metadata, bl);
7c673cae
FG
1872 t->put(MDS_METADATA_PREFIX, "last_metadata", bl);
1873}
1874
1875int MDSMonitor::load_metadata(map<mds_gid_t, Metadata>& m)
1876{
1877 bufferlist bl;
f67539c2 1878 int r = mon.store->get(MDS_METADATA_PREFIX, "last_metadata", bl);
7c673cae 1879 if (r) {
11fdf7f2 1880 dout(5) << "Unable to load 'last_metadata'" << dendl;
7c673cae
FG
1881 return r;
1882 }
1883
11fdf7f2
TL
1884 auto it = bl.cbegin();
1885 ceph::decode(m, it);
7c673cae
FG
1886 return 0;
1887}
1888
1adf2230 1889void MDSMonitor::count_metadata(const std::string &field, map<string,int> *out)
31f18b77 1890{
31f18b77
FG
1891 map<mds_gid_t,Metadata> meta;
1892 load_metadata(meta);
1893 for (auto& p : meta) {
1894 auto q = p.second.find(field);
1895 if (q == p.second.end()) {
c07f9fc5 1896 (*out)["unknown"]++;
31f18b77 1897 } else {
c07f9fc5 1898 (*out)[q->second]++;
31f18b77
FG
1899 }
1900 }
c07f9fc5
FG
1901}
1902
1adf2230 1903void MDSMonitor::count_metadata(const std::string &field, Formatter *f)
c07f9fc5
FG
1904{
1905 map<string,int> by_val;
1906 count_metadata(field, &by_val);
31f18b77
FG
1907 f->open_object_section(field.c_str());
1908 for (auto& p : by_val) {
1909 f->dump_int(p.first.c_str(), p.second);
1910 }
1911 f->close_section();
1912}
1913
f67539c2
TL
1914void MDSMonitor::get_versions(std::map<string, list<string> > &versions)
1915{
1916 map<mds_gid_t,Metadata> meta;
1917 load_metadata(meta);
1918 const auto &fsmap = get_fsmap();
1919 std::map<mds_gid_t, mds_info_t> map = fsmap.get_mds_info();
1920 dout(10) << __func__ << " mds meta=" << meta << dendl;
1921 for (auto& p : meta) {
1922 auto q = p.second.find("ceph_version_short");
1923 if (q == p.second.end()) continue;
1924 versions[q->second].push_back(string("mds.") + map[p.first].name);
1925 }
1926}
1927
1adf2230
AA
1928int MDSMonitor::dump_metadata(const FSMap& fsmap, const std::string &who,
1929 Formatter *f, ostream& err)
7c673cae 1930{
11fdf7f2 1931 ceph_assert(f);
7c673cae 1932
1adf2230 1933 mds_gid_t gid = gid_from_arg(fsmap, who, err);
7c673cae
FG
1934 if (gid == MDS_GID_NONE) {
1935 return -EINVAL;
1936 }
1937
1938 map<mds_gid_t, Metadata> metadata;
1939 if (int r = load_metadata(metadata)) {
1940 err << "Unable to load 'last_metadata'";
1941 return r;
1942 }
1943
1944 if (!metadata.count(gid)) {
1945 return -ENOENT;
1946 }
1947 const Metadata& m = metadata[gid];
1948 for (Metadata::const_iterator p = m.begin(); p != m.end(); ++p) {
1949 f->dump_string(p->first.c_str(), p->second);
1950 }
1951 return 0;
1952}
1953
1954int MDSMonitor::print_nodes(Formatter *f)
1955{
11fdf7f2 1956 ceph_assert(f);
7c673cae 1957
1adf2230
AA
1958 const auto &fsmap = get_fsmap();
1959
7c673cae
FG
1960 map<mds_gid_t, Metadata> metadata;
1961 if (int r = load_metadata(metadata)) {
1962 return r;
1963 }
1964
11fdf7f2 1965 map<string, list<string> > mdses; // hostname => mds
1adf2230
AA
1966 for (const auto &p : metadata) {
1967 const mds_gid_t& gid = p.first;
1968 const Metadata& m = p.second;
7c673cae
FG
1969 Metadata::const_iterator hostname = m.find("hostname");
1970 if (hostname == m.end()) {
1971 // not likely though
1972 continue;
1973 }
1adf2230 1974 if (!fsmap.gid_exists(gid)) {
7c673cae
FG
1975 dout(5) << __func__ << ": GID " << gid << " not existent" << dendl;
1976 continue;
1977 }
1adf2230 1978 const MDSMap::mds_info_t& mds_info = fsmap.get_info_gid(gid);
11fdf7f2 1979 mdses[hostname->second].push_back(mds_info.name);
7c673cae
FG
1980 }
1981
1982 dump_services(f, mdses, "mds");
1983 return 0;
1984}
1985
1986/**
1987 * If a cluster is undersized (with respect to max_mds), then
11fdf7f2
TL
1988 * attempt to find daemons to grow it. If the cluster is oversized
1989 * (with respect to max_mds) then shrink it by stopping its highest rank.
7c673cae 1990 */
11fdf7f2 1991bool MDSMonitor::maybe_resize_cluster(FSMap &fsmap, fs_cluster_id_t fscid)
7c673cae 1992{
11fdf7f2 1993 auto&& fs = fsmap.get_filesystem(fscid);
1adf2230 1994 auto &mds_map = fs->mds_map;
7c673cae 1995
1adf2230
AA
1996 int in = mds_map.get_num_in_mds();
1997 int max = mds_map.get_max_mds();
1998
1999 dout(20) << __func__ << " in " << in << " max " << max << dendl;
2000
11fdf7f2
TL
2001 /* Check that both the current epoch mds_map is resizeable as well as the
2002 * current batch of changes in pending. This is important if an MDS is
2003 * becoming active in the next epoch.
2004 */
05a536ef
TL
2005 if (!get_fsmap().filesystem_exists(fscid) ||
2006 !get_fsmap().get_filesystem(fscid)->mds_map.is_resizeable() ||
11fdf7f2
TL
2007 !mds_map.is_resizeable()) {
2008 dout(5) << __func__ << " mds_map is not currently resizeable" << dendl;
2009 return false;
2010 }
2011
2012 if (in < max && !mds_map.test_flag(CEPH_MDSMAP_NOT_JOINABLE)) {
7c673cae 2013 mds_rank_t mds = mds_rank_t(0);
1adf2230 2014 while (mds_map.is_in(mds)) {
7c673cae
FG
2015 mds++;
2016 }
9f95a23c
TL
2017 auto info = fsmap.find_replacement_for({fscid, mds});
2018 if (!info) {
1adf2230 2019 return false;
7c673cae
FG
2020 }
2021
9f95a23c 2022 dout(1) << "assigned standby " << info->addrs
7c673cae 2023 << " as mds." << mds << dendl;
f67539c2 2024 mon.clog->info() << info->human_name() << " assigned to "
1adf2230
AA
2025 "filesystem " << mds_map.fs_name << " as rank "
2026 << mds << " (now has " << mds_map.get_num_in_mds() + 1
d2e6a577 2027 << " ranks)";
9f95a23c 2028 fsmap.promote(info->global_id, *fs, mds);
1adf2230 2029 return true;
11fdf7f2
TL
2030 } else if (in > max) {
2031 mds_rank_t target = in - 1;
2032 const auto &info = mds_map.get_info(target);
2033 if (mds_map.is_active(target)) {
2034 dout(1) << "stopping " << target << dendl;
f67539c2 2035 mon.clog->info() << "stopping " << info.human_name();
11fdf7f2
TL
2036 auto f = [](auto& info) {
2037 info.state = MDSMap::STATE_STOPPING;
2038 };
2039 fsmap.modify_daemon(info.global_id, f);
2040 return true;
2041 } else {
2042 dout(20) << "skipping stop of " << target << dendl;
2043 return false;
2044 }
7c673cae
FG
2045 }
2046
1adf2230 2047 return false;
7c673cae
FG
2048}
2049
2050
2051/**
9f95a23c 2052 * Fail a daemon and replace it with a suitable standby.
7c673cae 2053 */
9f95a23c 2054bool MDSMonitor::drop_mds(FSMap &fsmap, mds_gid_t gid, const mds_info_t* rep_info, bool *osd_propose)
7c673cae 2055{
11fdf7f2 2056 ceph_assert(osd_propose != nullptr);
7c673cae 2057
1adf2230 2058 const auto fscid = fsmap.mds_roles.at(gid);
9f95a23c
TL
2059 const auto& info = fsmap.get_info_gid(gid);
2060 const auto rank = info.rank;
2061 const auto state = info.state;
2062
2063 if (info.is_frozen()) {
2064 return false;
2065 } else if (state == MDSMap::STATE_STANDBY_REPLAY ||
2066 state == MDSMap::STATE_STANDBY) {
2067 dout(1) << " failing and removing standby " << gid << " " << info.addrs
2068 << " mds." << rank
2069 << "." << info.inc << " " << ceph_mds_state_name(state)
2070 << dendl;
2071 *osd_propose |= fail_mds_gid(fsmap, gid);
2072 return true;
2073 } else if (rank >= 0 && rep_info) {
2074 auto fs = fsmap.filesystems.at(fscid);
2075 if (fs->mds_map.test_flag(CEPH_MDSMAP_NOT_JOINABLE)) {
2076 return false;
2077 }
2078 // are we in?
2079 // and is there a non-laggy standby that can take over for us?
2080 dout(1) << " replacing " << gid << " " << info.addrs
2081 << " mds." << rank << "." << info.inc
2082 << " " << ceph_mds_state_name(state)
2083 << " with " << rep_info->global_id << "/" << rep_info->name << " " << rep_info->addrs
2084 << dendl;
2085
f67539c2 2086 mon.clog->warn() << "Replacing " << info.human_name()
9f95a23c
TL
2087 << " as rank " << rank
2088 << " with standby " << rep_info->human_name();
2089
2090 // Remove the old one
2091 *osd_propose |= fail_mds_gid(fsmap, gid);
2092
2093 // Promote the replacement
2094 fsmap.promote(rep_info->global_id, *fs, rank);
2095
2096 return true;
2097 }
2098 return false;
2099}
2100
2101bool MDSMonitor::check_health(FSMap& fsmap, bool* propose_osdmap)
2102{
2103 bool do_propose = false;
2104 const auto now = mono_clock::now();
f67539c2 2105 const bool osdmap_writeable = mon.osdmon()->is_writeable();
9f95a23c
TL
2106 const auto mds_beacon_grace = g_conf().get_val<double>("mds_beacon_grace");
2107 const auto mds_beacon_interval = g_conf().get_val<double>("mds_beacon_interval");
2108
2109 if (mono_clock::is_zero(last_tick)) {
2110 last_tick = now;
2111 }
2112
2113 {
2114 auto since_last = std::chrono::duration<double>(now-last_tick);
2115
2116 if (since_last.count() > (mds_beacon_grace-mds_beacon_interval)) {
2117 // This case handles either local slowness (calls being delayed
2118 // for whatever reason) or cluster election slowness (a long gap
2119 // between calls while an election happened)
2120 dout(1) << __func__ << ": resetting beacon timeouts due to mon delay "
2121 "(slow election?) of " << since_last.count() << " seconds" << dendl;
2122 for (auto& p : last_beacon) {
2123 p.second.stamp = now;
2124 }
2125 }
2126 }
2127
2128 // make sure last_beacon is fully populated
2129 for (auto& p : fsmap.mds_roles) {
2130 auto& gid = p.first;
2131 last_beacon.emplace(std::piecewise_construct,
2132 std::forward_as_tuple(gid),
2133 std::forward_as_tuple(now, 0));
2134 }
7c673cae 2135
31f18b77 2136 // We will only take decisive action (replacing/removing a daemon)
9f95a23c 2137 // if we have some indication that some other daemon(s) are successfully
31f18b77 2138 // getting beacons through recently.
1adf2230 2139 mono_time latest_beacon = mono_clock::zero();
9f95a23c 2140 for (const auto& p : last_beacon) {
1adf2230 2141 latest_beacon = std::max(p.second.stamp, latest_beacon);
31f18b77 2142 }
f67539c2 2143 auto since = std::chrono::duration<double>(now-latest_beacon);
1adf2230 2144 const bool may_replace = since.count() <
11fdf7f2 2145 std::max(g_conf()->mds_beacon_interval, g_conf()->mds_beacon_grace * 0.5);
31f18b77 2146
9f95a23c
TL
2147 // check beacon timestamps
2148 std::vector<mds_gid_t> to_remove;
20effc67
TL
2149 const bool mon_down = mon.is_mon_down();
2150 const auto mds_beacon_mon_down_grace =
2151 g_conf().get_val<std::chrono::seconds>("mds_beacon_mon_down_grace");
2152 const auto quorum_age = std::chrono::seconds(mon.quorum_age());
2153 const bool new_quorum = quorum_age < mds_beacon_mon_down_grace;
9f95a23c
TL
2154 for (auto it = last_beacon.begin(); it != last_beacon.end(); ) {
2155 auto& [gid, beacon_info] = *it;
f67539c2 2156 auto since_last = std::chrono::duration<double>(now-beacon_info.stamp);
9f95a23c
TL
2157
2158 if (!fsmap.gid_exists(gid)) {
2159 // gid no longer exists, remove from tracked beacons
2160 it = last_beacon.erase(it);
2161 continue;
2162 }
7c673cae 2163
9f95a23c
TL
2164 if (since_last.count() >= g_conf()->mds_beacon_grace) {
2165 auto& info = fsmap.get_info_gid(gid);
2166 dout(1) << "no beacon from mds." << info.rank << "." << info.inc
2167 << " (gid: " << gid << " addr: " << info.addrs
2168 << " state: " << ceph_mds_state_name(info.state) << ")"
2169 << " since " << since_last.count() << dendl;
20effc67
TL
2170 if ((mon_down || new_quorum) && since_last < mds_beacon_mon_down_grace) {
2171 /* The MDS may be sending beacons to a monitor not yet in quorum or
2172 * temporarily partitioned. Hold off on removal for a little longer...
2173 */
2174 dout(10) << "deferring removal for mds_beacon_mon_down_grace during MON_DOWN" << dendl;
2175 ++it;
2176 continue;
2177 }
f67539c2 2178 // If the OSDMap is writeable, we can blocklist things, so we can
9f95a23c
TL
2179 // try failing any laggy MDS daemons. Consider each one for failure.
2180 if (!info.laggy()) {
2181 dout(1) << " marking " << gid << " " << info.addrs
2182 << " mds." << info.rank << "." << info.inc
2183 << " " << ceph_mds_state_name(info.state)
2184 << " laggy" << dendl;
2185 fsmap.modify_daemon(info.global_id, [](auto& info) {
2186 info.laggy_since = ceph_clock_now();
2187 });
2188 do_propose = true;
2189 }
2190 if (osdmap_writeable && may_replace) {
2191 to_remove.push_back(gid); // drop_mds may invalidate iterator
2192 }
2193 }
31f18b77 2194
9f95a23c
TL
2195 ++it;
2196 }
7c673cae 2197
9f95a23c 2198 for (const auto& gid : to_remove) {
f6b5b4d7 2199 auto info = fsmap.get_info_gid(gid);
9f95a23c
TL
2200 const mds_info_t* rep_info = nullptr;
2201 if (info.rank >= 0) {
f67539c2 2202 auto fscid = fsmap.fscid_from_gid(gid);
9f95a23c
TL
2203 rep_info = fsmap.find_replacement_for({fscid, info.rank});
2204 }
2205 bool dropped = drop_mds(fsmap, gid, rep_info, propose_osdmap);
2206 if (dropped) {
f67539c2 2207 mon.clog->info() << "MDS " << info.human_name()
9f95a23c
TL
2208 << " is removed because it is dead or otherwise unavailable.";
2209 do_propose = true;
2210 }
2211 }
7c673cae 2212
9f95a23c
TL
2213 if (osdmap_writeable) {
2214 for (auto& [fscid, fs] : fsmap.filesystems) {
2215 if (!fs->mds_map.test_flag(CEPH_MDSMAP_NOT_JOINABLE) &&
2216 fs->mds_map.is_resizeable()) {
2217 // Check if a rank or standby-replay should be replaced with a stronger
2218 // affinity standby. This looks at ranks and standby-replay:
2219 for (const auto& [gid, info] : fs->mds_map.get_mds_info()) {
2220 const auto join_fscid = info.join_fscid;
2221 if (join_fscid == fscid)
2222 continue;
2223 const auto rank = info.rank;
2224 const auto state = info.state;
2225 const mds_info_t* rep_info = nullptr;
2226 if (state == MDSMap::STATE_STANDBY_REPLAY) {
522d829b 2227 rep_info = fsmap.get_available_standby(*fs);
9f95a23c
TL
2228 } else if (state == MDSMap::STATE_ACTIVE) {
2229 rep_info = fsmap.find_replacement_for({fscid, rank});
2230 } else {
2231 /* N.B. !is_degraded() */
2232 ceph_abort_msg("invalid state in MDSMap");
2233 }
2234 if (!rep_info) {
2235 break;
2236 }
2237 bool better_affinity = false;
2238 if (join_fscid == FS_CLUSTER_ID_NONE) {
2239 better_affinity = (rep_info->join_fscid == fscid);
2240 } else {
2241 better_affinity = (rep_info->join_fscid == fscid) ||
2242 (rep_info->join_fscid == FS_CLUSTER_ID_NONE);
2243 }
2244 if (better_affinity) {
2245 if (state == MDSMap::STATE_STANDBY_REPLAY) {
f67539c2 2246 mon.clog->info() << "Dropping low affinity standby-replay "
9f95a23c
TL
2247 << info.human_name()
2248 << " in favor of higher affinity standby.";
2249 *propose_osdmap |= fail_mds_gid(fsmap, gid);
2250 /* Now let maybe_promote_standby do the promotion. */
2251 } else {
f67539c2 2252 mon.clog->info() << "Dropping low affinity active "
9f95a23c
TL
2253 << info.human_name()
2254 << " in favor of higher affinity standby.";
2255 do_propose |= drop_mds(fsmap, gid, rep_info, propose_osdmap);
2256 }
2257 break; /* don't replace more than one per tick per fs */
2258 }
2259 }
2260 }
2261 }
7c673cae 2262 }
9f95a23c 2263 return do_propose;
7c673cae
FG
2264}
2265
11fdf7f2 2266bool MDSMonitor::maybe_promote_standby(FSMap &fsmap, Filesystem& fs)
7c673cae 2267{
11fdf7f2
TL
2268 if (fs.mds_map.test_flag(CEPH_MDSMAP_NOT_JOINABLE)) {
2269 return false;
2270 }
7c673cae
FG
2271
2272 bool do_propose = false;
2273
2274 // have a standby take over?
2275 set<mds_rank_t> failed;
11fdf7f2
TL
2276 fs.mds_map.get_failed_mds_set(failed);
2277 for (const auto& rank : failed) {
9f95a23c
TL
2278 auto info = fsmap.find_replacement_for({fs.fscid, rank});
2279 if (info) {
2280 dout(1) << " taking over failed mds." << rank << " with " << info->global_id
2281 << "/" << info->name << " " << info->addrs << dendl;
f67539c2 2282 mon.clog->info() << "Standby " << info->human_name()
11fdf7f2
TL
2283 << " assigned to filesystem " << fs.mds_map.fs_name
2284 << " as rank " << rank;
2285
9f95a23c 2286 fsmap.promote(info->global_id, fs, rank);
11fdf7f2 2287 do_propose = true;
7c673cae 2288 }
11fdf7f2
TL
2289 }
2290
f67539c2 2291 if (fs.mds_map.is_resizeable() && fs.mds_map.allows_standby_replay()) {
7c673cae 2292 // There were no failures to replace, so try using any available standbys
a8e16298
TL
2293 // as standby-replay daemons. Don't do this when the cluster is degraded
2294 // as a standby-replay daemon may try to read a journal being migrated.
11fdf7f2 2295 for (;;) {
522d829b 2296 auto info = fsmap.get_available_standby(fs);
9f95a23c
TL
2297 if (!info) break;
2298 dout(20) << "standby available mds." << info->global_id << dendl;
11fdf7f2
TL
2299 bool changed = false;
2300 for (const auto& rank : fs.mds_map.in) {
9f95a23c 2301 dout(20) << "examining " << rank << dendl;
11fdf7f2 2302 if (fs.mds_map.is_followable(rank)) {
9f95a23c 2303 dout(1) << " setting mds." << info->global_id
11fdf7f2 2304 << " to follow mds rank " << rank << dendl;
9f95a23c 2305 fsmap.assign_standby_replay(info->global_id, fs.fscid, rank);
11fdf7f2
TL
2306 do_propose = true;
2307 changed = true;
2308 break;
7c673cae 2309 }
7c673cae 2310 }
11fdf7f2 2311 if (!changed) break;
7c673cae
FG
2312 }
2313 }
2314
2315 return do_propose;
2316}
2317
2318void MDSMonitor::tick()
2319{
1adf2230 2320 if (!is_active() || !is_leader()) return;
28e407b8
AA
2321
2322 auto &pending = get_pending_fsmap_writeable();
7c673cae 2323
28e407b8 2324 bool do_propose = false;
9f95a23c 2325 bool propose_osdmap = false;
7c673cae 2326
522d829b
TL
2327 if (check_fsmap_struct_version) {
2328 /* Allow time for trimming otherwise PaxosService::is_writeable will always
2329 * be false.
2330 */
2331
2332 auto now = clock::now();
2333 auto elapsed = now - last_fsmap_struct_flush;
2334 if (elapsed > std::chrono::seconds(30)) {
2335 FSMap fsmap;
2336 bufferlist bl;
2337 auto v = get_first_committed();
2338 int err = get_version(v, bl);
2339 if (err) {
2340 derr << "could not get version " << v << dendl;
2341 ceph_abort();
2342 }
a4b75251
TL
2343 try {
2344 fsmap.decode(bl);
2345 } catch (const ceph::buffer::malformed_input& e) {
2346 dout(5) << "flushing old fsmap struct because unable to decode FSMap: " << e.what() << dendl;
2347 }
2348 /* N.B. FSMap::is_struct_old is also true for undecoded (failed to decode) FSMap */
522d829b
TL
2349 if (fsmap.is_struct_old()) {
2350 dout(5) << "fsmap struct is too old; proposing to flush out old versions" << dendl;
2351 do_propose = true;
2352 last_fsmap_struct_flush = now;
2353 } else {
2354 dout(20) << "struct is recent" << dendl;
2355 check_fsmap_struct_version = false;
2356 }
2357 }
2358 }
2359
28e407b8 2360 do_propose |= pending.check_health();
7c673cae 2361
9f95a23c
TL
2362 /* Check health and affinity of ranks */
2363 do_propose |= check_health(pending, &propose_osdmap);
7c673cae 2364
9f95a23c
TL
2365 /* Resize the cluster according to max_mds. */
2366 for (auto& p : pending.filesystems) {
2367 do_propose |= maybe_resize_cluster(pending, p.second->fscid);
7c673cae
FG
2368 }
2369
9f95a23c
TL
2370 /* Replace any failed ranks. */
2371 for (auto& p : pending.filesystems) {
2372 do_propose |= maybe_promote_standby(pending, *p.second);
7c673cae
FG
2373 }
2374
c07f9fc5 2375 if (propose_osdmap) {
f67539c2 2376 request_proposal(mon.osdmon());
c07f9fc5 2377 }
7c673cae 2378
7c673cae
FG
2379 if (do_propose) {
2380 propose_pending();
2381 }
9f95a23c
TL
2382
2383 last_tick = mono_clock::now();
7c673cae
FG
2384}
2385
f67539c2 2386MDSMonitor::MDSMonitor(Monitor &mn, Paxos &p, string service_name)
7c673cae
FG
2387 : PaxosService(mn, p, service_name)
2388{
f67539c2 2389 handlers = FileSystemCommandHandler::load(&p);
7c673cae
FG
2390}
2391
2392void MDSMonitor::on_restart()
2393{
2394 // Clear out the leader-specific state.
1adf2230 2395 last_tick = mono_clock::now();
7c673cae
FG
2396 last_beacon.clear();
2397}
2398