]> git.proxmox.com Git - ceph.git/blame - ceph/src/mon/MDSMonitor.cc
import ceph 16.2.7
[ceph.git] / ceph / src / mon / MDSMonitor.cc
CommitLineData
7c673cae
FG
1// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2// vim: ts=8 sw=2 smarttab
3/*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 *
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
12 *
13 */
14
11fdf7f2 15#include <regex>
7c673cae
FG
16#include <sstream>
17#include <boost/utility.hpp>
18
19#include "MDSMonitor.h"
20#include "FSCommands.h"
21#include "Monitor.h"
22#include "MonitorDBStore.h"
23#include "OSDMonitor.h"
7c673cae
FG
24
25#include "common/strtol.h"
26#include "common/perf_counters.h"
27#include "common/config.h"
28#include "common/cmdparse.h"
29#include "messages/MMDSMap.h"
30#include "messages/MFSMap.h"
31#include "messages/MFSMapUser.h"
32#include "messages/MMDSLoadTargets.h"
33#include "messages/MMonCommand.h"
34#include "messages/MGenericMessage.h"
35
11fdf7f2 36#include "include/ceph_assert.h"
7c673cae
FG
37#include "include/str_list.h"
38#include "include/stringify.h"
39#include "mds/mdstypes.h"
40#include "Session.h"
41
f67539c2
TL
42using namespace TOPNSPC::common;
43
44using std::dec;
45using std::hex;
46using std::list;
47using std::map;
48using std::make_pair;
49using std::ostream;
50using std::ostringstream;
51using std::pair;
52using std::set;
53using std::string;
54using std::string_view;
55using std::stringstream;
56using std::to_string;
57using std::vector;
58
59using ceph::bufferlist;
60using ceph::decode;
61using ceph::encode;
62using ceph::ErasureCodeInterfaceRef;
63using ceph::ErasureCodeProfile;
64using ceph::Formatter;
65using ceph::JSONFormatter;
66using ceph::make_message;
67using ceph::mono_clock;
68using ceph::mono_time;
69
7c673cae
FG
70#define dout_subsys ceph_subsys_mon
71#undef dout_prefix
28e407b8 72#define dout_prefix _prefix(_dout, mon, get_fsmap())
f67539c2
TL
73static ostream& _prefix(std::ostream *_dout, Monitor &mon, const FSMap& fsmap) {
74 return *_dout << "mon." << mon.name << "@" << mon.rank
75 << "(" << mon.get_state_name()
7c673cae
FG
76 << ").mds e" << fsmap.get_epoch() << " ";
77}
78
3efd9988
FG
79static const string MDS_METADATA_PREFIX("mds_metadata");
80static const string MDS_HEALTH_PREFIX("mds_health");
81
82
7c673cae
FG
83/*
84 * Specialized implementation of cmd_getval to allow us to parse
85 * out strongly-typedef'd types
86 */
9f95a23c
TL
87namespace TOPNSPC::common {
88template<> bool cmd_getval(const cmdmap_t& cmdmap,
31f18b77 89 const std::string& k, mds_gid_t &val)
7c673cae 90{
9f95a23c 91 return cmd_getval(cmdmap, k, (int64_t&)val);
7c673cae
FG
92}
93
9f95a23c 94template<> bool cmd_getval(const cmdmap_t& cmdmap,
31f18b77 95 const std::string& k, mds_rank_t &val)
7c673cae 96{
9f95a23c 97 return cmd_getval(cmdmap, k, (int64_t&)val);
7c673cae
FG
98}
99
9f95a23c 100template<> bool cmd_getval(const cmdmap_t& cmdmap,
31f18b77 101 const std::string& k, MDSMap::DaemonState &val)
7c673cae 102{
9f95a23c
TL
103 return cmd_getval(cmdmap, k, (int64_t&)val);
104}
7c673cae 105}
7c673cae
FG
106// my methods
107
11fdf7f2
TL
108template <int dblV>
109void MDSMonitor::print_map(const FSMap& m)
7c673cae 110{
11fdf7f2 111 dout(dblV) << "print_map\n";
7c673cae
FG
112 m.print(*_dout);
113 *_dout << dendl;
114}
115
116// service methods
117void MDSMonitor::create_initial()
118{
119 dout(10) << "create_initial" << dendl;
120}
121
11fdf7f2 122void MDSMonitor::get_store_prefixes(std::set<string>& s) const
3efd9988
FG
123{
124 s.insert(service_name);
125 s.insert(MDS_METADATA_PREFIX);
126 s.insert(MDS_HEALTH_PREFIX);
127}
7c673cae
FG
128
129void MDSMonitor::update_from_paxos(bool *need_bootstrap)
130{
131 version_t version = get_last_committed();
28e407b8 132 if (version == get_fsmap().epoch)
7c673cae
FG
133 return;
134
135 dout(10) << __func__ << " version " << version
28e407b8 136 << ", my e " << get_fsmap().epoch << dendl;
11fdf7f2 137 ceph_assert(version > get_fsmap().epoch);
7c673cae 138
224ce89b
WB
139 load_health();
140
7c673cae
FG
141 // read and decode
142 bufferlist fsmap_bl;
143 fsmap_bl.clear();
144 int err = get_version(version, fsmap_bl);
11fdf7f2 145 ceph_assert(err == 0);
7c673cae 146
11fdf7f2 147 ceph_assert(fsmap_bl.length() > 0);
7c673cae 148 dout(10) << __func__ << " got " << version << dendl;
522d829b
TL
149 try {
150 PaxosFSMap::decode(fsmap_bl);
151 } catch (const ceph::buffer::malformed_input& e) {
152 derr << "unable to decode FSMap: " << e.what() << dendl;
153 throw;
154 }
7c673cae
FG
155
156 // new map
91327a77 157 dout(0) << "new map" << dendl;
11fdf7f2
TL
158 print_map<0>(get_fsmap());
159 if (!g_conf()->mon_mds_skip_sanity) {
28e407b8 160 get_fsmap().sanity();
7c673cae
FG
161 }
162
163 check_subs();
7c673cae
FG
164}
165
166void MDSMonitor::init()
167{
168 (void)load_metadata(pending_metadata);
169}
170
171void MDSMonitor::create_pending()
172{
28e407b8 173 auto &fsmap = PaxosFSMap::create_pending();
7c673cae 174
f67539c2
TL
175 if (mon.osdmon()->is_readable()) {
176 const auto &osdmap = mon.osdmon()->osdmap;
28e407b8 177 fsmap.sanitize([&osdmap](int64_t pool){return osdmap.have_pg_pool(pool);});
3efd9988
FG
178 }
179
28e407b8 180 dout(10) << "create_pending e" << fsmap.epoch << dendl;
7c673cae
FG
181}
182
183void MDSMonitor::encode_pending(MonitorDBStore::TransactionRef t)
184{
28e407b8
AA
185 auto &pending = get_pending_fsmap_writeable();
186 auto &epoch = pending.epoch;
7c673cae 187
28e407b8 188 dout(10) << "encode_pending e" << epoch << dendl;
7c673cae
FG
189
190 // print map iff 'debug mon = 30' or higher
11fdf7f2
TL
191 print_map<30>(pending);
192 if (!g_conf()->mon_mds_skip_sanity) {
a4b75251 193 pending.sanity(true);
7c673cae
FG
194 }
195
196 // Set 'modified' on maps modified this epoch
28e407b8
AA
197 for (auto &p : pending.filesystems) {
198 if (p.second->mds_map.epoch == epoch) {
199 p.second->mds_map.modified = ceph_clock_now();
7c673cae
FG
200 }
201 }
202
203 // apply to paxos
11fdf7f2 204 ceph_assert(get_last_committed() + 1 == pending.epoch);
28e407b8 205 bufferlist pending_bl;
f67539c2 206 pending.encode(pending_bl, mon.get_quorum_con_features());
7c673cae
FG
207
208 /* put everything in the transaction */
28e407b8
AA
209 put_version(t, pending.epoch, pending_bl);
210 put_last_committed(t, pending.epoch);
7c673cae
FG
211
212 // Encode MDSHealth data
213 for (std::map<uint64_t, MDSHealth>::iterator i = pending_daemon_health.begin();
214 i != pending_daemon_health.end(); ++i) {
215 bufferlist bl;
216 i->second.encode(bl);
217 t->put(MDS_HEALTH_PREFIX, stringify(i->first), bl);
218 }
219
220 for (std::set<uint64_t>::iterator i = pending_daemon_health_rm.begin();
221 i != pending_daemon_health_rm.end(); ++i) {
222 t->erase(MDS_HEALTH_PREFIX, stringify(*i));
223 }
224 pending_daemon_health_rm.clear();
1adf2230 225 remove_from_metadata(pending, t);
224ce89b
WB
226
227 // health
228 health_check_map_t new_checks;
28e407b8 229 const auto &info_map = pending.get_mds_info();
224ce89b
WB
230 for (const auto &i : info_map) {
231 const auto &gid = i.first;
232 const auto &info = i.second;
233 if (pending_daemon_health_rm.count(gid)) {
234 continue;
235 }
236 MDSHealth health;
237 auto p = pending_daemon_health.find(gid);
238 if (p != pending_daemon_health.end()) {
239 health = p->second;
240 } else {
241 bufferlist bl;
f67539c2 242 mon.store->get(MDS_HEALTH_PREFIX, stringify(gid), bl);
224ce89b
WB
243 if (!bl.length()) {
244 derr << "Missing health data for MDS " << gid << dendl;
245 continue;
246 }
11fdf7f2 247 auto bl_i = bl.cbegin();
224ce89b
WB
248 health.decode(bl_i);
249 }
250 for (const auto &metric : health.metrics) {
9f95a23c 251 const auto rank = info.rank;
224ce89b
WB
252 health_check_t *check = &new_checks.get_or_add(
253 mds_metric_name(metric.type),
254 metric.sev,
9f95a23c
TL
255 mds_metric_summary(metric.type),
256 1);
224ce89b 257 ostringstream ss;
f91f0fd5 258 ss << "mds." << info.name << "(mds." << rank << "): " << metric.message;
28e407b8
AA
259 bool first = true;
260 for (auto &p : metric.metadata) {
261 if (first) {
262 ss << " ";
263 } else {
224ce89b 264 ss << ", ";
28e407b8
AA
265 }
266 ss << p.first << ": " << p.second;
267 first = false;
224ce89b
WB
268 }
269 check->detail.push_back(ss.str());
270 }
271 }
28e407b8 272 pending.get_health_checks(&new_checks);
224ce89b 273 for (auto& p : new_checks.checks) {
11fdf7f2 274 p.second.summary = std::regex_replace(
224ce89b 275 p.second.summary,
11fdf7f2 276 std::regex("%num%"),
224ce89b 277 stringify(p.second.detail.size()));
11fdf7f2 278 p.second.summary = std::regex_replace(
224ce89b 279 p.second.summary,
11fdf7f2 280 std::regex("%plurals%"),
224ce89b 281 p.second.detail.size() > 1 ? "s" : "");
11fdf7f2 282 p.second.summary = std::regex_replace(
224ce89b 283 p.second.summary,
11fdf7f2 284 std::regex("%isorare%"),
224ce89b 285 p.second.detail.size() > 1 ? "are" : "is");
11fdf7f2 286 p.second.summary = std::regex_replace(
181888fb 287 p.second.summary,
11fdf7f2 288 std::regex("%hasorhave%"),
181888fb 289 p.second.detail.size() > 1 ? "have" : "has");
224ce89b
WB
290 }
291 encode_health(new_checks, t);
7c673cae
FG
292}
293
11fdf7f2 294version_t MDSMonitor::get_trim_to() const
7c673cae
FG
295{
296 version_t floor = 0;
11fdf7f2 297 if (g_conf()->mon_mds_force_trim_to > 0 &&
522d829b 298 g_conf()->mon_mds_force_trim_to <= (int)get_last_committed()) {
11fdf7f2 299 floor = g_conf()->mon_mds_force_trim_to;
7c673cae
FG
300 dout(10) << __func__ << " explicit mon_mds_force_trim_to = "
301 << floor << dendl;
302 }
303
11fdf7f2 304 unsigned max = g_conf()->mon_max_mdsmap_epochs;
7c673cae
FG
305 version_t last = get_last_committed();
306
522d829b
TL
307 if (last - get_first_committed() > max && floor < last - max) {
308 floor = last-max;
309 }
310
311 dout(20) << __func__ << " = " << floor << dendl;
7c673cae
FG
312 return floor;
313}
314
7c673cae
FG
315bool MDSMonitor::preprocess_query(MonOpRequestRef op)
316{
317 op->mark_mdsmon_event(__func__);
9f95a23c 318 auto m = op->get_req<PaxosServiceMessage>();
11fdf7f2
TL
319 dout(10) << "preprocess_query " << *m << " from " << m->get_orig_source()
320 << " " << m->get_orig_source_addrs() << dendl;
7c673cae
FG
321
322 switch (m->get_type()) {
323
324 case MSG_MDS_BEACON:
325 return preprocess_beacon(op);
326
327 case MSG_MON_COMMAND:
f64942e4
AA
328 try {
329 return preprocess_command(op);
11fdf7f2 330 } catch (const bad_cmd_get& e) {
f64942e4 331 bufferlist bl;
f67539c2 332 mon.reply_command(op, -EINVAL, e.what(), bl, get_last_committed());
f64942e4
AA
333 return true;
334 }
7c673cae
FG
335
336 case MSG_MDS_OFFLOAD_TARGETS:
337 return preprocess_offload_targets(op);
338
339 default:
340 ceph_abort();
341 return true;
342 }
343}
344
345void MDSMonitor::_note_beacon(MMDSBeacon *m)
346{
347 mds_gid_t gid = mds_gid_t(m->get_global_id());
348 version_t seq = m->get_seq();
349
91327a77 350 dout(5) << "_note_beacon " << *m << " noting time" << dendl;
1adf2230
AA
351 auto &beacon = last_beacon[gid];
352 beacon.stamp = mono_clock::now();
353 beacon.seq = seq;
7c673cae
FG
354}
355
356bool MDSMonitor::preprocess_beacon(MonOpRequestRef op)
357{
358 op->mark_mdsmon_event(__func__);
9f95a23c 359 auto m = op->get_req<MMDSBeacon>();
7c673cae
FG
360 MDSMap::DaemonState state = m->get_state();
361 mds_gid_t gid = m->get_global_id();
362 version_t seq = m->get_seq();
363 MDSMap::mds_info_t info;
364 epoch_t effective_epoch = 0;
365
1adf2230 366 const auto &fsmap = get_fsmap();
28e407b8 367
7c673cae 368 // check privileges, ignore if fails
11fdf7f2
TL
369 MonSession *session = op->get_session();
370 if (!session)
371 goto ignore;
7c673cae
FG
372 if (!session->is_capable("mds", MON_CAP_X)) {
373 dout(0) << "preprocess_beacon got MMDSBeacon from entity with insufficient privileges "
374 << session->caps << dendl;
375 goto ignore;
376 }
377
f67539c2
TL
378 if (m->get_fsid() != mon.monmap->fsid) {
379 dout(0) << "preprocess_beacon on fsid " << m->get_fsid() << " != " << mon.monmap->fsid << dendl;
7c673cae
FG
380 goto ignore;
381 }
382
91327a77 383 dout(5) << "preprocess_beacon " << *m
11fdf7f2
TL
384 << " from " << m->get_orig_source()
385 << " " << m->get_orig_source_addrs()
7c673cae
FG
386 << " " << m->get_compat()
387 << dendl;
388
389 // make sure the address has a port
390 if (m->get_orig_source_addr().get_port() == 0) {
391 dout(1) << " ignoring boot message without a port" << dendl;
392 goto ignore;
393 }
394
7c673cae 395 // fw to leader?
28e407b8 396 if (!is_leader())
7c673cae
FG
397 return false;
398
399 // booted, but not in map?
28e407b8 400 if (!fsmap.gid_exists(gid)) {
7c673cae
FG
401 if (state != MDSMap::STATE_BOOT) {
402 dout(7) << "mds_beacon " << *m << " is not in fsmap (state "
403 << ceph_mds_state_name(state) << ")" << dendl;
404
1adf2230
AA
405 /* We can't send an MDSMap this MDS was a part of because we no longer
406 * know which FS it was part of. Nor does this matter. Sending an empty
407 * MDSMap is sufficient for getting the MDS to respawn.
408 */
522d829b 409 auto m = make_message<MMDSMap>(mon.monmap->fsid, MDSMap::create_null_mdsmap());
f67539c2 410 mon.send_reply(op, m.detach());
7c673cae
FG
411 return true;
412 } else {
413 return false; // not booted yet.
414 }
415 }
416 dout(10) << __func__ << ": GID exists in map: " << gid << dendl;
28e407b8 417 info = fsmap.get_info_gid(gid);
7c673cae 418
f91f0fd5
TL
419 if (state == MDSMap::STATE_DNE) {
420 return false;
421 }
422
7c673cae
FG
423 // old seq?
424 if (info.state_seq > seq) {
425 dout(7) << "mds_beacon " << *m << " has old seq, ignoring" << dendl;
426 goto ignore;
427 }
428
429 // Work out the latest epoch that this daemon should have seen
430 {
28e407b8 431 fs_cluster_id_t fscid = fsmap.mds_roles.at(gid);
7c673cae 432 if (fscid == FS_CLUSTER_ID_NONE) {
28e407b8 433 effective_epoch = fsmap.standby_epochs.at(gid);
7c673cae 434 } else {
28e407b8 435 effective_epoch = fsmap.get_filesystem(fscid)->mds_map.epoch;
7c673cae
FG
436 }
437 if (effective_epoch != m->get_last_epoch_seen()) {
438 dout(10) << "mds_beacon " << *m
439 << " ignoring requested state, because mds hasn't seen latest map" << dendl;
440 goto reply;
441 }
442 }
443
444 if (info.laggy()) {
445 _note_beacon(m);
446 return false; // no longer laggy, need to update map.
447 }
448 if (state == MDSMap::STATE_BOOT) {
449 // ignore, already booted.
450 goto ignore;
451 }
9f95a23c
TL
452
453 // did the join_fscid change
454 if (m->get_fs().size()) {
455 fs_cluster_id_t fscid = FS_CLUSTER_ID_NONE;
456 auto f = fsmap.get_filesystem(m->get_fs());
457 if (f) {
458 fscid = f->fscid;
459 }
460 if (info.join_fscid != fscid) {
461 dout(10) << __func__ << " standby mds_join_fs changed to " << fscid
462 << " (" << m->get_fs() << ")" << dendl;
463 _note_beacon(m);
464 return false;
465 }
466 } else {
467 if (info.join_fscid != FS_CLUSTER_ID_NONE) {
468 dout(10) << __func__ << " standby mds_join_fs was cleared" << dendl;
469 _note_beacon(m);
470 return false;
471 }
472 }
473
7c673cae
FG
474 // is there a state change here?
475 if (info.state != state) {
7c673cae
FG
476 _note_beacon(m);
477 return false;
478 }
479
480 // Comparing known daemon health with m->get_health()
481 // and return false (i.e. require proposal) if they
482 // do not match, to update our stored
483 if (!(pending_daemon_health[gid] == m->get_health())) {
91327a77 484 dout(10) << __func__ << " health metrics for gid " << gid << " were updated" << dendl;
7c673cae
FG
485 _note_beacon(m);
486 return false;
487 }
488
489 reply:
490 // note time and reply
11fdf7f2 491 ceph_assert(effective_epoch > 0);
7c673cae 492 _note_beacon(m);
11fdf7f2 493 {
f67539c2 494 auto beacon = make_message<MMDSBeacon>(mon.monmap->fsid,
11fdf7f2
TL
495 m->get_global_id(), m->get_name(), effective_epoch,
496 state, seq, CEPH_FEATURES_SUPPORTED_DEFAULT);
f67539c2 497 mon.send_reply(op, beacon.detach());
11fdf7f2 498 }
7c673cae
FG
499 return true;
500
501 ignore:
502 // I won't reply this beacon, drop it.
f67539c2 503 mon.no_reply(op);
7c673cae
FG
504 return true;
505}
506
507bool MDSMonitor::preprocess_offload_targets(MonOpRequestRef op)
508{
509 op->mark_mdsmon_event(__func__);
9f95a23c 510 auto m = op->get_req<MMDSLoadTargets>();
7c673cae 511 dout(10) << "preprocess_offload_targets " << *m << " from " << m->get_orig_source() << dendl;
28e407b8 512
1adf2230 513 const auto &fsmap = get_fsmap();
7c673cae
FG
514
515 // check privileges, ignore message if fails
11fdf7f2 516 MonSession *session = op->get_session();
7c673cae 517 if (!session)
1adf2230 518 goto ignore;
7c673cae
FG
519 if (!session->is_capable("mds", MON_CAP_X)) {
520 dout(0) << "preprocess_offload_targets got MMDSLoadTargets from entity with insufficient caps "
521 << session->caps << dendl;
1adf2230 522 goto ignore;
7c673cae
FG
523 }
524
525 if (fsmap.gid_exists(m->global_id) &&
526 m->targets == fsmap.get_info_gid(m->global_id).export_targets)
1adf2230 527 goto ignore;
7c673cae
FG
528
529 return false;
530
1adf2230 531 ignore:
f67539c2 532 mon.no_reply(op);
7c673cae
FG
533 return true;
534}
535
536
537bool MDSMonitor::prepare_update(MonOpRequestRef op)
538{
539 op->mark_mdsmon_event(__func__);
9f95a23c 540 auto m = op->get_req<PaxosServiceMessage>();
7c673cae
FG
541 dout(7) << "prepare_update " << *m << dendl;
542
543 switch (m->get_type()) {
544
545 case MSG_MDS_BEACON:
546 return prepare_beacon(op);
547
548 case MSG_MON_COMMAND:
f64942e4
AA
549 try {
550 return prepare_command(op);
11fdf7f2 551 } catch (const bad_cmd_get& e) {
f64942e4 552 bufferlist bl;
f67539c2 553 mon.reply_command(op, -EINVAL, e.what(), bl, get_last_committed());
f64942e4
AA
554 return true;
555 }
7c673cae
FG
556
557 case MSG_MDS_OFFLOAD_TARGETS:
558 return prepare_offload_targets(op);
559
560 default:
561 ceph_abort();
562 }
563
564 return true;
565}
566
567bool MDSMonitor::prepare_beacon(MonOpRequestRef op)
568{
569 op->mark_mdsmon_event(__func__);
9f95a23c 570 auto m = op->get_req<MMDSBeacon>();
7c673cae 571 // -- this is an update --
11fdf7f2
TL
572 dout(12) << "prepare_beacon " << *m << " from " << m->get_orig_source()
573 << " " << m->get_orig_source_addrs() << dendl;
574 entity_addrvec_t addrs = m->get_orig_source_addrs();
7c673cae
FG
575 mds_gid_t gid = m->get_global_id();
576 MDSMap::DaemonState state = m->get_state();
577 version_t seq = m->get_seq();
578
28e407b8
AA
579 auto &pending = get_pending_fsmap_writeable();
580
91327a77 581 dout(15) << __func__ << " got health from gid " << gid << " with " << m->get_health().metrics.size() << " metrics." << dendl;
7c673cae
FG
582
583 // Calculate deltas of health metrics created and removed
584 // Do this by type rather than MDSHealthMetric equality, because messages can
585 // change a lot when they include e.g. a number of items.
586 const auto &old_health = pending_daemon_health[gid].metrics;
587 const auto &new_health = m->get_health().metrics;
588
589 std::set<mds_metric_t> old_types;
590 for (const auto &i : old_health) {
591 old_types.insert(i.type);
592 }
593
594 std::set<mds_metric_t> new_types;
595 for (const auto &i : new_health) {
596 new_types.insert(i.type);
597 }
598
599 for (const auto &new_metric: new_health) {
600 if (old_types.count(new_metric.type) == 0) {
11fdf7f2 601 dout(10) << "MDS health message (" << m->get_orig_source()
28e407b8 602 << "): " << new_metric.sev << " " << new_metric.message << dendl;
7c673cae
FG
603 }
604 }
605
606 // Log the disappearance of health messages at INFO
607 for (const auto &old_metric : old_health) {
608 if (new_types.count(old_metric.type) == 0) {
f67539c2 609 mon.clog->info() << "MDS health message cleared ("
11fdf7f2 610 << m->get_orig_source() << "): " << old_metric.message;
7c673cae
FG
611 }
612 }
613
614 // Store health
615 pending_daemon_health[gid] = m->get_health();
616
522d829b 617 const auto& cs = m->get_compat();
7c673cae
FG
618 if (state == MDSMap::STATE_BOOT) {
619 // zap previous instance of this name?
11fdf7f2 620 if (g_conf()->mds_enforce_unique_name) {
7c673cae 621 bool failed_mds = false;
28e407b8 622 while (mds_gid_t existing = pending.find_mds_gid_by_name(m->get_name())) {
f67539c2
TL
623 if (!mon.osdmon()->is_writeable()) {
624 mon.osdmon()->wait_for_writeable(op, new C_RetryMessage(this, op));
7c673cae
FG
625 return false;
626 }
522d829b 627 const auto& existing_info = pending.get_info_gid(existing);
f67539c2 628 mon.clog->info() << existing_info.human_name() << " restarted";
1adf2230 629 fail_mds_gid(pending, existing);
7c673cae
FG
630 failed_mds = true;
631 }
632 if (failed_mds) {
f67539c2
TL
633 ceph_assert(mon.osdmon()->is_writeable());
634 request_proposal(mon.osdmon());
7c673cae
FG
635 }
636 }
637
638 // Add this daemon to the map
28e407b8 639 if (pending.mds_roles.count(gid) == 0) {
7c673cae
FG
640 MDSMap::mds_info_t new_info;
641 new_info.global_id = gid;
642 new_info.name = m->get_name();
11fdf7f2 643 new_info.addrs = addrs;
7c673cae
FG
644 new_info.mds_features = m->get_mds_features();
645 new_info.state = MDSMap::STATE_STANDBY;
646 new_info.state_seq = seq;
522d829b 647 new_info.compat = cs;
9f95a23c
TL
648 if (m->get_fs().size()) {
649 fs_cluster_id_t fscid = FS_CLUSTER_ID_NONE;
650 auto f = pending.get_filesystem(m->get_fs());
651 if (f) {
652 fscid = f->fscid;
653 }
654 new_info.join_fscid = fscid;
655 }
522d829b 656 pending.insert(new_info);
7c673cae
FG
657 }
658
7c673cae 659 // initialize the beacon timer
1adf2230
AA
660 auto &beacon = last_beacon[gid];
661 beacon.stamp = mono_clock::now();
662 beacon.seq = seq;
7c673cae 663
7c673cae
FG
664 update_metadata(m->get_global_id(), m->get_sys_info());
665 } else {
666 // state update
91327a77
AA
667
668 if (!pending.gid_exists(gid)) {
669 /* gid has been removed from pending, send null map */
670 dout(5) << "mds_beacon " << *m << " is not in fsmap (state "
671 << ceph_mds_state_name(state) << ")" << dendl;
672
673 /* We can't send an MDSMap this MDS was a part of because we no longer
674 * know which FS it was part of. Nor does this matter. Sending an empty
675 * MDSMap is sufficient for getting the MDS to respawn.
676 */
a4b75251 677 goto null;
91327a77
AA
678 }
679
11fdf7f2 680 const auto& info = pending.get_info_gid(gid);
522d829b
TL
681
682 // did the reported compat change? That's illegal!
683 if (cs.compare(info.compat) != 0) {
684 if (!mon.osdmon()->is_writeable()) {
685 mon.osdmon()->wait_for_writeable(op, new C_RetryMessage(this, op));
686 return false;
687 }
688 mon.clog->warn() << info.human_name() << " compat changed unexpectedly";
689 fail_mds_gid(pending, gid);
690 request_proposal(mon.osdmon());
691 return true;
692 }
693
a4b75251
TL
694 // legal state change?
695 if ((info.state == MDSMap::STATE_STANDBY && state > 0) ||
696 (info.state == MDSMap::STATE_STANDBY_REPLAY && state > 0 && state != MDSMap::STATE_DAMAGED)) {
697 /* N.B.: standby-replay can indicate the rank is damaged due to failure to replay */
698 dout(10) << "mds_beacon mds can't activate itself (" << ceph_mds_state_name(info.state)
699 << " -> " << ceph_mds_state_name(state) << ")" << dendl;
700 goto evict;
701 } else if ((state == MDSMap::STATE_STANDBY || state == MDSMap::STATE_STANDBY_REPLAY)
702 && info.rank != MDS_RANK_NONE)
703 {
704 dout(4) << "mds_beacon MDS can't go back into standby after taking rank: "
705 "held rank " << info.rank << " while requesting state "
706 << ceph_mds_state_name(state) << dendl;
707 goto evict;
708 } else if (info.state == MDSMap::STATE_STOPPING &&
f64942e4
AA
709 state != MDSMap::STATE_STOPPING &&
710 state != MDSMap::STATE_STOPPED) {
7c673cae
FG
711 // we can't transition to any other states from STOPPING
712 dout(0) << "got beacon for MDS in STATE_STOPPING, ignoring requested state change"
713 << dendl;
a4b75251 714 goto evict;
7c673cae
FG
715 }
716
717 if (info.laggy()) {
11fdf7f2
TL
718 dout(1) << "prepare_beacon clearing laggy flag on " << addrs << dendl;
719 pending.modify_daemon(info.global_id, [](auto& info)
7c673cae 720 {
11fdf7f2 721 info.clear_laggy();
7c673cae
FG
722 }
723 );
724 }
9f95a23c 725
91327a77 726 dout(5) << "prepare_beacon mds." << info.rank
7c673cae
FG
727 << " " << ceph_mds_state_name(info.state)
728 << " -> " << ceph_mds_state_name(state)
7c673cae 729 << dendl;
9f95a23c
TL
730
731 fs_cluster_id_t fscid = FS_CLUSTER_ID_NONE;
732 if (m->get_fs().size()) {
733 auto f = pending.get_filesystem(m->get_fs());
734 if (f) {
735 fscid = f->fscid;
736 }
737 }
738 pending.modify_daemon(gid, [fscid](auto& info) {
739 info.join_fscid = fscid;
740 });
741
7c673cae 742 if (state == MDSMap::STATE_STOPPED) {
28e407b8
AA
743 const auto fscid = pending.mds_roles.at(gid);
744 const auto &fs = pending.get_filesystem(fscid);
181888fb 745
f67539c2 746 mon.clog->info() << info.human_name() << " finished "
11fdf7f2 747 << "stopping rank " << info.rank << " in filesystem "
d2e6a577 748 << fs->mds_map.fs_name << " (now has "
181888fb 749 << fs->mds_map.get_num_in_mds() - 1 << " ranks)";
d2e6a577 750
28e407b8 751 auto erased = pending.stop(gid);
7c673cae
FG
752 erased.push_back(gid);
753
9f95a23c 754 for (const auto& erased_gid : erased) {
7c673cae
FG
755 last_beacon.erase(erased_gid);
756 if (pending_daemon_health.count(erased_gid)) {
757 pending_daemon_health.erase(erased_gid);
758 pending_daemon_health_rm.insert(erased_gid);
759 }
760 }
761 } else if (state == MDSMap::STATE_DAMAGED) {
f67539c2 762 if (!mon.osdmon()->is_writeable()) {
91327a77 763 dout(1) << __func__ << ": DAMAGED from rank " << info.rank
f67539c2
TL
764 << " waiting for osdmon writeable to blocklist it" << dendl;
765 mon.osdmon()->wait_for_writeable(op, new C_RetryMessage(this, op));
7c673cae
FG
766 return false;
767 }
768
a4b75251
TL
769 auto rank = info.rank;
770
7c673cae
FG
771 // Record this MDS rank as damaged, so that other daemons
772 // won't try to run it.
a4b75251
TL
773 dout(0) << __func__ << ": marking rank " << rank << " damaged" << dendl;
774
775 auto fs = pending.get_filesystem(gid);
776 auto rankgid = fs->mds_map.get_gid(rank);
777 auto rankinfo = pending.get_info_gid(rankgid);
778 auto followergid = fs->mds_map.get_standby_replay(rank);
779
780 ceph_assert(gid == rankgid || gid == followergid);
7c673cae
FG
781
782 utime_t until = ceph_clock_now();
f67539c2 783 until += g_conf().get_val<double>("mon_mds_blocklist_interval");
a4b75251
TL
784 const auto blocklist_epoch = mon.osdmon()->blocklist(rankinfo.addrs, until);
785 if (followergid != MDS_GID_NONE) {
786 fail_mds_gid(pending, followergid);
787 last_beacon.erase(followergid);
7c673cae 788 }
f67539c2 789 request_proposal(mon.osdmon());
a4b75251
TL
790 pending.damaged(rankgid, blocklist_epoch);
791 last_beacon.erase(rankgid);
7c673cae 792
a4b75251
TL
793 /* MDS expects beacon reply back */
794 } else if (state == MDSMap::STATE_DNE) {
795 dout(1) << __func__ << ": DNE from " << info << dendl;
796 goto evict;
7c673cae
FG
797 } else if (info.state == MDSMap::STATE_STANDBY && state != info.state) {
798 // Standby daemons should never modify their own
799 // state. Reject any attempts to do so.
800 derr << "standby " << gid << " attempted to change state to "
801 << ceph_mds_state_name(state) << ", rejecting" << dendl;
a4b75251 802 goto evict;
7c673cae
FG
803 } else if (info.state != MDSMap::STATE_STANDBY && state != info.state &&
804 !MDSMap::state_transition_valid(info.state, state)) {
805 // Validate state transitions for daemons that hold a rank
806 derr << "daemon " << gid << " (rank " << info.rank << ") "
807 << "reported invalid state transition "
808 << ceph_mds_state_name(info.state) << " -> "
809 << ceph_mds_state_name(state) << dendl;
a4b75251 810 goto evict;
7c673cae 811 } else {
b32b8144 812 if (info.state != MDSMap::STATE_ACTIVE && state == MDSMap::STATE_ACTIVE) {
28e407b8
AA
813 const auto &fscid = pending.mds_roles.at(gid);
814 const auto &fs = pending.get_filesystem(fscid);
f67539c2 815 mon.clog->info() << info.human_name() << " is now active in "
d2e6a577
FG
816 << "filesystem " << fs->mds_map.fs_name << " as rank "
817 << info.rank;
818 }
b32b8144
FG
819
820 // Made it through special cases and validations, record the
821 // daemon's reported state to the FSMap.
11fdf7f2
TL
822 pending.modify_daemon(gid, [state, seq](auto& info) {
823 info.state = state;
824 info.state_seq = seq;
b32b8144 825 });
7c673cae
FG
826 }
827 }
828
91327a77 829 dout(5) << "prepare_beacon pending map now:" << dendl;
28e407b8 830 print_map(pending);
7c673cae 831
9f95a23c 832 wait_for_finished_proposal(op, new LambdaContext([op, this](int r){
7c673cae
FG
833 if (r >= 0)
834 _updated(op); // success
835 else if (r == -ECANCELED) {
f67539c2 836 mon.no_reply(op);
7c673cae
FG
837 } else {
838 dispatch(op); // try again
839 }
840 }));
841
a4b75251
TL
842 return true;
843
844evict:
845 if (!mon.osdmon()->is_writeable()) {
846 dout(1) << __func__ << ": waiting for writeable OSDMap to evict" << dendl;
847 mon.osdmon()->wait_for_writeable(op, new C_RetryMessage(this, op));
848 return false;
849 }
850
851 fail_mds_gid(pending, gid);
852 request_proposal(mon.osdmon());
853 dout(5) << __func__ << ": pending map now:" << dendl;
854 print_map(pending);
855
856 goto null;
857
858null:
859 wait_for_finished_proposal(op, new LambdaContext([op, this](int r){
860 if (r >= 0) {
861 auto m = make_message<MMDSMap>(mon.monmap->fsid, MDSMap::create_null_mdsmap());
862 mon.send_reply(op, m.detach());
863 } else {
864 dispatch(op); // try again
865 }
866 }));
867
7c673cae
FG
868 return true;
869}
870
871bool MDSMonitor::prepare_offload_targets(MonOpRequestRef op)
872{
28e407b8
AA
873 auto &pending = get_pending_fsmap_writeable();
874
7c673cae 875 op->mark_mdsmon_event(__func__);
9f95a23c 876 auto m = op->get_req<MMDSLoadTargets>();
7c673cae 877 mds_gid_t gid = m->global_id;
28e407b8 878 if (pending.gid_has_rank(gid)) {
7c673cae 879 dout(10) << "prepare_offload_targets " << gid << " " << m->targets << dendl;
28e407b8 880 pending.update_export_targets(gid, m->targets);
7c673cae
FG
881 } else {
882 dout(10) << "prepare_offload_targets " << gid << " not in map" << dendl;
883 }
f67539c2 884 mon.no_reply(op);
7c673cae
FG
885 return true;
886}
887
888bool MDSMonitor::should_propose(double& delay)
889{
890 // delegate to PaxosService to assess whether we should propose
891 return PaxosService::should_propose(delay);
892}
893
894void MDSMonitor::_updated(MonOpRequestRef op)
895{
28e407b8 896 const auto &fsmap = get_fsmap();
7c673cae 897 op->mark_mdsmon_event(__func__);
9f95a23c 898 auto m = op->get_req<MMDSBeacon>();
7c673cae 899 dout(10) << "_updated " << m->get_orig_source() << " " << *m << dendl;
f67539c2 900 mon.clog->debug() << m->get_orig_source() << " "
11fdf7f2
TL
901 << m->get_orig_source_addrs() << " "
902 << ceph_mds_state_name(m->get_state());
7c673cae
FG
903
904 if (m->get_state() == MDSMap::STATE_STOPPED) {
905 // send the map manually (they're out of the map, so they won't get it automatic)
522d829b 906 auto m = make_message<MMDSMap>(mon.monmap->fsid, MDSMap::create_null_mdsmap());
f67539c2 907 mon.send_reply(op, m.detach());
7c673cae 908 } else {
f67539c2 909 auto beacon = make_message<MMDSBeacon>(mon.monmap->fsid,
11fdf7f2
TL
910 m->get_global_id(), m->get_name(), fsmap.get_epoch(),
911 m->get_state(), m->get_seq(), CEPH_FEATURES_SUPPORTED_DEFAULT);
f67539c2 912 mon.send_reply(op, beacon.detach());
7c673cae
FG
913 }
914}
915
916void MDSMonitor::on_active()
917{
918 tick();
7c673cae 919
28e407b8 920 if (is_leader()) {
f67539c2 921 mon.clog->debug() << "fsmap " << get_fsmap();
224ce89b 922 }
7c673cae
FG
923}
924
7c673cae
FG
925void MDSMonitor::dump_info(Formatter *f)
926{
927 f->open_object_section("fsmap");
28e407b8 928 get_fsmap().dump(f);
7c673cae
FG
929 f->close_section();
930
931 f->dump_unsigned("mdsmap_first_committed", get_first_committed());
932 f->dump_unsigned("mdsmap_last_committed", get_last_committed());
933}
934
935bool MDSMonitor::preprocess_command(MonOpRequestRef op)
936{
937 op->mark_mdsmon_event(__func__);
9f95a23c 938 auto m = op->get_req<MMonCommand>();
7c673cae
FG
939 int r = -1;
940 bufferlist rdata;
941 stringstream ss, ds;
942
11fdf7f2 943 cmdmap_t cmdmap;
7c673cae
FG
944 if (!cmdmap_from_json(m->cmd, &cmdmap, ss)) {
945 // ss has reason for failure
946 string rs = ss.str();
f67539c2 947 mon.reply_command(op, -EINVAL, rs, rdata, get_last_committed());
7c673cae
FG
948 return true;
949 }
950
951 string prefix;
9f95a23c 952 cmd_getval(cmdmap, "prefix", prefix);
7c673cae 953 string format;
9f95a23c 954 cmd_getval(cmdmap, "format", format, string("plain"));
1adf2230 955 std::unique_ptr<Formatter> f(Formatter::create(format));
7c673cae 956
11fdf7f2 957 MonSession *session = op->get_session();
7c673cae 958 if (!session) {
f67539c2 959 mon.reply_command(op, -EACCES, "access denied", rdata, get_last_committed());
7c673cae
FG
960 return true;
961 }
962
f67539c2
TL
963 // to use const qualifier filter fsmap beforehand
964 FSMap _fsmap_copy = get_fsmap();
965 _fsmap_copy.filter(session->get_allowed_fs_names());
966 const auto& fsmap = _fsmap_copy;
967
7c673cae
FG
968 if (prefix == "mds stat") {
969 if (f) {
970 f->open_object_section("mds_stat");
971 dump_info(f.get());
972 f->close_section();
973 f->flush(ds);
974 } else {
975 ds << fsmap;
976 }
977 r = 0;
11fdf7f2
TL
978 } else if (prefix == "mds ok-to-stop") {
979 vector<string> ids;
9f95a23c 980 if (!cmd_getval(cmdmap, "ids", ids)) {
11fdf7f2
TL
981 r = -EINVAL;
982 ss << "must specify mds id";
983 goto out;
984 }
985 if (fsmap.is_any_degraded()) {
986 ss << "one or more filesystems is currently degraded";
987 r = -EBUSY;
988 goto out;
989 }
990 set<mds_gid_t> stopping;
991 for (auto& id : ids) {
992 ostringstream ess;
993 mds_gid_t gid = gid_from_arg(fsmap, id, ess);
994 if (gid == MDS_GID_NONE) {
995 // the mds doesn't exist, but no file systems are unhappy, so losing it
996 // can't have any effect.
997 continue;
998 }
999 stopping.insert(gid);
1000 }
1001 set<mds_gid_t> active;
1002 set<mds_gid_t> standby;
1003 for (auto gid : stopping) {
1004 if (fsmap.gid_has_rank(gid)) {
1005 // ignore standby-replay daemons (at this level)
1006 if (!fsmap.is_standby_replay(gid)) {
1007 auto standby = fsmap.get_standby_replay(gid);
1008 if (standby == MDS_GID_NONE ||
1009 stopping.count(standby)) {
1010 // no standby-replay, or we're also stopping the standby-replay
1011 // for this mds
1012 active.insert(gid);
1013 }
1014 }
7c673cae 1015 } else {
11fdf7f2
TL
1016 // net loss of a standby
1017 standby.insert(gid);
7c673cae
FG
1018 }
1019 }
11fdf7f2
TL
1020 if (fsmap.get_num_standby() - standby.size() < active.size()) {
1021 r = -EBUSY;
1022 ss << "insufficent standby MDS daemons to stop active gids "
1023 << stringify(active)
1024 << " and/or standby gids " << stringify(standby);;
1025 goto out;
28e407b8 1026 }
11fdf7f2
TL
1027 r = 0;
1028 ss << "should be safe to stop " << ids;
7c673cae
FG
1029 } else if (prefix == "fs dump") {
1030 int64_t epocharg;
1031 epoch_t epoch;
1032
1adf2230 1033 const FSMap *fsmapp = &fsmap;
28e407b8 1034 FSMap dummy;
9f95a23c 1035 if (cmd_getval(cmdmap, "epoch", epocharg)) {
7c673cae
FG
1036 epoch = epocharg;
1037 bufferlist b;
1038 int err = get_version(epoch, b);
1039 if (err == -ENOENT) {
7c673cae 1040 r = -ENOENT;
28e407b8 1041 goto out;
7c673cae 1042 } else {
11fdf7f2
TL
1043 ceph_assert(err == 0);
1044 ceph_assert(b.length());
28e407b8
AA
1045 dummy.decode(b);
1046 fsmapp = &dummy;
7c673cae
FG
1047 }
1048 }
c07f9fc5 1049
28e407b8
AA
1050 stringstream ds;
1051 if (f != NULL) {
1052 f->open_object_section("fsmap");
1053 fsmapp->dump(f.get());
1054 f->close_section();
1055 f->flush(ds);
1056 r = 0;
1057 } else {
1058 fsmapp->print(ds);
1059 r = 0;
7c673cae 1060 }
28e407b8
AA
1061
1062 rdata.append(ds);
1063 ss << "dumped fsmap epoch " << fsmapp->get_epoch();
7c673cae
FG
1064 } else if (prefix == "mds metadata") {
1065 if (!f)
1066 f.reset(Formatter::create("json-pretty"));
1067
1068 string who;
9f95a23c 1069 bool all = !cmd_getval(cmdmap, "who", who);
7c673cae
FG
1070 dout(1) << "all = " << all << dendl;
1071 if (all) {
1072 r = 0;
1073 // Dump all MDSs' metadata
1074 const auto all_info = fsmap.get_mds_info();
1075
1076 f->open_array_section("mds_metadata");
1077 for(const auto &i : all_info) {
1078 const auto &info = i.second;
1079
1080 f->open_object_section("mds");
1081 f->dump_string("name", info.name);
1082 std::ostringstream get_err;
1adf2230 1083 r = dump_metadata(fsmap, info.name, f.get(), get_err);
7c673cae
FG
1084 if (r == -EINVAL || r == -ENOENT) {
1085 // Drop error, list what metadata we do have
1086 dout(1) << get_err.str() << dendl;
1087 r = 0;
1088 } else if (r != 0) {
1089 derr << "Unexpected error reading metadata: " << cpp_strerror(r)
1090 << dendl;
1091 ss << get_err.str();
c07f9fc5 1092 f->close_section();
7c673cae
FG
1093 break;
1094 }
1095 f->close_section();
1096 }
1097 f->close_section();
1098 } else {
1099 // Dump a single daemon's metadata
1100 f->open_object_section("mds_metadata");
1adf2230 1101 r = dump_metadata(fsmap, who, f.get(), ss);
7c673cae
FG
1102 f->close_section();
1103 }
1104 f->flush(ds);
31f18b77
FG
1105 } else if (prefix == "mds versions") {
1106 if (!f)
1107 f.reset(Formatter::create("json-pretty"));
1108 count_metadata("ceph_version", f.get());
1109 f->flush(ds);
1110 r = 0;
1111 } else if (prefix == "mds count-metadata") {
1112 if (!f)
1113 f.reset(Formatter::create("json-pretty"));
1114 string field;
9f95a23c 1115 cmd_getval(cmdmap, "property", field);
31f18b77
FG
1116 count_metadata(field, f.get());
1117 f->flush(ds);
1118 r = 0;
522d829b
TL
1119 } else if (prefix == "fs compat show") {
1120 string fs_name;
1121 cmd_getval(cmdmap, "fs_name", fs_name);
1122 const auto &fs = fsmap.get_filesystem(fs_name);
1123 if (fs == nullptr) {
1124 ss << "filesystem '" << fs_name << "' not found";
1125 r = -ENOENT;
1126 goto out;
1127 }
1128
1129 if (f) {
1130 f->open_object_section("mds_compat");
1131 fs->mds_map.compat.dump(f.get());
1132 f->close_section();
1133 f->flush(ds);
1134 } else {
1135 ds << fs->mds_map.compat;
1136 }
1137 r = 0;
7c673cae
FG
1138 } else if (prefix == "mds compat show") {
1139 if (f) {
1140 f->open_object_section("mds_compat");
522d829b 1141 fsmap.default_compat.dump(f.get());
7c673cae
FG
1142 f->close_section();
1143 f->flush(ds);
1144 } else {
522d829b 1145 ds << fsmap.default_compat;
7c673cae
FG
1146 }
1147 r = 0;
1148 } else if (prefix == "fs get") {
1149 string fs_name;
9f95a23c 1150 cmd_getval(cmdmap, "fs_name", fs_name);
28e407b8 1151 const auto &fs = fsmap.get_filesystem(fs_name);
7c673cae
FG
1152 if (fs == nullptr) {
1153 ss << "filesystem '" << fs_name << "' not found";
1154 r = -ENOENT;
1155 } else {
1156 if (f != nullptr) {
1157 f->open_object_section("filesystem");
1158 fs->dump(f.get());
1159 f->close_section();
1160 f->flush(ds);
1161 r = 0;
1162 } else {
1163 fs->print(ds);
1164 r = 0;
1165 }
1166 }
1167 } else if (prefix == "fs ls") {
1168 if (f) {
1169 f->open_array_section("filesystems");
1adf2230
AA
1170 for (const auto &p : fsmap.filesystems) {
1171 const auto &fs = p.second;
1172 f->open_object_section("filesystem");
1173 {
1174 const MDSMap &mds_map = fs->mds_map;
1175 f->dump_string("name", mds_map.fs_name);
1176 /* Output both the names and IDs of pools, for use by
1177 * humans and machines respectively */
f67539c2 1178 f->dump_string("metadata_pool", mon.osdmon()->osdmap.get_pool_name(
1adf2230
AA
1179 mds_map.metadata_pool));
1180 f->dump_int("metadata_pool_id", mds_map.metadata_pool);
1181 f->open_array_section("data_pool_ids");
1182 for (const auto &id : mds_map.data_pools) {
1183 f->dump_int("data_pool_id", id);
1184 }
1185 f->close_section();
7c673cae 1186
1adf2230
AA
1187 f->open_array_section("data_pools");
1188 for (const auto &id : mds_map.data_pools) {
f67539c2 1189 const auto &name = mon.osdmon()->osdmap.get_pool_name(id);
1adf2230 1190 f->dump_string("data_pool", name);
7c673cae
FG
1191 }
1192 f->close_section();
1193 }
1adf2230 1194 f->close_section();
7c673cae
FG
1195 }
1196 f->close_section();
1197 f->flush(ds);
1198 } else {
28e407b8
AA
1199 for (const auto &p : fsmap.filesystems) {
1200 const auto &fs = p.second;
7c673cae 1201 const MDSMap &mds_map = fs->mds_map;
f67539c2 1202 const string &md_pool_name = mon.osdmon()->osdmap.get_pool_name(
7c673cae
FG
1203 mds_map.metadata_pool);
1204
1205 ds << "name: " << mds_map.fs_name << ", metadata pool: "
1206 << md_pool_name << ", data pools: [";
1adf2230 1207 for (const auto &id : mds_map.data_pools) {
f67539c2 1208 const string &pool_name = mon.osdmon()->osdmap.get_pool_name(id);
7c673cae
FG
1209 ds << pool_name << " ";
1210 }
1211 ds << "]" << std::endl;
1212 }
1213
1214 if (fsmap.filesystems.empty()) {
1215 ds << "No filesystems enabled" << std::endl;
1216 }
1217 }
1218 r = 0;
f67539c2
TL
1219 } else if (prefix == "fs feature ls") {
1220 if (f) {
1221 f->open_array_section("cephfs_features");
1222 for (size_t i = 0; i <= CEPHFS_FEATURE_MAX; ++i) {
1223 f->open_object_section("feature");
1224 f->dump_int("index", i);
1225 f->dump_string("name", cephfs_feature_name(i));
1226 f->close_section();
1227 }
1228 f->close_section();
1229 f->flush(ds);
1230 } else {
1231 for (size_t i = 0; i <= CEPHFS_FEATURE_MAX; ++i) {
1232 ds << i << " " << cephfs_feature_name(i) << std::endl;
1233 }
1234 }
1235 r = 0;
7c673cae
FG
1236 }
1237
28e407b8 1238out:
7c673cae
FG
1239 if (r != -1) {
1240 rdata.append(ds);
1241 string rs;
1242 getline(ss, rs);
f67539c2 1243 mon.reply_command(op, r, rs, rdata, get_last_committed());
7c673cae
FG
1244 return true;
1245 } else
1246 return false;
1247}
1248
1adf2230 1249bool MDSMonitor::fail_mds_gid(FSMap &fsmap, mds_gid_t gid)
7c673cae 1250{
9f95a23c 1251 const auto& info = fsmap.get_info_gid(gid);
91327a77 1252 dout(1) << "fail_mds_gid " << gid << " mds." << info.name << " role " << info.rank << dendl;
7c673cae 1253
f67539c2 1254 ceph_assert(mon.osdmon()->is_writeable());
a8e16298 1255
f67539c2 1256 epoch_t blocklist_epoch = 0;
7c673cae
FG
1257 if (info.rank >= 0 && info.state != MDSMap::STATE_STANDBY_REPLAY) {
1258 utime_t until = ceph_clock_now();
f67539c2
TL
1259 until += g_conf().get_val<double>("mon_mds_blocklist_interval");
1260 blocklist_epoch = mon.osdmon()->blocklist(info.addrs, until);
7c673cae
FG
1261 }
1262
f67539c2 1263 fsmap.erase(gid, blocklist_epoch);
7c673cae
FG
1264 last_beacon.erase(gid);
1265 if (pending_daemon_health.count(gid)) {
1266 pending_daemon_health.erase(gid);
1267 pending_daemon_health_rm.insert(gid);
1268 }
1269
f67539c2 1270 return blocklist_epoch != 0;
7c673cae
FG
1271}
1272
1adf2230 1273mds_gid_t MDSMonitor::gid_from_arg(const FSMap &fsmap, const std::string &arg, std::ostream &ss)
7c673cae
FG
1274{
1275 // Try parsing as a role
1276 mds_role_t role;
1277 std::ostringstream ignore_err; // Don't spam 'ss' with parse_role errors
1adf2230 1278 int r = fsmap.parse_role(arg, &role, ignore_err);
7c673cae
FG
1279 if (r == 0) {
1280 // See if a GID is assigned to this role
28e407b8 1281 const auto &fs = fsmap.get_filesystem(role.fscid);
11fdf7f2 1282 ceph_assert(fs != nullptr); // parse_role ensures it exists
7c673cae
FG
1283 if (fs->mds_map.is_up(role.rank)) {
1284 dout(10) << __func__ << ": validated rank/GID " << role
1285 << " as a rank" << dendl;
1286 return fs->mds_map.get_mds_info(role.rank).global_id;
1287 }
1288 }
1289
1290 // Try parsing as a gid
1291 std::string err;
1292 unsigned long long maybe_gid = strict_strtoll(arg.c_str(), 10, &err);
1293 if (!err.empty()) {
1294 // Not a role or a GID, try as a daemon name
28e407b8 1295 const MDSMap::mds_info_t *mds_info = fsmap.find_by_name(arg);
7c673cae
FG
1296 if (!mds_info) {
1297 ss << "MDS named '" << arg
1298 << "' does not exist, or is not up";
1299 return MDS_GID_NONE;
1300 }
1301 dout(10) << __func__ << ": resolved MDS name '" << arg
1302 << "' to GID " << mds_info->global_id << dendl;
1303 return mds_info->global_id;
1304 } else {
1305 // Not a role, but parses as a an integer, might be a GID
1306 dout(10) << __func__ << ": treating MDS reference '" << arg
1307 << "' as an integer " << maybe_gid << dendl;
31f18b77 1308
28e407b8 1309 if (fsmap.gid_exists(mds_gid_t(maybe_gid))) {
31f18b77 1310 return mds_gid_t(maybe_gid);
7c673cae
FG
1311 }
1312 }
1313
1314 dout(1) << __func__ << ": rank/GID " << arg
1315 << " not a existent rank or GID" << dendl;
1316 return MDS_GID_NONE;
1317}
1318
1adf2230
AA
1319int MDSMonitor::fail_mds(FSMap &fsmap, std::ostream &ss,
1320 const std::string &arg, MDSMap::mds_info_t *failed_info)
7c673cae 1321{
11fdf7f2 1322 ceph_assert(failed_info != nullptr);
d2e6a577 1323
1adf2230 1324 mds_gid_t gid = gid_from_arg(fsmap, arg, ss);
7c673cae
FG
1325 if (gid == MDS_GID_NONE) {
1326 return 0;
1327 }
f67539c2 1328 if (!mon.osdmon()->is_writeable()) {
7c673cae
FG
1329 return -EAGAIN;
1330 }
d2e6a577
FG
1331
1332 // Take a copy of the info before removing the MDS from the map,
1333 // so that the caller knows which mds (if any) they ended up removing.
1adf2230 1334 *failed_info = fsmap.get_info_gid(gid);
d2e6a577 1335
1adf2230 1336 fail_mds_gid(fsmap, gid);
7c673cae 1337 ss << "failed mds gid " << gid;
f67539c2
TL
1338 ceph_assert(mon.osdmon()->is_writeable());
1339 request_proposal(mon.osdmon());
7c673cae
FG
1340 return 0;
1341}
1342
1343bool MDSMonitor::prepare_command(MonOpRequestRef op)
1344{
1345 op->mark_mdsmon_event(__func__);
9f95a23c 1346 auto m = op->get_req<MMonCommand>();
7c673cae
FG
1347 int r = -EINVAL;
1348 stringstream ss;
1349 bufferlist rdata;
1350
11fdf7f2 1351 cmdmap_t cmdmap;
7c673cae
FG
1352 if (!cmdmap_from_json(m->cmd, &cmdmap, ss)) {
1353 string rs = ss.str();
f67539c2 1354 mon.reply_command(op, -EINVAL, rs, rdata, get_last_committed());
7c673cae
FG
1355 return true;
1356 }
1357
1358 string prefix;
9f95a23c 1359 cmd_getval(cmdmap, "prefix", prefix);
7c673cae
FG
1360
1361 /* Refuse access if message not associated with a valid session */
11fdf7f2 1362 MonSession *session = op->get_session();
7c673cae 1363 if (!session) {
f67539c2 1364 mon.reply_command(op, -EACCES, "access denied", rdata, get_last_committed());
7c673cae
FG
1365 return true;
1366 }
1367
28e407b8
AA
1368 auto &pending = get_pending_fsmap_writeable();
1369
c07f9fc5 1370 bool batched_propose = false;
28e407b8 1371 for (const auto &h : handlers) {
f67539c2
TL
1372 r = h->can_handle(prefix, op, pending, cmdmap, ss);
1373 if (r == 1) {
1374 ; // pass, since we got the right handler.
1375 } else if (r == 0) {
1376 continue;
1377 } else {
1378 goto out;
1379 }
c07f9fc5 1380
f67539c2
TL
1381 batched_propose = h->batched_propose();
1382 if (batched_propose) {
1383 paxos.plug();
1384 }
1385 r = h->handle(&mon, pending, op, cmdmap, ss);
1386 if (batched_propose) {
1387 paxos.unplug();
1388 }
1389
1390 if (r == -EAGAIN) {
1391 // message has been enqueued for retry; return.
1392 dout(4) << __func__ << " enqueue for retry by prepare_command" << dendl;
1393 return false;
1394 } else {
1395 if (r == 0) {
1396 // On successful updates, print the updated map
1397 print_map(pending);
7c673cae 1398 }
f67539c2
TL
1399 // Successful or not, we're done: respond.
1400 goto out;
7c673cae
FG
1401 }
1402 }
1403
1adf2230 1404 r = filesystem_command(pending, op, prefix, cmdmap, ss);
7c673cae
FG
1405 if (r >= 0) {
1406 goto out;
1407 } else if (r == -EAGAIN) {
1408 // Do not reply, the message has been enqueued for retry
1409 dout(4) << __func__ << " enqueue for retry by filesystem_command" << dendl;
1410 return false;
1411 } else if (r != -ENOSYS) {
1412 goto out;
1413 }
1414
7c673cae
FG
1415 if (r == -ENOSYS && ss.str().empty()) {
1416 ss << "unrecognized command";
1417 }
1418
1419out:
1420 dout(4) << __func__ << " done, r=" << r << dendl;
1421 /* Compose response */
1422 string rs;
1423 getline(ss, rs);
1424
1425 if (r >= 0) {
1426 // success.. delay reply
1427 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, r, rs,
1428 get_last_committed() + 1));
c07f9fc5
FG
1429 if (batched_propose) {
1430 force_immediate_propose();
1431 }
7c673cae
FG
1432 return true;
1433 } else {
1434 // reply immediately
f67539c2 1435 mon.reply_command(op, r, rs, rdata, get_last_committed());
7c673cae
FG
1436 return false;
1437 }
1438}
1439
7c673cae 1440int MDSMonitor::filesystem_command(
1adf2230 1441 FSMap &fsmap,
7c673cae
FG
1442 MonOpRequestRef op,
1443 std::string const &prefix,
11fdf7f2 1444 const cmdmap_t& cmdmap,
7c673cae
FG
1445 std::stringstream &ss)
1446{
1447 dout(4) << __func__ << " prefix='" << prefix << "'" << dendl;
1448 op->mark_mdsmon_event(__func__);
1449 int r = 0;
1450 string whostr;
9f95a23c 1451 cmd_getval(cmdmap, "role", whostr);
7c673cae 1452
11fdf7f2 1453 if (prefix == "mds set_state") {
7c673cae 1454 mds_gid_t gid;
9f95a23c 1455 if (!cmd_getval(cmdmap, "gid", gid)) {
7c673cae 1456 ss << "error parsing 'gid' value '"
11fdf7f2 1457 << cmd_vartype_stringify(cmdmap.at("gid")) << "'";
7c673cae
FG
1458 return -EINVAL;
1459 }
1460 MDSMap::DaemonState state;
9f95a23c 1461 if (!cmd_getval(cmdmap, "state", state)) {
7c673cae 1462 ss << "error parsing 'state' string value '"
11fdf7f2 1463 << cmd_vartype_stringify(cmdmap.at("state")) << "'";
7c673cae
FG
1464 return -EINVAL;
1465 }
f67539c2 1466 if (fsmap.gid_exists(gid, op->get_session()->get_allowed_fs_names())) {
11fdf7f2
TL
1467 fsmap.modify_daemon(gid, [state](auto& info) {
1468 info.state = state;
7c673cae
FG
1469 });
1470 ss << "set mds gid " << gid << " to state " << state << " "
1471 << ceph_mds_state_name(state);
1472 return 0;
1473 }
1474 } else if (prefix == "mds fail") {
1475 string who;
9f95a23c 1476 cmd_getval(cmdmap, "role_or_gid", who);
d2e6a577
FG
1477
1478 MDSMap::mds_info_t failed_info;
f67539c2
TL
1479 mds_gid_t gid = gid_from_arg(fsmap, who, ss);
1480 if (gid == MDS_GID_NONE) {
1481 ss << "MDS named '" << who << "' does not exist, is not up or you "
1482 << "lack the permission to see.";
1483 return 0;
1484 }
1485 if(!fsmap.gid_exists(gid, op->get_session()->get_allowed_fs_names())) {
1486 ss << "MDS named '" << who << "' does not exist, is not up or you "
1487 << "lack the permission to see.";
1488 return -EINVAL;
1489 }
1490 string_view fs_name = fsmap.fs_name_from_gid(gid);
1491 if (!op->get_session()->fs_name_capable(fs_name, MON_CAP_W)) {
1492 ss << "Permission denied.";
1493 return -EPERM;
1494 }
1495
1adf2230 1496 r = fail_mds(fsmap, ss, who, &failed_info);
7c673cae 1497 if (r < 0 && r == -EAGAIN) {
f67539c2 1498 mon.osdmon()->wait_for_writeable(op, new C_RetryMessage(this, op));
7c673cae 1499 return -EAGAIN; // don't propose yet; wait for message to be retried
d2e6a577
FG
1500 } else if (r == 0) {
1501 // Only log if we really did something (not when was already gone)
1502 if (failed_info.global_id != MDS_GID_NONE) {
f67539c2 1503 mon.clog->info() << failed_info.human_name() << " marked failed by "
d2e6a577
FG
1504 << op->get_session()->entity_name;
1505 }
7c673cae
FG
1506 }
1507 } else if (prefix == "mds rm") {
1508 mds_gid_t gid;
9f95a23c 1509 if (!cmd_getval(cmdmap, "gid", gid)) {
7c673cae 1510 ss << "error parsing 'gid' value '"
11fdf7f2 1511 << cmd_vartype_stringify(cmdmap.at("gid")) << "'";
7c673cae
FG
1512 return -EINVAL;
1513 }
f67539c2 1514 if (!fsmap.gid_exists(gid, op->get_session()->get_allowed_fs_names())) {
11fdf7f2 1515 ss << "mds gid " << gid << " does not exist";
f67539c2
TL
1516 return 0;
1517 }
1518 string_view fs_name = fsmap.fs_name_from_gid(gid);
1519 if (!op->get_session()->fs_name_capable(fs_name, MON_CAP_W)) {
1520 ss << "Permission denied.";
1521 return -EPERM;
1522 }
1523 const auto &info = fsmap.get_info_gid(gid);
1524 MDSMap::DaemonState state = info.state;
1525 if (state > 0) {
1526 ss << "cannot remove active mds." << info.name
1527 << " rank " << info.rank;
1528 return -EBUSY;
7c673cae 1529 } else {
f67539c2
TL
1530 fsmap.erase(gid, {});
1531 ss << "removed mds gid " << gid;
1532 return 0;
7c673cae
FG
1533 }
1534 } else if (prefix == "mds rmfailed") {
11fdf7f2 1535 bool confirm = false;
9f95a23c 1536 cmd_getval(cmdmap, "yes_i_really_mean_it", confirm);
11fdf7f2 1537 if (!confirm) {
7c673cae
FG
1538 ss << "WARNING: this can make your filesystem inaccessible! "
1539 "Add --yes-i-really-mean-it if you are sure you wish to continue.";
1540 return -EPERM;
1541 }
1542
1543 std::string role_str;
9f95a23c 1544 cmd_getval(cmdmap, "role", role_str);
7c673cae 1545 mds_role_t role;
f67539c2
TL
1546 const auto fs_names = op->get_session()->get_allowed_fs_names();
1547 int r = fsmap.parse_role(role_str, &role, ss, fs_names);
7c673cae
FG
1548 if (r < 0) {
1549 ss << "invalid role '" << role_str << "'";
1550 return -EINVAL;
1551 }
f67539c2
TL
1552 string_view fs_name = fsmap.get_filesystem(role.fscid)->mds_map.get_fs_name();
1553 if (!op->get_session()->fs_name_capable(fs_name, MON_CAP_W)) {
1554 ss << "Permission denied.";
1555 return -EPERM;
1556 }
7c673cae 1557
1adf2230 1558 fsmap.modify_filesystem(
7c673cae
FG
1559 role.fscid,
1560 [role](std::shared_ptr<Filesystem> fs)
1561 {
1562 fs->mds_map.failed.erase(role.rank);
1563 });
1564
1565 ss << "removed failed mds." << role;
1566 return 0;
522d829b 1567 /* TODO: convert to fs commands to update defaults */
7c673cae
FG
1568 } else if (prefix == "mds compat rm_compat") {
1569 int64_t f;
9f95a23c 1570 if (!cmd_getval(cmdmap, "feature", f)) {
7c673cae 1571 ss << "error parsing feature value '"
11fdf7f2 1572 << cmd_vartype_stringify(cmdmap.at("feature")) << "'";
7c673cae
FG
1573 return -EINVAL;
1574 }
522d829b 1575 if (fsmap.default_compat.compat.contains(f)) {
7c673cae 1576 ss << "removing compat feature " << f;
522d829b 1577 fsmap.default_compat.compat.remove(f);
7c673cae 1578 } else {
522d829b 1579 ss << "compat feature " << f << " not present in " << fsmap.default_compat;
7c673cae
FG
1580 }
1581 r = 0;
1582 } else if (prefix == "mds compat rm_incompat") {
1583 int64_t f;
9f95a23c 1584 if (!cmd_getval(cmdmap, "feature", f)) {
7c673cae 1585 ss << "error parsing feature value '"
11fdf7f2 1586 << cmd_vartype_stringify(cmdmap.at("feature")) << "'";
7c673cae
FG
1587 return -EINVAL;
1588 }
522d829b 1589 if (fsmap.default_compat.incompat.contains(f)) {
7c673cae 1590 ss << "removing incompat feature " << f;
522d829b 1591 fsmap.default_compat.incompat.remove(f);
7c673cae 1592 } else {
522d829b 1593 ss << "incompat feature " << f << " not present in " << fsmap.default_compat;
7c673cae
FG
1594 }
1595 r = 0;
1596 } else if (prefix == "mds repaired") {
1597 std::string role_str;
9f95a23c 1598 cmd_getval(cmdmap, "role", role_str);
7c673cae 1599 mds_role_t role;
f67539c2
TL
1600 const auto fs_names = op->get_session()->get_allowed_fs_names();
1601 r = fsmap.parse_role(role_str, &role, ss, fs_names);
7c673cae
FG
1602 if (r < 0) {
1603 return r;
1604 }
f67539c2
TL
1605 string_view fs_name = fsmap.get_filesystem(role.fscid)->mds_map.get_fs_name();
1606 if (!op->get_session()->fs_name_capable(fs_name, MON_CAP_W)) {
1607 ss << "Permission denied.";
1608 return -EPERM;
1609 }
7c673cae 1610
1adf2230 1611 bool modified = fsmap.undamaged(role.fscid, role.rank);
7c673cae 1612 if (modified) {
494da23a 1613 ss << "repaired: restoring rank " << role;
7c673cae 1614 } else {
494da23a 1615 ss << "nothing to do: rank is not damaged";
7c673cae
FG
1616 }
1617
1618 r = 0;
11fdf7f2
TL
1619 } else if (prefix == "mds freeze") {
1620 std::string who;
9f95a23c 1621 cmd_getval(cmdmap, "role_or_gid", who);
11fdf7f2
TL
1622 mds_gid_t gid = gid_from_arg(fsmap, who, ss);
1623 if (gid == MDS_GID_NONE) {
7c673cae
FG
1624 return -EINVAL;
1625 }
1626
f67539c2
TL
1627 string_view fs_name = fsmap.fs_name_from_gid(gid);
1628 if (!op->get_session()->fs_name_capable(fs_name, MON_CAP_W)) {
1629 ss << "Permission denied.";
1630 return -EPERM;
1631 }
1632
11fdf7f2 1633 bool freeze = false;
7c673cae 1634 {
11fdf7f2 1635 std::string str;
9f95a23c 1636 cmd_getval(cmdmap, "val", str);
11fdf7f2
TL
1637 if ((r = parse_bool(str, &freeze, ss)) != 0) {
1638 return r;
1639 }
1640 }
7c673cae 1641
11fdf7f2
TL
1642 auto f = [freeze,gid,&ss](auto& info) {
1643 if (freeze) {
1644 ss << "freezing mds." << gid;
1645 info.freeze();
1646 } else {
1647 ss << "unfreezing mds." << gid;
1648 info.unfreeze();
1649 }
1650 };
1651 fsmap.modify_daemon(gid, f);
7c673cae
FG
1652 r = 0;
1653 } else {
1654 return -ENOSYS;
1655 }
1656
1657 return r;
1658}
1659
7c673cae
FG
1660void MDSMonitor::check_subs()
1661{
7c673cae
FG
1662 // Subscriptions may be to "mdsmap" (MDS and legacy clients),
1663 // "mdsmap.<namespace>", or to "fsmap" for the full state of all
1664 // filesystems. Build a list of all the types we service
1665 // subscriptions for.
9f95a23c
TL
1666
1667 std::vector<std::string> types = {
1668 "fsmap",
1669 "fsmap.user",
1670 "mdsmap",
1671 };
1672
28e407b8
AA
1673 for (const auto &p : get_fsmap().filesystems) {
1674 const auto &fscid = p.first;
9f95a23c
TL
1675 CachedStackStringStream cos;
1676 *cos << "mdsmap." << fscid;
1677 types.push_back(std::string(cos->strv()));
7c673cae
FG
1678 }
1679
1680 for (const auto &type : types) {
f67539c2 1681 auto& subs = mon.session_map.subs;
9f95a23c
TL
1682 auto subs_it = subs.find(type);
1683 if (subs_it == subs.end())
7c673cae 1684 continue;
9f95a23c
TL
1685 auto sub_it = subs_it->second->begin();
1686 while (!sub_it.end()) {
1687 auto sub = *sub_it;
1688 ++sub_it; // N.B. check_sub may remove sub!
7c673cae
FG
1689 check_sub(sub);
1690 }
1691 }
1692}
1693
1694
1695void MDSMonitor::check_sub(Subscription *sub)
1696{
1697 dout(20) << __func__ << ": " << sub->type << dendl;
1698
f67539c2
TL
1699 // to use const qualifier filter fsmap beforehand
1700 FSMap _fsmap_copy = get_fsmap();
1701 _fsmap_copy.filter(sub->session->get_allowed_fs_names());
1702 const auto& fsmap = _fsmap_copy;
1703 if (sub->next > fsmap.get_epoch()) {
1704 return;
1705 }
28e407b8 1706
7c673cae 1707 if (sub->type == "fsmap") {
f67539c2
TL
1708 sub->session->con->send_message(new MFSMap(mon.monmap->fsid, fsmap));
1709 if (sub->onetime) {
1710 mon.session_map.remove_sub(sub);
1711 } else {
1712 sub->next = fsmap.get_epoch() + 1;
7c673cae
FG
1713 }
1714 } else if (sub->type == "fsmap.user") {
f67539c2
TL
1715 FSMapUser fsmap_u;
1716 fsmap_u.epoch = fsmap.get_epoch();
1717 fsmap_u.legacy_client_fscid = fsmap.legacy_client_fscid;
1718 for (const auto &p : fsmap.filesystems) {
1719 FSMapUser::fs_info_t& fs_info = fsmap_u.filesystems[p.second->fscid];
1720 fs_info.cid = p.second->fscid;
1721 fs_info.name = p.second->mds_map.fs_name;
1722 }
1723 sub->session->con->send_message(new MFSMapUser(mon.monmap->fsid, fsmap_u));
1724 if (sub->onetime) {
1725 mon.session_map.remove_sub(sub);
1726 } else {
1727 sub->next = fsmap.get_epoch() + 1;
7c673cae
FG
1728 }
1729 } else if (sub->type.compare(0, 6, "mdsmap") == 0) {
11fdf7f2 1730 const bool is_mds = sub->session->name.is_mds();
7c673cae
FG
1731 mds_gid_t mds_gid = MDS_GID_NONE;
1732 fs_cluster_id_t fscid = FS_CLUSTER_ID_NONE;
1733 if (is_mds) {
1734 // What (if any) namespace are you assigned to?
1735 auto mds_info = fsmap.get_mds_info();
1adf2230 1736 for (const auto &p : mds_info) {
11fdf7f2 1737 if (p.second.addrs == sub->session->addrs) {
1adf2230 1738 mds_gid = p.first;
7c673cae
FG
1739 fscid = fsmap.mds_roles.at(mds_gid);
1740 }
1741 }
1742 } else {
1743 // You're a client. Did you request a particular
1744 // namespace?
11fdf7f2 1745 if (sub->type.compare(0, 7, "mdsmap.") == 0) {
7c673cae
FG
1746 auto namespace_id_str = sub->type.substr(std::string("mdsmap.").size());
1747 dout(10) << __func__ << ": namespace_id " << namespace_id_str << dendl;
1748 std::string err;
1749 fscid = strict_strtoll(namespace_id_str.c_str(), 10, &err);
1750 if (!err.empty()) {
1751 // Client asked for a non-existent namespace, send them nothing
1752 dout(1) << "Invalid client subscription '" << sub->type
1753 << "'" << dendl;
1754 return;
1755 }
7c673cae
FG
1756 } else {
1757 // Unqualified request for "mdsmap": give it the one marked
1758 // for use by legacy clients.
1759 if (fsmap.legacy_client_fscid != FS_CLUSTER_ID_NONE) {
1760 fscid = fsmap.legacy_client_fscid;
1761 } else {
1762 dout(1) << "Client subscribed for legacy filesystem but "
1763 "none is configured" << dendl;
1764 return;
1765 }
1766 }
b3b6e05e
TL
1767 if (!fsmap.filesystem_exists(fscid)) {
1768 // Client asked for a non-existent namespace, send them nothing
1769 // TODO: something more graceful for when a client has a filesystem
1770 // mounted, and the fileysstem is deleted. Add a "shut down you fool"
1771 // flag to MMDSMap?
1772 dout(1) << "Client subscribed to non-existent namespace '" <<
1773 fscid << "'" << dendl;
1774 return;
1775 }
7c673cae
FG
1776 }
1777 dout(10) << __func__ << ": is_mds=" << is_mds << ", fscid= " << fscid << dendl;
1778
1779 // Work out the effective latest epoch
28e407b8 1780 const MDSMap *mds_map = nullptr;
522d829b 1781 MDSMap null_map = MDSMap::create_null_mdsmap();
7c673cae
FG
1782 if (fscid == FS_CLUSTER_ID_NONE) {
1783 // For a client, we should have already dropped out
11fdf7f2 1784 ceph_assert(is_mds);
7c673cae 1785
28e407b8
AA
1786 auto it = fsmap.standby_daemons.find(mds_gid);
1787 if (it != fsmap.standby_daemons.end()) {
7c673cae 1788 // For an MDS, we need to feed it an MDSMap with its own state in
28e407b8
AA
1789 null_map.mds_info[mds_gid] = it->second;
1790 null_map.epoch = fsmap.standby_epochs.at(mds_gid);
7c673cae
FG
1791 } else {
1792 null_map.epoch = fsmap.epoch;
1793 }
1794 mds_map = &null_map;
1795 } else {
1796 // Check the effective epoch
28e407b8 1797 mds_map = &fsmap.get_filesystem(fscid)->mds_map;
7c673cae
FG
1798 }
1799
11fdf7f2 1800 ceph_assert(mds_map != nullptr);
7c673cae
FG
1801 dout(10) << __func__ << " selected MDS map epoch " <<
1802 mds_map->epoch << " for namespace " << fscid << " for subscriber "
11fdf7f2 1803 << sub->session->name << " who wants epoch " << sub->next << dendl;
7c673cae
FG
1804
1805 if (sub->next > mds_map->epoch) {
1806 return;
1807 }
f67539c2
TL
1808 auto msg = make_message<MMDSMap>(mon.monmap->fsid, *mds_map,
1809 mds_map->fs_name);
7c673cae 1810
11fdf7f2 1811 sub->session->con->send_message(msg.detach());
7c673cae 1812 if (sub->onetime) {
f67539c2 1813 mon.session_map.remove_sub(sub);
7c673cae
FG
1814 } else {
1815 sub->next = mds_map->get_epoch() + 1;
1816 }
1817 }
1818}
1819
1820
1821void MDSMonitor::update_metadata(mds_gid_t gid,
1822 const map<string, string>& metadata)
1823{
1824 if (metadata.empty()) {
1825 return;
1826 }
1827 pending_metadata[gid] = metadata;
1828
f67539c2 1829 MonitorDBStore::TransactionRef t = paxos.get_pending_transaction();
7c673cae 1830 bufferlist bl;
11fdf7f2 1831 encode(pending_metadata, bl);
7c673cae 1832 t->put(MDS_METADATA_PREFIX, "last_metadata", bl);
f67539c2 1833 paxos.trigger_propose();
7c673cae
FG
1834}
1835
1adf2230 1836void MDSMonitor::remove_from_metadata(const FSMap &fsmap, MonitorDBStore::TransactionRef t)
7c673cae
FG
1837{
1838 bool update = false;
1adf2230
AA
1839 for (auto it = pending_metadata.begin(); it != pending_metadata.end(); ) {
1840 if (!fsmap.gid_exists(it->first)) {
1841 it = pending_metadata.erase(it);
7c673cae
FG
1842 update = true;
1843 } else {
1adf2230 1844 ++it;
7c673cae
FG
1845 }
1846 }
1847 if (!update)
1848 return;
1849 bufferlist bl;
11fdf7f2 1850 encode(pending_metadata, bl);
7c673cae
FG
1851 t->put(MDS_METADATA_PREFIX, "last_metadata", bl);
1852}
1853
1854int MDSMonitor::load_metadata(map<mds_gid_t, Metadata>& m)
1855{
1856 bufferlist bl;
f67539c2 1857 int r = mon.store->get(MDS_METADATA_PREFIX, "last_metadata", bl);
7c673cae 1858 if (r) {
11fdf7f2 1859 dout(5) << "Unable to load 'last_metadata'" << dendl;
7c673cae
FG
1860 return r;
1861 }
1862
11fdf7f2
TL
1863 auto it = bl.cbegin();
1864 ceph::decode(m, it);
7c673cae
FG
1865 return 0;
1866}
1867
1adf2230 1868void MDSMonitor::count_metadata(const std::string &field, map<string,int> *out)
31f18b77 1869{
31f18b77
FG
1870 map<mds_gid_t,Metadata> meta;
1871 load_metadata(meta);
1872 for (auto& p : meta) {
1873 auto q = p.second.find(field);
1874 if (q == p.second.end()) {
c07f9fc5 1875 (*out)["unknown"]++;
31f18b77 1876 } else {
c07f9fc5 1877 (*out)[q->second]++;
31f18b77
FG
1878 }
1879 }
c07f9fc5
FG
1880}
1881
1adf2230 1882void MDSMonitor::count_metadata(const std::string &field, Formatter *f)
c07f9fc5
FG
1883{
1884 map<string,int> by_val;
1885 count_metadata(field, &by_val);
31f18b77
FG
1886 f->open_object_section(field.c_str());
1887 for (auto& p : by_val) {
1888 f->dump_int(p.first.c_str(), p.second);
1889 }
1890 f->close_section();
1891}
1892
f67539c2
TL
1893void MDSMonitor::get_versions(std::map<string, list<string> > &versions)
1894{
1895 map<mds_gid_t,Metadata> meta;
1896 load_metadata(meta);
1897 const auto &fsmap = get_fsmap();
1898 std::map<mds_gid_t, mds_info_t> map = fsmap.get_mds_info();
1899 dout(10) << __func__ << " mds meta=" << meta << dendl;
1900 for (auto& p : meta) {
1901 auto q = p.second.find("ceph_version_short");
1902 if (q == p.second.end()) continue;
1903 versions[q->second].push_back(string("mds.") + map[p.first].name);
1904 }
1905}
1906
1adf2230
AA
1907int MDSMonitor::dump_metadata(const FSMap& fsmap, const std::string &who,
1908 Formatter *f, ostream& err)
7c673cae 1909{
11fdf7f2 1910 ceph_assert(f);
7c673cae 1911
1adf2230 1912 mds_gid_t gid = gid_from_arg(fsmap, who, err);
7c673cae
FG
1913 if (gid == MDS_GID_NONE) {
1914 return -EINVAL;
1915 }
1916
1917 map<mds_gid_t, Metadata> metadata;
1918 if (int r = load_metadata(metadata)) {
1919 err << "Unable to load 'last_metadata'";
1920 return r;
1921 }
1922
1923 if (!metadata.count(gid)) {
1924 return -ENOENT;
1925 }
1926 const Metadata& m = metadata[gid];
1927 for (Metadata::const_iterator p = m.begin(); p != m.end(); ++p) {
1928 f->dump_string(p->first.c_str(), p->second);
1929 }
1930 return 0;
1931}
1932
1933int MDSMonitor::print_nodes(Formatter *f)
1934{
11fdf7f2 1935 ceph_assert(f);
7c673cae 1936
1adf2230
AA
1937 const auto &fsmap = get_fsmap();
1938
7c673cae
FG
1939 map<mds_gid_t, Metadata> metadata;
1940 if (int r = load_metadata(metadata)) {
1941 return r;
1942 }
1943
11fdf7f2 1944 map<string, list<string> > mdses; // hostname => mds
1adf2230
AA
1945 for (const auto &p : metadata) {
1946 const mds_gid_t& gid = p.first;
1947 const Metadata& m = p.second;
7c673cae
FG
1948 Metadata::const_iterator hostname = m.find("hostname");
1949 if (hostname == m.end()) {
1950 // not likely though
1951 continue;
1952 }
1adf2230 1953 if (!fsmap.gid_exists(gid)) {
7c673cae
FG
1954 dout(5) << __func__ << ": GID " << gid << " not existent" << dendl;
1955 continue;
1956 }
1adf2230 1957 const MDSMap::mds_info_t& mds_info = fsmap.get_info_gid(gid);
11fdf7f2 1958 mdses[hostname->second].push_back(mds_info.name);
7c673cae
FG
1959 }
1960
1961 dump_services(f, mdses, "mds");
1962 return 0;
1963}
1964
1965/**
1966 * If a cluster is undersized (with respect to max_mds), then
11fdf7f2
TL
1967 * attempt to find daemons to grow it. If the cluster is oversized
1968 * (with respect to max_mds) then shrink it by stopping its highest rank.
7c673cae 1969 */
11fdf7f2 1970bool MDSMonitor::maybe_resize_cluster(FSMap &fsmap, fs_cluster_id_t fscid)
7c673cae 1971{
11fdf7f2
TL
1972 auto &current_mds_map = get_fsmap().get_filesystem(fscid)->mds_map;
1973 auto&& fs = fsmap.get_filesystem(fscid);
1adf2230 1974 auto &mds_map = fs->mds_map;
7c673cae 1975
1adf2230
AA
1976 int in = mds_map.get_num_in_mds();
1977 int max = mds_map.get_max_mds();
1978
1979 dout(20) << __func__ << " in " << in << " max " << max << dendl;
1980
11fdf7f2
TL
1981 /* Check that both the current epoch mds_map is resizeable as well as the
1982 * current batch of changes in pending. This is important if an MDS is
1983 * becoming active in the next epoch.
1984 */
1985 if (!current_mds_map.is_resizeable() ||
1986 !mds_map.is_resizeable()) {
1987 dout(5) << __func__ << " mds_map is not currently resizeable" << dendl;
1988 return false;
1989 }
1990
1991 if (in < max && !mds_map.test_flag(CEPH_MDSMAP_NOT_JOINABLE)) {
7c673cae 1992 mds_rank_t mds = mds_rank_t(0);
1adf2230 1993 while (mds_map.is_in(mds)) {
7c673cae
FG
1994 mds++;
1995 }
9f95a23c
TL
1996 auto info = fsmap.find_replacement_for({fscid, mds});
1997 if (!info) {
1adf2230 1998 return false;
7c673cae
FG
1999 }
2000
9f95a23c 2001 dout(1) << "assigned standby " << info->addrs
7c673cae 2002 << " as mds." << mds << dendl;
f67539c2 2003 mon.clog->info() << info->human_name() << " assigned to "
1adf2230
AA
2004 "filesystem " << mds_map.fs_name << " as rank "
2005 << mds << " (now has " << mds_map.get_num_in_mds() + 1
d2e6a577 2006 << " ranks)";
9f95a23c 2007 fsmap.promote(info->global_id, *fs, mds);
1adf2230 2008 return true;
11fdf7f2
TL
2009 } else if (in > max) {
2010 mds_rank_t target = in - 1;
2011 const auto &info = mds_map.get_info(target);
2012 if (mds_map.is_active(target)) {
2013 dout(1) << "stopping " << target << dendl;
f67539c2 2014 mon.clog->info() << "stopping " << info.human_name();
11fdf7f2
TL
2015 auto f = [](auto& info) {
2016 info.state = MDSMap::STATE_STOPPING;
2017 };
2018 fsmap.modify_daemon(info.global_id, f);
2019 return true;
2020 } else {
2021 dout(20) << "skipping stop of " << target << dendl;
2022 return false;
2023 }
7c673cae
FG
2024 }
2025
1adf2230 2026 return false;
7c673cae
FG
2027}
2028
2029
2030/**
9f95a23c 2031 * Fail a daemon and replace it with a suitable standby.
7c673cae 2032 */
9f95a23c 2033bool MDSMonitor::drop_mds(FSMap &fsmap, mds_gid_t gid, const mds_info_t* rep_info, bool *osd_propose)
7c673cae 2034{
11fdf7f2 2035 ceph_assert(osd_propose != nullptr);
7c673cae 2036
1adf2230 2037 const auto fscid = fsmap.mds_roles.at(gid);
9f95a23c
TL
2038 const auto& info = fsmap.get_info_gid(gid);
2039 const auto rank = info.rank;
2040 const auto state = info.state;
2041
2042 if (info.is_frozen()) {
2043 return false;
2044 } else if (state == MDSMap::STATE_STANDBY_REPLAY ||
2045 state == MDSMap::STATE_STANDBY) {
2046 dout(1) << " failing and removing standby " << gid << " " << info.addrs
2047 << " mds." << rank
2048 << "." << info.inc << " " << ceph_mds_state_name(state)
2049 << dendl;
2050 *osd_propose |= fail_mds_gid(fsmap, gid);
2051 return true;
2052 } else if (rank >= 0 && rep_info) {
2053 auto fs = fsmap.filesystems.at(fscid);
2054 if (fs->mds_map.test_flag(CEPH_MDSMAP_NOT_JOINABLE)) {
2055 return false;
2056 }
2057 // are we in?
2058 // and is there a non-laggy standby that can take over for us?
2059 dout(1) << " replacing " << gid << " " << info.addrs
2060 << " mds." << rank << "." << info.inc
2061 << " " << ceph_mds_state_name(state)
2062 << " with " << rep_info->global_id << "/" << rep_info->name << " " << rep_info->addrs
2063 << dendl;
2064
f67539c2 2065 mon.clog->warn() << "Replacing " << info.human_name()
9f95a23c
TL
2066 << " as rank " << rank
2067 << " with standby " << rep_info->human_name();
2068
2069 // Remove the old one
2070 *osd_propose |= fail_mds_gid(fsmap, gid);
2071
2072 // Promote the replacement
2073 fsmap.promote(rep_info->global_id, *fs, rank);
2074
2075 return true;
2076 }
2077 return false;
2078}
2079
2080bool MDSMonitor::check_health(FSMap& fsmap, bool* propose_osdmap)
2081{
2082 bool do_propose = false;
2083 const auto now = mono_clock::now();
f67539c2 2084 const bool osdmap_writeable = mon.osdmon()->is_writeable();
9f95a23c
TL
2085 const auto mds_beacon_grace = g_conf().get_val<double>("mds_beacon_grace");
2086 const auto mds_beacon_interval = g_conf().get_val<double>("mds_beacon_interval");
2087
2088 if (mono_clock::is_zero(last_tick)) {
2089 last_tick = now;
2090 }
2091
2092 {
2093 auto since_last = std::chrono::duration<double>(now-last_tick);
2094
2095 if (since_last.count() > (mds_beacon_grace-mds_beacon_interval)) {
2096 // This case handles either local slowness (calls being delayed
2097 // for whatever reason) or cluster election slowness (a long gap
2098 // between calls while an election happened)
2099 dout(1) << __func__ << ": resetting beacon timeouts due to mon delay "
2100 "(slow election?) of " << since_last.count() << " seconds" << dendl;
2101 for (auto& p : last_beacon) {
2102 p.second.stamp = now;
2103 }
2104 }
2105 }
2106
2107 // make sure last_beacon is fully populated
2108 for (auto& p : fsmap.mds_roles) {
2109 auto& gid = p.first;
2110 last_beacon.emplace(std::piecewise_construct,
2111 std::forward_as_tuple(gid),
2112 std::forward_as_tuple(now, 0));
2113 }
7c673cae 2114
31f18b77 2115 // We will only take decisive action (replacing/removing a daemon)
9f95a23c 2116 // if we have some indication that some other daemon(s) are successfully
31f18b77 2117 // getting beacons through recently.
1adf2230 2118 mono_time latest_beacon = mono_clock::zero();
9f95a23c 2119 for (const auto& p : last_beacon) {
1adf2230 2120 latest_beacon = std::max(p.second.stamp, latest_beacon);
31f18b77 2121 }
f67539c2 2122 auto since = std::chrono::duration<double>(now-latest_beacon);
1adf2230 2123 const bool may_replace = since.count() <
11fdf7f2 2124 std::max(g_conf()->mds_beacon_interval, g_conf()->mds_beacon_grace * 0.5);
31f18b77 2125
9f95a23c
TL
2126 // check beacon timestamps
2127 std::vector<mds_gid_t> to_remove;
2128 for (auto it = last_beacon.begin(); it != last_beacon.end(); ) {
2129 auto& [gid, beacon_info] = *it;
f67539c2 2130 auto since_last = std::chrono::duration<double>(now-beacon_info.stamp);
9f95a23c
TL
2131
2132 if (!fsmap.gid_exists(gid)) {
2133 // gid no longer exists, remove from tracked beacons
2134 it = last_beacon.erase(it);
2135 continue;
2136 }
7c673cae 2137
9f95a23c
TL
2138 if (since_last.count() >= g_conf()->mds_beacon_grace) {
2139 auto& info = fsmap.get_info_gid(gid);
2140 dout(1) << "no beacon from mds." << info.rank << "." << info.inc
2141 << " (gid: " << gid << " addr: " << info.addrs
2142 << " state: " << ceph_mds_state_name(info.state) << ")"
2143 << " since " << since_last.count() << dendl;
f67539c2 2144 // If the OSDMap is writeable, we can blocklist things, so we can
9f95a23c
TL
2145 // try failing any laggy MDS daemons. Consider each one for failure.
2146 if (!info.laggy()) {
2147 dout(1) << " marking " << gid << " " << info.addrs
2148 << " mds." << info.rank << "." << info.inc
2149 << " " << ceph_mds_state_name(info.state)
2150 << " laggy" << dendl;
2151 fsmap.modify_daemon(info.global_id, [](auto& info) {
2152 info.laggy_since = ceph_clock_now();
2153 });
2154 do_propose = true;
2155 }
2156 if (osdmap_writeable && may_replace) {
2157 to_remove.push_back(gid); // drop_mds may invalidate iterator
2158 }
2159 }
31f18b77 2160
9f95a23c
TL
2161 ++it;
2162 }
7c673cae 2163
9f95a23c 2164 for (const auto& gid : to_remove) {
f6b5b4d7 2165 auto info = fsmap.get_info_gid(gid);
9f95a23c
TL
2166 const mds_info_t* rep_info = nullptr;
2167 if (info.rank >= 0) {
f67539c2 2168 auto fscid = fsmap.fscid_from_gid(gid);
9f95a23c
TL
2169 rep_info = fsmap.find_replacement_for({fscid, info.rank});
2170 }
2171 bool dropped = drop_mds(fsmap, gid, rep_info, propose_osdmap);
2172 if (dropped) {
f67539c2 2173 mon.clog->info() << "MDS " << info.human_name()
9f95a23c
TL
2174 << " is removed because it is dead or otherwise unavailable.";
2175 do_propose = true;
2176 }
2177 }
7c673cae 2178
9f95a23c
TL
2179 if (osdmap_writeable) {
2180 for (auto& [fscid, fs] : fsmap.filesystems) {
2181 if (!fs->mds_map.test_flag(CEPH_MDSMAP_NOT_JOINABLE) &&
2182 fs->mds_map.is_resizeable()) {
2183 // Check if a rank or standby-replay should be replaced with a stronger
2184 // affinity standby. This looks at ranks and standby-replay:
2185 for (const auto& [gid, info] : fs->mds_map.get_mds_info()) {
2186 const auto join_fscid = info.join_fscid;
2187 if (join_fscid == fscid)
2188 continue;
2189 const auto rank = info.rank;
2190 const auto state = info.state;
2191 const mds_info_t* rep_info = nullptr;
2192 if (state == MDSMap::STATE_STANDBY_REPLAY) {
522d829b 2193 rep_info = fsmap.get_available_standby(*fs);
9f95a23c
TL
2194 } else if (state == MDSMap::STATE_ACTIVE) {
2195 rep_info = fsmap.find_replacement_for({fscid, rank});
2196 } else {
2197 /* N.B. !is_degraded() */
2198 ceph_abort_msg("invalid state in MDSMap");
2199 }
2200 if (!rep_info) {
2201 break;
2202 }
2203 bool better_affinity = false;
2204 if (join_fscid == FS_CLUSTER_ID_NONE) {
2205 better_affinity = (rep_info->join_fscid == fscid);
2206 } else {
2207 better_affinity = (rep_info->join_fscid == fscid) ||
2208 (rep_info->join_fscid == FS_CLUSTER_ID_NONE);
2209 }
2210 if (better_affinity) {
2211 if (state == MDSMap::STATE_STANDBY_REPLAY) {
f67539c2 2212 mon.clog->info() << "Dropping low affinity standby-replay "
9f95a23c
TL
2213 << info.human_name()
2214 << " in favor of higher affinity standby.";
2215 *propose_osdmap |= fail_mds_gid(fsmap, gid);
2216 /* Now let maybe_promote_standby do the promotion. */
2217 } else {
f67539c2 2218 mon.clog->info() << "Dropping low affinity active "
9f95a23c
TL
2219 << info.human_name()
2220 << " in favor of higher affinity standby.";
2221 do_propose |= drop_mds(fsmap, gid, rep_info, propose_osdmap);
2222 }
2223 break; /* don't replace more than one per tick per fs */
2224 }
2225 }
2226 }
2227 }
7c673cae 2228 }
9f95a23c 2229 return do_propose;
7c673cae
FG
2230}
2231
11fdf7f2 2232bool MDSMonitor::maybe_promote_standby(FSMap &fsmap, Filesystem& fs)
7c673cae 2233{
11fdf7f2
TL
2234 if (fs.mds_map.test_flag(CEPH_MDSMAP_NOT_JOINABLE)) {
2235 return false;
2236 }
7c673cae
FG
2237
2238 bool do_propose = false;
2239
2240 // have a standby take over?
2241 set<mds_rank_t> failed;
11fdf7f2
TL
2242 fs.mds_map.get_failed_mds_set(failed);
2243 for (const auto& rank : failed) {
9f95a23c
TL
2244 auto info = fsmap.find_replacement_for({fs.fscid, rank});
2245 if (info) {
2246 dout(1) << " taking over failed mds." << rank << " with " << info->global_id
2247 << "/" << info->name << " " << info->addrs << dendl;
f67539c2 2248 mon.clog->info() << "Standby " << info->human_name()
11fdf7f2
TL
2249 << " assigned to filesystem " << fs.mds_map.fs_name
2250 << " as rank " << rank;
2251
9f95a23c 2252 fsmap.promote(info->global_id, fs, rank);
11fdf7f2 2253 do_propose = true;
7c673cae 2254 }
11fdf7f2
TL
2255 }
2256
f67539c2 2257 if (fs.mds_map.is_resizeable() && fs.mds_map.allows_standby_replay()) {
7c673cae 2258 // There were no failures to replace, so try using any available standbys
a8e16298
TL
2259 // as standby-replay daemons. Don't do this when the cluster is degraded
2260 // as a standby-replay daemon may try to read a journal being migrated.
11fdf7f2 2261 for (;;) {
522d829b 2262 auto info = fsmap.get_available_standby(fs);
9f95a23c
TL
2263 if (!info) break;
2264 dout(20) << "standby available mds." << info->global_id << dendl;
11fdf7f2
TL
2265 bool changed = false;
2266 for (const auto& rank : fs.mds_map.in) {
9f95a23c 2267 dout(20) << "examining " << rank << dendl;
11fdf7f2 2268 if (fs.mds_map.is_followable(rank)) {
9f95a23c 2269 dout(1) << " setting mds." << info->global_id
11fdf7f2 2270 << " to follow mds rank " << rank << dendl;
9f95a23c 2271 fsmap.assign_standby_replay(info->global_id, fs.fscid, rank);
11fdf7f2
TL
2272 do_propose = true;
2273 changed = true;
2274 break;
7c673cae 2275 }
7c673cae 2276 }
11fdf7f2 2277 if (!changed) break;
7c673cae
FG
2278 }
2279 }
2280
2281 return do_propose;
2282}
2283
2284void MDSMonitor::tick()
2285{
1adf2230 2286 if (!is_active() || !is_leader()) return;
28e407b8
AA
2287
2288 auto &pending = get_pending_fsmap_writeable();
7c673cae 2289
28e407b8 2290 bool do_propose = false;
9f95a23c 2291 bool propose_osdmap = false;
7c673cae 2292
522d829b
TL
2293 if (check_fsmap_struct_version) {
2294 /* Allow time for trimming otherwise PaxosService::is_writeable will always
2295 * be false.
2296 */
2297
2298 auto now = clock::now();
2299 auto elapsed = now - last_fsmap_struct_flush;
2300 if (elapsed > std::chrono::seconds(30)) {
2301 FSMap fsmap;
2302 bufferlist bl;
2303 auto v = get_first_committed();
2304 int err = get_version(v, bl);
2305 if (err) {
2306 derr << "could not get version " << v << dendl;
2307 ceph_abort();
2308 }
a4b75251
TL
2309 try {
2310 fsmap.decode(bl);
2311 } catch (const ceph::buffer::malformed_input& e) {
2312 dout(5) << "flushing old fsmap struct because unable to decode FSMap: " << e.what() << dendl;
2313 }
2314 /* N.B. FSMap::is_struct_old is also true for undecoded (failed to decode) FSMap */
522d829b
TL
2315 if (fsmap.is_struct_old()) {
2316 dout(5) << "fsmap struct is too old; proposing to flush out old versions" << dendl;
2317 do_propose = true;
2318 last_fsmap_struct_flush = now;
2319 } else {
2320 dout(20) << "struct is recent" << dendl;
2321 check_fsmap_struct_version = false;
2322 }
2323 }
2324 }
2325
28e407b8 2326 do_propose |= pending.check_health();
7c673cae 2327
9f95a23c
TL
2328 /* Check health and affinity of ranks */
2329 do_propose |= check_health(pending, &propose_osdmap);
7c673cae 2330
9f95a23c
TL
2331 /* Resize the cluster according to max_mds. */
2332 for (auto& p : pending.filesystems) {
2333 do_propose |= maybe_resize_cluster(pending, p.second->fscid);
7c673cae
FG
2334 }
2335
9f95a23c
TL
2336 /* Replace any failed ranks. */
2337 for (auto& p : pending.filesystems) {
2338 do_propose |= maybe_promote_standby(pending, *p.second);
7c673cae
FG
2339 }
2340
c07f9fc5 2341 if (propose_osdmap) {
f67539c2 2342 request_proposal(mon.osdmon());
c07f9fc5 2343 }
7c673cae 2344
7c673cae
FG
2345 if (do_propose) {
2346 propose_pending();
2347 }
9f95a23c
TL
2348
2349 last_tick = mono_clock::now();
7c673cae
FG
2350}
2351
f67539c2 2352MDSMonitor::MDSMonitor(Monitor &mn, Paxos &p, string service_name)
7c673cae
FG
2353 : PaxosService(mn, p, service_name)
2354{
f67539c2 2355 handlers = FileSystemCommandHandler::load(&p);
7c673cae
FG
2356}
2357
2358void MDSMonitor::on_restart()
2359{
2360 // Clear out the leader-specific state.
1adf2230 2361 last_tick = mono_clock::now();
7c673cae
FG
2362 last_beacon.clear();
2363}
2364