]> git.proxmox.com Git - ceph.git/blame - ceph/src/mon/MDSMonitor.cc
update download target update for octopus release
[ceph.git] / ceph / src / mon / MDSMonitor.cc
CommitLineData
7c673cae
FG
1// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2// vim: ts=8 sw=2 smarttab
3/*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 *
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
12 *
13 */
14
11fdf7f2 15#include <regex>
7c673cae
FG
16#include <sstream>
17#include <boost/utility.hpp>
18
19#include "MDSMonitor.h"
20#include "FSCommands.h"
21#include "Monitor.h"
22#include "MonitorDBStore.h"
23#include "OSDMonitor.h"
7c673cae
FG
24
25#include "common/strtol.h"
26#include "common/perf_counters.h"
27#include "common/config.h"
28#include "common/cmdparse.h"
29#include "messages/MMDSMap.h"
30#include "messages/MFSMap.h"
31#include "messages/MFSMapUser.h"
32#include "messages/MMDSLoadTargets.h"
33#include "messages/MMonCommand.h"
34#include "messages/MGenericMessage.h"
35
11fdf7f2 36#include "include/ceph_assert.h"
7c673cae
FG
37#include "include/str_list.h"
38#include "include/stringify.h"
39#include "mds/mdstypes.h"
40#include "Session.h"
41
42#define dout_subsys ceph_subsys_mon
43#undef dout_prefix
28e407b8
AA
44#define dout_prefix _prefix(_dout, mon, get_fsmap())
45static ostream& _prefix(std::ostream *_dout, Monitor *mon, const FSMap& fsmap) {
7c673cae
FG
46 return *_dout << "mon." << mon->name << "@" << mon->rank
47 << "(" << mon->get_state_name()
48 << ").mds e" << fsmap.get_epoch() << " ";
49}
50
3efd9988
FG
51static const string MDS_METADATA_PREFIX("mds_metadata");
52static const string MDS_HEALTH_PREFIX("mds_health");
53
54
7c673cae
FG
55/*
56 * Specialized implementation of cmd_getval to allow us to parse
57 * out strongly-typedef'd types
58 */
59template<> bool cmd_getval(CephContext *cct, const cmdmap_t& cmdmap,
31f18b77 60 const std::string& k, mds_gid_t &val)
7c673cae
FG
61{
62 return cmd_getval(cct, cmdmap, k, (int64_t&)val);
63}
64
65template<> bool cmd_getval(CephContext *cct, const cmdmap_t& cmdmap,
31f18b77 66 const std::string& k, mds_rank_t &val)
7c673cae
FG
67{
68 return cmd_getval(cct, cmdmap, k, (int64_t&)val);
69}
70
71template<> bool cmd_getval(CephContext *cct, const cmdmap_t& cmdmap,
31f18b77 72 const std::string& k, MDSMap::DaemonState &val)
7c673cae
FG
73{
74 return cmd_getval(cct, cmdmap, k, (int64_t&)val);
75}
76
7c673cae
FG
77// my methods
78
11fdf7f2
TL
79template <int dblV>
80void MDSMonitor::print_map(const FSMap& m)
7c673cae 81{
11fdf7f2 82 dout(dblV) << "print_map\n";
7c673cae
FG
83 m.print(*_dout);
84 *_dout << dendl;
85}
86
87// service methods
88void MDSMonitor::create_initial()
89{
90 dout(10) << "create_initial" << dendl;
91}
92
11fdf7f2 93void MDSMonitor::get_store_prefixes(std::set<string>& s) const
3efd9988
FG
94{
95 s.insert(service_name);
96 s.insert(MDS_METADATA_PREFIX);
97 s.insert(MDS_HEALTH_PREFIX);
98}
7c673cae
FG
99
100void MDSMonitor::update_from_paxos(bool *need_bootstrap)
101{
102 version_t version = get_last_committed();
28e407b8 103 if (version == get_fsmap().epoch)
7c673cae
FG
104 return;
105
106 dout(10) << __func__ << " version " << version
28e407b8 107 << ", my e " << get_fsmap().epoch << dendl;
11fdf7f2 108 ceph_assert(version > get_fsmap().epoch);
7c673cae 109
224ce89b
WB
110 load_health();
111
7c673cae
FG
112 // read and decode
113 bufferlist fsmap_bl;
114 fsmap_bl.clear();
115 int err = get_version(version, fsmap_bl);
11fdf7f2 116 ceph_assert(err == 0);
7c673cae 117
11fdf7f2 118 ceph_assert(fsmap_bl.length() > 0);
7c673cae 119 dout(10) << __func__ << " got " << version << dendl;
28e407b8 120 PaxosFSMap::decode(fsmap_bl);
7c673cae
FG
121
122 // new map
91327a77 123 dout(0) << "new map" << dendl;
11fdf7f2
TL
124 print_map<0>(get_fsmap());
125 if (!g_conf()->mon_mds_skip_sanity) {
28e407b8 126 get_fsmap().sanity();
7c673cae
FG
127 }
128
129 check_subs();
7c673cae
FG
130}
131
132void MDSMonitor::init()
133{
134 (void)load_metadata(pending_metadata);
135}
136
137void MDSMonitor::create_pending()
138{
28e407b8 139 auto &fsmap = PaxosFSMap::create_pending();
7c673cae 140
3efd9988 141 if (mon->osdmon()->is_readable()) {
28e407b8
AA
142 const auto &osdmap = mon->osdmon()->osdmap;
143 fsmap.sanitize([&osdmap](int64_t pool){return osdmap.have_pg_pool(pool);});
3efd9988
FG
144 }
145
28e407b8 146 dout(10) << "create_pending e" << fsmap.epoch << dendl;
7c673cae
FG
147}
148
149void MDSMonitor::encode_pending(MonitorDBStore::TransactionRef t)
150{
28e407b8
AA
151 auto &pending = get_pending_fsmap_writeable();
152 auto &epoch = pending.epoch;
7c673cae 153
28e407b8 154 dout(10) << "encode_pending e" << epoch << dendl;
7c673cae
FG
155
156 // print map iff 'debug mon = 30' or higher
11fdf7f2
TL
157 print_map<30>(pending);
158 if (!g_conf()->mon_mds_skip_sanity) {
28e407b8 159 pending.sanity();
7c673cae
FG
160 }
161
162 // Set 'modified' on maps modified this epoch
28e407b8
AA
163 for (auto &p : pending.filesystems) {
164 if (p.second->mds_map.epoch == epoch) {
165 p.second->mds_map.modified = ceph_clock_now();
7c673cae
FG
166 }
167 }
168
169 // apply to paxos
11fdf7f2 170 ceph_assert(get_last_committed() + 1 == pending.epoch);
28e407b8
AA
171 bufferlist pending_bl;
172 pending.encode(pending_bl, mon->get_quorum_con_features());
7c673cae
FG
173
174 /* put everything in the transaction */
28e407b8
AA
175 put_version(t, pending.epoch, pending_bl);
176 put_last_committed(t, pending.epoch);
7c673cae
FG
177
178 // Encode MDSHealth data
179 for (std::map<uint64_t, MDSHealth>::iterator i = pending_daemon_health.begin();
180 i != pending_daemon_health.end(); ++i) {
181 bufferlist bl;
182 i->second.encode(bl);
183 t->put(MDS_HEALTH_PREFIX, stringify(i->first), bl);
184 }
185
186 for (std::set<uint64_t>::iterator i = pending_daemon_health_rm.begin();
187 i != pending_daemon_health_rm.end(); ++i) {
188 t->erase(MDS_HEALTH_PREFIX, stringify(*i));
189 }
190 pending_daemon_health_rm.clear();
1adf2230 191 remove_from_metadata(pending, t);
224ce89b
WB
192
193 // health
194 health_check_map_t new_checks;
28e407b8 195 const auto &info_map = pending.get_mds_info();
224ce89b
WB
196 for (const auto &i : info_map) {
197 const auto &gid = i.first;
198 const auto &info = i.second;
199 if (pending_daemon_health_rm.count(gid)) {
200 continue;
201 }
202 MDSHealth health;
203 auto p = pending_daemon_health.find(gid);
204 if (p != pending_daemon_health.end()) {
205 health = p->second;
206 } else {
207 bufferlist bl;
208 mon->store->get(MDS_HEALTH_PREFIX, stringify(gid), bl);
209 if (!bl.length()) {
210 derr << "Missing health data for MDS " << gid << dendl;
211 continue;
212 }
11fdf7f2 213 auto bl_i = bl.cbegin();
224ce89b
WB
214 health.decode(bl_i);
215 }
216 for (const auto &metric : health.metrics) {
d2e6a577 217 const int rank = info.rank;
224ce89b
WB
218 health_check_t *check = &new_checks.get_or_add(
219 mds_metric_name(metric.type),
220 metric.sev,
221 mds_metric_summary(metric.type));
222 ostringstream ss;
223 ss << "mds" << info.name << "(mds." << rank << "): " << metric.message;
28e407b8
AA
224 bool first = true;
225 for (auto &p : metric.metadata) {
226 if (first) {
227 ss << " ";
228 } else {
224ce89b 229 ss << ", ";
28e407b8
AA
230 }
231 ss << p.first << ": " << p.second;
232 first = false;
224ce89b
WB
233 }
234 check->detail.push_back(ss.str());
235 }
236 }
28e407b8 237 pending.get_health_checks(&new_checks);
224ce89b 238 for (auto& p : new_checks.checks) {
11fdf7f2 239 p.second.summary = std::regex_replace(
224ce89b 240 p.second.summary,
11fdf7f2 241 std::regex("%num%"),
224ce89b 242 stringify(p.second.detail.size()));
11fdf7f2 243 p.second.summary = std::regex_replace(
224ce89b 244 p.second.summary,
11fdf7f2 245 std::regex("%plurals%"),
224ce89b 246 p.second.detail.size() > 1 ? "s" : "");
11fdf7f2 247 p.second.summary = std::regex_replace(
224ce89b 248 p.second.summary,
11fdf7f2 249 std::regex("%isorare%"),
224ce89b 250 p.second.detail.size() > 1 ? "are" : "is");
11fdf7f2 251 p.second.summary = std::regex_replace(
181888fb 252 p.second.summary,
11fdf7f2 253 std::regex("%hasorhave%"),
181888fb 254 p.second.detail.size() > 1 ? "have" : "has");
224ce89b
WB
255 }
256 encode_health(new_checks, t);
7c673cae
FG
257}
258
11fdf7f2 259version_t MDSMonitor::get_trim_to() const
7c673cae
FG
260{
261 version_t floor = 0;
11fdf7f2
TL
262 if (g_conf()->mon_mds_force_trim_to > 0 &&
263 g_conf()->mon_mds_force_trim_to < (int)get_last_committed()) {
264 floor = g_conf()->mon_mds_force_trim_to;
7c673cae
FG
265 dout(10) << __func__ << " explicit mon_mds_force_trim_to = "
266 << floor << dendl;
267 }
268
11fdf7f2 269 unsigned max = g_conf()->mon_max_mdsmap_epochs;
7c673cae
FG
270 version_t last = get_last_committed();
271
272 if (last - get_first_committed() > max && floor < last - max)
273 return last - max;
274 return floor;
275}
276
7c673cae
FG
277bool MDSMonitor::preprocess_query(MonOpRequestRef op)
278{
279 op->mark_mdsmon_event(__func__);
280 PaxosServiceMessage *m = static_cast<PaxosServiceMessage*>(op->get_req());
11fdf7f2
TL
281 dout(10) << "preprocess_query " << *m << " from " << m->get_orig_source()
282 << " " << m->get_orig_source_addrs() << dendl;
7c673cae
FG
283
284 switch (m->get_type()) {
285
286 case MSG_MDS_BEACON:
287 return preprocess_beacon(op);
288
289 case MSG_MON_COMMAND:
f64942e4
AA
290 try {
291 return preprocess_command(op);
11fdf7f2 292 } catch (const bad_cmd_get& e) {
f64942e4
AA
293 bufferlist bl;
294 mon->reply_command(op, -EINVAL, e.what(), bl, get_last_committed());
295 return true;
296 }
7c673cae
FG
297
298 case MSG_MDS_OFFLOAD_TARGETS:
299 return preprocess_offload_targets(op);
300
301 default:
302 ceph_abort();
303 return true;
304 }
305}
306
307void MDSMonitor::_note_beacon(MMDSBeacon *m)
308{
309 mds_gid_t gid = mds_gid_t(m->get_global_id());
310 version_t seq = m->get_seq();
311
91327a77 312 dout(5) << "_note_beacon " << *m << " noting time" << dendl;
1adf2230
AA
313 auto &beacon = last_beacon[gid];
314 beacon.stamp = mono_clock::now();
315 beacon.seq = seq;
7c673cae
FG
316}
317
318bool MDSMonitor::preprocess_beacon(MonOpRequestRef op)
319{
320 op->mark_mdsmon_event(__func__);
321 MMDSBeacon *m = static_cast<MMDSBeacon*>(op->get_req());
322 MDSMap::DaemonState state = m->get_state();
323 mds_gid_t gid = m->get_global_id();
324 version_t seq = m->get_seq();
325 MDSMap::mds_info_t info;
326 epoch_t effective_epoch = 0;
327
1adf2230 328 const auto &fsmap = get_fsmap();
28e407b8 329
7c673cae 330 // check privileges, ignore if fails
11fdf7f2
TL
331 MonSession *session = op->get_session();
332 if (!session)
333 goto ignore;
7c673cae
FG
334 if (!session->is_capable("mds", MON_CAP_X)) {
335 dout(0) << "preprocess_beacon got MMDSBeacon from entity with insufficient privileges "
336 << session->caps << dendl;
337 goto ignore;
338 }
339
340 if (m->get_fsid() != mon->monmap->fsid) {
341 dout(0) << "preprocess_beacon on fsid " << m->get_fsid() << " != " << mon->monmap->fsid << dendl;
342 goto ignore;
343 }
344
91327a77 345 dout(5) << "preprocess_beacon " << *m
11fdf7f2
TL
346 << " from " << m->get_orig_source()
347 << " " << m->get_orig_source_addrs()
7c673cae
FG
348 << " " << m->get_compat()
349 << dendl;
350
351 // make sure the address has a port
352 if (m->get_orig_source_addr().get_port() == 0) {
353 dout(1) << " ignoring boot message without a port" << dendl;
354 goto ignore;
355 }
356
357 // check compat
358 if (!m->get_compat().writeable(fsmap.compat)) {
11fdf7f2
TL
359 dout(1) << " mds " << m->get_orig_source()
360 << " " << m->get_orig_source_addrs()
361 << " can't write to fsmap " << fsmap.compat << dendl;
7c673cae
FG
362 goto ignore;
363 }
364
365 // fw to leader?
28e407b8 366 if (!is_leader())
7c673cae
FG
367 return false;
368
369 // booted, but not in map?
28e407b8 370 if (!fsmap.gid_exists(gid)) {
7c673cae
FG
371 if (state != MDSMap::STATE_BOOT) {
372 dout(7) << "mds_beacon " << *m << " is not in fsmap (state "
373 << ceph_mds_state_name(state) << ")" << dendl;
374
1adf2230
AA
375 /* We can't send an MDSMap this MDS was a part of because we no longer
376 * know which FS it was part of. Nor does this matter. Sending an empty
377 * MDSMap is sufficient for getting the MDS to respawn.
378 */
7c673cae
FG
379 MDSMap null_map;
380 null_map.epoch = fsmap.epoch;
381 null_map.compat = fsmap.compat;
11fdf7f2
TL
382 auto m = MMDSMap::create(mon->monmap->fsid, null_map);
383 mon->send_reply(op, m.detach());
7c673cae
FG
384 return true;
385 } else {
386 return false; // not booted yet.
387 }
388 }
389 dout(10) << __func__ << ": GID exists in map: " << gid << dendl;
28e407b8 390 info = fsmap.get_info_gid(gid);
7c673cae
FG
391
392 // old seq?
393 if (info.state_seq > seq) {
394 dout(7) << "mds_beacon " << *m << " has old seq, ignoring" << dendl;
395 goto ignore;
396 }
397
398 // Work out the latest epoch that this daemon should have seen
399 {
28e407b8 400 fs_cluster_id_t fscid = fsmap.mds_roles.at(gid);
7c673cae 401 if (fscid == FS_CLUSTER_ID_NONE) {
28e407b8 402 effective_epoch = fsmap.standby_epochs.at(gid);
7c673cae 403 } else {
28e407b8 404 effective_epoch = fsmap.get_filesystem(fscid)->mds_map.epoch;
7c673cae
FG
405 }
406 if (effective_epoch != m->get_last_epoch_seen()) {
407 dout(10) << "mds_beacon " << *m
408 << " ignoring requested state, because mds hasn't seen latest map" << dendl;
409 goto reply;
410 }
411 }
412
413 if (info.laggy()) {
414 _note_beacon(m);
415 return false; // no longer laggy, need to update map.
416 }
417 if (state == MDSMap::STATE_BOOT) {
418 // ignore, already booted.
419 goto ignore;
420 }
421 // is there a state change here?
422 if (info.state != state) {
423 // legal state change?
424 if ((info.state == MDSMap::STATE_STANDBY ||
425 info.state == MDSMap::STATE_STANDBY_REPLAY) && state > 0) {
426 dout(10) << "mds_beacon mds can't activate itself (" << ceph_mds_state_name(info.state)
427 << " -> " << ceph_mds_state_name(state) << ")" << dendl;
428 goto reply;
429 }
430
431 if ((state == MDSMap::STATE_STANDBY || state == MDSMap::STATE_STANDBY_REPLAY)
432 && info.rank != MDS_RANK_NONE)
433 {
434 dout(4) << "mds_beacon MDS can't go back into standby after taking rank: "
435 "held rank " << info.rank << " while requesting state "
436 << ceph_mds_state_name(state) << dendl;
437 goto reply;
438 }
439
440 _note_beacon(m);
441 return false;
442 }
443
444 // Comparing known daemon health with m->get_health()
445 // and return false (i.e. require proposal) if they
446 // do not match, to update our stored
447 if (!(pending_daemon_health[gid] == m->get_health())) {
91327a77 448 dout(10) << __func__ << " health metrics for gid " << gid << " were updated" << dendl;
7c673cae
FG
449 _note_beacon(m);
450 return false;
451 }
452
453 reply:
454 // note time and reply
11fdf7f2 455 ceph_assert(effective_epoch > 0);
7c673cae 456 _note_beacon(m);
11fdf7f2
TL
457 {
458 auto beacon = MMDSBeacon::create(mon->monmap->fsid,
459 m->get_global_id(), m->get_name(), effective_epoch,
460 state, seq, CEPH_FEATURES_SUPPORTED_DEFAULT);
461 mon->send_reply(op, beacon.detach());
462 }
7c673cae
FG
463 return true;
464
465 ignore:
466 // I won't reply this beacon, drop it.
467 mon->no_reply(op);
468 return true;
469}
470
471bool MDSMonitor::preprocess_offload_targets(MonOpRequestRef op)
472{
473 op->mark_mdsmon_event(__func__);
474 MMDSLoadTargets *m = static_cast<MMDSLoadTargets*>(op->get_req());
475 dout(10) << "preprocess_offload_targets " << *m << " from " << m->get_orig_source() << dendl;
28e407b8 476
1adf2230 477 const auto &fsmap = get_fsmap();
7c673cae
FG
478
479 // check privileges, ignore message if fails
11fdf7f2 480 MonSession *session = op->get_session();
7c673cae 481 if (!session)
1adf2230 482 goto ignore;
7c673cae
FG
483 if (!session->is_capable("mds", MON_CAP_X)) {
484 dout(0) << "preprocess_offload_targets got MMDSLoadTargets from entity with insufficient caps "
485 << session->caps << dendl;
1adf2230 486 goto ignore;
7c673cae
FG
487 }
488
489 if (fsmap.gid_exists(m->global_id) &&
490 m->targets == fsmap.get_info_gid(m->global_id).export_targets)
1adf2230 491 goto ignore;
7c673cae
FG
492
493 return false;
494
1adf2230
AA
495 ignore:
496 mon->no_reply(op);
7c673cae
FG
497 return true;
498}
499
500
501bool MDSMonitor::prepare_update(MonOpRequestRef op)
502{
503 op->mark_mdsmon_event(__func__);
504 PaxosServiceMessage *m = static_cast<PaxosServiceMessage*>(op->get_req());
505 dout(7) << "prepare_update " << *m << dendl;
506
507 switch (m->get_type()) {
508
509 case MSG_MDS_BEACON:
510 return prepare_beacon(op);
511
512 case MSG_MON_COMMAND:
f64942e4
AA
513 try {
514 return prepare_command(op);
11fdf7f2 515 } catch (const bad_cmd_get& e) {
f64942e4
AA
516 bufferlist bl;
517 mon->reply_command(op, -EINVAL, e.what(), bl, get_last_committed());
518 return true;
519 }
7c673cae
FG
520
521 case MSG_MDS_OFFLOAD_TARGETS:
522 return prepare_offload_targets(op);
523
524 default:
525 ceph_abort();
526 }
527
528 return true;
529}
530
531bool MDSMonitor::prepare_beacon(MonOpRequestRef op)
532{
533 op->mark_mdsmon_event(__func__);
534 MMDSBeacon *m = static_cast<MMDSBeacon*>(op->get_req());
535 // -- this is an update --
11fdf7f2
TL
536 dout(12) << "prepare_beacon " << *m << " from " << m->get_orig_source()
537 << " " << m->get_orig_source_addrs() << dendl;
538 entity_addrvec_t addrs = m->get_orig_source_addrs();
7c673cae
FG
539 mds_gid_t gid = m->get_global_id();
540 MDSMap::DaemonState state = m->get_state();
541 version_t seq = m->get_seq();
542
28e407b8
AA
543 auto &pending = get_pending_fsmap_writeable();
544
91327a77 545 dout(15) << __func__ << " got health from gid " << gid << " with " << m->get_health().metrics.size() << " metrics." << dendl;
7c673cae
FG
546
547 // Calculate deltas of health metrics created and removed
548 // Do this by type rather than MDSHealthMetric equality, because messages can
549 // change a lot when they include e.g. a number of items.
550 const auto &old_health = pending_daemon_health[gid].metrics;
551 const auto &new_health = m->get_health().metrics;
552
553 std::set<mds_metric_t> old_types;
554 for (const auto &i : old_health) {
555 old_types.insert(i.type);
556 }
557
558 std::set<mds_metric_t> new_types;
559 for (const auto &i : new_health) {
560 new_types.insert(i.type);
561 }
562
563 for (const auto &new_metric: new_health) {
564 if (old_types.count(new_metric.type) == 0) {
11fdf7f2 565 dout(10) << "MDS health message (" << m->get_orig_source()
28e407b8 566 << "): " << new_metric.sev << " " << new_metric.message << dendl;
7c673cae
FG
567 }
568 }
569
570 // Log the disappearance of health messages at INFO
571 for (const auto &old_metric : old_health) {
572 if (new_types.count(old_metric.type) == 0) {
573 mon->clog->info() << "MDS health message cleared ("
11fdf7f2 574 << m->get_orig_source() << "): " << old_metric.message;
7c673cae
FG
575 }
576 }
577
578 // Store health
579 pending_daemon_health[gid] = m->get_health();
580
581 // boot?
582 if (state == MDSMap::STATE_BOOT) {
583 // zap previous instance of this name?
11fdf7f2 584 if (g_conf()->mds_enforce_unique_name) {
7c673cae 585 bool failed_mds = false;
28e407b8 586 while (mds_gid_t existing = pending.find_mds_gid_by_name(m->get_name())) {
7c673cae
FG
587 if (!mon->osdmon()->is_writeable()) {
588 mon->osdmon()->wait_for_writeable(op, new C_RetryMessage(this, op));
589 return false;
590 }
d2e6a577 591 const MDSMap::mds_info_t &existing_info =
28e407b8 592 pending.get_info_gid(existing);
d2e6a577 593 mon->clog->info() << existing_info.human_name() << " restarted";
1adf2230 594 fail_mds_gid(pending, existing);
7c673cae
FG
595 failed_mds = true;
596 }
597 if (failed_mds) {
11fdf7f2 598 ceph_assert(mon->osdmon()->is_writeable());
7c673cae
FG
599 request_proposal(mon->osdmon());
600 }
601 }
602
603 // Add this daemon to the map
28e407b8 604 if (pending.mds_roles.count(gid) == 0) {
7c673cae
FG
605 MDSMap::mds_info_t new_info;
606 new_info.global_id = gid;
607 new_info.name = m->get_name();
11fdf7f2 608 new_info.addrs = addrs;
7c673cae
FG
609 new_info.mds_features = m->get_mds_features();
610 new_info.state = MDSMap::STATE_STANDBY;
611 new_info.state_seq = seq;
28e407b8 612 pending.insert(new_info);
7c673cae
FG
613 }
614
7c673cae 615 // initialize the beacon timer
1adf2230
AA
616 auto &beacon = last_beacon[gid];
617 beacon.stamp = mono_clock::now();
618 beacon.seq = seq;
7c673cae
FG
619
620 // new incompat?
28e407b8
AA
621 if (!pending.compat.writeable(m->get_compat())) {
622 dout(10) << " fsmap " << pending.compat
7c673cae
FG
623 << " can't write to new mds' " << m->get_compat()
624 << ", updating fsmap and killing old mds's"
625 << dendl;
28e407b8 626 pending.update_compat(m->get_compat());
7c673cae
FG
627 }
628
629 update_metadata(m->get_global_id(), m->get_sys_info());
630 } else {
631 // state update
91327a77
AA
632
633 if (!pending.gid_exists(gid)) {
634 /* gid has been removed from pending, send null map */
635 dout(5) << "mds_beacon " << *m << " is not in fsmap (state "
636 << ceph_mds_state_name(state) << ")" << dendl;
637
638 /* We can't send an MDSMap this MDS was a part of because we no longer
639 * know which FS it was part of. Nor does this matter. Sending an empty
640 * MDSMap is sufficient for getting the MDS to respawn.
641 */
642 wait_for_finished_proposal(op, new FunctionContext([op, this](int r){
643 if (r >= 0) {
644 const auto& fsmap = get_fsmap();
645 MDSMap null_map;
646 null_map.epoch = fsmap.epoch;
647 null_map.compat = fsmap.compat;
11fdf7f2
TL
648 auto m = MMDSMap::create(mon->monmap->fsid, null_map);
649 mon->send_reply(op, m.detach());
91327a77
AA
650 } else {
651 dispatch(op); // try again
652 }
653 }));
654 return true;
655 }
656
11fdf7f2 657 const auto& info = pending.get_info_gid(gid);
f64942e4
AA
658 if (info.state == MDSMap::STATE_STOPPING &&
659 state != MDSMap::STATE_STOPPING &&
660 state != MDSMap::STATE_STOPPED) {
7c673cae
FG
661 // we can't transition to any other states from STOPPING
662 dout(0) << "got beacon for MDS in STATE_STOPPING, ignoring requested state change"
663 << dendl;
664 _note_beacon(m);
665 return true;
666 }
667
668 if (info.laggy()) {
11fdf7f2
TL
669 dout(1) << "prepare_beacon clearing laggy flag on " << addrs << dendl;
670 pending.modify_daemon(info.global_id, [](auto& info)
7c673cae 671 {
11fdf7f2 672 info.clear_laggy();
7c673cae
FG
673 }
674 );
675 }
676
91327a77 677 dout(5) << "prepare_beacon mds." << info.rank
7c673cae
FG
678 << " " << ceph_mds_state_name(info.state)
679 << " -> " << ceph_mds_state_name(state)
7c673cae
FG
680 << dendl;
681 if (state == MDSMap::STATE_STOPPED) {
28e407b8
AA
682 const auto fscid = pending.mds_roles.at(gid);
683 const auto &fs = pending.get_filesystem(fscid);
181888fb 684
d2e6a577 685 mon->clog->info() << info.human_name() << " finished "
11fdf7f2 686 << "stopping rank " << info.rank << " in filesystem "
d2e6a577 687 << fs->mds_map.fs_name << " (now has "
181888fb 688 << fs->mds_map.get_num_in_mds() - 1 << " ranks)";
d2e6a577 689
28e407b8 690 auto erased = pending.stop(gid);
7c673cae
FG
691 erased.push_back(gid);
692
693 for (const auto &erased_gid : erased) {
694 last_beacon.erase(erased_gid);
695 if (pending_daemon_health.count(erased_gid)) {
696 pending_daemon_health.erase(erased_gid);
697 pending_daemon_health_rm.insert(erased_gid);
698 }
699 }
d2e6a577
FG
700
701
7c673cae
FG
702 } else if (state == MDSMap::STATE_DAMAGED) {
703 if (!mon->osdmon()->is_writeable()) {
91327a77 704 dout(1) << __func__ << ": DAMAGED from rank " << info.rank
7c673cae
FG
705 << " waiting for osdmon writeable to blacklist it" << dendl;
706 mon->osdmon()->wait_for_writeable(op, new C_RetryMessage(this, op));
707 return false;
708 }
709
710 // Record this MDS rank as damaged, so that other daemons
711 // won't try to run it.
91327a77 712 dout(0) << __func__ << ": marking rank "
7c673cae
FG
713 << info.rank << " damaged" << dendl;
714
715 utime_t until = ceph_clock_now();
11fdf7f2
TL
716 until += g_conf().get_val<double>("mon_mds_blacklist_interval");
717 const auto blacklist_epoch = mon->osdmon()->blacklist(info.addrs, until);
7c673cae 718 request_proposal(mon->osdmon());
28e407b8 719 pending.damaged(gid, blacklist_epoch);
7c673cae
FG
720 last_beacon.erase(gid);
721
722 // Respond to MDS, so that it knows it can continue to shut down
11fdf7f2 723 auto beacon = MMDSBeacon::create(
7c673cae 724 mon->monmap->fsid, m->get_global_id(),
28e407b8 725 m->get_name(), pending.get_epoch(), state, seq,
11fdf7f2
TL
726 CEPH_FEATURES_SUPPORTED_DEFAULT);
727 mon->send_reply(op, beacon.detach());
7c673cae
FG
728 } else if (state == MDSMap::STATE_DNE) {
729 if (!mon->osdmon()->is_writeable()) {
91327a77 730 dout(1) << __func__ << ": DNE from rank " << info.rank
7c673cae
FG
731 << " waiting for osdmon writeable to blacklist it" << dendl;
732 mon->osdmon()->wait_for_writeable(op, new C_RetryMessage(this, op));
733 return false;
734 }
735
1adf2230 736 fail_mds_gid(pending, gid);
11fdf7f2 737 ceph_assert(mon->osdmon()->is_writeable());
7c673cae
FG
738 request_proposal(mon->osdmon());
739
740 // Respond to MDS, so that it knows it can continue to shut down
11fdf7f2
TL
741 auto beacon = MMDSBeacon::create(mon->monmap->fsid,
742 m->get_global_id(), m->get_name(), pending.get_epoch(), state, seq,
743 CEPH_FEATURES_SUPPORTED_DEFAULT);
744 mon->send_reply(op, beacon.detach());
7c673cae
FG
745 } else if (info.state == MDSMap::STATE_STANDBY && state != info.state) {
746 // Standby daemons should never modify their own
747 // state. Reject any attempts to do so.
748 derr << "standby " << gid << " attempted to change state to "
749 << ceph_mds_state_name(state) << ", rejecting" << dendl;
750 return true;
751 } else if (info.state != MDSMap::STATE_STANDBY && state != info.state &&
752 !MDSMap::state_transition_valid(info.state, state)) {
753 // Validate state transitions for daemons that hold a rank
754 derr << "daemon " << gid << " (rank " << info.rank << ") "
755 << "reported invalid state transition "
756 << ceph_mds_state_name(info.state) << " -> "
757 << ceph_mds_state_name(state) << dendl;
758 return true;
759 } else {
b32b8144 760 if (info.state != MDSMap::STATE_ACTIVE && state == MDSMap::STATE_ACTIVE) {
28e407b8
AA
761 const auto &fscid = pending.mds_roles.at(gid);
762 const auto &fs = pending.get_filesystem(fscid);
d2e6a577
FG
763 mon->clog->info() << info.human_name() << " is now active in "
764 << "filesystem " << fs->mds_map.fs_name << " as rank "
765 << info.rank;
766 }
b32b8144
FG
767
768 // Made it through special cases and validations, record the
769 // daemon's reported state to the FSMap.
11fdf7f2
TL
770 pending.modify_daemon(gid, [state, seq](auto& info) {
771 info.state = state;
772 info.state_seq = seq;
b32b8144 773 });
7c673cae
FG
774 }
775 }
776
91327a77 777 dout(5) << "prepare_beacon pending map now:" << dendl;
28e407b8 778 print_map(pending);
7c673cae
FG
779
780 wait_for_finished_proposal(op, new FunctionContext([op, this](int r){
781 if (r >= 0)
782 _updated(op); // success
783 else if (r == -ECANCELED) {
784 mon->no_reply(op);
785 } else {
786 dispatch(op); // try again
787 }
788 }));
789
790 return true;
791}
792
793bool MDSMonitor::prepare_offload_targets(MonOpRequestRef op)
794{
28e407b8
AA
795 auto &pending = get_pending_fsmap_writeable();
796
7c673cae
FG
797 op->mark_mdsmon_event(__func__);
798 MMDSLoadTargets *m = static_cast<MMDSLoadTargets*>(op->get_req());
799 mds_gid_t gid = m->global_id;
28e407b8 800 if (pending.gid_has_rank(gid)) {
7c673cae 801 dout(10) << "prepare_offload_targets " << gid << " " << m->targets << dendl;
28e407b8 802 pending.update_export_targets(gid, m->targets);
7c673cae
FG
803 } else {
804 dout(10) << "prepare_offload_targets " << gid << " not in map" << dendl;
805 }
91327a77 806 mon->no_reply(op);
7c673cae
FG
807 return true;
808}
809
810bool MDSMonitor::should_propose(double& delay)
811{
812 // delegate to PaxosService to assess whether we should propose
813 return PaxosService::should_propose(delay);
814}
815
816void MDSMonitor::_updated(MonOpRequestRef op)
817{
28e407b8 818 const auto &fsmap = get_fsmap();
7c673cae
FG
819 op->mark_mdsmon_event(__func__);
820 MMDSBeacon *m = static_cast<MMDSBeacon*>(op->get_req());
821 dout(10) << "_updated " << m->get_orig_source() << " " << *m << dendl;
11fdf7f2
TL
822 mon->clog->debug() << m->get_orig_source() << " "
823 << m->get_orig_source_addrs() << " "
824 << ceph_mds_state_name(m->get_state());
7c673cae
FG
825
826 if (m->get_state() == MDSMap::STATE_STOPPED) {
827 // send the map manually (they're out of the map, so they won't get it automatic)
828 MDSMap null_map;
829 null_map.epoch = fsmap.epoch;
830 null_map.compat = fsmap.compat;
11fdf7f2
TL
831 auto m = MMDSMap::create(mon->monmap->fsid, null_map);
832 mon->send_reply(op, m.detach());
7c673cae 833 } else {
11fdf7f2
TL
834 auto beacon = MMDSBeacon::create(mon->monmap->fsid,
835 m->get_global_id(), m->get_name(), fsmap.get_epoch(),
836 m->get_state(), m->get_seq(), CEPH_FEATURES_SUPPORTED_DEFAULT);
837 mon->send_reply(op, beacon.detach());
7c673cae
FG
838 }
839}
840
841void MDSMonitor::on_active()
842{
843 tick();
7c673cae 844
28e407b8
AA
845 if (is_leader()) {
846 mon->clog->debug() << "fsmap " << get_fsmap();
224ce89b 847 }
7c673cae
FG
848}
849
7c673cae
FG
850void MDSMonitor::dump_info(Formatter *f)
851{
852 f->open_object_section("fsmap");
28e407b8 853 get_fsmap().dump(f);
7c673cae
FG
854 f->close_section();
855
856 f->dump_unsigned("mdsmap_first_committed", get_first_committed());
857 f->dump_unsigned("mdsmap_last_committed", get_last_committed());
858}
859
860bool MDSMonitor::preprocess_command(MonOpRequestRef op)
861{
862 op->mark_mdsmon_event(__func__);
863 MMonCommand *m = static_cast<MMonCommand*>(op->get_req());
864 int r = -1;
865 bufferlist rdata;
866 stringstream ss, ds;
867
1adf2230 868 const auto &fsmap = get_fsmap();
28e407b8 869
11fdf7f2 870 cmdmap_t cmdmap;
7c673cae
FG
871 if (!cmdmap_from_json(m->cmd, &cmdmap, ss)) {
872 // ss has reason for failure
873 string rs = ss.str();
874 mon->reply_command(op, -EINVAL, rs, rdata, get_last_committed());
875 return true;
876 }
877
878 string prefix;
879 cmd_getval(g_ceph_context, cmdmap, "prefix", prefix);
880 string format;
881 cmd_getval(g_ceph_context, cmdmap, "format", format, string("plain"));
1adf2230 882 std::unique_ptr<Formatter> f(Formatter::create(format));
7c673cae 883
11fdf7f2 884 MonSession *session = op->get_session();
7c673cae
FG
885 if (!session) {
886 mon->reply_command(op, -EACCES, "access denied", rdata, get_last_committed());
887 return true;
888 }
889
890 if (prefix == "mds stat") {
891 if (f) {
892 f->open_object_section("mds_stat");
893 dump_info(f.get());
894 f->close_section();
895 f->flush(ds);
896 } else {
897 ds << fsmap;
898 }
899 r = 0;
11fdf7f2
TL
900 } else if (prefix == "mds ok-to-stop") {
901 vector<string> ids;
902 if (!cmd_getval(g_ceph_context, cmdmap, "ids", ids)) {
903 r = -EINVAL;
904 ss << "must specify mds id";
905 goto out;
906 }
907 if (fsmap.is_any_degraded()) {
908 ss << "one or more filesystems is currently degraded";
909 r = -EBUSY;
910 goto out;
911 }
912 set<mds_gid_t> stopping;
913 for (auto& id : ids) {
914 ostringstream ess;
915 mds_gid_t gid = gid_from_arg(fsmap, id, ess);
916 if (gid == MDS_GID_NONE) {
917 // the mds doesn't exist, but no file systems are unhappy, so losing it
918 // can't have any effect.
919 continue;
920 }
921 stopping.insert(gid);
922 }
923 set<mds_gid_t> active;
924 set<mds_gid_t> standby;
925 for (auto gid : stopping) {
926 if (fsmap.gid_has_rank(gid)) {
927 // ignore standby-replay daemons (at this level)
928 if (!fsmap.is_standby_replay(gid)) {
929 auto standby = fsmap.get_standby_replay(gid);
930 if (standby == MDS_GID_NONE ||
931 stopping.count(standby)) {
932 // no standby-replay, or we're also stopping the standby-replay
933 // for this mds
934 active.insert(gid);
935 }
936 }
7c673cae 937 } else {
11fdf7f2
TL
938 // net loss of a standby
939 standby.insert(gid);
7c673cae
FG
940 }
941 }
11fdf7f2
TL
942 if (fsmap.get_num_standby() - standby.size() < active.size()) {
943 r = -EBUSY;
944 ss << "insufficent standby MDS daemons to stop active gids "
945 << stringify(active)
946 << " and/or standby gids " << stringify(standby);;
947 goto out;
28e407b8 948 }
11fdf7f2
TL
949 r = 0;
950 ss << "should be safe to stop " << ids;
7c673cae
FG
951 } else if (prefix == "fs dump") {
952 int64_t epocharg;
953 epoch_t epoch;
954
1adf2230 955 const FSMap *fsmapp = &fsmap;
28e407b8 956 FSMap dummy;
7c673cae
FG
957 if (cmd_getval(g_ceph_context, cmdmap, "epoch", epocharg)) {
958 epoch = epocharg;
959 bufferlist b;
960 int err = get_version(epoch, b);
961 if (err == -ENOENT) {
7c673cae 962 r = -ENOENT;
28e407b8 963 goto out;
7c673cae 964 } else {
11fdf7f2
TL
965 ceph_assert(err == 0);
966 ceph_assert(b.length());
28e407b8
AA
967 dummy.decode(b);
968 fsmapp = &dummy;
7c673cae
FG
969 }
970 }
c07f9fc5 971
28e407b8
AA
972 stringstream ds;
973 if (f != NULL) {
974 f->open_object_section("fsmap");
975 fsmapp->dump(f.get());
976 f->close_section();
977 f->flush(ds);
978 r = 0;
979 } else {
980 fsmapp->print(ds);
981 r = 0;
7c673cae 982 }
28e407b8
AA
983
984 rdata.append(ds);
985 ss << "dumped fsmap epoch " << fsmapp->get_epoch();
7c673cae
FG
986 } else if (prefix == "mds metadata") {
987 if (!f)
988 f.reset(Formatter::create("json-pretty"));
989
990 string who;
991 bool all = !cmd_getval(g_ceph_context, cmdmap, "who", who);
992 dout(1) << "all = " << all << dendl;
993 if (all) {
994 r = 0;
995 // Dump all MDSs' metadata
996 const auto all_info = fsmap.get_mds_info();
997
998 f->open_array_section("mds_metadata");
999 for(const auto &i : all_info) {
1000 const auto &info = i.second;
1001
1002 f->open_object_section("mds");
1003 f->dump_string("name", info.name);
1004 std::ostringstream get_err;
1adf2230 1005 r = dump_metadata(fsmap, info.name, f.get(), get_err);
7c673cae
FG
1006 if (r == -EINVAL || r == -ENOENT) {
1007 // Drop error, list what metadata we do have
1008 dout(1) << get_err.str() << dendl;
1009 r = 0;
1010 } else if (r != 0) {
1011 derr << "Unexpected error reading metadata: " << cpp_strerror(r)
1012 << dendl;
1013 ss << get_err.str();
c07f9fc5 1014 f->close_section();
7c673cae
FG
1015 break;
1016 }
1017 f->close_section();
1018 }
1019 f->close_section();
1020 } else {
1021 // Dump a single daemon's metadata
1022 f->open_object_section("mds_metadata");
1adf2230 1023 r = dump_metadata(fsmap, who, f.get(), ss);
7c673cae
FG
1024 f->close_section();
1025 }
1026 f->flush(ds);
31f18b77
FG
1027 } else if (prefix == "mds versions") {
1028 if (!f)
1029 f.reset(Formatter::create("json-pretty"));
1030 count_metadata("ceph_version", f.get());
1031 f->flush(ds);
1032 r = 0;
1033 } else if (prefix == "mds count-metadata") {
1034 if (!f)
1035 f.reset(Formatter::create("json-pretty"));
1036 string field;
1037 cmd_getval(g_ceph_context, cmdmap, "property", field);
1038 count_metadata(field, f.get());
1039 f->flush(ds);
1040 r = 0;
7c673cae
FG
1041 } else if (prefix == "mds compat show") {
1042 if (f) {
1043 f->open_object_section("mds_compat");
1044 fsmap.compat.dump(f.get());
1045 f->close_section();
1046 f->flush(ds);
1047 } else {
1048 ds << fsmap.compat;
1049 }
1050 r = 0;
1051 } else if (prefix == "fs get") {
1052 string fs_name;
1053 cmd_getval(g_ceph_context, cmdmap, "fs_name", fs_name);
28e407b8 1054 const auto &fs = fsmap.get_filesystem(fs_name);
7c673cae
FG
1055 if (fs == nullptr) {
1056 ss << "filesystem '" << fs_name << "' not found";
1057 r = -ENOENT;
1058 } else {
1059 if (f != nullptr) {
1060 f->open_object_section("filesystem");
1061 fs->dump(f.get());
1062 f->close_section();
1063 f->flush(ds);
1064 r = 0;
1065 } else {
1066 fs->print(ds);
1067 r = 0;
1068 }
1069 }
1070 } else if (prefix == "fs ls") {
1071 if (f) {
1072 f->open_array_section("filesystems");
1adf2230
AA
1073 for (const auto &p : fsmap.filesystems) {
1074 const auto &fs = p.second;
1075 f->open_object_section("filesystem");
1076 {
1077 const MDSMap &mds_map = fs->mds_map;
1078 f->dump_string("name", mds_map.fs_name);
1079 /* Output both the names and IDs of pools, for use by
1080 * humans and machines respectively */
1081 f->dump_string("metadata_pool", mon->osdmon()->osdmap.get_pool_name(
1082 mds_map.metadata_pool));
1083 f->dump_int("metadata_pool_id", mds_map.metadata_pool);
1084 f->open_array_section("data_pool_ids");
1085 for (const auto &id : mds_map.data_pools) {
1086 f->dump_int("data_pool_id", id);
1087 }
1088 f->close_section();
7c673cae 1089
1adf2230
AA
1090 f->open_array_section("data_pools");
1091 for (const auto &id : mds_map.data_pools) {
1092 const auto &name = mon->osdmon()->osdmap.get_pool_name(id);
1093 f->dump_string("data_pool", name);
7c673cae
FG
1094 }
1095 f->close_section();
1096 }
1adf2230 1097 f->close_section();
7c673cae
FG
1098 }
1099 f->close_section();
1100 f->flush(ds);
1101 } else {
28e407b8
AA
1102 for (const auto &p : fsmap.filesystems) {
1103 const auto &fs = p.second;
7c673cae
FG
1104 const MDSMap &mds_map = fs->mds_map;
1105 const string &md_pool_name = mon->osdmon()->osdmap.get_pool_name(
1106 mds_map.metadata_pool);
1107
1108 ds << "name: " << mds_map.fs_name << ", metadata pool: "
1109 << md_pool_name << ", data pools: [";
1adf2230
AA
1110 for (const auto &id : mds_map.data_pools) {
1111 const string &pool_name = mon->osdmon()->osdmap.get_pool_name(id);
7c673cae
FG
1112 ds << pool_name << " ";
1113 }
1114 ds << "]" << std::endl;
1115 }
1116
1117 if (fsmap.filesystems.empty()) {
1118 ds << "No filesystems enabled" << std::endl;
1119 }
1120 }
1121 r = 0;
1122 }
1123
28e407b8 1124out:
7c673cae
FG
1125 if (r != -1) {
1126 rdata.append(ds);
1127 string rs;
1128 getline(ss, rs);
1129 mon->reply_command(op, r, rs, rdata, get_last_committed());
1130 return true;
1131 } else
1132 return false;
1133}
1134
1adf2230 1135bool MDSMonitor::fail_mds_gid(FSMap &fsmap, mds_gid_t gid)
7c673cae 1136{
1adf2230 1137 const MDSMap::mds_info_t &info = fsmap.get_info_gid(gid);
91327a77 1138 dout(1) << "fail_mds_gid " << gid << " mds." << info.name << " role " << info.rank << dendl;
7c673cae 1139
a8e16298
TL
1140 ceph_assert(mon->osdmon()->is_writeable());
1141
7c673cae
FG
1142 epoch_t blacklist_epoch = 0;
1143 if (info.rank >= 0 && info.state != MDSMap::STATE_STANDBY_REPLAY) {
1144 utime_t until = ceph_clock_now();
11fdf7f2
TL
1145 until += g_conf().get_val<double>("mon_mds_blacklist_interval");
1146 blacklist_epoch = mon->osdmon()->blacklist(info.addrs, until);
7c673cae
FG
1147 }
1148
1adf2230 1149 fsmap.erase(gid, blacklist_epoch);
7c673cae
FG
1150 last_beacon.erase(gid);
1151 if (pending_daemon_health.count(gid)) {
1152 pending_daemon_health.erase(gid);
1153 pending_daemon_health_rm.insert(gid);
1154 }
1155
1156 return blacklist_epoch != 0;
1157}
1158
1adf2230 1159mds_gid_t MDSMonitor::gid_from_arg(const FSMap &fsmap, const std::string &arg, std::ostream &ss)
7c673cae
FG
1160{
1161 // Try parsing as a role
1162 mds_role_t role;
1163 std::ostringstream ignore_err; // Don't spam 'ss' with parse_role errors
1adf2230 1164 int r = fsmap.parse_role(arg, &role, ignore_err);
7c673cae
FG
1165 if (r == 0) {
1166 // See if a GID is assigned to this role
28e407b8 1167 const auto &fs = fsmap.get_filesystem(role.fscid);
11fdf7f2 1168 ceph_assert(fs != nullptr); // parse_role ensures it exists
7c673cae
FG
1169 if (fs->mds_map.is_up(role.rank)) {
1170 dout(10) << __func__ << ": validated rank/GID " << role
1171 << " as a rank" << dendl;
1172 return fs->mds_map.get_mds_info(role.rank).global_id;
1173 }
1174 }
1175
1176 // Try parsing as a gid
1177 std::string err;
1178 unsigned long long maybe_gid = strict_strtoll(arg.c_str(), 10, &err);
1179 if (!err.empty()) {
1180 // Not a role or a GID, try as a daemon name
28e407b8 1181 const MDSMap::mds_info_t *mds_info = fsmap.find_by_name(arg);
7c673cae
FG
1182 if (!mds_info) {
1183 ss << "MDS named '" << arg
1184 << "' does not exist, or is not up";
1185 return MDS_GID_NONE;
1186 }
1187 dout(10) << __func__ << ": resolved MDS name '" << arg
1188 << "' to GID " << mds_info->global_id << dendl;
1189 return mds_info->global_id;
1190 } else {
1191 // Not a role, but parses as a an integer, might be a GID
1192 dout(10) << __func__ << ": treating MDS reference '" << arg
1193 << "' as an integer " << maybe_gid << dendl;
31f18b77 1194
28e407b8 1195 if (fsmap.gid_exists(mds_gid_t(maybe_gid))) {
31f18b77 1196 return mds_gid_t(maybe_gid);
7c673cae
FG
1197 }
1198 }
1199
1200 dout(1) << __func__ << ": rank/GID " << arg
1201 << " not a existent rank or GID" << dendl;
1202 return MDS_GID_NONE;
1203}
1204
1adf2230
AA
1205int MDSMonitor::fail_mds(FSMap &fsmap, std::ostream &ss,
1206 const std::string &arg, MDSMap::mds_info_t *failed_info)
7c673cae 1207{
11fdf7f2 1208 ceph_assert(failed_info != nullptr);
d2e6a577 1209
1adf2230 1210 mds_gid_t gid = gid_from_arg(fsmap, arg, ss);
7c673cae
FG
1211 if (gid == MDS_GID_NONE) {
1212 return 0;
1213 }
1214 if (!mon->osdmon()->is_writeable()) {
1215 return -EAGAIN;
1216 }
d2e6a577
FG
1217
1218 // Take a copy of the info before removing the MDS from the map,
1219 // so that the caller knows which mds (if any) they ended up removing.
1adf2230 1220 *failed_info = fsmap.get_info_gid(gid);
d2e6a577 1221
1adf2230 1222 fail_mds_gid(fsmap, gid);
7c673cae 1223 ss << "failed mds gid " << gid;
11fdf7f2 1224 ceph_assert(mon->osdmon()->is_writeable());
7c673cae
FG
1225 request_proposal(mon->osdmon());
1226 return 0;
1227}
1228
1229bool MDSMonitor::prepare_command(MonOpRequestRef op)
1230{
1231 op->mark_mdsmon_event(__func__);
1232 MMonCommand *m = static_cast<MMonCommand*>(op->get_req());
1233 int r = -EINVAL;
1234 stringstream ss;
1235 bufferlist rdata;
1236
11fdf7f2 1237 cmdmap_t cmdmap;
7c673cae
FG
1238 if (!cmdmap_from_json(m->cmd, &cmdmap, ss)) {
1239 string rs = ss.str();
1240 mon->reply_command(op, -EINVAL, rs, rdata, get_last_committed());
1241 return true;
1242 }
1243
1244 string prefix;
1245 cmd_getval(g_ceph_context, cmdmap, "prefix", prefix);
1246
1247 /* Refuse access if message not associated with a valid session */
11fdf7f2 1248 MonSession *session = op->get_session();
7c673cae
FG
1249 if (!session) {
1250 mon->reply_command(op, -EACCES, "access denied", rdata, get_last_committed());
1251 return true;
1252 }
1253
28e407b8
AA
1254 auto &pending = get_pending_fsmap_writeable();
1255
c07f9fc5 1256 bool batched_propose = false;
28e407b8 1257 for (const auto &h : handlers) {
7c673cae 1258 if (h->can_handle(prefix)) {
c07f9fc5
FG
1259 batched_propose = h->batched_propose();
1260 if (batched_propose) {
1261 paxos->plug();
1262 }
28e407b8 1263 r = h->handle(mon, pending, op, cmdmap, ss);
c07f9fc5
FG
1264 if (batched_propose) {
1265 paxos->unplug();
1266 }
1267
7c673cae
FG
1268 if (r == -EAGAIN) {
1269 // message has been enqueued for retry; return.
1270 dout(4) << __func__ << " enqueue for retry by prepare_command" << dendl;
1271 return false;
1272 } else {
1273 if (r == 0) {
1274 // On successful updates, print the updated map
28e407b8 1275 print_map(pending);
7c673cae
FG
1276 }
1277 // Successful or not, we're done: respond.
1278 goto out;
1279 }
1280 }
1281 }
1282
1adf2230 1283 r = filesystem_command(pending, op, prefix, cmdmap, ss);
7c673cae
FG
1284 if (r >= 0) {
1285 goto out;
1286 } else if (r == -EAGAIN) {
1287 // Do not reply, the message has been enqueued for retry
1288 dout(4) << __func__ << " enqueue for retry by filesystem_command" << dendl;
1289 return false;
1290 } else if (r != -ENOSYS) {
1291 goto out;
1292 }
1293
7c673cae
FG
1294 if (r == -ENOSYS && ss.str().empty()) {
1295 ss << "unrecognized command";
1296 }
1297
1298out:
1299 dout(4) << __func__ << " done, r=" << r << dendl;
1300 /* Compose response */
1301 string rs;
1302 getline(ss, rs);
1303
1304 if (r >= 0) {
1305 // success.. delay reply
1306 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, r, rs,
1307 get_last_committed() + 1));
c07f9fc5
FG
1308 if (batched_propose) {
1309 force_immediate_propose();
1310 }
7c673cae
FG
1311 return true;
1312 } else {
1313 // reply immediately
1314 mon->reply_command(op, r, rs, rdata, get_last_committed());
1315 return false;
1316 }
1317}
1318
7c673cae 1319int MDSMonitor::filesystem_command(
1adf2230 1320 FSMap &fsmap,
7c673cae
FG
1321 MonOpRequestRef op,
1322 std::string const &prefix,
11fdf7f2 1323 const cmdmap_t& cmdmap,
7c673cae
FG
1324 std::stringstream &ss)
1325{
1326 dout(4) << __func__ << " prefix='" << prefix << "'" << dendl;
1327 op->mark_mdsmon_event(__func__);
1328 int r = 0;
1329 string whostr;
11fdf7f2 1330 cmd_getval(g_ceph_context, cmdmap, "role", whostr);
7c673cae 1331
11fdf7f2 1332 if (prefix == "mds set_state") {
7c673cae
FG
1333 mds_gid_t gid;
1334 if (!cmd_getval(g_ceph_context, cmdmap, "gid", gid)) {
1335 ss << "error parsing 'gid' value '"
11fdf7f2 1336 << cmd_vartype_stringify(cmdmap.at("gid")) << "'";
7c673cae
FG
1337 return -EINVAL;
1338 }
1339 MDSMap::DaemonState state;
1340 if (!cmd_getval(g_ceph_context, cmdmap, "state", state)) {
1341 ss << "error parsing 'state' string value '"
11fdf7f2 1342 << cmd_vartype_stringify(cmdmap.at("state")) << "'";
7c673cae
FG
1343 return -EINVAL;
1344 }
1adf2230 1345 if (fsmap.gid_exists(gid)) {
11fdf7f2
TL
1346 fsmap.modify_daemon(gid, [state](auto& info) {
1347 info.state = state;
7c673cae
FG
1348 });
1349 ss << "set mds gid " << gid << " to state " << state << " "
1350 << ceph_mds_state_name(state);
1351 return 0;
1352 }
1353 } else if (prefix == "mds fail") {
1354 string who;
11fdf7f2 1355 cmd_getval(g_ceph_context, cmdmap, "role_or_gid", who);
d2e6a577
FG
1356
1357 MDSMap::mds_info_t failed_info;
1adf2230 1358 r = fail_mds(fsmap, ss, who, &failed_info);
7c673cae
FG
1359 if (r < 0 && r == -EAGAIN) {
1360 mon->osdmon()->wait_for_writeable(op, new C_RetryMessage(this, op));
1361 return -EAGAIN; // don't propose yet; wait for message to be retried
d2e6a577
FG
1362 } else if (r == 0) {
1363 // Only log if we really did something (not when was already gone)
1364 if (failed_info.global_id != MDS_GID_NONE) {
1365 mon->clog->info() << failed_info.human_name() << " marked failed by "
1366 << op->get_session()->entity_name;
1367 }
7c673cae
FG
1368 }
1369 } else if (prefix == "mds rm") {
1370 mds_gid_t gid;
1371 if (!cmd_getval(g_ceph_context, cmdmap, "gid", gid)) {
1372 ss << "error parsing 'gid' value '"
11fdf7f2 1373 << cmd_vartype_stringify(cmdmap.at("gid")) << "'";
7c673cae
FG
1374 return -EINVAL;
1375 }
1adf2230 1376 if (!fsmap.gid_exists(gid)) {
11fdf7f2 1377 ss << "mds gid " << gid << " does not exist";
7c673cae
FG
1378 r = 0;
1379 } else {
1adf2230 1380 const auto &info = fsmap.get_info_gid(gid);
28e407b8 1381 MDSMap::DaemonState state = info.state;
7c673cae 1382 if (state > 0) {
28e407b8
AA
1383 ss << "cannot remove active mds." << info.name
1384 << " rank " << info.rank;
7c673cae
FG
1385 return -EBUSY;
1386 } else {
1adf2230 1387 fsmap.erase(gid, {});
7c673cae
FG
1388 ss << "removed mds gid " << gid;
1389 return 0;
1390 }
1391 }
1392 } else if (prefix == "mds rmfailed") {
11fdf7f2
TL
1393 bool confirm = false;
1394 cmd_getval(g_ceph_context, cmdmap, "yes_i_really_mean_it", confirm);
1395 if (!confirm) {
7c673cae
FG
1396 ss << "WARNING: this can make your filesystem inaccessible! "
1397 "Add --yes-i-really-mean-it if you are sure you wish to continue.";
1398 return -EPERM;
1399 }
1400
1401 std::string role_str;
11fdf7f2 1402 cmd_getval(g_ceph_context, cmdmap, "role", role_str);
7c673cae 1403 mds_role_t role;
1adf2230 1404 int r = fsmap.parse_role(role_str, &role, ss);
7c673cae
FG
1405 if (r < 0) {
1406 ss << "invalid role '" << role_str << "'";
1407 return -EINVAL;
1408 }
1409
1adf2230 1410 fsmap.modify_filesystem(
7c673cae
FG
1411 role.fscid,
1412 [role](std::shared_ptr<Filesystem> fs)
1413 {
1414 fs->mds_map.failed.erase(role.rank);
1415 });
1416
1417 ss << "removed failed mds." << role;
1418 return 0;
1419 } else if (prefix == "mds compat rm_compat") {
1420 int64_t f;
1421 if (!cmd_getval(g_ceph_context, cmdmap, "feature", f)) {
1422 ss << "error parsing feature value '"
11fdf7f2 1423 << cmd_vartype_stringify(cmdmap.at("feature")) << "'";
7c673cae
FG
1424 return -EINVAL;
1425 }
1adf2230 1426 if (fsmap.compat.compat.contains(f)) {
7c673cae 1427 ss << "removing compat feature " << f;
1adf2230 1428 CompatSet modified = fsmap.compat;
7c673cae 1429 modified.compat.remove(f);
1adf2230 1430 fsmap.update_compat(modified);
7c673cae 1431 } else {
1adf2230 1432 ss << "compat feature " << f << " not present in " << fsmap.compat;
7c673cae
FG
1433 }
1434 r = 0;
1435 } else if (prefix == "mds compat rm_incompat") {
1436 int64_t f;
1437 if (!cmd_getval(g_ceph_context, cmdmap, "feature", f)) {
1438 ss << "error parsing feature value '"
11fdf7f2 1439 << cmd_vartype_stringify(cmdmap.at("feature")) << "'";
7c673cae
FG
1440 return -EINVAL;
1441 }
1adf2230 1442 if (fsmap.compat.incompat.contains(f)) {
7c673cae 1443 ss << "removing incompat feature " << f;
1adf2230 1444 CompatSet modified = fsmap.compat;
7c673cae 1445 modified.incompat.remove(f);
1adf2230 1446 fsmap.update_compat(modified);
7c673cae 1447 } else {
1adf2230 1448 ss << "incompat feature " << f << " not present in " << fsmap.compat;
7c673cae
FG
1449 }
1450 r = 0;
1451 } else if (prefix == "mds repaired") {
1452 std::string role_str;
11fdf7f2 1453 cmd_getval(g_ceph_context, cmdmap, "role", role_str);
7c673cae 1454 mds_role_t role;
1adf2230 1455 r = fsmap.parse_role(role_str, &role, ss);
7c673cae
FG
1456 if (r < 0) {
1457 return r;
1458 }
1459
1adf2230 1460 bool modified = fsmap.undamaged(role.fscid, role.rank);
7c673cae 1461 if (modified) {
494da23a 1462 ss << "repaired: restoring rank " << role;
7c673cae 1463 } else {
494da23a 1464 ss << "nothing to do: rank is not damaged";
7c673cae
FG
1465 }
1466
1467 r = 0;
11fdf7f2
TL
1468 } else if (prefix == "mds freeze") {
1469 std::string who;
1470 cmd_getval(g_ceph_context, cmdmap, "role_or_gid", who);
1471 mds_gid_t gid = gid_from_arg(fsmap, who, ss);
1472 if (gid == MDS_GID_NONE) {
7c673cae
FG
1473 return -EINVAL;
1474 }
1475
11fdf7f2 1476 bool freeze = false;
7c673cae 1477 {
11fdf7f2
TL
1478 std::string str;
1479 cmd_getval(g_ceph_context, cmdmap, "val", str);
1480 if ((r = parse_bool(str, &freeze, ss)) != 0) {
1481 return r;
1482 }
1483 }
7c673cae 1484
11fdf7f2
TL
1485 auto f = [freeze,gid,&ss](auto& info) {
1486 if (freeze) {
1487 ss << "freezing mds." << gid;
1488 info.freeze();
1489 } else {
1490 ss << "unfreezing mds." << gid;
1491 info.unfreeze();
1492 }
1493 };
1494 fsmap.modify_daemon(gid, f);
7c673cae
FG
1495 r = 0;
1496 } else {
1497 return -ENOSYS;
1498 }
1499
1500 return r;
1501}
1502
7c673cae
FG
1503void MDSMonitor::check_subs()
1504{
1505 std::list<std::string> types;
1506
1507 // Subscriptions may be to "mdsmap" (MDS and legacy clients),
1508 // "mdsmap.<namespace>", or to "fsmap" for the full state of all
1509 // filesystems. Build a list of all the types we service
1510 // subscriptions for.
1511 types.push_back("fsmap");
1512 types.push_back("fsmap.user");
1513 types.push_back("mdsmap");
28e407b8
AA
1514 for (const auto &p : get_fsmap().filesystems) {
1515 const auto &fscid = p.first;
7c673cae
FG
1516 std::ostringstream oss;
1517 oss << "mdsmap." << fscid;
1518 types.push_back(oss.str());
1519 }
1520
1521 for (const auto &type : types) {
1522 if (mon->session_map.subs.count(type) == 0)
1523 continue;
1524 xlist<Subscription*>::iterator p = mon->session_map.subs[type]->begin();
1525 while (!p.end()) {
1526 Subscription *sub = *p;
1527 ++p;
1528 check_sub(sub);
1529 }
1530 }
1531}
1532
1533
1534void MDSMonitor::check_sub(Subscription *sub)
1535{
1536 dout(20) << __func__ << ": " << sub->type << dendl;
1537
28e407b8
AA
1538 const auto &fsmap = get_fsmap();
1539
7c673cae
FG
1540 if (sub->type == "fsmap") {
1541 if (sub->next <= fsmap.get_epoch()) {
1542 sub->session->con->send_message(new MFSMap(mon->monmap->fsid, fsmap));
1543 if (sub->onetime) {
1544 mon->session_map.remove_sub(sub);
1545 } else {
1546 sub->next = fsmap.get_epoch() + 1;
1547 }
1548 }
1549 } else if (sub->type == "fsmap.user") {
1550 if (sub->next <= fsmap.get_epoch()) {
1551 FSMapUser fsmap_u;
1552 fsmap_u.epoch = fsmap.get_epoch();
1553 fsmap_u.legacy_client_fscid = fsmap.legacy_client_fscid;
28e407b8
AA
1554 for (const auto &p : fsmap.filesystems) {
1555 FSMapUser::fs_info_t& fs_info = fsmap_u.filesystems[p.second->fscid];
1556 fs_info.cid = p.second->fscid;
1557 fs_info.name = p.second->mds_map.fs_name;
7c673cae
FG
1558 }
1559 sub->session->con->send_message(new MFSMapUser(mon->monmap->fsid, fsmap_u));
1560 if (sub->onetime) {
1561 mon->session_map.remove_sub(sub);
1562 } else {
1563 sub->next = fsmap.get_epoch() + 1;
1564 }
1565 }
1566 } else if (sub->type.compare(0, 6, "mdsmap") == 0) {
1567 if (sub->next > fsmap.get_epoch()) {
1568 return;
1569 }
1570
11fdf7f2 1571 const bool is_mds = sub->session->name.is_mds();
7c673cae
FG
1572 mds_gid_t mds_gid = MDS_GID_NONE;
1573 fs_cluster_id_t fscid = FS_CLUSTER_ID_NONE;
1574 if (is_mds) {
1575 // What (if any) namespace are you assigned to?
1576 auto mds_info = fsmap.get_mds_info();
1adf2230 1577 for (const auto &p : mds_info) {
11fdf7f2 1578 if (p.second.addrs == sub->session->addrs) {
1adf2230 1579 mds_gid = p.first;
7c673cae
FG
1580 fscid = fsmap.mds_roles.at(mds_gid);
1581 }
1582 }
1583 } else {
1584 // You're a client. Did you request a particular
1585 // namespace?
11fdf7f2 1586 if (sub->type.compare(0, 7, "mdsmap.") == 0) {
7c673cae
FG
1587 auto namespace_id_str = sub->type.substr(std::string("mdsmap.").size());
1588 dout(10) << __func__ << ": namespace_id " << namespace_id_str << dendl;
1589 std::string err;
1590 fscid = strict_strtoll(namespace_id_str.c_str(), 10, &err);
1591 if (!err.empty()) {
1592 // Client asked for a non-existent namespace, send them nothing
1593 dout(1) << "Invalid client subscription '" << sub->type
1594 << "'" << dendl;
1595 return;
1596 }
1597 if (fsmap.filesystems.count(fscid) == 0) {
1598 // Client asked for a non-existent namespace, send them nothing
1599 // TODO: something more graceful for when a client has a filesystem
1600 // mounted, and the fileysstem is deleted. Add a "shut down you fool"
1601 // flag to MMDSMap?
1602 dout(1) << "Client subscribed to non-existent namespace '" <<
1603 fscid << "'" << dendl;
1604 return;
1605 }
1606 } else {
1607 // Unqualified request for "mdsmap": give it the one marked
1608 // for use by legacy clients.
1609 if (fsmap.legacy_client_fscid != FS_CLUSTER_ID_NONE) {
1610 fscid = fsmap.legacy_client_fscid;
1611 } else {
1612 dout(1) << "Client subscribed for legacy filesystem but "
1613 "none is configured" << dendl;
1614 return;
1615 }
1616 }
1617 }
1618 dout(10) << __func__ << ": is_mds=" << is_mds << ", fscid= " << fscid << dendl;
1619
1620 // Work out the effective latest epoch
28e407b8 1621 const MDSMap *mds_map = nullptr;
7c673cae
FG
1622 MDSMap null_map;
1623 null_map.compat = fsmap.compat;
1624 if (fscid == FS_CLUSTER_ID_NONE) {
1625 // For a client, we should have already dropped out
11fdf7f2 1626 ceph_assert(is_mds);
7c673cae 1627
28e407b8
AA
1628 auto it = fsmap.standby_daemons.find(mds_gid);
1629 if (it != fsmap.standby_daemons.end()) {
7c673cae 1630 // For an MDS, we need to feed it an MDSMap with its own state in
28e407b8
AA
1631 null_map.mds_info[mds_gid] = it->second;
1632 null_map.epoch = fsmap.standby_epochs.at(mds_gid);
7c673cae
FG
1633 } else {
1634 null_map.epoch = fsmap.epoch;
1635 }
1636 mds_map = &null_map;
1637 } else {
1638 // Check the effective epoch
28e407b8 1639 mds_map = &fsmap.get_filesystem(fscid)->mds_map;
7c673cae
FG
1640 }
1641
11fdf7f2 1642 ceph_assert(mds_map != nullptr);
7c673cae
FG
1643 dout(10) << __func__ << " selected MDS map epoch " <<
1644 mds_map->epoch << " for namespace " << fscid << " for subscriber "
11fdf7f2 1645 << sub->session->name << " who wants epoch " << sub->next << dendl;
7c673cae
FG
1646
1647 if (sub->next > mds_map->epoch) {
1648 return;
1649 }
11fdf7f2 1650 auto msg = MMDSMap::create(mon->monmap->fsid, *mds_map);
7c673cae 1651
11fdf7f2 1652 sub->session->con->send_message(msg.detach());
7c673cae
FG
1653 if (sub->onetime) {
1654 mon->session_map.remove_sub(sub);
1655 } else {
1656 sub->next = mds_map->get_epoch() + 1;
1657 }
1658 }
1659}
1660
1661
1662void MDSMonitor::update_metadata(mds_gid_t gid,
1663 const map<string, string>& metadata)
1664{
1665 if (metadata.empty()) {
1666 return;
1667 }
1668 pending_metadata[gid] = metadata;
1669
1670 MonitorDBStore::TransactionRef t = paxos->get_pending_transaction();
1671 bufferlist bl;
11fdf7f2 1672 encode(pending_metadata, bl);
7c673cae
FG
1673 t->put(MDS_METADATA_PREFIX, "last_metadata", bl);
1674 paxos->trigger_propose();
1675}
1676
1adf2230 1677void MDSMonitor::remove_from_metadata(const FSMap &fsmap, MonitorDBStore::TransactionRef t)
7c673cae
FG
1678{
1679 bool update = false;
1adf2230
AA
1680 for (auto it = pending_metadata.begin(); it != pending_metadata.end(); ) {
1681 if (!fsmap.gid_exists(it->first)) {
1682 it = pending_metadata.erase(it);
7c673cae
FG
1683 update = true;
1684 } else {
1adf2230 1685 ++it;
7c673cae
FG
1686 }
1687 }
1688 if (!update)
1689 return;
1690 bufferlist bl;
11fdf7f2 1691 encode(pending_metadata, bl);
7c673cae
FG
1692 t->put(MDS_METADATA_PREFIX, "last_metadata", bl);
1693}
1694
1695int MDSMonitor::load_metadata(map<mds_gid_t, Metadata>& m)
1696{
1697 bufferlist bl;
1698 int r = mon->store->get(MDS_METADATA_PREFIX, "last_metadata", bl);
1699 if (r) {
11fdf7f2 1700 dout(5) << "Unable to load 'last_metadata'" << dendl;
7c673cae
FG
1701 return r;
1702 }
1703
11fdf7f2
TL
1704 auto it = bl.cbegin();
1705 ceph::decode(m, it);
7c673cae
FG
1706 return 0;
1707}
1708
1adf2230 1709void MDSMonitor::count_metadata(const std::string &field, map<string,int> *out)
31f18b77 1710{
31f18b77
FG
1711 map<mds_gid_t,Metadata> meta;
1712 load_metadata(meta);
1713 for (auto& p : meta) {
1714 auto q = p.second.find(field);
1715 if (q == p.second.end()) {
c07f9fc5 1716 (*out)["unknown"]++;
31f18b77 1717 } else {
c07f9fc5 1718 (*out)[q->second]++;
31f18b77
FG
1719 }
1720 }
c07f9fc5
FG
1721}
1722
1adf2230 1723void MDSMonitor::count_metadata(const std::string &field, Formatter *f)
c07f9fc5
FG
1724{
1725 map<string,int> by_val;
1726 count_metadata(field, &by_val);
31f18b77
FG
1727 f->open_object_section(field.c_str());
1728 for (auto& p : by_val) {
1729 f->dump_int(p.first.c_str(), p.second);
1730 }
1731 f->close_section();
1732}
1733
1adf2230
AA
1734int MDSMonitor::dump_metadata(const FSMap& fsmap, const std::string &who,
1735 Formatter *f, ostream& err)
7c673cae 1736{
11fdf7f2 1737 ceph_assert(f);
7c673cae 1738
1adf2230 1739 mds_gid_t gid = gid_from_arg(fsmap, who, err);
7c673cae
FG
1740 if (gid == MDS_GID_NONE) {
1741 return -EINVAL;
1742 }
1743
1744 map<mds_gid_t, Metadata> metadata;
1745 if (int r = load_metadata(metadata)) {
1746 err << "Unable to load 'last_metadata'";
1747 return r;
1748 }
1749
1750 if (!metadata.count(gid)) {
1751 return -ENOENT;
1752 }
1753 const Metadata& m = metadata[gid];
1754 for (Metadata::const_iterator p = m.begin(); p != m.end(); ++p) {
1755 f->dump_string(p->first.c_str(), p->second);
1756 }
1757 return 0;
1758}
1759
1760int MDSMonitor::print_nodes(Formatter *f)
1761{
11fdf7f2 1762 ceph_assert(f);
7c673cae 1763
1adf2230
AA
1764 const auto &fsmap = get_fsmap();
1765
7c673cae
FG
1766 map<mds_gid_t, Metadata> metadata;
1767 if (int r = load_metadata(metadata)) {
1768 return r;
1769 }
1770
11fdf7f2 1771 map<string, list<string> > mdses; // hostname => mds
1adf2230
AA
1772 for (const auto &p : metadata) {
1773 const mds_gid_t& gid = p.first;
1774 const Metadata& m = p.second;
7c673cae
FG
1775 Metadata::const_iterator hostname = m.find("hostname");
1776 if (hostname == m.end()) {
1777 // not likely though
1778 continue;
1779 }
1adf2230 1780 if (!fsmap.gid_exists(gid)) {
7c673cae
FG
1781 dout(5) << __func__ << ": GID " << gid << " not existent" << dendl;
1782 continue;
1783 }
1adf2230 1784 const MDSMap::mds_info_t& mds_info = fsmap.get_info_gid(gid);
11fdf7f2 1785 mdses[hostname->second].push_back(mds_info.name);
7c673cae
FG
1786 }
1787
1788 dump_services(f, mdses, "mds");
1789 return 0;
1790}
1791
1792/**
1793 * If a cluster is undersized (with respect to max_mds), then
11fdf7f2
TL
1794 * attempt to find daemons to grow it. If the cluster is oversized
1795 * (with respect to max_mds) then shrink it by stopping its highest rank.
7c673cae 1796 */
11fdf7f2 1797bool MDSMonitor::maybe_resize_cluster(FSMap &fsmap, fs_cluster_id_t fscid)
7c673cae 1798{
11fdf7f2
TL
1799 auto &current_mds_map = get_fsmap().get_filesystem(fscid)->mds_map;
1800 auto&& fs = fsmap.get_filesystem(fscid);
1adf2230 1801 auto &mds_map = fs->mds_map;
7c673cae 1802
1adf2230
AA
1803 int in = mds_map.get_num_in_mds();
1804 int max = mds_map.get_max_mds();
1805
1806 dout(20) << __func__ << " in " << in << " max " << max << dendl;
1807
11fdf7f2
TL
1808 /* Check that both the current epoch mds_map is resizeable as well as the
1809 * current batch of changes in pending. This is important if an MDS is
1810 * becoming active in the next epoch.
1811 */
1812 if (!current_mds_map.is_resizeable() ||
1813 !mds_map.is_resizeable()) {
1814 dout(5) << __func__ << " mds_map is not currently resizeable" << dendl;
1815 return false;
1816 }
1817
1818 if (in < max && !mds_map.test_flag(CEPH_MDSMAP_NOT_JOINABLE)) {
7c673cae
FG
1819 mds_rank_t mds = mds_rank_t(0);
1820 string name;
1adf2230 1821 while (mds_map.is_in(mds)) {
7c673cae
FG
1822 mds++;
1823 }
11fdf7f2 1824 auto&& newgid = fsmap.find_replacement_for({fscid, mds}, name);
7c673cae 1825 if (newgid == MDS_GID_NONE) {
1adf2230 1826 return false;
7c673cae
FG
1827 }
1828
1adf2230 1829 const auto &new_info = fsmap.get_info_gid(newgid);
11fdf7f2 1830 dout(1) << "assigned standby " << new_info.addrs
7c673cae 1831 << " as mds." << mds << dendl;
d2e6a577
FG
1832
1833 mon->clog->info() << new_info.human_name() << " assigned to "
1adf2230
AA
1834 "filesystem " << mds_map.fs_name << " as rank "
1835 << mds << " (now has " << mds_map.get_num_in_mds() + 1
d2e6a577 1836 << " ranks)";
11fdf7f2 1837 fsmap.promote(newgid, *fs, mds);
1adf2230 1838 return true;
11fdf7f2
TL
1839 } else if (in > max) {
1840 mds_rank_t target = in - 1;
1841 const auto &info = mds_map.get_info(target);
1842 if (mds_map.is_active(target)) {
1843 dout(1) << "stopping " << target << dendl;
1844 mon->clog->info() << "stopping " << info.human_name();
1845 auto f = [](auto& info) {
1846 info.state = MDSMap::STATE_STOPPING;
1847 };
1848 fsmap.modify_daemon(info.global_id, f);
1849 return true;
1850 } else {
1851 dout(20) << "skipping stop of " << target << dendl;
1852 return false;
1853 }
7c673cae
FG
1854 }
1855
1adf2230 1856 return false;
7c673cae
FG
1857}
1858
1859
1860/**
1861 * If a daemon is laggy, and a suitable replacement
1862 * is available, fail this daemon (remove from map) and pass its
1863 * role to another daemon.
1864 */
1adf2230
AA
1865void MDSMonitor::maybe_replace_gid(FSMap &fsmap, mds_gid_t gid,
1866 const MDSMap::mds_info_t& info, bool *mds_propose, bool *osd_propose)
7c673cae 1867{
11fdf7f2
TL
1868 ceph_assert(mds_propose != nullptr);
1869 ceph_assert(osd_propose != nullptr);
7c673cae 1870
1adf2230 1871 const auto fscid = fsmap.mds_roles.at(gid);
7c673cae 1872
31f18b77
FG
1873 // We will only take decisive action (replacing/removing a daemon)
1874 // if we have some indicating that some other daemon(s) are successfully
1875 // getting beacons through recently.
1adf2230
AA
1876 mono_time latest_beacon = mono_clock::zero();
1877 for (const auto &p : last_beacon) {
1878 latest_beacon = std::max(p.second.stamp, latest_beacon);
31f18b77 1879 }
1adf2230
AA
1880 mono_time now = mono_clock::now();
1881 chrono::duration<double> since = now-latest_beacon;
11fdf7f2 1882 const bool frozen = info.is_frozen();
1adf2230 1883 const bool may_replace = since.count() <
11fdf7f2 1884 std::max(g_conf()->mds_beacon_interval, g_conf()->mds_beacon_grace * 0.5);
31f18b77 1885
7c673cae
FG
1886 // are we in?
1887 // and is there a non-laggy standby that can take over for us?
1888 mds_gid_t sgid;
1889 if (info.rank >= 0 &&
1890 info.state != MDSMap::STATE_STANDBY &&
1891 info.state != MDSMap::STATE_STANDBY_REPLAY &&
31f18b77 1892 may_replace &&
11fdf7f2
TL
1893 !frozen &&
1894 !fsmap.get_filesystem(fscid)->mds_map.test_flag(CEPH_MDSMAP_NOT_JOINABLE) &&
1895 (sgid = fsmap.find_replacement_for({fscid, info.rank}, info.name)) != MDS_GID_NONE)
7c673cae
FG
1896 {
1897
1adf2230 1898 MDSMap::mds_info_t si = fsmap.get_info_gid(sgid);
11fdf7f2
TL
1899 dout(1) << " replacing " << gid << " " << info.addrs
1900 << " mds." << info.rank << "." << info.inc
1901 << " " << ceph_mds_state_name(info.state)
1902 << " with " << sgid << "/" << si.name << " " << si.addrs
1903 << dendl;
7c673cae 1904
d2e6a577 1905 mon->clog->warn() << info.human_name()
31f18b77
FG
1906 << " is not responding, replacing it "
1907 << "as rank " << info.rank
d2e6a577 1908 << " with standby " << si.human_name();
31f18b77 1909
7c673cae 1910 // Remember what NS the old one was in
1adf2230 1911 const fs_cluster_id_t fscid = fsmap.mds_roles.at(gid);
7c673cae
FG
1912
1913 // Remove the old one
1adf2230 1914 *osd_propose |= fail_mds_gid(fsmap, gid);
7c673cae
FG
1915
1916 // Promote the replacement
11fdf7f2
TL
1917 auto&& fs = fsmap.filesystems.at(fscid);
1918 fsmap.promote(sgid, *fs, info.rank);
7c673cae
FG
1919
1920 *mds_propose = true;
31f18b77 1921 } else if ((info.state == MDSMap::STATE_STANDBY_REPLAY ||
11fdf7f2
TL
1922 info.state == MDSMap::STATE_STANDBY) && may_replace && !frozen) {
1923 dout(1) << " failing and removing " << gid << " " << info.addrs
1924 << " mds." << info.rank
1925 << "." << info.inc << " " << ceph_mds_state_name(info.state)
1926 << dendl;
d2e6a577
FG
1927 mon->clog->info() << "Standby " << info.human_name() << " is not "
1928 "responding, dropping it";
1adf2230 1929 fail_mds_gid(fsmap, gid);
7c673cae
FG
1930 *mds_propose = true;
1931 } else if (!info.laggy()) {
11fdf7f2
TL
1932 dout(1) << " marking " << gid << " " << info.addrs
1933 << " mds." << info.rank << "." << info.inc
1934 << " " << ceph_mds_state_name(info.state)
1935 << " laggy" << dendl;
1936 fsmap.modify_daemon(info.global_id, [](auto& info) {
1937 info.laggy_since = ceph_clock_now();
7c673cae
FG
1938 });
1939 *mds_propose = true;
1940 }
1941}
1942
11fdf7f2 1943bool MDSMonitor::maybe_promote_standby(FSMap &fsmap, Filesystem& fs)
7c673cae 1944{
11fdf7f2
TL
1945 if (fs.mds_map.test_flag(CEPH_MDSMAP_NOT_JOINABLE)) {
1946 return false;
1947 }
7c673cae
FG
1948
1949 bool do_propose = false;
1950
1951 // have a standby take over?
1952 set<mds_rank_t> failed;
11fdf7f2
TL
1953 fs.mds_map.get_failed_mds_set(failed);
1954 for (const auto& rank : failed) {
1955 auto&& sgid = fsmap.find_replacement_for({fs.fscid, rank}, {});
1956 if (sgid) {
1957 auto&& info = fsmap.get_info_gid(sgid);
1958 dout(1) << " taking over failed mds." << rank << " with " << sgid
1959 << "/" << info.name << " " << info.addrs << dendl;
1960 mon->clog->info() << "Standby " << info.human_name()
1961 << " assigned to filesystem " << fs.mds_map.fs_name
1962 << " as rank " << rank;
1963
1964 fsmap.promote(sgid, fs, rank);
1965 do_propose = true;
7c673cae 1966 }
11fdf7f2
TL
1967 }
1968
1969 if (fs.mds_map.allows_standby_replay() && !fs.mds_map.is_degraded()) {
7c673cae 1970 // There were no failures to replace, so try using any available standbys
a8e16298
TL
1971 // as standby-replay daemons. Don't do this when the cluster is degraded
1972 // as a standby-replay daemon may try to read a journal being migrated.
11fdf7f2
TL
1973 for (;;) {
1974 auto standby_gid = fsmap.get_available_standby();
1975 if (standby_gid == MDS_GID_NONE) break;
1976 dout(20) << "standby available mds." << standby_gid << dendl;
1977 bool changed = false;
1978 for (const auto& rank : fs.mds_map.in) {
1979 dout(20) << "exmaining " << rank << dendl;
1980 if (fs.mds_map.is_followable(rank)) {
1981 dout(1) << " setting mds." << standby_gid
1982 << " to follow mds rank " << rank << dendl;
1983 fsmap.assign_standby_replay(standby_gid, fs.fscid, rank);
1984 do_propose = true;
1985 changed = true;
1986 break;
7c673cae 1987 }
7c673cae 1988 }
11fdf7f2 1989 if (!changed) break;
7c673cae
FG
1990 }
1991 }
1992
1993 return do_propose;
1994}
1995
1996void MDSMonitor::tick()
1997{
1998 // make sure mds's are still alive
1999 // ...if i am an active leader
28e407b8 2000
1adf2230 2001 if (!is_active() || !is_leader()) return;
28e407b8
AA
2002
2003 auto &pending = get_pending_fsmap_writeable();
7c673cae 2004
28e407b8 2005 bool do_propose = false;
7c673cae 2006
28e407b8 2007 do_propose |= pending.check_health();
7c673cae 2008
11fdf7f2 2009 // resize mds cluster (adjust @in)?
28e407b8 2010 for (auto &p : pending.filesystems) {
11fdf7f2 2011 do_propose |= maybe_resize_cluster(pending, p.second->fscid);
7c673cae
FG
2012 }
2013
1adf2230 2014 mono_time now = mono_clock::now();
11fdf7f2 2015 if (mono_clock::is_zero(last_tick)) {
7c673cae
FG
2016 last_tick = now;
2017 }
1adf2230 2018 chrono::duration<double> since_last = now-last_tick;
7c673cae 2019
1adf2230 2020 if (since_last.count() >
11fdf7f2 2021 (g_conf()->mds_beacon_grace - g_conf()->mds_beacon_interval)) {
7c673cae
FG
2022 // This case handles either local slowness (calls being delayed
2023 // for whatever reason) or cluster election slowness (a long gap
2024 // between calls while an election happened)
91327a77 2025 dout(1) << __func__ << ": resetting beacon timeouts due to mon delay "
7c673cae 2026 "(slow election?) of " << now - last_tick << " seconds" << dendl;
1adf2230
AA
2027 for (auto &p : last_beacon) {
2028 p.second.stamp = now;
7c673cae
FG
2029 }
2030 }
2031
2032 last_tick = now;
2033
7c673cae 2034 // make sure last_beacon is fully populated
28e407b8 2035 for (auto &p : pending.mds_roles) {
7c673cae 2036 auto &gid = p.first;
1adf2230
AA
2037 last_beacon.emplace(std::piecewise_construct,
2038 std::forward_as_tuple(gid),
2039 std::forward_as_tuple(mono_clock::now(), 0));
7c673cae
FG
2040 }
2041
1adf2230
AA
2042
2043 // check beacon timestamps
c07f9fc5
FG
2044 bool propose_osdmap = false;
2045 bool osdmap_writeable = mon->osdmon()->is_writeable();
1adf2230
AA
2046 for (auto it = last_beacon.begin(); it != last_beacon.end(); ) {
2047 mds_gid_t gid = it->first;
2048 auto beacon_info = it->second;
2049 chrono::duration<double> since_last = now-beacon_info.stamp;
7c673cae 2050
28e407b8 2051 if (!pending.gid_exists(gid)) {
c07f9fc5 2052 // clean it out
1adf2230 2053 it = last_beacon.erase(it);
c07f9fc5 2054 continue;
7c673cae
FG
2055 }
2056
1adf2230 2057
11fdf7f2 2058 if (since_last.count() >= g_conf()->mds_beacon_grace) {
28e407b8 2059 auto &info = pending.get_info_gid(gid);
c07f9fc5 2060 dout(1) << "no beacon from mds." << info.rank << "." << info.inc
11fdf7f2 2061 << " (gid: " << gid << " addr: " << info.addrs
c07f9fc5 2062 << " state: " << ceph_mds_state_name(info.state) << ")"
1adf2230 2063 << " since " << since_last.count() << "s" << dendl;
c07f9fc5
FG
2064 // If the OSDMap is writeable, we can blacklist things, so we can
2065 // try failing any laggy MDS daemons. Consider each one for failure.
2066 if (osdmap_writeable) {
1adf2230 2067 maybe_replace_gid(pending, gid, info, &do_propose, &propose_osdmap);
c07f9fc5 2068 }
7c673cae 2069 }
1adf2230
AA
2070
2071 ++it;
7c673cae 2072 }
c07f9fc5
FG
2073 if (propose_osdmap) {
2074 request_proposal(mon->osdmon());
2075 }
7c673cae 2076
28e407b8 2077 for (auto &p : pending.filesystems) {
11fdf7f2 2078 do_propose |= maybe_promote_standby(pending, *p.second);
7c673cae
FG
2079 }
2080
2081 if (do_propose) {
2082 propose_pending();
2083 }
2084}
2085
7c673cae
FG
2086MDSMonitor::MDSMonitor(Monitor *mn, Paxos *p, string service_name)
2087 : PaxosService(mn, p, service_name)
2088{
c07f9fc5 2089 handlers = FileSystemCommandHandler::load(p);
7c673cae
FG
2090}
2091
2092void MDSMonitor::on_restart()
2093{
2094 // Clear out the leader-specific state.
1adf2230 2095 last_tick = mono_clock::now();
7c673cae
FG
2096 last_beacon.clear();
2097}
2098