]> git.proxmox.com Git - ceph.git/blame - ceph/src/mon/MDSMonitor.cc
bump version to 15.2.4-pve1
[ceph.git] / ceph / src / mon / MDSMonitor.cc
CommitLineData
7c673cae
FG
1// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2// vim: ts=8 sw=2 smarttab
3/*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 *
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
12 *
13 */
14
11fdf7f2 15#include <regex>
7c673cae
FG
16#include <sstream>
17#include <boost/utility.hpp>
18
19#include "MDSMonitor.h"
20#include "FSCommands.h"
21#include "Monitor.h"
22#include "MonitorDBStore.h"
23#include "OSDMonitor.h"
7c673cae
FG
24
25#include "common/strtol.h"
26#include "common/perf_counters.h"
27#include "common/config.h"
28#include "common/cmdparse.h"
29#include "messages/MMDSMap.h"
30#include "messages/MFSMap.h"
31#include "messages/MFSMapUser.h"
32#include "messages/MMDSLoadTargets.h"
33#include "messages/MMonCommand.h"
34#include "messages/MGenericMessage.h"
35
11fdf7f2 36#include "include/ceph_assert.h"
7c673cae
FG
37#include "include/str_list.h"
38#include "include/stringify.h"
39#include "mds/mdstypes.h"
40#include "Session.h"
41
42#define dout_subsys ceph_subsys_mon
43#undef dout_prefix
28e407b8 44#define dout_prefix _prefix(_dout, mon, get_fsmap())
9f95a23c
TL
45using namespace TOPNSPC::common;
46
28e407b8 47static ostream& _prefix(std::ostream *_dout, Monitor *mon, const FSMap& fsmap) {
7c673cae
FG
48 return *_dout << "mon." << mon->name << "@" << mon->rank
49 << "(" << mon->get_state_name()
50 << ").mds e" << fsmap.get_epoch() << " ";
51}
52
3efd9988
FG
53static const string MDS_METADATA_PREFIX("mds_metadata");
54static const string MDS_HEALTH_PREFIX("mds_health");
55
56
7c673cae
FG
57/*
58 * Specialized implementation of cmd_getval to allow us to parse
59 * out strongly-typedef'd types
60 */
9f95a23c
TL
61namespace TOPNSPC::common {
62template<> bool cmd_getval(const cmdmap_t& cmdmap,
31f18b77 63 const std::string& k, mds_gid_t &val)
7c673cae 64{
9f95a23c 65 return cmd_getval(cmdmap, k, (int64_t&)val);
7c673cae
FG
66}
67
9f95a23c 68template<> bool cmd_getval(const cmdmap_t& cmdmap,
31f18b77 69 const std::string& k, mds_rank_t &val)
7c673cae 70{
9f95a23c 71 return cmd_getval(cmdmap, k, (int64_t&)val);
7c673cae
FG
72}
73
9f95a23c 74template<> bool cmd_getval(const cmdmap_t& cmdmap,
31f18b77 75 const std::string& k, MDSMap::DaemonState &val)
7c673cae 76{
9f95a23c
TL
77 return cmd_getval(cmdmap, k, (int64_t&)val);
78}
7c673cae 79}
7c673cae
FG
80// my methods
81
11fdf7f2
TL
82template <int dblV>
83void MDSMonitor::print_map(const FSMap& m)
7c673cae 84{
11fdf7f2 85 dout(dblV) << "print_map\n";
7c673cae
FG
86 m.print(*_dout);
87 *_dout << dendl;
88}
89
90// service methods
91void MDSMonitor::create_initial()
92{
93 dout(10) << "create_initial" << dendl;
94}
95
11fdf7f2 96void MDSMonitor::get_store_prefixes(std::set<string>& s) const
3efd9988
FG
97{
98 s.insert(service_name);
99 s.insert(MDS_METADATA_PREFIX);
100 s.insert(MDS_HEALTH_PREFIX);
101}
7c673cae
FG
102
103void MDSMonitor::update_from_paxos(bool *need_bootstrap)
104{
105 version_t version = get_last_committed();
28e407b8 106 if (version == get_fsmap().epoch)
7c673cae
FG
107 return;
108
109 dout(10) << __func__ << " version " << version
28e407b8 110 << ", my e " << get_fsmap().epoch << dendl;
11fdf7f2 111 ceph_assert(version > get_fsmap().epoch);
7c673cae 112
224ce89b
WB
113 load_health();
114
7c673cae
FG
115 // read and decode
116 bufferlist fsmap_bl;
117 fsmap_bl.clear();
118 int err = get_version(version, fsmap_bl);
11fdf7f2 119 ceph_assert(err == 0);
7c673cae 120
11fdf7f2 121 ceph_assert(fsmap_bl.length() > 0);
7c673cae 122 dout(10) << __func__ << " got " << version << dendl;
28e407b8 123 PaxosFSMap::decode(fsmap_bl);
7c673cae
FG
124
125 // new map
91327a77 126 dout(0) << "new map" << dendl;
11fdf7f2
TL
127 print_map<0>(get_fsmap());
128 if (!g_conf()->mon_mds_skip_sanity) {
28e407b8 129 get_fsmap().sanity();
7c673cae
FG
130 }
131
132 check_subs();
7c673cae
FG
133}
134
135void MDSMonitor::init()
136{
137 (void)load_metadata(pending_metadata);
138}
139
140void MDSMonitor::create_pending()
141{
28e407b8 142 auto &fsmap = PaxosFSMap::create_pending();
7c673cae 143
3efd9988 144 if (mon->osdmon()->is_readable()) {
28e407b8
AA
145 const auto &osdmap = mon->osdmon()->osdmap;
146 fsmap.sanitize([&osdmap](int64_t pool){return osdmap.have_pg_pool(pool);});
3efd9988
FG
147 }
148
28e407b8 149 dout(10) << "create_pending e" << fsmap.epoch << dendl;
7c673cae
FG
150}
151
152void MDSMonitor::encode_pending(MonitorDBStore::TransactionRef t)
153{
28e407b8
AA
154 auto &pending = get_pending_fsmap_writeable();
155 auto &epoch = pending.epoch;
7c673cae 156
28e407b8 157 dout(10) << "encode_pending e" << epoch << dendl;
7c673cae
FG
158
159 // print map iff 'debug mon = 30' or higher
11fdf7f2
TL
160 print_map<30>(pending);
161 if (!g_conf()->mon_mds_skip_sanity) {
28e407b8 162 pending.sanity();
7c673cae
FG
163 }
164
165 // Set 'modified' on maps modified this epoch
28e407b8
AA
166 for (auto &p : pending.filesystems) {
167 if (p.second->mds_map.epoch == epoch) {
168 p.second->mds_map.modified = ceph_clock_now();
7c673cae
FG
169 }
170 }
171
172 // apply to paxos
11fdf7f2 173 ceph_assert(get_last_committed() + 1 == pending.epoch);
28e407b8
AA
174 bufferlist pending_bl;
175 pending.encode(pending_bl, mon->get_quorum_con_features());
7c673cae
FG
176
177 /* put everything in the transaction */
28e407b8
AA
178 put_version(t, pending.epoch, pending_bl);
179 put_last_committed(t, pending.epoch);
7c673cae
FG
180
181 // Encode MDSHealth data
182 for (std::map<uint64_t, MDSHealth>::iterator i = pending_daemon_health.begin();
183 i != pending_daemon_health.end(); ++i) {
184 bufferlist bl;
185 i->second.encode(bl);
186 t->put(MDS_HEALTH_PREFIX, stringify(i->first), bl);
187 }
188
189 for (std::set<uint64_t>::iterator i = pending_daemon_health_rm.begin();
190 i != pending_daemon_health_rm.end(); ++i) {
191 t->erase(MDS_HEALTH_PREFIX, stringify(*i));
192 }
193 pending_daemon_health_rm.clear();
1adf2230 194 remove_from_metadata(pending, t);
224ce89b
WB
195
196 // health
197 health_check_map_t new_checks;
28e407b8 198 const auto &info_map = pending.get_mds_info();
224ce89b
WB
199 for (const auto &i : info_map) {
200 const auto &gid = i.first;
201 const auto &info = i.second;
202 if (pending_daemon_health_rm.count(gid)) {
203 continue;
204 }
205 MDSHealth health;
206 auto p = pending_daemon_health.find(gid);
207 if (p != pending_daemon_health.end()) {
208 health = p->second;
209 } else {
210 bufferlist bl;
211 mon->store->get(MDS_HEALTH_PREFIX, stringify(gid), bl);
212 if (!bl.length()) {
213 derr << "Missing health data for MDS " << gid << dendl;
214 continue;
215 }
11fdf7f2 216 auto bl_i = bl.cbegin();
224ce89b
WB
217 health.decode(bl_i);
218 }
219 for (const auto &metric : health.metrics) {
9f95a23c 220 const auto rank = info.rank;
224ce89b
WB
221 health_check_t *check = &new_checks.get_or_add(
222 mds_metric_name(metric.type),
223 metric.sev,
9f95a23c
TL
224 mds_metric_summary(metric.type),
225 1);
224ce89b
WB
226 ostringstream ss;
227 ss << "mds" << info.name << "(mds." << rank << "): " << metric.message;
28e407b8
AA
228 bool first = true;
229 for (auto &p : metric.metadata) {
230 if (first) {
231 ss << " ";
232 } else {
224ce89b 233 ss << ", ";
28e407b8
AA
234 }
235 ss << p.first << ": " << p.second;
236 first = false;
224ce89b
WB
237 }
238 check->detail.push_back(ss.str());
239 }
240 }
28e407b8 241 pending.get_health_checks(&new_checks);
224ce89b 242 for (auto& p : new_checks.checks) {
11fdf7f2 243 p.second.summary = std::regex_replace(
224ce89b 244 p.second.summary,
11fdf7f2 245 std::regex("%num%"),
224ce89b 246 stringify(p.second.detail.size()));
11fdf7f2 247 p.second.summary = std::regex_replace(
224ce89b 248 p.second.summary,
11fdf7f2 249 std::regex("%plurals%"),
224ce89b 250 p.second.detail.size() > 1 ? "s" : "");
11fdf7f2 251 p.second.summary = std::regex_replace(
224ce89b 252 p.second.summary,
11fdf7f2 253 std::regex("%isorare%"),
224ce89b 254 p.second.detail.size() > 1 ? "are" : "is");
11fdf7f2 255 p.second.summary = std::regex_replace(
181888fb 256 p.second.summary,
11fdf7f2 257 std::regex("%hasorhave%"),
181888fb 258 p.second.detail.size() > 1 ? "have" : "has");
224ce89b
WB
259 }
260 encode_health(new_checks, t);
7c673cae
FG
261}
262
11fdf7f2 263version_t MDSMonitor::get_trim_to() const
7c673cae
FG
264{
265 version_t floor = 0;
11fdf7f2
TL
266 if (g_conf()->mon_mds_force_trim_to > 0 &&
267 g_conf()->mon_mds_force_trim_to < (int)get_last_committed()) {
268 floor = g_conf()->mon_mds_force_trim_to;
7c673cae
FG
269 dout(10) << __func__ << " explicit mon_mds_force_trim_to = "
270 << floor << dendl;
271 }
272
11fdf7f2 273 unsigned max = g_conf()->mon_max_mdsmap_epochs;
7c673cae
FG
274 version_t last = get_last_committed();
275
276 if (last - get_first_committed() > max && floor < last - max)
277 return last - max;
278 return floor;
279}
280
7c673cae
FG
281bool MDSMonitor::preprocess_query(MonOpRequestRef op)
282{
283 op->mark_mdsmon_event(__func__);
9f95a23c 284 auto m = op->get_req<PaxosServiceMessage>();
11fdf7f2
TL
285 dout(10) << "preprocess_query " << *m << " from " << m->get_orig_source()
286 << " " << m->get_orig_source_addrs() << dendl;
7c673cae
FG
287
288 switch (m->get_type()) {
289
290 case MSG_MDS_BEACON:
291 return preprocess_beacon(op);
292
293 case MSG_MON_COMMAND:
f64942e4
AA
294 try {
295 return preprocess_command(op);
11fdf7f2 296 } catch (const bad_cmd_get& e) {
f64942e4
AA
297 bufferlist bl;
298 mon->reply_command(op, -EINVAL, e.what(), bl, get_last_committed());
299 return true;
300 }
7c673cae
FG
301
302 case MSG_MDS_OFFLOAD_TARGETS:
303 return preprocess_offload_targets(op);
304
305 default:
306 ceph_abort();
307 return true;
308 }
309}
310
311void MDSMonitor::_note_beacon(MMDSBeacon *m)
312{
313 mds_gid_t gid = mds_gid_t(m->get_global_id());
314 version_t seq = m->get_seq();
315
91327a77 316 dout(5) << "_note_beacon " << *m << " noting time" << dendl;
1adf2230
AA
317 auto &beacon = last_beacon[gid];
318 beacon.stamp = mono_clock::now();
319 beacon.seq = seq;
7c673cae
FG
320}
321
322bool MDSMonitor::preprocess_beacon(MonOpRequestRef op)
323{
324 op->mark_mdsmon_event(__func__);
9f95a23c 325 auto m = op->get_req<MMDSBeacon>();
7c673cae
FG
326 MDSMap::DaemonState state = m->get_state();
327 mds_gid_t gid = m->get_global_id();
328 version_t seq = m->get_seq();
329 MDSMap::mds_info_t info;
330 epoch_t effective_epoch = 0;
331
1adf2230 332 const auto &fsmap = get_fsmap();
28e407b8 333
7c673cae 334 // check privileges, ignore if fails
11fdf7f2
TL
335 MonSession *session = op->get_session();
336 if (!session)
337 goto ignore;
7c673cae
FG
338 if (!session->is_capable("mds", MON_CAP_X)) {
339 dout(0) << "preprocess_beacon got MMDSBeacon from entity with insufficient privileges "
340 << session->caps << dendl;
341 goto ignore;
342 }
343
344 if (m->get_fsid() != mon->monmap->fsid) {
345 dout(0) << "preprocess_beacon on fsid " << m->get_fsid() << " != " << mon->monmap->fsid << dendl;
346 goto ignore;
347 }
348
91327a77 349 dout(5) << "preprocess_beacon " << *m
11fdf7f2
TL
350 << " from " << m->get_orig_source()
351 << " " << m->get_orig_source_addrs()
7c673cae
FG
352 << " " << m->get_compat()
353 << dendl;
354
355 // make sure the address has a port
356 if (m->get_orig_source_addr().get_port() == 0) {
357 dout(1) << " ignoring boot message without a port" << dendl;
358 goto ignore;
359 }
360
361 // check compat
362 if (!m->get_compat().writeable(fsmap.compat)) {
11fdf7f2
TL
363 dout(1) << " mds " << m->get_orig_source()
364 << " " << m->get_orig_source_addrs()
365 << " can't write to fsmap " << fsmap.compat << dendl;
7c673cae
FG
366 goto ignore;
367 }
368
369 // fw to leader?
28e407b8 370 if (!is_leader())
7c673cae
FG
371 return false;
372
373 // booted, but not in map?
28e407b8 374 if (!fsmap.gid_exists(gid)) {
7c673cae
FG
375 if (state != MDSMap::STATE_BOOT) {
376 dout(7) << "mds_beacon " << *m << " is not in fsmap (state "
377 << ceph_mds_state_name(state) << ")" << dendl;
378
1adf2230
AA
379 /* We can't send an MDSMap this MDS was a part of because we no longer
380 * know which FS it was part of. Nor does this matter. Sending an empty
381 * MDSMap is sufficient for getting the MDS to respawn.
382 */
7c673cae
FG
383 MDSMap null_map;
384 null_map.epoch = fsmap.epoch;
385 null_map.compat = fsmap.compat;
9f95a23c 386 auto m = make_message<MMDSMap>(mon->monmap->fsid, null_map);
11fdf7f2 387 mon->send_reply(op, m.detach());
7c673cae
FG
388 return true;
389 } else {
390 return false; // not booted yet.
391 }
392 }
393 dout(10) << __func__ << ": GID exists in map: " << gid << dendl;
28e407b8 394 info = fsmap.get_info_gid(gid);
7c673cae
FG
395
396 // old seq?
397 if (info.state_seq > seq) {
398 dout(7) << "mds_beacon " << *m << " has old seq, ignoring" << dendl;
399 goto ignore;
400 }
401
402 // Work out the latest epoch that this daemon should have seen
403 {
28e407b8 404 fs_cluster_id_t fscid = fsmap.mds_roles.at(gid);
7c673cae 405 if (fscid == FS_CLUSTER_ID_NONE) {
28e407b8 406 effective_epoch = fsmap.standby_epochs.at(gid);
7c673cae 407 } else {
28e407b8 408 effective_epoch = fsmap.get_filesystem(fscid)->mds_map.epoch;
7c673cae
FG
409 }
410 if (effective_epoch != m->get_last_epoch_seen()) {
411 dout(10) << "mds_beacon " << *m
412 << " ignoring requested state, because mds hasn't seen latest map" << dendl;
413 goto reply;
414 }
415 }
416
417 if (info.laggy()) {
418 _note_beacon(m);
419 return false; // no longer laggy, need to update map.
420 }
421 if (state == MDSMap::STATE_BOOT) {
422 // ignore, already booted.
423 goto ignore;
424 }
9f95a23c
TL
425
426 // did the join_fscid change
427 if (m->get_fs().size()) {
428 fs_cluster_id_t fscid = FS_CLUSTER_ID_NONE;
429 auto f = fsmap.get_filesystem(m->get_fs());
430 if (f) {
431 fscid = f->fscid;
432 }
433 if (info.join_fscid != fscid) {
434 dout(10) << __func__ << " standby mds_join_fs changed to " << fscid
435 << " (" << m->get_fs() << ")" << dendl;
436 _note_beacon(m);
437 return false;
438 }
439 } else {
440 if (info.join_fscid != FS_CLUSTER_ID_NONE) {
441 dout(10) << __func__ << " standby mds_join_fs was cleared" << dendl;
442 _note_beacon(m);
443 return false;
444 }
445 }
446
7c673cae
FG
447 // is there a state change here?
448 if (info.state != state) {
449 // legal state change?
450 if ((info.state == MDSMap::STATE_STANDBY ||
451 info.state == MDSMap::STATE_STANDBY_REPLAY) && state > 0) {
452 dout(10) << "mds_beacon mds can't activate itself (" << ceph_mds_state_name(info.state)
453 << " -> " << ceph_mds_state_name(state) << ")" << dendl;
454 goto reply;
455 }
456
457 if ((state == MDSMap::STATE_STANDBY || state == MDSMap::STATE_STANDBY_REPLAY)
458 && info.rank != MDS_RANK_NONE)
459 {
460 dout(4) << "mds_beacon MDS can't go back into standby after taking rank: "
461 "held rank " << info.rank << " while requesting state "
462 << ceph_mds_state_name(state) << dendl;
463 goto reply;
464 }
465
466 _note_beacon(m);
467 return false;
468 }
469
470 // Comparing known daemon health with m->get_health()
471 // and return false (i.e. require proposal) if they
472 // do not match, to update our stored
473 if (!(pending_daemon_health[gid] == m->get_health())) {
91327a77 474 dout(10) << __func__ << " health metrics for gid " << gid << " were updated" << dendl;
7c673cae
FG
475 _note_beacon(m);
476 return false;
477 }
478
479 reply:
480 // note time and reply
11fdf7f2 481 ceph_assert(effective_epoch > 0);
7c673cae 482 _note_beacon(m);
11fdf7f2 483 {
9f95a23c 484 auto beacon = make_message<MMDSBeacon>(mon->monmap->fsid,
11fdf7f2
TL
485 m->get_global_id(), m->get_name(), effective_epoch,
486 state, seq, CEPH_FEATURES_SUPPORTED_DEFAULT);
487 mon->send_reply(op, beacon.detach());
488 }
7c673cae
FG
489 return true;
490
491 ignore:
492 // I won't reply this beacon, drop it.
493 mon->no_reply(op);
494 return true;
495}
496
497bool MDSMonitor::preprocess_offload_targets(MonOpRequestRef op)
498{
499 op->mark_mdsmon_event(__func__);
9f95a23c 500 auto m = op->get_req<MMDSLoadTargets>();
7c673cae 501 dout(10) << "preprocess_offload_targets " << *m << " from " << m->get_orig_source() << dendl;
28e407b8 502
1adf2230 503 const auto &fsmap = get_fsmap();
7c673cae
FG
504
505 // check privileges, ignore message if fails
11fdf7f2 506 MonSession *session = op->get_session();
7c673cae 507 if (!session)
1adf2230 508 goto ignore;
7c673cae
FG
509 if (!session->is_capable("mds", MON_CAP_X)) {
510 dout(0) << "preprocess_offload_targets got MMDSLoadTargets from entity with insufficient caps "
511 << session->caps << dendl;
1adf2230 512 goto ignore;
7c673cae
FG
513 }
514
515 if (fsmap.gid_exists(m->global_id) &&
516 m->targets == fsmap.get_info_gid(m->global_id).export_targets)
1adf2230 517 goto ignore;
7c673cae
FG
518
519 return false;
520
1adf2230
AA
521 ignore:
522 mon->no_reply(op);
7c673cae
FG
523 return true;
524}
525
526
527bool MDSMonitor::prepare_update(MonOpRequestRef op)
528{
529 op->mark_mdsmon_event(__func__);
9f95a23c 530 auto m = op->get_req<PaxosServiceMessage>();
7c673cae
FG
531 dout(7) << "prepare_update " << *m << dendl;
532
533 switch (m->get_type()) {
534
535 case MSG_MDS_BEACON:
536 return prepare_beacon(op);
537
538 case MSG_MON_COMMAND:
f64942e4
AA
539 try {
540 return prepare_command(op);
11fdf7f2 541 } catch (const bad_cmd_get& e) {
f64942e4
AA
542 bufferlist bl;
543 mon->reply_command(op, -EINVAL, e.what(), bl, get_last_committed());
544 return true;
545 }
7c673cae
FG
546
547 case MSG_MDS_OFFLOAD_TARGETS:
548 return prepare_offload_targets(op);
549
550 default:
551 ceph_abort();
552 }
553
554 return true;
555}
556
557bool MDSMonitor::prepare_beacon(MonOpRequestRef op)
558{
559 op->mark_mdsmon_event(__func__);
9f95a23c 560 auto m = op->get_req<MMDSBeacon>();
7c673cae 561 // -- this is an update --
11fdf7f2
TL
562 dout(12) << "prepare_beacon " << *m << " from " << m->get_orig_source()
563 << " " << m->get_orig_source_addrs() << dendl;
564 entity_addrvec_t addrs = m->get_orig_source_addrs();
7c673cae
FG
565 mds_gid_t gid = m->get_global_id();
566 MDSMap::DaemonState state = m->get_state();
567 version_t seq = m->get_seq();
568
28e407b8
AA
569 auto &pending = get_pending_fsmap_writeable();
570
91327a77 571 dout(15) << __func__ << " got health from gid " << gid << " with " << m->get_health().metrics.size() << " metrics." << dendl;
7c673cae
FG
572
573 // Calculate deltas of health metrics created and removed
574 // Do this by type rather than MDSHealthMetric equality, because messages can
575 // change a lot when they include e.g. a number of items.
576 const auto &old_health = pending_daemon_health[gid].metrics;
577 const auto &new_health = m->get_health().metrics;
578
579 std::set<mds_metric_t> old_types;
580 for (const auto &i : old_health) {
581 old_types.insert(i.type);
582 }
583
584 std::set<mds_metric_t> new_types;
585 for (const auto &i : new_health) {
586 new_types.insert(i.type);
587 }
588
589 for (const auto &new_metric: new_health) {
590 if (old_types.count(new_metric.type) == 0) {
11fdf7f2 591 dout(10) << "MDS health message (" << m->get_orig_source()
28e407b8 592 << "): " << new_metric.sev << " " << new_metric.message << dendl;
7c673cae
FG
593 }
594 }
595
596 // Log the disappearance of health messages at INFO
597 for (const auto &old_metric : old_health) {
598 if (new_types.count(old_metric.type) == 0) {
599 mon->clog->info() << "MDS health message cleared ("
11fdf7f2 600 << m->get_orig_source() << "): " << old_metric.message;
7c673cae
FG
601 }
602 }
603
604 // Store health
605 pending_daemon_health[gid] = m->get_health();
606
607 // boot?
608 if (state == MDSMap::STATE_BOOT) {
609 // zap previous instance of this name?
11fdf7f2 610 if (g_conf()->mds_enforce_unique_name) {
7c673cae 611 bool failed_mds = false;
28e407b8 612 while (mds_gid_t existing = pending.find_mds_gid_by_name(m->get_name())) {
7c673cae
FG
613 if (!mon->osdmon()->is_writeable()) {
614 mon->osdmon()->wait_for_writeable(op, new C_RetryMessage(this, op));
615 return false;
616 }
d2e6a577 617 const MDSMap::mds_info_t &existing_info =
28e407b8 618 pending.get_info_gid(existing);
d2e6a577 619 mon->clog->info() << existing_info.human_name() << " restarted";
1adf2230 620 fail_mds_gid(pending, existing);
7c673cae
FG
621 failed_mds = true;
622 }
623 if (failed_mds) {
11fdf7f2 624 ceph_assert(mon->osdmon()->is_writeable());
7c673cae
FG
625 request_proposal(mon->osdmon());
626 }
627 }
628
629 // Add this daemon to the map
28e407b8 630 if (pending.mds_roles.count(gid) == 0) {
7c673cae
FG
631 MDSMap::mds_info_t new_info;
632 new_info.global_id = gid;
633 new_info.name = m->get_name();
11fdf7f2 634 new_info.addrs = addrs;
7c673cae
FG
635 new_info.mds_features = m->get_mds_features();
636 new_info.state = MDSMap::STATE_STANDBY;
637 new_info.state_seq = seq;
28e407b8 638 pending.insert(new_info);
9f95a23c
TL
639 if (m->get_fs().size()) {
640 fs_cluster_id_t fscid = FS_CLUSTER_ID_NONE;
641 auto f = pending.get_filesystem(m->get_fs());
642 if (f) {
643 fscid = f->fscid;
644 }
645 new_info.join_fscid = fscid;
646 }
7c673cae
FG
647 }
648
7c673cae 649 // initialize the beacon timer
1adf2230
AA
650 auto &beacon = last_beacon[gid];
651 beacon.stamp = mono_clock::now();
652 beacon.seq = seq;
7c673cae
FG
653
654 // new incompat?
28e407b8
AA
655 if (!pending.compat.writeable(m->get_compat())) {
656 dout(10) << " fsmap " << pending.compat
7c673cae
FG
657 << " can't write to new mds' " << m->get_compat()
658 << ", updating fsmap and killing old mds's"
659 << dendl;
28e407b8 660 pending.update_compat(m->get_compat());
7c673cae
FG
661 }
662
663 update_metadata(m->get_global_id(), m->get_sys_info());
664 } else {
665 // state update
91327a77
AA
666
667 if (!pending.gid_exists(gid)) {
668 /* gid has been removed from pending, send null map */
669 dout(5) << "mds_beacon " << *m << " is not in fsmap (state "
670 << ceph_mds_state_name(state) << ")" << dendl;
671
672 /* We can't send an MDSMap this MDS was a part of because we no longer
673 * know which FS it was part of. Nor does this matter. Sending an empty
674 * MDSMap is sufficient for getting the MDS to respawn.
675 */
9f95a23c 676 wait_for_finished_proposal(op, new LambdaContext([op, this](int r){
91327a77
AA
677 if (r >= 0) {
678 const auto& fsmap = get_fsmap();
679 MDSMap null_map;
680 null_map.epoch = fsmap.epoch;
681 null_map.compat = fsmap.compat;
9f95a23c 682 auto m = make_message<MMDSMap>(mon->monmap->fsid, null_map);
11fdf7f2 683 mon->send_reply(op, m.detach());
91327a77
AA
684 } else {
685 dispatch(op); // try again
686 }
687 }));
688 return true;
689 }
690
11fdf7f2 691 const auto& info = pending.get_info_gid(gid);
f64942e4
AA
692 if (info.state == MDSMap::STATE_STOPPING &&
693 state != MDSMap::STATE_STOPPING &&
694 state != MDSMap::STATE_STOPPED) {
7c673cae
FG
695 // we can't transition to any other states from STOPPING
696 dout(0) << "got beacon for MDS in STATE_STOPPING, ignoring requested state change"
697 << dendl;
698 _note_beacon(m);
699 return true;
700 }
701
702 if (info.laggy()) {
11fdf7f2
TL
703 dout(1) << "prepare_beacon clearing laggy flag on " << addrs << dendl;
704 pending.modify_daemon(info.global_id, [](auto& info)
7c673cae 705 {
11fdf7f2 706 info.clear_laggy();
7c673cae
FG
707 }
708 );
709 }
9f95a23c 710
91327a77 711 dout(5) << "prepare_beacon mds." << info.rank
7c673cae
FG
712 << " " << ceph_mds_state_name(info.state)
713 << " -> " << ceph_mds_state_name(state)
7c673cae 714 << dendl;
9f95a23c
TL
715
716 fs_cluster_id_t fscid = FS_CLUSTER_ID_NONE;
717 if (m->get_fs().size()) {
718 auto f = pending.get_filesystem(m->get_fs());
719 if (f) {
720 fscid = f->fscid;
721 }
722 }
723 pending.modify_daemon(gid, [fscid](auto& info) {
724 info.join_fscid = fscid;
725 });
726
7c673cae 727 if (state == MDSMap::STATE_STOPPED) {
28e407b8
AA
728 const auto fscid = pending.mds_roles.at(gid);
729 const auto &fs = pending.get_filesystem(fscid);
181888fb 730
d2e6a577 731 mon->clog->info() << info.human_name() << " finished "
11fdf7f2 732 << "stopping rank " << info.rank << " in filesystem "
d2e6a577 733 << fs->mds_map.fs_name << " (now has "
181888fb 734 << fs->mds_map.get_num_in_mds() - 1 << " ranks)";
d2e6a577 735
28e407b8 736 auto erased = pending.stop(gid);
7c673cae
FG
737 erased.push_back(gid);
738
9f95a23c 739 for (const auto& erased_gid : erased) {
7c673cae
FG
740 last_beacon.erase(erased_gid);
741 if (pending_daemon_health.count(erased_gid)) {
742 pending_daemon_health.erase(erased_gid);
743 pending_daemon_health_rm.insert(erased_gid);
744 }
745 }
d2e6a577
FG
746
747
7c673cae
FG
748 } else if (state == MDSMap::STATE_DAMAGED) {
749 if (!mon->osdmon()->is_writeable()) {
91327a77 750 dout(1) << __func__ << ": DAMAGED from rank " << info.rank
7c673cae
FG
751 << " waiting for osdmon writeable to blacklist it" << dendl;
752 mon->osdmon()->wait_for_writeable(op, new C_RetryMessage(this, op));
753 return false;
754 }
755
756 // Record this MDS rank as damaged, so that other daemons
757 // won't try to run it.
91327a77 758 dout(0) << __func__ << ": marking rank "
7c673cae
FG
759 << info.rank << " damaged" << dendl;
760
761 utime_t until = ceph_clock_now();
11fdf7f2
TL
762 until += g_conf().get_val<double>("mon_mds_blacklist_interval");
763 const auto blacklist_epoch = mon->osdmon()->blacklist(info.addrs, until);
7c673cae 764 request_proposal(mon->osdmon());
28e407b8 765 pending.damaged(gid, blacklist_epoch);
7c673cae
FG
766 last_beacon.erase(gid);
767
768 // Respond to MDS, so that it knows it can continue to shut down
9f95a23c 769 auto beacon = make_message<MMDSBeacon>(
7c673cae 770 mon->monmap->fsid, m->get_global_id(),
28e407b8 771 m->get_name(), pending.get_epoch(), state, seq,
11fdf7f2
TL
772 CEPH_FEATURES_SUPPORTED_DEFAULT);
773 mon->send_reply(op, beacon.detach());
7c673cae
FG
774 } else if (state == MDSMap::STATE_DNE) {
775 if (!mon->osdmon()->is_writeable()) {
91327a77 776 dout(1) << __func__ << ": DNE from rank " << info.rank
7c673cae
FG
777 << " waiting for osdmon writeable to blacklist it" << dendl;
778 mon->osdmon()->wait_for_writeable(op, new C_RetryMessage(this, op));
779 return false;
780 }
781
1adf2230 782 fail_mds_gid(pending, gid);
11fdf7f2 783 ceph_assert(mon->osdmon()->is_writeable());
7c673cae
FG
784 request_proposal(mon->osdmon());
785
786 // Respond to MDS, so that it knows it can continue to shut down
9f95a23c 787 auto beacon = make_message<MMDSBeacon>(mon->monmap->fsid,
11fdf7f2
TL
788 m->get_global_id(), m->get_name(), pending.get_epoch(), state, seq,
789 CEPH_FEATURES_SUPPORTED_DEFAULT);
790 mon->send_reply(op, beacon.detach());
7c673cae
FG
791 } else if (info.state == MDSMap::STATE_STANDBY && state != info.state) {
792 // Standby daemons should never modify their own
793 // state. Reject any attempts to do so.
794 derr << "standby " << gid << " attempted to change state to "
795 << ceph_mds_state_name(state) << ", rejecting" << dendl;
796 return true;
797 } else if (info.state != MDSMap::STATE_STANDBY && state != info.state &&
798 !MDSMap::state_transition_valid(info.state, state)) {
799 // Validate state transitions for daemons that hold a rank
800 derr << "daemon " << gid << " (rank " << info.rank << ") "
801 << "reported invalid state transition "
802 << ceph_mds_state_name(info.state) << " -> "
803 << ceph_mds_state_name(state) << dendl;
804 return true;
805 } else {
b32b8144 806 if (info.state != MDSMap::STATE_ACTIVE && state == MDSMap::STATE_ACTIVE) {
28e407b8
AA
807 const auto &fscid = pending.mds_roles.at(gid);
808 const auto &fs = pending.get_filesystem(fscid);
d2e6a577
FG
809 mon->clog->info() << info.human_name() << " is now active in "
810 << "filesystem " << fs->mds_map.fs_name << " as rank "
811 << info.rank;
812 }
b32b8144
FG
813
814 // Made it through special cases and validations, record the
815 // daemon's reported state to the FSMap.
11fdf7f2
TL
816 pending.modify_daemon(gid, [state, seq](auto& info) {
817 info.state = state;
818 info.state_seq = seq;
b32b8144 819 });
7c673cae
FG
820 }
821 }
822
91327a77 823 dout(5) << "prepare_beacon pending map now:" << dendl;
28e407b8 824 print_map(pending);
7c673cae 825
9f95a23c 826 wait_for_finished_proposal(op, new LambdaContext([op, this](int r){
7c673cae
FG
827 if (r >= 0)
828 _updated(op); // success
829 else if (r == -ECANCELED) {
830 mon->no_reply(op);
831 } else {
832 dispatch(op); // try again
833 }
834 }));
835
836 return true;
837}
838
839bool MDSMonitor::prepare_offload_targets(MonOpRequestRef op)
840{
28e407b8
AA
841 auto &pending = get_pending_fsmap_writeable();
842
7c673cae 843 op->mark_mdsmon_event(__func__);
9f95a23c 844 auto m = op->get_req<MMDSLoadTargets>();
7c673cae 845 mds_gid_t gid = m->global_id;
28e407b8 846 if (pending.gid_has_rank(gid)) {
7c673cae 847 dout(10) << "prepare_offload_targets " << gid << " " << m->targets << dendl;
28e407b8 848 pending.update_export_targets(gid, m->targets);
7c673cae
FG
849 } else {
850 dout(10) << "prepare_offload_targets " << gid << " not in map" << dendl;
851 }
91327a77 852 mon->no_reply(op);
7c673cae
FG
853 return true;
854}
855
856bool MDSMonitor::should_propose(double& delay)
857{
858 // delegate to PaxosService to assess whether we should propose
859 return PaxosService::should_propose(delay);
860}
861
862void MDSMonitor::_updated(MonOpRequestRef op)
863{
28e407b8 864 const auto &fsmap = get_fsmap();
7c673cae 865 op->mark_mdsmon_event(__func__);
9f95a23c 866 auto m = op->get_req<MMDSBeacon>();
7c673cae 867 dout(10) << "_updated " << m->get_orig_source() << " " << *m << dendl;
11fdf7f2
TL
868 mon->clog->debug() << m->get_orig_source() << " "
869 << m->get_orig_source_addrs() << " "
870 << ceph_mds_state_name(m->get_state());
7c673cae
FG
871
872 if (m->get_state() == MDSMap::STATE_STOPPED) {
873 // send the map manually (they're out of the map, so they won't get it automatic)
874 MDSMap null_map;
875 null_map.epoch = fsmap.epoch;
876 null_map.compat = fsmap.compat;
9f95a23c 877 auto m = make_message<MMDSMap>(mon->monmap->fsid, null_map);
11fdf7f2 878 mon->send_reply(op, m.detach());
7c673cae 879 } else {
9f95a23c 880 auto beacon = make_message<MMDSBeacon>(mon->monmap->fsid,
11fdf7f2
TL
881 m->get_global_id(), m->get_name(), fsmap.get_epoch(),
882 m->get_state(), m->get_seq(), CEPH_FEATURES_SUPPORTED_DEFAULT);
883 mon->send_reply(op, beacon.detach());
7c673cae
FG
884 }
885}
886
887void MDSMonitor::on_active()
888{
889 tick();
7c673cae 890
28e407b8
AA
891 if (is_leader()) {
892 mon->clog->debug() << "fsmap " << get_fsmap();
224ce89b 893 }
7c673cae
FG
894}
895
7c673cae
FG
896void MDSMonitor::dump_info(Formatter *f)
897{
898 f->open_object_section("fsmap");
28e407b8 899 get_fsmap().dump(f);
7c673cae
FG
900 f->close_section();
901
902 f->dump_unsigned("mdsmap_first_committed", get_first_committed());
903 f->dump_unsigned("mdsmap_last_committed", get_last_committed());
904}
905
906bool MDSMonitor::preprocess_command(MonOpRequestRef op)
907{
908 op->mark_mdsmon_event(__func__);
9f95a23c 909 auto m = op->get_req<MMonCommand>();
7c673cae
FG
910 int r = -1;
911 bufferlist rdata;
912 stringstream ss, ds;
913
1adf2230 914 const auto &fsmap = get_fsmap();
28e407b8 915
11fdf7f2 916 cmdmap_t cmdmap;
7c673cae
FG
917 if (!cmdmap_from_json(m->cmd, &cmdmap, ss)) {
918 // ss has reason for failure
919 string rs = ss.str();
920 mon->reply_command(op, -EINVAL, rs, rdata, get_last_committed());
921 return true;
922 }
923
924 string prefix;
9f95a23c 925 cmd_getval(cmdmap, "prefix", prefix);
7c673cae 926 string format;
9f95a23c 927 cmd_getval(cmdmap, "format", format, string("plain"));
1adf2230 928 std::unique_ptr<Formatter> f(Formatter::create(format));
7c673cae 929
11fdf7f2 930 MonSession *session = op->get_session();
7c673cae
FG
931 if (!session) {
932 mon->reply_command(op, -EACCES, "access denied", rdata, get_last_committed());
933 return true;
934 }
935
936 if (prefix == "mds stat") {
937 if (f) {
938 f->open_object_section("mds_stat");
939 dump_info(f.get());
940 f->close_section();
941 f->flush(ds);
942 } else {
943 ds << fsmap;
944 }
945 r = 0;
11fdf7f2
TL
946 } else if (prefix == "mds ok-to-stop") {
947 vector<string> ids;
9f95a23c 948 if (!cmd_getval(cmdmap, "ids", ids)) {
11fdf7f2
TL
949 r = -EINVAL;
950 ss << "must specify mds id";
951 goto out;
952 }
953 if (fsmap.is_any_degraded()) {
954 ss << "one or more filesystems is currently degraded";
955 r = -EBUSY;
956 goto out;
957 }
958 set<mds_gid_t> stopping;
959 for (auto& id : ids) {
960 ostringstream ess;
961 mds_gid_t gid = gid_from_arg(fsmap, id, ess);
962 if (gid == MDS_GID_NONE) {
963 // the mds doesn't exist, but no file systems are unhappy, so losing it
964 // can't have any effect.
965 continue;
966 }
967 stopping.insert(gid);
968 }
969 set<mds_gid_t> active;
970 set<mds_gid_t> standby;
971 for (auto gid : stopping) {
972 if (fsmap.gid_has_rank(gid)) {
973 // ignore standby-replay daemons (at this level)
974 if (!fsmap.is_standby_replay(gid)) {
975 auto standby = fsmap.get_standby_replay(gid);
976 if (standby == MDS_GID_NONE ||
977 stopping.count(standby)) {
978 // no standby-replay, or we're also stopping the standby-replay
979 // for this mds
980 active.insert(gid);
981 }
982 }
7c673cae 983 } else {
11fdf7f2
TL
984 // net loss of a standby
985 standby.insert(gid);
7c673cae
FG
986 }
987 }
11fdf7f2
TL
988 if (fsmap.get_num_standby() - standby.size() < active.size()) {
989 r = -EBUSY;
990 ss << "insufficent standby MDS daemons to stop active gids "
991 << stringify(active)
992 << " and/or standby gids " << stringify(standby);;
993 goto out;
28e407b8 994 }
11fdf7f2
TL
995 r = 0;
996 ss << "should be safe to stop " << ids;
7c673cae
FG
997 } else if (prefix == "fs dump") {
998 int64_t epocharg;
999 epoch_t epoch;
1000
1adf2230 1001 const FSMap *fsmapp = &fsmap;
28e407b8 1002 FSMap dummy;
9f95a23c 1003 if (cmd_getval(cmdmap, "epoch", epocharg)) {
7c673cae
FG
1004 epoch = epocharg;
1005 bufferlist b;
1006 int err = get_version(epoch, b);
1007 if (err == -ENOENT) {
7c673cae 1008 r = -ENOENT;
28e407b8 1009 goto out;
7c673cae 1010 } else {
11fdf7f2
TL
1011 ceph_assert(err == 0);
1012 ceph_assert(b.length());
28e407b8
AA
1013 dummy.decode(b);
1014 fsmapp = &dummy;
7c673cae
FG
1015 }
1016 }
c07f9fc5 1017
28e407b8
AA
1018 stringstream ds;
1019 if (f != NULL) {
1020 f->open_object_section("fsmap");
1021 fsmapp->dump(f.get());
1022 f->close_section();
1023 f->flush(ds);
1024 r = 0;
1025 } else {
1026 fsmapp->print(ds);
1027 r = 0;
7c673cae 1028 }
28e407b8
AA
1029
1030 rdata.append(ds);
1031 ss << "dumped fsmap epoch " << fsmapp->get_epoch();
7c673cae
FG
1032 } else if (prefix == "mds metadata") {
1033 if (!f)
1034 f.reset(Formatter::create("json-pretty"));
1035
1036 string who;
9f95a23c 1037 bool all = !cmd_getval(cmdmap, "who", who);
7c673cae
FG
1038 dout(1) << "all = " << all << dendl;
1039 if (all) {
1040 r = 0;
1041 // Dump all MDSs' metadata
1042 const auto all_info = fsmap.get_mds_info();
1043
1044 f->open_array_section("mds_metadata");
1045 for(const auto &i : all_info) {
1046 const auto &info = i.second;
1047
1048 f->open_object_section("mds");
1049 f->dump_string("name", info.name);
1050 std::ostringstream get_err;
1adf2230 1051 r = dump_metadata(fsmap, info.name, f.get(), get_err);
7c673cae
FG
1052 if (r == -EINVAL || r == -ENOENT) {
1053 // Drop error, list what metadata we do have
1054 dout(1) << get_err.str() << dendl;
1055 r = 0;
1056 } else if (r != 0) {
1057 derr << "Unexpected error reading metadata: " << cpp_strerror(r)
1058 << dendl;
1059 ss << get_err.str();
c07f9fc5 1060 f->close_section();
7c673cae
FG
1061 break;
1062 }
1063 f->close_section();
1064 }
1065 f->close_section();
1066 } else {
1067 // Dump a single daemon's metadata
1068 f->open_object_section("mds_metadata");
1adf2230 1069 r = dump_metadata(fsmap, who, f.get(), ss);
7c673cae
FG
1070 f->close_section();
1071 }
1072 f->flush(ds);
31f18b77
FG
1073 } else if (prefix == "mds versions") {
1074 if (!f)
1075 f.reset(Formatter::create("json-pretty"));
1076 count_metadata("ceph_version", f.get());
1077 f->flush(ds);
1078 r = 0;
1079 } else if (prefix == "mds count-metadata") {
1080 if (!f)
1081 f.reset(Formatter::create("json-pretty"));
1082 string field;
9f95a23c 1083 cmd_getval(cmdmap, "property", field);
31f18b77
FG
1084 count_metadata(field, f.get());
1085 f->flush(ds);
1086 r = 0;
7c673cae
FG
1087 } else if (prefix == "mds compat show") {
1088 if (f) {
1089 f->open_object_section("mds_compat");
1090 fsmap.compat.dump(f.get());
1091 f->close_section();
1092 f->flush(ds);
1093 } else {
1094 ds << fsmap.compat;
1095 }
1096 r = 0;
1097 } else if (prefix == "fs get") {
1098 string fs_name;
9f95a23c 1099 cmd_getval(cmdmap, "fs_name", fs_name);
28e407b8 1100 const auto &fs = fsmap.get_filesystem(fs_name);
7c673cae
FG
1101 if (fs == nullptr) {
1102 ss << "filesystem '" << fs_name << "' not found";
1103 r = -ENOENT;
1104 } else {
1105 if (f != nullptr) {
1106 f->open_object_section("filesystem");
1107 fs->dump(f.get());
1108 f->close_section();
1109 f->flush(ds);
1110 r = 0;
1111 } else {
1112 fs->print(ds);
1113 r = 0;
1114 }
1115 }
1116 } else if (prefix == "fs ls") {
1117 if (f) {
1118 f->open_array_section("filesystems");
1adf2230
AA
1119 for (const auto &p : fsmap.filesystems) {
1120 const auto &fs = p.second;
1121 f->open_object_section("filesystem");
1122 {
1123 const MDSMap &mds_map = fs->mds_map;
1124 f->dump_string("name", mds_map.fs_name);
1125 /* Output both the names and IDs of pools, for use by
1126 * humans and machines respectively */
1127 f->dump_string("metadata_pool", mon->osdmon()->osdmap.get_pool_name(
1128 mds_map.metadata_pool));
1129 f->dump_int("metadata_pool_id", mds_map.metadata_pool);
1130 f->open_array_section("data_pool_ids");
1131 for (const auto &id : mds_map.data_pools) {
1132 f->dump_int("data_pool_id", id);
1133 }
1134 f->close_section();
7c673cae 1135
1adf2230
AA
1136 f->open_array_section("data_pools");
1137 for (const auto &id : mds_map.data_pools) {
1138 const auto &name = mon->osdmon()->osdmap.get_pool_name(id);
1139 f->dump_string("data_pool", name);
7c673cae
FG
1140 }
1141 f->close_section();
1142 }
1adf2230 1143 f->close_section();
7c673cae
FG
1144 }
1145 f->close_section();
1146 f->flush(ds);
1147 } else {
28e407b8
AA
1148 for (const auto &p : fsmap.filesystems) {
1149 const auto &fs = p.second;
7c673cae
FG
1150 const MDSMap &mds_map = fs->mds_map;
1151 const string &md_pool_name = mon->osdmon()->osdmap.get_pool_name(
1152 mds_map.metadata_pool);
1153
1154 ds << "name: " << mds_map.fs_name << ", metadata pool: "
1155 << md_pool_name << ", data pools: [";
1adf2230
AA
1156 for (const auto &id : mds_map.data_pools) {
1157 const string &pool_name = mon->osdmon()->osdmap.get_pool_name(id);
7c673cae
FG
1158 ds << pool_name << " ";
1159 }
1160 ds << "]" << std::endl;
1161 }
1162
1163 if (fsmap.filesystems.empty()) {
1164 ds << "No filesystems enabled" << std::endl;
1165 }
1166 }
1167 r = 0;
1168 }
1169
28e407b8 1170out:
7c673cae
FG
1171 if (r != -1) {
1172 rdata.append(ds);
1173 string rs;
1174 getline(ss, rs);
1175 mon->reply_command(op, r, rs, rdata, get_last_committed());
1176 return true;
1177 } else
1178 return false;
1179}
1180
1adf2230 1181bool MDSMonitor::fail_mds_gid(FSMap &fsmap, mds_gid_t gid)
7c673cae 1182{
9f95a23c 1183 const auto& info = fsmap.get_info_gid(gid);
91327a77 1184 dout(1) << "fail_mds_gid " << gid << " mds." << info.name << " role " << info.rank << dendl;
7c673cae 1185
a8e16298
TL
1186 ceph_assert(mon->osdmon()->is_writeable());
1187
7c673cae
FG
1188 epoch_t blacklist_epoch = 0;
1189 if (info.rank >= 0 && info.state != MDSMap::STATE_STANDBY_REPLAY) {
1190 utime_t until = ceph_clock_now();
11fdf7f2
TL
1191 until += g_conf().get_val<double>("mon_mds_blacklist_interval");
1192 blacklist_epoch = mon->osdmon()->blacklist(info.addrs, until);
7c673cae
FG
1193 }
1194
1adf2230 1195 fsmap.erase(gid, blacklist_epoch);
7c673cae
FG
1196 last_beacon.erase(gid);
1197 if (pending_daemon_health.count(gid)) {
1198 pending_daemon_health.erase(gid);
1199 pending_daemon_health_rm.insert(gid);
1200 }
1201
1202 return blacklist_epoch != 0;
1203}
1204
1adf2230 1205mds_gid_t MDSMonitor::gid_from_arg(const FSMap &fsmap, const std::string &arg, std::ostream &ss)
7c673cae
FG
1206{
1207 // Try parsing as a role
1208 mds_role_t role;
1209 std::ostringstream ignore_err; // Don't spam 'ss' with parse_role errors
1adf2230 1210 int r = fsmap.parse_role(arg, &role, ignore_err);
7c673cae
FG
1211 if (r == 0) {
1212 // See if a GID is assigned to this role
28e407b8 1213 const auto &fs = fsmap.get_filesystem(role.fscid);
11fdf7f2 1214 ceph_assert(fs != nullptr); // parse_role ensures it exists
7c673cae
FG
1215 if (fs->mds_map.is_up(role.rank)) {
1216 dout(10) << __func__ << ": validated rank/GID " << role
1217 << " as a rank" << dendl;
1218 return fs->mds_map.get_mds_info(role.rank).global_id;
1219 }
1220 }
1221
1222 // Try parsing as a gid
1223 std::string err;
1224 unsigned long long maybe_gid = strict_strtoll(arg.c_str(), 10, &err);
1225 if (!err.empty()) {
1226 // Not a role or a GID, try as a daemon name
28e407b8 1227 const MDSMap::mds_info_t *mds_info = fsmap.find_by_name(arg);
7c673cae
FG
1228 if (!mds_info) {
1229 ss << "MDS named '" << arg
1230 << "' does not exist, or is not up";
1231 return MDS_GID_NONE;
1232 }
1233 dout(10) << __func__ << ": resolved MDS name '" << arg
1234 << "' to GID " << mds_info->global_id << dendl;
1235 return mds_info->global_id;
1236 } else {
1237 // Not a role, but parses as a an integer, might be a GID
1238 dout(10) << __func__ << ": treating MDS reference '" << arg
1239 << "' as an integer " << maybe_gid << dendl;
31f18b77 1240
28e407b8 1241 if (fsmap.gid_exists(mds_gid_t(maybe_gid))) {
31f18b77 1242 return mds_gid_t(maybe_gid);
7c673cae
FG
1243 }
1244 }
1245
1246 dout(1) << __func__ << ": rank/GID " << arg
1247 << " not a existent rank or GID" << dendl;
1248 return MDS_GID_NONE;
1249}
1250
1adf2230
AA
1251int MDSMonitor::fail_mds(FSMap &fsmap, std::ostream &ss,
1252 const std::string &arg, MDSMap::mds_info_t *failed_info)
7c673cae 1253{
11fdf7f2 1254 ceph_assert(failed_info != nullptr);
d2e6a577 1255
1adf2230 1256 mds_gid_t gid = gid_from_arg(fsmap, arg, ss);
7c673cae
FG
1257 if (gid == MDS_GID_NONE) {
1258 return 0;
1259 }
1260 if (!mon->osdmon()->is_writeable()) {
1261 return -EAGAIN;
1262 }
d2e6a577
FG
1263
1264 // Take a copy of the info before removing the MDS from the map,
1265 // so that the caller knows which mds (if any) they ended up removing.
1adf2230 1266 *failed_info = fsmap.get_info_gid(gid);
d2e6a577 1267
1adf2230 1268 fail_mds_gid(fsmap, gid);
7c673cae 1269 ss << "failed mds gid " << gid;
11fdf7f2 1270 ceph_assert(mon->osdmon()->is_writeable());
7c673cae
FG
1271 request_proposal(mon->osdmon());
1272 return 0;
1273}
1274
1275bool MDSMonitor::prepare_command(MonOpRequestRef op)
1276{
1277 op->mark_mdsmon_event(__func__);
9f95a23c 1278 auto m = op->get_req<MMonCommand>();
7c673cae
FG
1279 int r = -EINVAL;
1280 stringstream ss;
1281 bufferlist rdata;
1282
11fdf7f2 1283 cmdmap_t cmdmap;
7c673cae
FG
1284 if (!cmdmap_from_json(m->cmd, &cmdmap, ss)) {
1285 string rs = ss.str();
1286 mon->reply_command(op, -EINVAL, rs, rdata, get_last_committed());
1287 return true;
1288 }
1289
1290 string prefix;
9f95a23c 1291 cmd_getval(cmdmap, "prefix", prefix);
7c673cae
FG
1292
1293 /* Refuse access if message not associated with a valid session */
11fdf7f2 1294 MonSession *session = op->get_session();
7c673cae
FG
1295 if (!session) {
1296 mon->reply_command(op, -EACCES, "access denied", rdata, get_last_committed());
1297 return true;
1298 }
1299
28e407b8
AA
1300 auto &pending = get_pending_fsmap_writeable();
1301
c07f9fc5 1302 bool batched_propose = false;
28e407b8 1303 for (const auto &h : handlers) {
7c673cae 1304 if (h->can_handle(prefix)) {
c07f9fc5
FG
1305 batched_propose = h->batched_propose();
1306 if (batched_propose) {
1307 paxos->plug();
1308 }
28e407b8 1309 r = h->handle(mon, pending, op, cmdmap, ss);
c07f9fc5
FG
1310 if (batched_propose) {
1311 paxos->unplug();
1312 }
1313
7c673cae
FG
1314 if (r == -EAGAIN) {
1315 // message has been enqueued for retry; return.
1316 dout(4) << __func__ << " enqueue for retry by prepare_command" << dendl;
1317 return false;
1318 } else {
1319 if (r == 0) {
1320 // On successful updates, print the updated map
28e407b8 1321 print_map(pending);
7c673cae
FG
1322 }
1323 // Successful or not, we're done: respond.
1324 goto out;
1325 }
1326 }
1327 }
1328
1adf2230 1329 r = filesystem_command(pending, op, prefix, cmdmap, ss);
7c673cae
FG
1330 if (r >= 0) {
1331 goto out;
1332 } else if (r == -EAGAIN) {
1333 // Do not reply, the message has been enqueued for retry
1334 dout(4) << __func__ << " enqueue for retry by filesystem_command" << dendl;
1335 return false;
1336 } else if (r != -ENOSYS) {
1337 goto out;
1338 }
1339
7c673cae
FG
1340 if (r == -ENOSYS && ss.str().empty()) {
1341 ss << "unrecognized command";
1342 }
1343
1344out:
1345 dout(4) << __func__ << " done, r=" << r << dendl;
1346 /* Compose response */
1347 string rs;
1348 getline(ss, rs);
1349
1350 if (r >= 0) {
1351 // success.. delay reply
1352 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, r, rs,
1353 get_last_committed() + 1));
c07f9fc5
FG
1354 if (batched_propose) {
1355 force_immediate_propose();
1356 }
7c673cae
FG
1357 return true;
1358 } else {
1359 // reply immediately
1360 mon->reply_command(op, r, rs, rdata, get_last_committed());
1361 return false;
1362 }
1363}
1364
7c673cae 1365int MDSMonitor::filesystem_command(
1adf2230 1366 FSMap &fsmap,
7c673cae
FG
1367 MonOpRequestRef op,
1368 std::string const &prefix,
11fdf7f2 1369 const cmdmap_t& cmdmap,
7c673cae
FG
1370 std::stringstream &ss)
1371{
1372 dout(4) << __func__ << " prefix='" << prefix << "'" << dendl;
1373 op->mark_mdsmon_event(__func__);
1374 int r = 0;
1375 string whostr;
9f95a23c 1376 cmd_getval(cmdmap, "role", whostr);
7c673cae 1377
11fdf7f2 1378 if (prefix == "mds set_state") {
7c673cae 1379 mds_gid_t gid;
9f95a23c 1380 if (!cmd_getval(cmdmap, "gid", gid)) {
7c673cae 1381 ss << "error parsing 'gid' value '"
11fdf7f2 1382 << cmd_vartype_stringify(cmdmap.at("gid")) << "'";
7c673cae
FG
1383 return -EINVAL;
1384 }
1385 MDSMap::DaemonState state;
9f95a23c 1386 if (!cmd_getval(cmdmap, "state", state)) {
7c673cae 1387 ss << "error parsing 'state' string value '"
11fdf7f2 1388 << cmd_vartype_stringify(cmdmap.at("state")) << "'";
7c673cae
FG
1389 return -EINVAL;
1390 }
1adf2230 1391 if (fsmap.gid_exists(gid)) {
11fdf7f2
TL
1392 fsmap.modify_daemon(gid, [state](auto& info) {
1393 info.state = state;
7c673cae
FG
1394 });
1395 ss << "set mds gid " << gid << " to state " << state << " "
1396 << ceph_mds_state_name(state);
1397 return 0;
1398 }
1399 } else if (prefix == "mds fail") {
1400 string who;
9f95a23c 1401 cmd_getval(cmdmap, "role_or_gid", who);
d2e6a577
FG
1402
1403 MDSMap::mds_info_t failed_info;
1adf2230 1404 r = fail_mds(fsmap, ss, who, &failed_info);
7c673cae
FG
1405 if (r < 0 && r == -EAGAIN) {
1406 mon->osdmon()->wait_for_writeable(op, new C_RetryMessage(this, op));
1407 return -EAGAIN; // don't propose yet; wait for message to be retried
d2e6a577
FG
1408 } else if (r == 0) {
1409 // Only log if we really did something (not when was already gone)
1410 if (failed_info.global_id != MDS_GID_NONE) {
1411 mon->clog->info() << failed_info.human_name() << " marked failed by "
1412 << op->get_session()->entity_name;
1413 }
7c673cae
FG
1414 }
1415 } else if (prefix == "mds rm") {
1416 mds_gid_t gid;
9f95a23c 1417 if (!cmd_getval(cmdmap, "gid", gid)) {
7c673cae 1418 ss << "error parsing 'gid' value '"
11fdf7f2 1419 << cmd_vartype_stringify(cmdmap.at("gid")) << "'";
7c673cae
FG
1420 return -EINVAL;
1421 }
1adf2230 1422 if (!fsmap.gid_exists(gid)) {
11fdf7f2 1423 ss << "mds gid " << gid << " does not exist";
7c673cae
FG
1424 r = 0;
1425 } else {
1adf2230 1426 const auto &info = fsmap.get_info_gid(gid);
28e407b8 1427 MDSMap::DaemonState state = info.state;
7c673cae 1428 if (state > 0) {
28e407b8
AA
1429 ss << "cannot remove active mds." << info.name
1430 << " rank " << info.rank;
7c673cae
FG
1431 return -EBUSY;
1432 } else {
1adf2230 1433 fsmap.erase(gid, {});
7c673cae
FG
1434 ss << "removed mds gid " << gid;
1435 return 0;
1436 }
1437 }
1438 } else if (prefix == "mds rmfailed") {
11fdf7f2 1439 bool confirm = false;
9f95a23c 1440 cmd_getval(cmdmap, "yes_i_really_mean_it", confirm);
11fdf7f2 1441 if (!confirm) {
7c673cae
FG
1442 ss << "WARNING: this can make your filesystem inaccessible! "
1443 "Add --yes-i-really-mean-it if you are sure you wish to continue.";
1444 return -EPERM;
1445 }
1446
1447 std::string role_str;
9f95a23c 1448 cmd_getval(cmdmap, "role", role_str);
7c673cae 1449 mds_role_t role;
1adf2230 1450 int r = fsmap.parse_role(role_str, &role, ss);
7c673cae
FG
1451 if (r < 0) {
1452 ss << "invalid role '" << role_str << "'";
1453 return -EINVAL;
1454 }
1455
1adf2230 1456 fsmap.modify_filesystem(
7c673cae
FG
1457 role.fscid,
1458 [role](std::shared_ptr<Filesystem> fs)
1459 {
1460 fs->mds_map.failed.erase(role.rank);
1461 });
1462
1463 ss << "removed failed mds." << role;
1464 return 0;
1465 } else if (prefix == "mds compat rm_compat") {
1466 int64_t f;
9f95a23c 1467 if (!cmd_getval(cmdmap, "feature", f)) {
7c673cae 1468 ss << "error parsing feature value '"
11fdf7f2 1469 << cmd_vartype_stringify(cmdmap.at("feature")) << "'";
7c673cae
FG
1470 return -EINVAL;
1471 }
1adf2230 1472 if (fsmap.compat.compat.contains(f)) {
7c673cae 1473 ss << "removing compat feature " << f;
1adf2230 1474 CompatSet modified = fsmap.compat;
7c673cae 1475 modified.compat.remove(f);
1adf2230 1476 fsmap.update_compat(modified);
7c673cae 1477 } else {
1adf2230 1478 ss << "compat feature " << f << " not present in " << fsmap.compat;
7c673cae
FG
1479 }
1480 r = 0;
1481 } else if (prefix == "mds compat rm_incompat") {
1482 int64_t f;
9f95a23c 1483 if (!cmd_getval(cmdmap, "feature", f)) {
7c673cae 1484 ss << "error parsing feature value '"
11fdf7f2 1485 << cmd_vartype_stringify(cmdmap.at("feature")) << "'";
7c673cae
FG
1486 return -EINVAL;
1487 }
1adf2230 1488 if (fsmap.compat.incompat.contains(f)) {
7c673cae 1489 ss << "removing incompat feature " << f;
1adf2230 1490 CompatSet modified = fsmap.compat;
7c673cae 1491 modified.incompat.remove(f);
1adf2230 1492 fsmap.update_compat(modified);
7c673cae 1493 } else {
1adf2230 1494 ss << "incompat feature " << f << " not present in " << fsmap.compat;
7c673cae
FG
1495 }
1496 r = 0;
1497 } else if (prefix == "mds repaired") {
1498 std::string role_str;
9f95a23c 1499 cmd_getval(cmdmap, "role", role_str);
7c673cae 1500 mds_role_t role;
1adf2230 1501 r = fsmap.parse_role(role_str, &role, ss);
7c673cae
FG
1502 if (r < 0) {
1503 return r;
1504 }
1505
1adf2230 1506 bool modified = fsmap.undamaged(role.fscid, role.rank);
7c673cae 1507 if (modified) {
494da23a 1508 ss << "repaired: restoring rank " << role;
7c673cae 1509 } else {
494da23a 1510 ss << "nothing to do: rank is not damaged";
7c673cae
FG
1511 }
1512
1513 r = 0;
11fdf7f2
TL
1514 } else if (prefix == "mds freeze") {
1515 std::string who;
9f95a23c 1516 cmd_getval(cmdmap, "role_or_gid", who);
11fdf7f2
TL
1517 mds_gid_t gid = gid_from_arg(fsmap, who, ss);
1518 if (gid == MDS_GID_NONE) {
7c673cae
FG
1519 return -EINVAL;
1520 }
1521
11fdf7f2 1522 bool freeze = false;
7c673cae 1523 {
11fdf7f2 1524 std::string str;
9f95a23c 1525 cmd_getval(cmdmap, "val", str);
11fdf7f2
TL
1526 if ((r = parse_bool(str, &freeze, ss)) != 0) {
1527 return r;
1528 }
1529 }
7c673cae 1530
11fdf7f2
TL
1531 auto f = [freeze,gid,&ss](auto& info) {
1532 if (freeze) {
1533 ss << "freezing mds." << gid;
1534 info.freeze();
1535 } else {
1536 ss << "unfreezing mds." << gid;
1537 info.unfreeze();
1538 }
1539 };
1540 fsmap.modify_daemon(gid, f);
7c673cae
FG
1541 r = 0;
1542 } else {
1543 return -ENOSYS;
1544 }
1545
1546 return r;
1547}
1548
7c673cae
FG
1549void MDSMonitor::check_subs()
1550{
7c673cae
FG
1551 // Subscriptions may be to "mdsmap" (MDS and legacy clients),
1552 // "mdsmap.<namespace>", or to "fsmap" for the full state of all
1553 // filesystems. Build a list of all the types we service
1554 // subscriptions for.
9f95a23c
TL
1555
1556 std::vector<std::string> types = {
1557 "fsmap",
1558 "fsmap.user",
1559 "mdsmap",
1560 };
1561
28e407b8
AA
1562 for (const auto &p : get_fsmap().filesystems) {
1563 const auto &fscid = p.first;
9f95a23c
TL
1564 CachedStackStringStream cos;
1565 *cos << "mdsmap." << fscid;
1566 types.push_back(std::string(cos->strv()));
7c673cae
FG
1567 }
1568
1569 for (const auto &type : types) {
9f95a23c
TL
1570 auto& subs = mon->session_map.subs;
1571 auto subs_it = subs.find(type);
1572 if (subs_it == subs.end())
7c673cae 1573 continue;
9f95a23c
TL
1574 auto sub_it = subs_it->second->begin();
1575 while (!sub_it.end()) {
1576 auto sub = *sub_it;
1577 ++sub_it; // N.B. check_sub may remove sub!
7c673cae
FG
1578 check_sub(sub);
1579 }
1580 }
1581}
1582
1583
1584void MDSMonitor::check_sub(Subscription *sub)
1585{
1586 dout(20) << __func__ << ": " << sub->type << dendl;
1587
28e407b8
AA
1588 const auto &fsmap = get_fsmap();
1589
7c673cae
FG
1590 if (sub->type == "fsmap") {
1591 if (sub->next <= fsmap.get_epoch()) {
1592 sub->session->con->send_message(new MFSMap(mon->monmap->fsid, fsmap));
1593 if (sub->onetime) {
1594 mon->session_map.remove_sub(sub);
1595 } else {
1596 sub->next = fsmap.get_epoch() + 1;
1597 }
1598 }
1599 } else if (sub->type == "fsmap.user") {
1600 if (sub->next <= fsmap.get_epoch()) {
1601 FSMapUser fsmap_u;
1602 fsmap_u.epoch = fsmap.get_epoch();
1603 fsmap_u.legacy_client_fscid = fsmap.legacy_client_fscid;
28e407b8
AA
1604 for (const auto &p : fsmap.filesystems) {
1605 FSMapUser::fs_info_t& fs_info = fsmap_u.filesystems[p.second->fscid];
1606 fs_info.cid = p.second->fscid;
1607 fs_info.name = p.second->mds_map.fs_name;
7c673cae
FG
1608 }
1609 sub->session->con->send_message(new MFSMapUser(mon->monmap->fsid, fsmap_u));
1610 if (sub->onetime) {
1611 mon->session_map.remove_sub(sub);
1612 } else {
1613 sub->next = fsmap.get_epoch() + 1;
1614 }
1615 }
1616 } else if (sub->type.compare(0, 6, "mdsmap") == 0) {
1617 if (sub->next > fsmap.get_epoch()) {
1618 return;
1619 }
1620
11fdf7f2 1621 const bool is_mds = sub->session->name.is_mds();
7c673cae
FG
1622 mds_gid_t mds_gid = MDS_GID_NONE;
1623 fs_cluster_id_t fscid = FS_CLUSTER_ID_NONE;
1624 if (is_mds) {
1625 // What (if any) namespace are you assigned to?
1626 auto mds_info = fsmap.get_mds_info();
1adf2230 1627 for (const auto &p : mds_info) {
11fdf7f2 1628 if (p.second.addrs == sub->session->addrs) {
1adf2230 1629 mds_gid = p.first;
7c673cae
FG
1630 fscid = fsmap.mds_roles.at(mds_gid);
1631 }
1632 }
1633 } else {
1634 // You're a client. Did you request a particular
1635 // namespace?
11fdf7f2 1636 if (sub->type.compare(0, 7, "mdsmap.") == 0) {
7c673cae
FG
1637 auto namespace_id_str = sub->type.substr(std::string("mdsmap.").size());
1638 dout(10) << __func__ << ": namespace_id " << namespace_id_str << dendl;
1639 std::string err;
1640 fscid = strict_strtoll(namespace_id_str.c_str(), 10, &err);
1641 if (!err.empty()) {
1642 // Client asked for a non-existent namespace, send them nothing
1643 dout(1) << "Invalid client subscription '" << sub->type
1644 << "'" << dendl;
1645 return;
1646 }
1647 if (fsmap.filesystems.count(fscid) == 0) {
1648 // Client asked for a non-existent namespace, send them nothing
1649 // TODO: something more graceful for when a client has a filesystem
1650 // mounted, and the fileysstem is deleted. Add a "shut down you fool"
1651 // flag to MMDSMap?
1652 dout(1) << "Client subscribed to non-existent namespace '" <<
1653 fscid << "'" << dendl;
1654 return;
1655 }
1656 } else {
1657 // Unqualified request for "mdsmap": give it the one marked
1658 // for use by legacy clients.
1659 if (fsmap.legacy_client_fscid != FS_CLUSTER_ID_NONE) {
1660 fscid = fsmap.legacy_client_fscid;
1661 } else {
1662 dout(1) << "Client subscribed for legacy filesystem but "
1663 "none is configured" << dendl;
1664 return;
1665 }
1666 }
1667 }
1668 dout(10) << __func__ << ": is_mds=" << is_mds << ", fscid= " << fscid << dendl;
1669
1670 // Work out the effective latest epoch
28e407b8 1671 const MDSMap *mds_map = nullptr;
7c673cae
FG
1672 MDSMap null_map;
1673 null_map.compat = fsmap.compat;
1674 if (fscid == FS_CLUSTER_ID_NONE) {
1675 // For a client, we should have already dropped out
11fdf7f2 1676 ceph_assert(is_mds);
7c673cae 1677
28e407b8
AA
1678 auto it = fsmap.standby_daemons.find(mds_gid);
1679 if (it != fsmap.standby_daemons.end()) {
7c673cae 1680 // For an MDS, we need to feed it an MDSMap with its own state in
28e407b8
AA
1681 null_map.mds_info[mds_gid] = it->second;
1682 null_map.epoch = fsmap.standby_epochs.at(mds_gid);
7c673cae
FG
1683 } else {
1684 null_map.epoch = fsmap.epoch;
1685 }
1686 mds_map = &null_map;
1687 } else {
1688 // Check the effective epoch
28e407b8 1689 mds_map = &fsmap.get_filesystem(fscid)->mds_map;
7c673cae
FG
1690 }
1691
11fdf7f2 1692 ceph_assert(mds_map != nullptr);
7c673cae
FG
1693 dout(10) << __func__ << " selected MDS map epoch " <<
1694 mds_map->epoch << " for namespace " << fscid << " for subscriber "
11fdf7f2 1695 << sub->session->name << " who wants epoch " << sub->next << dendl;
7c673cae
FG
1696
1697 if (sub->next > mds_map->epoch) {
1698 return;
1699 }
9f95a23c 1700 auto msg = make_message<MMDSMap>(mon->monmap->fsid, *mds_map);
7c673cae 1701
11fdf7f2 1702 sub->session->con->send_message(msg.detach());
7c673cae
FG
1703 if (sub->onetime) {
1704 mon->session_map.remove_sub(sub);
1705 } else {
1706 sub->next = mds_map->get_epoch() + 1;
1707 }
1708 }
1709}
1710
1711
1712void MDSMonitor::update_metadata(mds_gid_t gid,
1713 const map<string, string>& metadata)
1714{
1715 if (metadata.empty()) {
1716 return;
1717 }
1718 pending_metadata[gid] = metadata;
1719
1720 MonitorDBStore::TransactionRef t = paxos->get_pending_transaction();
1721 bufferlist bl;
11fdf7f2 1722 encode(pending_metadata, bl);
7c673cae
FG
1723 t->put(MDS_METADATA_PREFIX, "last_metadata", bl);
1724 paxos->trigger_propose();
1725}
1726
1adf2230 1727void MDSMonitor::remove_from_metadata(const FSMap &fsmap, MonitorDBStore::TransactionRef t)
7c673cae
FG
1728{
1729 bool update = false;
1adf2230
AA
1730 for (auto it = pending_metadata.begin(); it != pending_metadata.end(); ) {
1731 if (!fsmap.gid_exists(it->first)) {
1732 it = pending_metadata.erase(it);
7c673cae
FG
1733 update = true;
1734 } else {
1adf2230 1735 ++it;
7c673cae
FG
1736 }
1737 }
1738 if (!update)
1739 return;
1740 bufferlist bl;
11fdf7f2 1741 encode(pending_metadata, bl);
7c673cae
FG
1742 t->put(MDS_METADATA_PREFIX, "last_metadata", bl);
1743}
1744
1745int MDSMonitor::load_metadata(map<mds_gid_t, Metadata>& m)
1746{
1747 bufferlist bl;
1748 int r = mon->store->get(MDS_METADATA_PREFIX, "last_metadata", bl);
1749 if (r) {
11fdf7f2 1750 dout(5) << "Unable to load 'last_metadata'" << dendl;
7c673cae
FG
1751 return r;
1752 }
1753
11fdf7f2
TL
1754 auto it = bl.cbegin();
1755 ceph::decode(m, it);
7c673cae
FG
1756 return 0;
1757}
1758
1adf2230 1759void MDSMonitor::count_metadata(const std::string &field, map<string,int> *out)
31f18b77 1760{
31f18b77
FG
1761 map<mds_gid_t,Metadata> meta;
1762 load_metadata(meta);
1763 for (auto& p : meta) {
1764 auto q = p.second.find(field);
1765 if (q == p.second.end()) {
c07f9fc5 1766 (*out)["unknown"]++;
31f18b77 1767 } else {
c07f9fc5 1768 (*out)[q->second]++;
31f18b77
FG
1769 }
1770 }
c07f9fc5
FG
1771}
1772
1adf2230 1773void MDSMonitor::count_metadata(const std::string &field, Formatter *f)
c07f9fc5
FG
1774{
1775 map<string,int> by_val;
1776 count_metadata(field, &by_val);
31f18b77
FG
1777 f->open_object_section(field.c_str());
1778 for (auto& p : by_val) {
1779 f->dump_int(p.first.c_str(), p.second);
1780 }
1781 f->close_section();
1782}
1783
1adf2230
AA
1784int MDSMonitor::dump_metadata(const FSMap& fsmap, const std::string &who,
1785 Formatter *f, ostream& err)
7c673cae 1786{
11fdf7f2 1787 ceph_assert(f);
7c673cae 1788
1adf2230 1789 mds_gid_t gid = gid_from_arg(fsmap, who, err);
7c673cae
FG
1790 if (gid == MDS_GID_NONE) {
1791 return -EINVAL;
1792 }
1793
1794 map<mds_gid_t, Metadata> metadata;
1795 if (int r = load_metadata(metadata)) {
1796 err << "Unable to load 'last_metadata'";
1797 return r;
1798 }
1799
1800 if (!metadata.count(gid)) {
1801 return -ENOENT;
1802 }
1803 const Metadata& m = metadata[gid];
1804 for (Metadata::const_iterator p = m.begin(); p != m.end(); ++p) {
1805 f->dump_string(p->first.c_str(), p->second);
1806 }
1807 return 0;
1808}
1809
1810int MDSMonitor::print_nodes(Formatter *f)
1811{
11fdf7f2 1812 ceph_assert(f);
7c673cae 1813
1adf2230
AA
1814 const auto &fsmap = get_fsmap();
1815
7c673cae
FG
1816 map<mds_gid_t, Metadata> metadata;
1817 if (int r = load_metadata(metadata)) {
1818 return r;
1819 }
1820
11fdf7f2 1821 map<string, list<string> > mdses; // hostname => mds
1adf2230
AA
1822 for (const auto &p : metadata) {
1823 const mds_gid_t& gid = p.first;
1824 const Metadata& m = p.second;
7c673cae
FG
1825 Metadata::const_iterator hostname = m.find("hostname");
1826 if (hostname == m.end()) {
1827 // not likely though
1828 continue;
1829 }
1adf2230 1830 if (!fsmap.gid_exists(gid)) {
7c673cae
FG
1831 dout(5) << __func__ << ": GID " << gid << " not existent" << dendl;
1832 continue;
1833 }
1adf2230 1834 const MDSMap::mds_info_t& mds_info = fsmap.get_info_gid(gid);
11fdf7f2 1835 mdses[hostname->second].push_back(mds_info.name);
7c673cae
FG
1836 }
1837
1838 dump_services(f, mdses, "mds");
1839 return 0;
1840}
1841
1842/**
1843 * If a cluster is undersized (with respect to max_mds), then
11fdf7f2
TL
1844 * attempt to find daemons to grow it. If the cluster is oversized
1845 * (with respect to max_mds) then shrink it by stopping its highest rank.
7c673cae 1846 */
11fdf7f2 1847bool MDSMonitor::maybe_resize_cluster(FSMap &fsmap, fs_cluster_id_t fscid)
7c673cae 1848{
11fdf7f2
TL
1849 auto &current_mds_map = get_fsmap().get_filesystem(fscid)->mds_map;
1850 auto&& fs = fsmap.get_filesystem(fscid);
1adf2230 1851 auto &mds_map = fs->mds_map;
7c673cae 1852
1adf2230
AA
1853 int in = mds_map.get_num_in_mds();
1854 int max = mds_map.get_max_mds();
1855
1856 dout(20) << __func__ << " in " << in << " max " << max << dendl;
1857
11fdf7f2
TL
1858 /* Check that both the current epoch mds_map is resizeable as well as the
1859 * current batch of changes in pending. This is important if an MDS is
1860 * becoming active in the next epoch.
1861 */
1862 if (!current_mds_map.is_resizeable() ||
1863 !mds_map.is_resizeable()) {
1864 dout(5) << __func__ << " mds_map is not currently resizeable" << dendl;
1865 return false;
1866 }
1867
1868 if (in < max && !mds_map.test_flag(CEPH_MDSMAP_NOT_JOINABLE)) {
7c673cae 1869 mds_rank_t mds = mds_rank_t(0);
1adf2230 1870 while (mds_map.is_in(mds)) {
7c673cae
FG
1871 mds++;
1872 }
9f95a23c
TL
1873 auto info = fsmap.find_replacement_for({fscid, mds});
1874 if (!info) {
1adf2230 1875 return false;
7c673cae
FG
1876 }
1877
9f95a23c 1878 dout(1) << "assigned standby " << info->addrs
7c673cae 1879 << " as mds." << mds << dendl;
9f95a23c 1880 mon->clog->info() << info->human_name() << " assigned to "
1adf2230
AA
1881 "filesystem " << mds_map.fs_name << " as rank "
1882 << mds << " (now has " << mds_map.get_num_in_mds() + 1
d2e6a577 1883 << " ranks)";
9f95a23c 1884 fsmap.promote(info->global_id, *fs, mds);
1adf2230 1885 return true;
11fdf7f2
TL
1886 } else if (in > max) {
1887 mds_rank_t target = in - 1;
1888 const auto &info = mds_map.get_info(target);
1889 if (mds_map.is_active(target)) {
1890 dout(1) << "stopping " << target << dendl;
1891 mon->clog->info() << "stopping " << info.human_name();
1892 auto f = [](auto& info) {
1893 info.state = MDSMap::STATE_STOPPING;
1894 };
1895 fsmap.modify_daemon(info.global_id, f);
1896 return true;
1897 } else {
1898 dout(20) << "skipping stop of " << target << dendl;
1899 return false;
1900 }
7c673cae
FG
1901 }
1902
1adf2230 1903 return false;
7c673cae
FG
1904}
1905
1906
1907/**
9f95a23c 1908 * Fail a daemon and replace it with a suitable standby.
7c673cae 1909 */
9f95a23c 1910bool MDSMonitor::drop_mds(FSMap &fsmap, mds_gid_t gid, const mds_info_t* rep_info, bool *osd_propose)
7c673cae 1911{
11fdf7f2 1912 ceph_assert(osd_propose != nullptr);
7c673cae 1913
1adf2230 1914 const auto fscid = fsmap.mds_roles.at(gid);
9f95a23c
TL
1915 const auto& info = fsmap.get_info_gid(gid);
1916 const auto rank = info.rank;
1917 const auto state = info.state;
1918
1919 if (info.is_frozen()) {
1920 return false;
1921 } else if (state == MDSMap::STATE_STANDBY_REPLAY ||
1922 state == MDSMap::STATE_STANDBY) {
1923 dout(1) << " failing and removing standby " << gid << " " << info.addrs
1924 << " mds." << rank
1925 << "." << info.inc << " " << ceph_mds_state_name(state)
1926 << dendl;
1927 *osd_propose |= fail_mds_gid(fsmap, gid);
1928 return true;
1929 } else if (rank >= 0 && rep_info) {
1930 auto fs = fsmap.filesystems.at(fscid);
1931 if (fs->mds_map.test_flag(CEPH_MDSMAP_NOT_JOINABLE)) {
1932 return false;
1933 }
1934 // are we in?
1935 // and is there a non-laggy standby that can take over for us?
1936 dout(1) << " replacing " << gid << " " << info.addrs
1937 << " mds." << rank << "." << info.inc
1938 << " " << ceph_mds_state_name(state)
1939 << " with " << rep_info->global_id << "/" << rep_info->name << " " << rep_info->addrs
1940 << dendl;
1941
1942 mon->clog->warn() << "Replacing " << info.human_name()
1943 << " as rank " << rank
1944 << " with standby " << rep_info->human_name();
1945
1946 // Remove the old one
1947 *osd_propose |= fail_mds_gid(fsmap, gid);
1948
1949 // Promote the replacement
1950 fsmap.promote(rep_info->global_id, *fs, rank);
1951
1952 return true;
1953 }
1954 return false;
1955}
1956
1957bool MDSMonitor::check_health(FSMap& fsmap, bool* propose_osdmap)
1958{
1959 bool do_propose = false;
1960 const auto now = mono_clock::now();
1961 const bool osdmap_writeable = mon->osdmon()->is_writeable();
1962 const auto mds_beacon_grace = g_conf().get_val<double>("mds_beacon_grace");
1963 const auto mds_beacon_interval = g_conf().get_val<double>("mds_beacon_interval");
1964
1965 if (mono_clock::is_zero(last_tick)) {
1966 last_tick = now;
1967 }
1968
1969 {
1970 auto since_last = std::chrono::duration<double>(now-last_tick);
1971
1972 if (since_last.count() > (mds_beacon_grace-mds_beacon_interval)) {
1973 // This case handles either local slowness (calls being delayed
1974 // for whatever reason) or cluster election slowness (a long gap
1975 // between calls while an election happened)
1976 dout(1) << __func__ << ": resetting beacon timeouts due to mon delay "
1977 "(slow election?) of " << since_last.count() << " seconds" << dendl;
1978 for (auto& p : last_beacon) {
1979 p.second.stamp = now;
1980 }
1981 }
1982 }
1983
1984 // make sure last_beacon is fully populated
1985 for (auto& p : fsmap.mds_roles) {
1986 auto& gid = p.first;
1987 last_beacon.emplace(std::piecewise_construct,
1988 std::forward_as_tuple(gid),
1989 std::forward_as_tuple(now, 0));
1990 }
7c673cae 1991
31f18b77 1992 // We will only take decisive action (replacing/removing a daemon)
9f95a23c 1993 // if we have some indication that some other daemon(s) are successfully
31f18b77 1994 // getting beacons through recently.
1adf2230 1995 mono_time latest_beacon = mono_clock::zero();
9f95a23c 1996 for (const auto& p : last_beacon) {
1adf2230 1997 latest_beacon = std::max(p.second.stamp, latest_beacon);
31f18b77 1998 }
9f95a23c 1999 auto since = chrono::duration<double>(now-latest_beacon);
1adf2230 2000 const bool may_replace = since.count() <
11fdf7f2 2001 std::max(g_conf()->mds_beacon_interval, g_conf()->mds_beacon_grace * 0.5);
31f18b77 2002
9f95a23c
TL
2003 // check beacon timestamps
2004 std::vector<mds_gid_t> to_remove;
2005 for (auto it = last_beacon.begin(); it != last_beacon.end(); ) {
2006 auto& [gid, beacon_info] = *it;
2007 auto since_last = chrono::duration<double>(now-beacon_info.stamp);
2008
2009 if (!fsmap.gid_exists(gid)) {
2010 // gid no longer exists, remove from tracked beacons
2011 it = last_beacon.erase(it);
2012 continue;
2013 }
7c673cae 2014
9f95a23c
TL
2015 if (since_last.count() >= g_conf()->mds_beacon_grace) {
2016 auto& info = fsmap.get_info_gid(gid);
2017 dout(1) << "no beacon from mds." << info.rank << "." << info.inc
2018 << " (gid: " << gid << " addr: " << info.addrs
2019 << " state: " << ceph_mds_state_name(info.state) << ")"
2020 << " since " << since_last.count() << dendl;
2021 // If the OSDMap is writeable, we can blacklist things, so we can
2022 // try failing any laggy MDS daemons. Consider each one for failure.
2023 if (!info.laggy()) {
2024 dout(1) << " marking " << gid << " " << info.addrs
2025 << " mds." << info.rank << "." << info.inc
2026 << " " << ceph_mds_state_name(info.state)
2027 << " laggy" << dendl;
2028 fsmap.modify_daemon(info.global_id, [](auto& info) {
2029 info.laggy_since = ceph_clock_now();
2030 });
2031 do_propose = true;
2032 }
2033 if (osdmap_writeable && may_replace) {
2034 to_remove.push_back(gid); // drop_mds may invalidate iterator
2035 }
2036 }
31f18b77 2037
9f95a23c
TL
2038 ++it;
2039 }
7c673cae 2040
9f95a23c
TL
2041 for (const auto& gid : to_remove) {
2042 auto& info = fsmap.get_info_gid(gid);
2043 const mds_info_t* rep_info = nullptr;
2044 if (info.rank >= 0) {
2045 auto fscid = fsmap.gid_fscid(gid);
2046 rep_info = fsmap.find_replacement_for({fscid, info.rank});
2047 }
2048 bool dropped = drop_mds(fsmap, gid, rep_info, propose_osdmap);
2049 if (dropped) {
2050 mon->clog->info() << "MDS " << info.human_name()
2051 << " is removed because it is dead or otherwise unavailable.";
2052 do_propose = true;
2053 }
2054 }
7c673cae 2055
9f95a23c
TL
2056 if (osdmap_writeable) {
2057 for (auto& [fscid, fs] : fsmap.filesystems) {
2058 if (!fs->mds_map.test_flag(CEPH_MDSMAP_NOT_JOINABLE) &&
2059 fs->mds_map.is_resizeable()) {
2060 // Check if a rank or standby-replay should be replaced with a stronger
2061 // affinity standby. This looks at ranks and standby-replay:
2062 for (const auto& [gid, info] : fs->mds_map.get_mds_info()) {
2063 const auto join_fscid = info.join_fscid;
2064 if (join_fscid == fscid)
2065 continue;
2066 const auto rank = info.rank;
2067 const auto state = info.state;
2068 const mds_info_t* rep_info = nullptr;
2069 if (state == MDSMap::STATE_STANDBY_REPLAY) {
2070 rep_info = fsmap.get_available_standby(fscid);
2071 } else if (state == MDSMap::STATE_ACTIVE) {
2072 rep_info = fsmap.find_replacement_for({fscid, rank});
2073 } else {
2074 /* N.B. !is_degraded() */
2075 ceph_abort_msg("invalid state in MDSMap");
2076 }
2077 if (!rep_info) {
2078 break;
2079 }
2080 bool better_affinity = false;
2081 if (join_fscid == FS_CLUSTER_ID_NONE) {
2082 better_affinity = (rep_info->join_fscid == fscid);
2083 } else {
2084 better_affinity = (rep_info->join_fscid == fscid) ||
2085 (rep_info->join_fscid == FS_CLUSTER_ID_NONE);
2086 }
2087 if (better_affinity) {
2088 if (state == MDSMap::STATE_STANDBY_REPLAY) {
2089 mon->clog->info() << "Dropping low affinity standby-replay "
2090 << info.human_name()
2091 << " in favor of higher affinity standby.";
2092 *propose_osdmap |= fail_mds_gid(fsmap, gid);
2093 /* Now let maybe_promote_standby do the promotion. */
2094 } else {
2095 mon->clog->info() << "Dropping low affinity active "
2096 << info.human_name()
2097 << " in favor of higher affinity standby.";
2098 do_propose |= drop_mds(fsmap, gid, rep_info, propose_osdmap);
2099 }
2100 break; /* don't replace more than one per tick per fs */
2101 }
2102 }
2103 }
2104 }
7c673cae 2105 }
9f95a23c 2106 return do_propose;
7c673cae
FG
2107}
2108
11fdf7f2 2109bool MDSMonitor::maybe_promote_standby(FSMap &fsmap, Filesystem& fs)
7c673cae 2110{
11fdf7f2
TL
2111 if (fs.mds_map.test_flag(CEPH_MDSMAP_NOT_JOINABLE)) {
2112 return false;
2113 }
7c673cae
FG
2114
2115 bool do_propose = false;
2116
2117 // have a standby take over?
2118 set<mds_rank_t> failed;
11fdf7f2
TL
2119 fs.mds_map.get_failed_mds_set(failed);
2120 for (const auto& rank : failed) {
9f95a23c
TL
2121 auto info = fsmap.find_replacement_for({fs.fscid, rank});
2122 if (info) {
2123 dout(1) << " taking over failed mds." << rank << " with " << info->global_id
2124 << "/" << info->name << " " << info->addrs << dendl;
2125 mon->clog->info() << "Standby " << info->human_name()
11fdf7f2
TL
2126 << " assigned to filesystem " << fs.mds_map.fs_name
2127 << " as rank " << rank;
2128
9f95a23c 2129 fsmap.promote(info->global_id, fs, rank);
11fdf7f2 2130 do_propose = true;
7c673cae 2131 }
11fdf7f2
TL
2132 }
2133
9f95a23c 2134 if (!fs.mds_map.is_degraded() && fs.mds_map.allows_standby_replay()) {
7c673cae 2135 // There were no failures to replace, so try using any available standbys
a8e16298
TL
2136 // as standby-replay daemons. Don't do this when the cluster is degraded
2137 // as a standby-replay daemon may try to read a journal being migrated.
11fdf7f2 2138 for (;;) {
9f95a23c
TL
2139 auto info = fsmap.get_available_standby(fs.fscid);
2140 if (!info) break;
2141 dout(20) << "standby available mds." << info->global_id << dendl;
11fdf7f2
TL
2142 bool changed = false;
2143 for (const auto& rank : fs.mds_map.in) {
9f95a23c 2144 dout(20) << "examining " << rank << dendl;
11fdf7f2 2145 if (fs.mds_map.is_followable(rank)) {
9f95a23c 2146 dout(1) << " setting mds." << info->global_id
11fdf7f2 2147 << " to follow mds rank " << rank << dendl;
9f95a23c 2148 fsmap.assign_standby_replay(info->global_id, fs.fscid, rank);
11fdf7f2
TL
2149 do_propose = true;
2150 changed = true;
2151 break;
7c673cae 2152 }
7c673cae 2153 }
11fdf7f2 2154 if (!changed) break;
7c673cae
FG
2155 }
2156 }
2157
2158 return do_propose;
2159}
2160
2161void MDSMonitor::tick()
2162{
1adf2230 2163 if (!is_active() || !is_leader()) return;
28e407b8
AA
2164
2165 auto &pending = get_pending_fsmap_writeable();
7c673cae 2166
28e407b8 2167 bool do_propose = false;
9f95a23c 2168 bool propose_osdmap = false;
7c673cae 2169
28e407b8 2170 do_propose |= pending.check_health();
7c673cae 2171
9f95a23c
TL
2172 /* Check health and affinity of ranks */
2173 do_propose |= check_health(pending, &propose_osdmap);
7c673cae 2174
9f95a23c
TL
2175 /* Resize the cluster according to max_mds. */
2176 for (auto& p : pending.filesystems) {
2177 do_propose |= maybe_resize_cluster(pending, p.second->fscid);
7c673cae
FG
2178 }
2179
9f95a23c
TL
2180 /* Replace any failed ranks. */
2181 for (auto& p : pending.filesystems) {
2182 do_propose |= maybe_promote_standby(pending, *p.second);
7c673cae
FG
2183 }
2184
c07f9fc5
FG
2185 if (propose_osdmap) {
2186 request_proposal(mon->osdmon());
2187 }
7c673cae 2188
7c673cae
FG
2189 if (do_propose) {
2190 propose_pending();
2191 }
9f95a23c
TL
2192
2193 last_tick = mono_clock::now();
7c673cae
FG
2194}
2195
7c673cae
FG
2196MDSMonitor::MDSMonitor(Monitor *mn, Paxos *p, string service_name)
2197 : PaxosService(mn, p, service_name)
2198{
c07f9fc5 2199 handlers = FileSystemCommandHandler::load(p);
7c673cae
FG
2200}
2201
2202void MDSMonitor::on_restart()
2203{
2204 // Clear out the leader-specific state.
1adf2230 2205 last_tick = mono_clock::now();
7c673cae
FG
2206 last_beacon.clear();
2207}
2208