]> git.proxmox.com Git - ceph.git/blob - ceph/src/mon/MDSMonitor.cc
ecbe9f0400e71ccf9dac9fb00084c4ae5daf194d
[ceph.git] / ceph / src / mon / MDSMonitor.cc
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3 /*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 *
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
12 *
13 */
14
15 #include <sstream>
16 #include <boost/utility.hpp>
17 #include <boost/regex.hpp>
18
19 #include "MDSMonitor.h"
20 #include "FSCommands.h"
21 #include "Monitor.h"
22 #include "MonitorDBStore.h"
23 #include "OSDMonitor.h"
24 #include "PGMonitor.h"
25
26 #include "common/strtol.h"
27 #include "common/perf_counters.h"
28 #include "common/config.h"
29 #include "common/cmdparse.h"
30 #include "messages/MMDSMap.h"
31 #include "messages/MFSMap.h"
32 #include "messages/MFSMapUser.h"
33 #include "messages/MMDSLoadTargets.h"
34 #include "messages/MMonCommand.h"
35 #include "messages/MGenericMessage.h"
36
37 #include "include/assert.h"
38 #include "include/str_list.h"
39 #include "include/stringify.h"
40 #include "mds/mdstypes.h"
41 #include "Session.h"
42
43 #define dout_subsys ceph_subsys_mon
44 #undef dout_prefix
45 #define dout_prefix _prefix(_dout, mon, fsmap)
46 static ostream& _prefix(std::ostream *_dout, Monitor *mon, FSMap const& fsmap) {
47 return *_dout << "mon." << mon->name << "@" << mon->rank
48 << "(" << mon->get_state_name()
49 << ").mds e" << fsmap.get_epoch() << " ";
50 }
51
52 /*
53 * Specialized implementation of cmd_getval to allow us to parse
54 * out strongly-typedef'd types
55 */
56 template<> bool cmd_getval(CephContext *cct, const cmdmap_t& cmdmap,
57 const std::string& k, mds_gid_t &val)
58 {
59 return cmd_getval(cct, cmdmap, k, (int64_t&)val);
60 }
61
62 template<> bool cmd_getval(CephContext *cct, const cmdmap_t& cmdmap,
63 const std::string& k, mds_rank_t &val)
64 {
65 return cmd_getval(cct, cmdmap, k, (int64_t&)val);
66 }
67
68 template<> bool cmd_getval(CephContext *cct, const cmdmap_t& cmdmap,
69 const std::string& k, MDSMap::DaemonState &val)
70 {
71 return cmd_getval(cct, cmdmap, k, (int64_t&)val);
72 }
73
74 static const string MDS_METADATA_PREFIX("mds_metadata");
75
76
77 // my methods
78
79 void MDSMonitor::print_map(FSMap &m, int dbl)
80 {
81 dout(dbl) << "print_map\n";
82 m.print(*_dout);
83 *_dout << dendl;
84 }
85
86 // service methods
87 void MDSMonitor::create_initial()
88 {
89 dout(10) << "create_initial" << dendl;
90 }
91
92
93 void MDSMonitor::update_from_paxos(bool *need_bootstrap)
94 {
95 version_t version = get_last_committed();
96 if (version == fsmap.epoch)
97 return;
98
99 dout(10) << __func__ << " version " << version
100 << ", my e " << fsmap.epoch << dendl;
101 assert(version > fsmap.epoch);
102
103 load_health();
104
105 // read and decode
106 bufferlist fsmap_bl;
107 fsmap_bl.clear();
108 int err = get_version(version, fsmap_bl);
109 assert(err == 0);
110
111 assert(fsmap_bl.length() > 0);
112 dout(10) << __func__ << " got " << version << dendl;
113 fsmap.decode(fsmap_bl);
114
115 // new map
116 dout(4) << "new map" << dendl;
117 print_map(fsmap, 0);
118 if (!g_conf->mon_mds_skip_sanity) {
119 fsmap.sanity();
120 }
121
122 check_subs();
123 update_logger();
124 }
125
126 void MDSMonitor::init()
127 {
128 (void)load_metadata(pending_metadata);
129 }
130
131 void MDSMonitor::create_pending()
132 {
133 pending_fsmap = fsmap;
134 pending_fsmap.epoch++;
135
136 dout(10) << "create_pending e" << pending_fsmap.epoch << dendl;
137 }
138
139 void MDSMonitor::encode_pending(MonitorDBStore::TransactionRef t)
140 {
141 dout(10) << "encode_pending e" << pending_fsmap.epoch << dendl;
142
143
144 // print map iff 'debug mon = 30' or higher
145 print_map(pending_fsmap, 30);
146 if (!g_conf->mon_mds_skip_sanity) {
147 pending_fsmap.sanity();
148 }
149
150 // Set 'modified' on maps modified this epoch
151 for (auto &i : fsmap.filesystems) {
152 if (i.second->mds_map.epoch == fsmap.epoch) {
153 i.second->mds_map.modified = ceph_clock_now();
154 }
155 }
156
157 // apply to paxos
158 assert(get_last_committed() + 1 == pending_fsmap.epoch);
159 bufferlist fsmap_bl;
160 pending_fsmap.encode(fsmap_bl, mon->get_quorum_con_features());
161
162 /* put everything in the transaction */
163 put_version(t, pending_fsmap.epoch, fsmap_bl);
164 put_last_committed(t, pending_fsmap.epoch);
165
166 // Encode MDSHealth data
167 for (std::map<uint64_t, MDSHealth>::iterator i = pending_daemon_health.begin();
168 i != pending_daemon_health.end(); ++i) {
169 bufferlist bl;
170 i->second.encode(bl);
171 t->put(MDS_HEALTH_PREFIX, stringify(i->first), bl);
172 }
173
174 for (std::set<uint64_t>::iterator i = pending_daemon_health_rm.begin();
175 i != pending_daemon_health_rm.end(); ++i) {
176 t->erase(MDS_HEALTH_PREFIX, stringify(*i));
177 }
178 pending_daemon_health_rm.clear();
179 remove_from_metadata(t);
180
181 // health
182 health_check_map_t new_checks;
183 const auto info_map = pending_fsmap.get_mds_info();
184 for (const auto &i : info_map) {
185 const auto &gid = i.first;
186 const auto &info = i.second;
187 if (pending_daemon_health_rm.count(gid)) {
188 continue;
189 }
190 MDSHealth health;
191 auto p = pending_daemon_health.find(gid);
192 if (p != pending_daemon_health.end()) {
193 health = p->second;
194 } else {
195 bufferlist bl;
196 mon->store->get(MDS_HEALTH_PREFIX, stringify(gid), bl);
197 if (!bl.length()) {
198 derr << "Missing health data for MDS " << gid << dendl;
199 continue;
200 }
201 bufferlist::iterator bl_i = bl.begin();
202 health.decode(bl_i);
203 }
204 for (const auto &metric : health.metrics) {
205 int const rank = info.rank;
206 health_check_t *check = &new_checks.get_or_add(
207 mds_metric_name(metric.type),
208 metric.sev,
209 mds_metric_summary(metric.type));
210 ostringstream ss;
211 ss << "mds" << info.name << "(mds." << rank << "): " << metric.message;
212 for (auto p = metric.metadata.begin();
213 p != metric.metadata.end();
214 ++p) {
215 if (p != metric.metadata.begin()) {
216 ss << ", ";
217 }
218 ss << p->first << ": " << p->second;
219 }
220 check->detail.push_back(ss.str());
221 }
222 }
223 pending_fsmap.get_health_checks(&new_checks);
224 for (auto& p : new_checks.checks) {
225 p.second.summary = boost::regex_replace(
226 p.second.summary,
227 boost::regex("%num%"),
228 stringify(p.second.detail.size()));
229 p.second.summary = boost::regex_replace(
230 p.second.summary,
231 boost::regex("%plurals%"),
232 p.second.detail.size() > 1 ? "s" : "");
233 p.second.summary = boost::regex_replace(
234 p.second.summary,
235 boost::regex("%isorare%"),
236 p.second.detail.size() > 1 ? "are" : "is");
237 }
238 encode_health(new_checks, t);
239 }
240
241 version_t MDSMonitor::get_trim_to()
242 {
243 version_t floor = 0;
244 if (g_conf->mon_mds_force_trim_to > 0 &&
245 g_conf->mon_mds_force_trim_to < (int)get_last_committed()) {
246 floor = g_conf->mon_mds_force_trim_to;
247 dout(10) << __func__ << " explicit mon_mds_force_trim_to = "
248 << floor << dendl;
249 }
250
251 unsigned max = g_conf->mon_max_mdsmap_epochs;
252 version_t last = get_last_committed();
253
254 if (last - get_first_committed() > max && floor < last - max)
255 return last - max;
256 return floor;
257 }
258
259 void MDSMonitor::update_logger()
260 {
261 dout(10) << "update_logger" << dendl;
262
263 uint64_t up = 0;
264 uint64_t in = 0;
265 uint64_t failed = 0;
266 for (const auto &i : fsmap.filesystems) {
267 const MDSMap &mds_map = i.second->mds_map;
268
269 up += mds_map.get_num_up_mds();
270 in += mds_map.get_num_in_mds();
271 failed += mds_map.get_num_failed_mds();
272 }
273 mon->cluster_logger->set(l_cluster_num_mds_up, up);
274 mon->cluster_logger->set(l_cluster_num_mds_in, in);
275 mon->cluster_logger->set(l_cluster_num_mds_failed, failed);
276 mon->cluster_logger->set(l_cluster_mds_epoch, fsmap.get_epoch());
277 }
278
279 bool MDSMonitor::preprocess_query(MonOpRequestRef op)
280 {
281 op->mark_mdsmon_event(__func__);
282 PaxosServiceMessage *m = static_cast<PaxosServiceMessage*>(op->get_req());
283 dout(10) << "preprocess_query " << *m << " from " << m->get_orig_source_inst() << dendl;
284
285 switch (m->get_type()) {
286
287 case MSG_MDS_BEACON:
288 return preprocess_beacon(op);
289
290 case MSG_MON_COMMAND:
291 return preprocess_command(op);
292
293 case MSG_MDS_OFFLOAD_TARGETS:
294 return preprocess_offload_targets(op);
295
296 default:
297 ceph_abort();
298 return true;
299 }
300 }
301
302 void MDSMonitor::_note_beacon(MMDSBeacon *m)
303 {
304 mds_gid_t gid = mds_gid_t(m->get_global_id());
305 version_t seq = m->get_seq();
306
307 dout(15) << "_note_beacon " << *m << " noting time" << dendl;
308 last_beacon[gid].stamp = ceph_clock_now();
309 last_beacon[gid].seq = seq;
310 }
311
312 bool MDSMonitor::preprocess_beacon(MonOpRequestRef op)
313 {
314 op->mark_mdsmon_event(__func__);
315 MMDSBeacon *m = static_cast<MMDSBeacon*>(op->get_req());
316 MDSMap::DaemonState state = m->get_state();
317 mds_gid_t gid = m->get_global_id();
318 version_t seq = m->get_seq();
319 MDSMap::mds_info_t info;
320 epoch_t effective_epoch = 0;
321
322 // check privileges, ignore if fails
323 MonSession *session = m->get_session();
324 assert(session);
325 if (!session->is_capable("mds", MON_CAP_X)) {
326 dout(0) << "preprocess_beacon got MMDSBeacon from entity with insufficient privileges "
327 << session->caps << dendl;
328 goto ignore;
329 }
330
331 if (m->get_fsid() != mon->monmap->fsid) {
332 dout(0) << "preprocess_beacon on fsid " << m->get_fsid() << " != " << mon->monmap->fsid << dendl;
333 goto ignore;
334 }
335
336 dout(12) << "preprocess_beacon " << *m
337 << " from " << m->get_orig_source_inst()
338 << " " << m->get_compat()
339 << dendl;
340
341 // make sure the address has a port
342 if (m->get_orig_source_addr().get_port() == 0) {
343 dout(1) << " ignoring boot message without a port" << dendl;
344 goto ignore;
345 }
346
347 // check compat
348 if (!m->get_compat().writeable(fsmap.compat)) {
349 dout(1) << " mds " << m->get_source_inst() << " can't write to fsmap " << fsmap.compat << dendl;
350 goto ignore;
351 }
352
353 // fw to leader?
354 if (!mon->is_leader())
355 return false;
356
357 // booted, but not in map?
358 if (!pending_fsmap.gid_exists(gid)) {
359 if (state != MDSMap::STATE_BOOT) {
360 dout(7) << "mds_beacon " << *m << " is not in fsmap (state "
361 << ceph_mds_state_name(state) << ")" << dendl;
362
363 MDSMap null_map;
364 null_map.epoch = fsmap.epoch;
365 null_map.compat = fsmap.compat;
366 mon->send_reply(op, new MMDSMap(mon->monmap->fsid, &null_map));
367 return true;
368 } else {
369 return false; // not booted yet.
370 }
371 }
372 dout(10) << __func__ << ": GID exists in map: " << gid << dendl;
373 info = pending_fsmap.get_info_gid(gid);
374
375 // old seq?
376 if (info.state_seq > seq) {
377 dout(7) << "mds_beacon " << *m << " has old seq, ignoring" << dendl;
378 goto ignore;
379 }
380
381 // Work out the latest epoch that this daemon should have seen
382 {
383 fs_cluster_id_t fscid = pending_fsmap.mds_roles.at(gid);
384 if (fscid == FS_CLUSTER_ID_NONE) {
385 effective_epoch = pending_fsmap.standby_epochs.at(gid);
386 } else {
387 effective_epoch = pending_fsmap.get_filesystem(fscid)->mds_map.epoch;
388 }
389 if (effective_epoch != m->get_last_epoch_seen()) {
390 dout(10) << "mds_beacon " << *m
391 << " ignoring requested state, because mds hasn't seen latest map" << dendl;
392 goto reply;
393 }
394 }
395
396 if (info.laggy()) {
397 _note_beacon(m);
398 return false; // no longer laggy, need to update map.
399 }
400 if (state == MDSMap::STATE_BOOT) {
401 // ignore, already booted.
402 goto ignore;
403 }
404 // is there a state change here?
405 if (info.state != state) {
406 // legal state change?
407 if ((info.state == MDSMap::STATE_STANDBY ||
408 info.state == MDSMap::STATE_STANDBY_REPLAY) && state > 0) {
409 dout(10) << "mds_beacon mds can't activate itself (" << ceph_mds_state_name(info.state)
410 << " -> " << ceph_mds_state_name(state) << ")" << dendl;
411 goto reply;
412 }
413
414 if ((state == MDSMap::STATE_STANDBY || state == MDSMap::STATE_STANDBY_REPLAY)
415 && info.rank != MDS_RANK_NONE)
416 {
417 dout(4) << "mds_beacon MDS can't go back into standby after taking rank: "
418 "held rank " << info.rank << " while requesting state "
419 << ceph_mds_state_name(state) << dendl;
420 goto reply;
421 }
422
423 _note_beacon(m);
424 return false;
425 }
426
427 // Comparing known daemon health with m->get_health()
428 // and return false (i.e. require proposal) if they
429 // do not match, to update our stored
430 if (!(pending_daemon_health[gid] == m->get_health())) {
431 dout(20) << __func__ << " health metrics for gid " << gid << " were updated" << dendl;
432 _note_beacon(m);
433 return false;
434 }
435
436 reply:
437 // note time and reply
438 assert(effective_epoch > 0);
439 _note_beacon(m);
440 mon->send_reply(op,
441 new MMDSBeacon(mon->monmap->fsid, m->get_global_id(), m->get_name(),
442 effective_epoch, state, seq,
443 CEPH_FEATURES_SUPPORTED_DEFAULT));
444 return true;
445
446 ignore:
447 // I won't reply this beacon, drop it.
448 mon->no_reply(op);
449 return true;
450 }
451
452 bool MDSMonitor::preprocess_offload_targets(MonOpRequestRef op)
453 {
454 op->mark_mdsmon_event(__func__);
455 MMDSLoadTargets *m = static_cast<MMDSLoadTargets*>(op->get_req());
456 dout(10) << "preprocess_offload_targets " << *m << " from " << m->get_orig_source() << dendl;
457
458 // check privileges, ignore message if fails
459 MonSession *session = m->get_session();
460 if (!session)
461 goto done;
462 if (!session->is_capable("mds", MON_CAP_X)) {
463 dout(0) << "preprocess_offload_targets got MMDSLoadTargets from entity with insufficient caps "
464 << session->caps << dendl;
465 goto done;
466 }
467
468 if (fsmap.gid_exists(m->global_id) &&
469 m->targets == fsmap.get_info_gid(m->global_id).export_targets)
470 goto done;
471
472 return false;
473
474 done:
475 return true;
476 }
477
478
479 bool MDSMonitor::prepare_update(MonOpRequestRef op)
480 {
481 op->mark_mdsmon_event(__func__);
482 PaxosServiceMessage *m = static_cast<PaxosServiceMessage*>(op->get_req());
483 dout(7) << "prepare_update " << *m << dendl;
484
485 switch (m->get_type()) {
486
487 case MSG_MDS_BEACON:
488 return prepare_beacon(op);
489
490 case MSG_MON_COMMAND:
491 return prepare_command(op);
492
493 case MSG_MDS_OFFLOAD_TARGETS:
494 return prepare_offload_targets(op);
495
496 default:
497 ceph_abort();
498 }
499
500 return true;
501 }
502
503 bool MDSMonitor::prepare_beacon(MonOpRequestRef op)
504 {
505 op->mark_mdsmon_event(__func__);
506 MMDSBeacon *m = static_cast<MMDSBeacon*>(op->get_req());
507 // -- this is an update --
508 dout(12) << "prepare_beacon " << *m << " from " << m->get_orig_source_inst() << dendl;
509 entity_addr_t addr = m->get_orig_source_inst().addr;
510 mds_gid_t gid = m->get_global_id();
511 MDSMap::DaemonState state = m->get_state();
512 version_t seq = m->get_seq();
513
514 dout(20) << __func__ << " got health from gid " << gid << " with " << m->get_health().metrics.size() << " metrics." << dendl;
515
516 // Calculate deltas of health metrics created and removed
517 // Do this by type rather than MDSHealthMetric equality, because messages can
518 // change a lot when they include e.g. a number of items.
519 const auto &old_health = pending_daemon_health[gid].metrics;
520 const auto &new_health = m->get_health().metrics;
521
522 std::set<mds_metric_t> old_types;
523 for (const auto &i : old_health) {
524 old_types.insert(i.type);
525 }
526
527 std::set<mds_metric_t> new_types;
528 for (const auto &i : new_health) {
529 new_types.insert(i.type);
530 }
531
532 for (const auto &new_metric: new_health) {
533 if (old_types.count(new_metric.type) == 0) {
534 std::stringstream msg;
535 msg << "MDS health message (" << m->get_orig_source_inst().name << "): "
536 << new_metric.message;
537 if (new_metric.sev == HEALTH_ERR) {
538 mon->clog->error() << msg.str();
539 } else if (new_metric.sev == HEALTH_WARN) {
540 mon->clog->warn() << msg.str();
541 } else {
542 mon->clog->info() << msg.str();
543 }
544 }
545 }
546
547 // Log the disappearance of health messages at INFO
548 for (const auto &old_metric : old_health) {
549 if (new_types.count(old_metric.type) == 0) {
550 mon->clog->info() << "MDS health message cleared ("
551 << m->get_orig_source_inst().name << "): " << old_metric.message;
552 }
553 }
554
555 // Store health
556 pending_daemon_health[gid] = m->get_health();
557
558 // boot?
559 if (state == MDSMap::STATE_BOOT) {
560 // zap previous instance of this name?
561 if (g_conf->mds_enforce_unique_name) {
562 bool failed_mds = false;
563 while (mds_gid_t existing = pending_fsmap.find_mds_gid_by_name(m->get_name())) {
564 if (!mon->osdmon()->is_writeable()) {
565 mon->osdmon()->wait_for_writeable(op, new C_RetryMessage(this, op));
566 return false;
567 }
568 mon->clog->info() << "MDS daemon '" << m->get_name() << "' restarted";
569 fail_mds_gid(existing);
570 failed_mds = true;
571 }
572 if (failed_mds) {
573 assert(mon->osdmon()->is_writeable());
574 request_proposal(mon->osdmon());
575 }
576 }
577
578 // Add this daemon to the map
579 if (pending_fsmap.mds_roles.count(gid) == 0) {
580 MDSMap::mds_info_t new_info;
581 new_info.global_id = gid;
582 new_info.name = m->get_name();
583 new_info.addr = addr;
584 new_info.mds_features = m->get_mds_features();
585 new_info.state = MDSMap::STATE_STANDBY;
586 new_info.state_seq = seq;
587 new_info.standby_for_rank = m->get_standby_for_rank();
588 new_info.standby_for_name = m->get_standby_for_name();
589 new_info.standby_for_fscid = m->get_standby_for_fscid();
590 new_info.standby_replay = m->get_standby_replay();
591 pending_fsmap.insert(new_info);
592 }
593
594 // Resolve standby_for_name to a rank
595 const MDSMap::mds_info_t &info = pending_fsmap.get_info_gid(gid);
596 if (!info.standby_for_name.empty()) {
597 const MDSMap::mds_info_t *leaderinfo = fsmap.find_by_name(
598 info.standby_for_name);
599 if (leaderinfo && (leaderinfo->rank >= 0)) {
600 auto fscid = pending_fsmap.mds_roles.at(leaderinfo->global_id);
601 auto fs = pending_fsmap.get_filesystem(fscid);
602
603 pending_fsmap.modify_daemon(gid, [fscid, leaderinfo](
604 MDSMap::mds_info_t *info) {
605 info->standby_for_rank = leaderinfo->rank;
606 info->standby_for_fscid = fscid;
607 });
608 }
609 }
610
611 // initialize the beacon timer
612 last_beacon[gid].stamp = ceph_clock_now();
613 last_beacon[gid].seq = seq;
614
615 // new incompat?
616 if (!pending_fsmap.compat.writeable(m->get_compat())) {
617 dout(10) << " fsmap " << pending_fsmap.compat
618 << " can't write to new mds' " << m->get_compat()
619 << ", updating fsmap and killing old mds's"
620 << dendl;
621 pending_fsmap.update_compat(m->get_compat());
622 }
623
624 update_metadata(m->get_global_id(), m->get_sys_info());
625 } else {
626 // state update
627 const MDSMap::mds_info_t &info = pending_fsmap.get_info_gid(gid);
628 // Old MDS daemons don't mention that they're standby replay until
629 // after they've sent their boot beacon, so update this field.
630 if (info.standby_replay != m->get_standby_replay()) {
631 pending_fsmap.modify_daemon(info.global_id, [&m](
632 MDSMap::mds_info_t *i)
633 {
634 i->standby_replay = m->get_standby_replay();
635 });
636 }
637
638 if (info.state == MDSMap::STATE_STOPPING && state != MDSMap::STATE_STOPPED ) {
639 // we can't transition to any other states from STOPPING
640 dout(0) << "got beacon for MDS in STATE_STOPPING, ignoring requested state change"
641 << dendl;
642 _note_beacon(m);
643 return true;
644 }
645
646 if (info.laggy()) {
647 dout(10) << "prepare_beacon clearing laggy flag on " << addr << dendl;
648 pending_fsmap.modify_daemon(info.global_id, [](MDSMap::mds_info_t *info)
649 {
650 info->clear_laggy();
651 }
652 );
653 }
654
655 dout(10) << "prepare_beacon mds." << info.rank
656 << " " << ceph_mds_state_name(info.state)
657 << " -> " << ceph_mds_state_name(state)
658 << " standby_for_rank=" << m->get_standby_for_rank()
659 << dendl;
660 if (state == MDSMap::STATE_STOPPED) {
661 auto erased = pending_fsmap.stop(gid);
662 erased.push_back(gid);
663
664 for (const auto &erased_gid : erased) {
665 last_beacon.erase(erased_gid);
666 if (pending_daemon_health.count(erased_gid)) {
667 pending_daemon_health.erase(erased_gid);
668 pending_daemon_health_rm.insert(erased_gid);
669 }
670 }
671 } else if (state == MDSMap::STATE_DAMAGED) {
672 if (!mon->osdmon()->is_writeable()) {
673 dout(4) << __func__ << ": DAMAGED from rank " << info.rank
674 << " waiting for osdmon writeable to blacklist it" << dendl;
675 mon->osdmon()->wait_for_writeable(op, new C_RetryMessage(this, op));
676 return false;
677 }
678
679 // Record this MDS rank as damaged, so that other daemons
680 // won't try to run it.
681 dout(4) << __func__ << ": marking rank "
682 << info.rank << " damaged" << dendl;
683
684 utime_t until = ceph_clock_now();
685 until += g_conf->mds_blacklist_interval;
686 const auto blacklist_epoch = mon->osdmon()->blacklist(info.addr, until);
687 request_proposal(mon->osdmon());
688 pending_fsmap.damaged(gid, blacklist_epoch);
689 last_beacon.erase(gid);
690
691 // Respond to MDS, so that it knows it can continue to shut down
692 mon->send_reply(op,
693 new MMDSBeacon(
694 mon->monmap->fsid, m->get_global_id(),
695 m->get_name(), fsmap.get_epoch(), state, seq,
696 CEPH_FEATURES_SUPPORTED_DEFAULT));
697 } else if (state == MDSMap::STATE_DNE) {
698 if (!mon->osdmon()->is_writeable()) {
699 dout(4) << __func__ << ": DNE from rank " << info.rank
700 << " waiting for osdmon writeable to blacklist it" << dendl;
701 mon->osdmon()->wait_for_writeable(op, new C_RetryMessage(this, op));
702 return false;
703 }
704
705 fail_mds_gid(gid);
706 assert(mon->osdmon()->is_writeable());
707 request_proposal(mon->osdmon());
708
709 // Respond to MDS, so that it knows it can continue to shut down
710 mon->send_reply(op,
711 new MMDSBeacon(
712 mon->monmap->fsid, m->get_global_id(),
713 m->get_name(), fsmap.get_epoch(), state, seq,
714 CEPH_FEATURES_SUPPORTED_DEFAULT));
715 } else if (info.state == MDSMap::STATE_STANDBY && state != info.state) {
716 // Standby daemons should never modify their own
717 // state. Reject any attempts to do so.
718 derr << "standby " << gid << " attempted to change state to "
719 << ceph_mds_state_name(state) << ", rejecting" << dendl;
720 return true;
721 } else if (info.state != MDSMap::STATE_STANDBY && state != info.state &&
722 !MDSMap::state_transition_valid(info.state, state)) {
723 // Validate state transitions for daemons that hold a rank
724 derr << "daemon " << gid << " (rank " << info.rank << ") "
725 << "reported invalid state transition "
726 << ceph_mds_state_name(info.state) << " -> "
727 << ceph_mds_state_name(state) << dendl;
728 return true;
729 } else {
730 // Made it through special cases and validations, record the
731 // daemon's reported state to the FSMap.
732 pending_fsmap.modify_daemon(gid, [state, seq](MDSMap::mds_info_t *info) {
733 info->state = state;
734 info->state_seq = seq;
735 });
736 }
737 }
738
739 dout(7) << "prepare_beacon pending map now:" << dendl;
740 print_map(pending_fsmap);
741
742 wait_for_finished_proposal(op, new FunctionContext([op, this](int r){
743 if (r >= 0)
744 _updated(op); // success
745 else if (r == -ECANCELED) {
746 mon->no_reply(op);
747 } else {
748 dispatch(op); // try again
749 }
750 }));
751
752 return true;
753 }
754
755 bool MDSMonitor::prepare_offload_targets(MonOpRequestRef op)
756 {
757 op->mark_mdsmon_event(__func__);
758 MMDSLoadTargets *m = static_cast<MMDSLoadTargets*>(op->get_req());
759 mds_gid_t gid = m->global_id;
760 if (pending_fsmap.gid_has_rank(gid)) {
761 dout(10) << "prepare_offload_targets " << gid << " " << m->targets << dendl;
762 pending_fsmap.update_export_targets(gid, m->targets);
763 } else {
764 dout(10) << "prepare_offload_targets " << gid << " not in map" << dendl;
765 }
766 return true;
767 }
768
769 bool MDSMonitor::should_propose(double& delay)
770 {
771 // delegate to PaxosService to assess whether we should propose
772 return PaxosService::should_propose(delay);
773 }
774
775 void MDSMonitor::_updated(MonOpRequestRef op)
776 {
777 op->mark_mdsmon_event(__func__);
778 MMDSBeacon *m = static_cast<MMDSBeacon*>(op->get_req());
779 dout(10) << "_updated " << m->get_orig_source() << " " << *m << dendl;
780 mon->clog->info() << m->get_orig_source_inst() << " "
781 << ceph_mds_state_name(m->get_state());
782
783 if (m->get_state() == MDSMap::STATE_STOPPED) {
784 // send the map manually (they're out of the map, so they won't get it automatic)
785 MDSMap null_map;
786 null_map.epoch = fsmap.epoch;
787 null_map.compat = fsmap.compat;
788 mon->send_reply(op, new MMDSMap(mon->monmap->fsid, &null_map));
789 } else {
790 mon->send_reply(op, new MMDSBeacon(mon->monmap->fsid,
791 m->get_global_id(),
792 m->get_name(),
793 fsmap.get_epoch(),
794 m->get_state(),
795 m->get_seq(),
796 CEPH_FEATURES_SUPPORTED_DEFAULT));
797 }
798 }
799
800 void MDSMonitor::on_active()
801 {
802 tick();
803 update_logger();
804
805 if (mon->is_leader()) {
806 mon->clog->debug() << "fsmap " << fsmap;
807 }
808 }
809
810 void MDSMonitor::get_health(list<pair<health_status_t, string> >& summary,
811 list<pair<health_status_t, string> > *detail,
812 CephContext* cct) const
813 {
814 fsmap.get_health(summary, detail);
815
816 // For each MDS GID...
817 const auto info_map = fsmap.get_mds_info();
818 for (const auto &i : info_map) {
819 const auto &gid = i.first;
820 const auto &info = i.second;
821
822 // Decode MDSHealth
823 bufferlist bl;
824 mon->store->get(MDS_HEALTH_PREFIX, stringify(gid), bl);
825 if (!bl.length()) {
826 derr << "Missing health data for MDS " << gid << dendl;
827 continue;
828 }
829 MDSHealth health;
830 bufferlist::iterator bl_i = bl.begin();
831 health.decode(bl_i);
832
833 for (const auto &metric : health.metrics) {
834 int const rank = info.rank;
835 std::ostringstream message;
836 message << "mds" << rank << ": " << metric.message;
837 summary.push_back(std::make_pair(metric.sev, message.str()));
838
839 if (detail) {
840 // There is no way for us to clealy associate detail entries with summary entries (#7192), so
841 // we duplicate the summary message in the detail string and tag the metadata on.
842 std::ostringstream detail_message;
843 detail_message << message.str();
844 if (metric.metadata.size()) {
845 detail_message << "(";
846 auto k = metric.metadata.begin();
847 while (k != metric.metadata.end()) {
848 detail_message << k->first << ": " << k->second;
849 if (boost::next(k) != metric.metadata.end()) {
850 detail_message << ", ";
851 }
852 ++k;
853 }
854 detail_message << ")";
855 }
856 detail->push_back(std::make_pair(metric.sev, detail_message.str()));
857 }
858 }
859 }
860 }
861
862 void MDSMonitor::dump_info(Formatter *f)
863 {
864 f->open_object_section("fsmap");
865 fsmap.dump(f);
866 f->close_section();
867
868 f->dump_unsigned("mdsmap_first_committed", get_first_committed());
869 f->dump_unsigned("mdsmap_last_committed", get_last_committed());
870 }
871
872 bool MDSMonitor::preprocess_command(MonOpRequestRef op)
873 {
874 op->mark_mdsmon_event(__func__);
875 MMonCommand *m = static_cast<MMonCommand*>(op->get_req());
876 int r = -1;
877 bufferlist rdata;
878 stringstream ss, ds;
879
880 map<string, cmd_vartype> cmdmap;
881 if (!cmdmap_from_json(m->cmd, &cmdmap, ss)) {
882 // ss has reason for failure
883 string rs = ss.str();
884 mon->reply_command(op, -EINVAL, rs, rdata, get_last_committed());
885 return true;
886 }
887
888 string prefix;
889 cmd_getval(g_ceph_context, cmdmap, "prefix", prefix);
890 string format;
891 cmd_getval(g_ceph_context, cmdmap, "format", format, string("plain"));
892 boost::scoped_ptr<Formatter> f(Formatter::create(format));
893
894 MonSession *session = m->get_session();
895 if (!session) {
896 mon->reply_command(op, -EACCES, "access denied", rdata, get_last_committed());
897 return true;
898 }
899
900 if (prefix == "mds stat") {
901 if (f) {
902 f->open_object_section("mds_stat");
903 dump_info(f.get());
904 f->close_section();
905 f->flush(ds);
906 } else {
907 ds << fsmap;
908 }
909 r = 0;
910 } else if (prefix == "mds dump") {
911 int64_t epocharg;
912 epoch_t epoch;
913
914 FSMap *p = &fsmap;
915 if (cmd_getval(g_ceph_context, cmdmap, "epoch", epocharg)) {
916 epoch = epocharg;
917 bufferlist b;
918 int err = get_version(epoch, b);
919 if (err == -ENOENT) {
920 p = 0;
921 r = -ENOENT;
922 } else {
923 assert(err == 0);
924 assert(b.length());
925 p = new FSMap;
926 p->decode(b);
927 }
928 }
929 if (p) {
930 stringstream ds;
931 const MDSMap *mdsmap = nullptr;
932 MDSMap blank;
933 blank.epoch = fsmap.epoch;
934 if (fsmap.legacy_client_fscid != FS_CLUSTER_ID_NONE) {
935 mdsmap = &(fsmap.filesystems[fsmap.legacy_client_fscid]->mds_map);
936 } else {
937 mdsmap = &blank;
938 }
939 if (f != NULL) {
940 f->open_object_section("mdsmap");
941 mdsmap->dump(f.get());
942 f->close_section();
943 f->flush(ds);
944 r = 0;
945 } else {
946 mdsmap->print(ds);
947 r = 0;
948 }
949
950 rdata.append(ds);
951 ss << "dumped fsmap epoch " << p->get_epoch();
952
953 if (p != &fsmap) {
954 delete p;
955 }
956 }
957 } else if (prefix == "fs dump") {
958 int64_t epocharg;
959 epoch_t epoch;
960
961 FSMap *p = &fsmap;
962 if (cmd_getval(g_ceph_context, cmdmap, "epoch", epocharg)) {
963 epoch = epocharg;
964 bufferlist b;
965 int err = get_version(epoch, b);
966 if (err == -ENOENT) {
967 p = 0;
968 r = -ENOENT;
969 } else {
970 assert(err == 0);
971 assert(b.length());
972 p = new FSMap;
973 p->decode(b);
974 }
975 }
976 if (p) {
977 stringstream ds;
978 if (f != NULL) {
979 f->open_object_section("fsmap");
980 p->dump(f.get());
981 f->close_section();
982 f->flush(ds);
983 r = 0;
984 } else {
985 p->print(ds);
986 r = 0;
987 }
988
989 rdata.append(ds);
990 ss << "dumped fsmap epoch " << p->get_epoch();
991
992 if (p != &fsmap)
993 delete p;
994 }
995 } else if (prefix == "mds metadata") {
996 if (!f)
997 f.reset(Formatter::create("json-pretty"));
998
999 string who;
1000 bool all = !cmd_getval(g_ceph_context, cmdmap, "who", who);
1001 dout(1) << "all = " << all << dendl;
1002 if (all) {
1003 r = 0;
1004 // Dump all MDSs' metadata
1005 const auto all_info = fsmap.get_mds_info();
1006
1007 f->open_array_section("mds_metadata");
1008 for(const auto &i : all_info) {
1009 const auto &info = i.second;
1010
1011 f->open_object_section("mds");
1012 f->dump_string("name", info.name);
1013 std::ostringstream get_err;
1014 r = dump_metadata(info.name, f.get(), get_err);
1015 if (r == -EINVAL || r == -ENOENT) {
1016 // Drop error, list what metadata we do have
1017 dout(1) << get_err.str() << dendl;
1018 r = 0;
1019 } else if (r != 0) {
1020 derr << "Unexpected error reading metadata: " << cpp_strerror(r)
1021 << dendl;
1022 ss << get_err.str();
1023 f->close_section();
1024 break;
1025 }
1026 f->close_section();
1027 }
1028 f->close_section();
1029 } else {
1030 // Dump a single daemon's metadata
1031 f->open_object_section("mds_metadata");
1032 r = dump_metadata(who, f.get(), ss);
1033 f->close_section();
1034 }
1035 f->flush(ds);
1036 } else if (prefix == "mds versions") {
1037 if (!f)
1038 f.reset(Formatter::create("json-pretty"));
1039 count_metadata("ceph_version", f.get());
1040 f->flush(ds);
1041 r = 0;
1042 } else if (prefix == "mds count-metadata") {
1043 if (!f)
1044 f.reset(Formatter::create("json-pretty"));
1045 string field;
1046 cmd_getval(g_ceph_context, cmdmap, "property", field);
1047 count_metadata(field, f.get());
1048 f->flush(ds);
1049 r = 0;
1050 } else if (prefix == "mds getmap") {
1051 epoch_t e;
1052 int64_t epocharg;
1053 bufferlist b;
1054 if (cmd_getval(g_ceph_context, cmdmap, "epoch", epocharg)) {
1055 e = epocharg;
1056 int err = get_version(e, b);
1057 if (err == -ENOENT) {
1058 r = -ENOENT;
1059 } else {
1060 assert(err == 0);
1061 assert(b.length());
1062 FSMap mm;
1063 mm.decode(b);
1064 mm.encode(rdata, m->get_connection()->get_features());
1065 ss << "got fsmap epoch " << mm.get_epoch();
1066 r = 0;
1067 }
1068 } else {
1069 fsmap.encode(rdata, m->get_connection()->get_features());
1070 ss << "got fsmap epoch " << fsmap.get_epoch();
1071 r = 0;
1072 }
1073 } else if (prefix == "mds compat show") {
1074 if (f) {
1075 f->open_object_section("mds_compat");
1076 fsmap.compat.dump(f.get());
1077 f->close_section();
1078 f->flush(ds);
1079 } else {
1080 ds << fsmap.compat;
1081 }
1082 r = 0;
1083 } else if (prefix == "fs get") {
1084 string fs_name;
1085 cmd_getval(g_ceph_context, cmdmap, "fs_name", fs_name);
1086 auto fs = fsmap.get_filesystem(fs_name);
1087 if (fs == nullptr) {
1088 ss << "filesystem '" << fs_name << "' not found";
1089 r = -ENOENT;
1090 } else {
1091 if (f != nullptr) {
1092 f->open_object_section("filesystem");
1093 fs->dump(f.get());
1094 f->close_section();
1095 f->flush(ds);
1096 r = 0;
1097 } else {
1098 fs->print(ds);
1099 r = 0;
1100 }
1101 }
1102 } else if (prefix == "fs ls") {
1103 if (f) {
1104 f->open_array_section("filesystems");
1105 {
1106 for (const auto i : fsmap.filesystems) {
1107 const auto fs = i.second;
1108 f->open_object_section("filesystem");
1109 {
1110 const MDSMap &mds_map = fs->mds_map;
1111 f->dump_string("name", mds_map.fs_name);
1112 /* Output both the names and IDs of pools, for use by
1113 * humans and machines respectively */
1114 f->dump_string("metadata_pool", mon->osdmon()->osdmap.get_pool_name(
1115 mds_map.metadata_pool));
1116 f->dump_int("metadata_pool_id", mds_map.metadata_pool);
1117 f->open_array_section("data_pool_ids");
1118 {
1119 for (auto dpi = mds_map.data_pools.begin();
1120 dpi != mds_map.data_pools.end(); ++dpi) {
1121 f->dump_int("data_pool_id", *dpi);
1122 }
1123 }
1124 f->close_section();
1125
1126 f->open_array_section("data_pools");
1127 {
1128 for (auto dpi = mds_map.data_pools.begin();
1129 dpi != mds_map.data_pools.end(); ++dpi) {
1130 const auto &name = mon->osdmon()->osdmap.get_pool_name(
1131 *dpi);
1132 f->dump_string("data_pool", name);
1133 }
1134 }
1135
1136 f->close_section();
1137 }
1138 f->close_section();
1139 }
1140 }
1141 f->close_section();
1142 f->flush(ds);
1143 } else {
1144 for (const auto i : fsmap.filesystems) {
1145 const auto fs = i.second;
1146 const MDSMap &mds_map = fs->mds_map;
1147 const string &md_pool_name = mon->osdmon()->osdmap.get_pool_name(
1148 mds_map.metadata_pool);
1149
1150 ds << "name: " << mds_map.fs_name << ", metadata pool: "
1151 << md_pool_name << ", data pools: [";
1152 for (auto dpi : mds_map.data_pools) {
1153 const string &pool_name = mon->osdmon()->osdmap.get_pool_name(dpi);
1154 ds << pool_name << " ";
1155 }
1156 ds << "]" << std::endl;
1157 }
1158
1159 if (fsmap.filesystems.empty()) {
1160 ds << "No filesystems enabled" << std::endl;
1161 }
1162 }
1163 r = 0;
1164 }
1165
1166 if (r != -1) {
1167 rdata.append(ds);
1168 string rs;
1169 getline(ss, rs);
1170 mon->reply_command(op, r, rs, rdata, get_last_committed());
1171 return true;
1172 } else
1173 return false;
1174 }
1175
1176 bool MDSMonitor::fail_mds_gid(mds_gid_t gid)
1177 {
1178 const MDSMap::mds_info_t info = pending_fsmap.get_info_gid(gid);
1179 dout(10) << "fail_mds_gid " << gid << " mds." << info.name << " role " << info.rank << dendl;
1180
1181 epoch_t blacklist_epoch = 0;
1182 if (info.rank >= 0 && info.state != MDSMap::STATE_STANDBY_REPLAY) {
1183 utime_t until = ceph_clock_now();
1184 until += g_conf->mds_blacklist_interval;
1185 blacklist_epoch = mon->osdmon()->blacklist(info.addr, until);
1186 }
1187
1188 pending_fsmap.erase(gid, blacklist_epoch);
1189 last_beacon.erase(gid);
1190 if (pending_daemon_health.count(gid)) {
1191 pending_daemon_health.erase(gid);
1192 pending_daemon_health_rm.insert(gid);
1193 }
1194
1195 return blacklist_epoch != 0;
1196 }
1197
1198 mds_gid_t MDSMonitor::gid_from_arg(const std::string& arg, std::ostream &ss)
1199 {
1200 const FSMap *relevant_fsmap = mon->is_leader() ? &pending_fsmap : &fsmap;
1201
1202 // Try parsing as a role
1203 mds_role_t role;
1204 std::ostringstream ignore_err; // Don't spam 'ss' with parse_role errors
1205 int r = parse_role(arg, &role, ignore_err);
1206 if (r == 0) {
1207 // See if a GID is assigned to this role
1208 auto fs = relevant_fsmap->get_filesystem(role.fscid);
1209 assert(fs != nullptr); // parse_role ensures it exists
1210 if (fs->mds_map.is_up(role.rank)) {
1211 dout(10) << __func__ << ": validated rank/GID " << role
1212 << " as a rank" << dendl;
1213 return fs->mds_map.get_mds_info(role.rank).global_id;
1214 }
1215 }
1216
1217 // Try parsing as a gid
1218 std::string err;
1219 unsigned long long maybe_gid = strict_strtoll(arg.c_str(), 10, &err);
1220 if (!err.empty()) {
1221 // Not a role or a GID, try as a daemon name
1222 const MDSMap::mds_info_t *mds_info = relevant_fsmap->find_by_name(arg);
1223 if (!mds_info) {
1224 ss << "MDS named '" << arg
1225 << "' does not exist, or is not up";
1226 return MDS_GID_NONE;
1227 }
1228 dout(10) << __func__ << ": resolved MDS name '" << arg
1229 << "' to GID " << mds_info->global_id << dendl;
1230 return mds_info->global_id;
1231 } else {
1232 // Not a role, but parses as a an integer, might be a GID
1233 dout(10) << __func__ << ": treating MDS reference '" << arg
1234 << "' as an integer " << maybe_gid << dendl;
1235
1236 if (relevant_fsmap->gid_exists(mds_gid_t(maybe_gid))) {
1237 return mds_gid_t(maybe_gid);
1238 }
1239 }
1240
1241 dout(1) << __func__ << ": rank/GID " << arg
1242 << " not a existent rank or GID" << dendl;
1243 return MDS_GID_NONE;
1244 }
1245
1246 int MDSMonitor::fail_mds(std::ostream &ss, const std::string &arg)
1247 {
1248 mds_gid_t gid = gid_from_arg(arg, ss);
1249 if (gid == MDS_GID_NONE) {
1250 return 0;
1251 }
1252 if (!mon->osdmon()->is_writeable()) {
1253 return -EAGAIN;
1254 }
1255 fail_mds_gid(gid);
1256 ss << "failed mds gid " << gid;
1257 assert(mon->osdmon()->is_writeable());
1258 request_proposal(mon->osdmon());
1259 return 0;
1260 }
1261
1262 bool MDSMonitor::prepare_command(MonOpRequestRef op)
1263 {
1264 op->mark_mdsmon_event(__func__);
1265 MMonCommand *m = static_cast<MMonCommand*>(op->get_req());
1266 int r = -EINVAL;
1267 stringstream ss;
1268 bufferlist rdata;
1269
1270 map<string, cmd_vartype> cmdmap;
1271 if (!cmdmap_from_json(m->cmd, &cmdmap, ss)) {
1272 string rs = ss.str();
1273 mon->reply_command(op, -EINVAL, rs, rdata, get_last_committed());
1274 return true;
1275 }
1276
1277 string prefix;
1278 cmd_getval(g_ceph_context, cmdmap, "prefix", prefix);
1279
1280 /* Refuse access if message not associated with a valid session */
1281 MonSession *session = m->get_session();
1282 if (!session) {
1283 mon->reply_command(op, -EACCES, "access denied", rdata, get_last_committed());
1284 return true;
1285 }
1286
1287 bool batched_propose = false;
1288 for (auto h : handlers) {
1289 if (h->can_handle(prefix)) {
1290 batched_propose = h->batched_propose();
1291 if (batched_propose) {
1292 paxos->plug();
1293 }
1294 r = h->handle(mon, pending_fsmap, op, cmdmap, ss);
1295 if (batched_propose) {
1296 paxos->unplug();
1297 }
1298
1299 if (r == -EAGAIN) {
1300 // message has been enqueued for retry; return.
1301 dout(4) << __func__ << " enqueue for retry by prepare_command" << dendl;
1302 return false;
1303 } else {
1304 if (r == 0) {
1305 // On successful updates, print the updated map
1306 print_map(pending_fsmap);
1307 }
1308 // Successful or not, we're done: respond.
1309 goto out;
1310 }
1311 }
1312 }
1313
1314 r = filesystem_command(op, prefix, cmdmap, ss);
1315 if (r >= 0) {
1316 goto out;
1317 } else if (r == -EAGAIN) {
1318 // Do not reply, the message has been enqueued for retry
1319 dout(4) << __func__ << " enqueue for retry by filesystem_command" << dendl;
1320 return false;
1321 } else if (r != -ENOSYS) {
1322 goto out;
1323 }
1324
1325 // Only handle legacy commands if there is a filesystem configured
1326 if (pending_fsmap.legacy_client_fscid == FS_CLUSTER_ID_NONE) {
1327 if (pending_fsmap.filesystems.size() == 0) {
1328 ss << "No filesystem configured: use `ceph fs new` to create a filesystem";
1329 } else {
1330 ss << "No filesystem set for use with legacy commands";
1331 }
1332 r = -EINVAL;
1333 goto out;
1334 }
1335
1336 r = legacy_filesystem_command(op, prefix, cmdmap, ss);
1337
1338 if (r == -ENOSYS && ss.str().empty()) {
1339 ss << "unrecognized command";
1340 }
1341
1342 out:
1343 dout(4) << __func__ << " done, r=" << r << dendl;
1344 /* Compose response */
1345 string rs;
1346 getline(ss, rs);
1347
1348 if (r >= 0) {
1349 // success.. delay reply
1350 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, r, rs,
1351 get_last_committed() + 1));
1352 if (batched_propose) {
1353 force_immediate_propose();
1354 }
1355 return true;
1356 } else {
1357 // reply immediately
1358 mon->reply_command(op, r, rs, rdata, get_last_committed());
1359 return false;
1360 }
1361 }
1362
1363
1364 /**
1365 * Given one of the following forms:
1366 * <fs name>:<rank>
1367 * <fs id>:<rank>
1368 * <rank>
1369 *
1370 * Parse into a mds_role_t. The rank-only form is only valid
1371 * if legacy_client_ns is set.
1372 */
1373 int MDSMonitor::parse_role(
1374 const std::string &role_str,
1375 mds_role_t *role,
1376 std::ostream &ss)
1377 {
1378 const FSMap *relevant_fsmap = &fsmap;
1379 if (mon->is_leader()) {
1380 relevant_fsmap = &pending_fsmap;
1381 }
1382 return relevant_fsmap->parse_role(role_str, role, ss);
1383 }
1384
1385 int MDSMonitor::filesystem_command(
1386 MonOpRequestRef op,
1387 std::string const &prefix,
1388 map<string, cmd_vartype> &cmdmap,
1389 std::stringstream &ss)
1390 {
1391 dout(4) << __func__ << " prefix='" << prefix << "'" << dendl;
1392 op->mark_mdsmon_event(__func__);
1393 int r = 0;
1394 string whostr;
1395 cmd_getval(g_ceph_context, cmdmap, "who", whostr);
1396
1397 if (prefix == "mds stop" ||
1398 prefix == "mds deactivate") {
1399
1400 mds_role_t role;
1401 r = parse_role(whostr, &role, ss);
1402 if (r < 0 ) {
1403 return r;
1404 }
1405 auto fs = pending_fsmap.get_filesystem(role.fscid);
1406
1407 if (!fs->mds_map.is_active(role.rank)) {
1408 r = -EEXIST;
1409 ss << "mds." << role << " not active ("
1410 << ceph_mds_state_name(fs->mds_map.get_state(role.rank)) << ")";
1411 } else if (fs->mds_map.get_root() == role.rank ||
1412 fs->mds_map.get_tableserver() == role.rank) {
1413 r = -EINVAL;
1414 ss << "can't tell the root (" << fs->mds_map.get_root()
1415 << ") or tableserver (" << fs->mds_map.get_tableserver()
1416 << ") to deactivate";
1417 } else if (role.rank != fs->mds_map.get_last_in_mds()) {
1418 r = -EINVAL;
1419 ss << "mds." << role << " doesn't have the max rank ("
1420 << fs->mds_map.get_last_in_mds() << ")";
1421 } else if (fs->mds_map.get_num_in_mds() <= size_t(fs->mds_map.get_max_mds())) {
1422 r = -EBUSY;
1423 ss << "must decrease max_mds or else MDS will immediately reactivate";
1424 } else {
1425 r = 0;
1426 mds_gid_t gid = fs->mds_map.up.at(role.rank);
1427 ss << "telling mds." << role << " "
1428 << pending_fsmap.get_info_gid(gid).addr << " to deactivate";
1429
1430 pending_fsmap.modify_daemon(gid, [](MDSMap::mds_info_t *info) {
1431 info->state = MDSMap::STATE_STOPPING;
1432 });
1433 }
1434 } else if (prefix == "mds set_state") {
1435 mds_gid_t gid;
1436 if (!cmd_getval(g_ceph_context, cmdmap, "gid", gid)) {
1437 ss << "error parsing 'gid' value '"
1438 << cmd_vartype_stringify(cmdmap["gid"]) << "'";
1439 return -EINVAL;
1440 }
1441 MDSMap::DaemonState state;
1442 if (!cmd_getval(g_ceph_context, cmdmap, "state", state)) {
1443 ss << "error parsing 'state' string value '"
1444 << cmd_vartype_stringify(cmdmap["state"]) << "'";
1445 return -EINVAL;
1446 }
1447 if (pending_fsmap.gid_exists(gid)) {
1448 pending_fsmap.modify_daemon(gid, [state](MDSMap::mds_info_t *info) {
1449 info->state = state;
1450 });
1451 ss << "set mds gid " << gid << " to state " << state << " "
1452 << ceph_mds_state_name(state);
1453 return 0;
1454 }
1455 } else if (prefix == "mds fail") {
1456 string who;
1457 cmd_getval(g_ceph_context, cmdmap, "who", who);
1458 r = fail_mds(ss, who);
1459 if (r < 0 && r == -EAGAIN) {
1460 mon->osdmon()->wait_for_writeable(op, new C_RetryMessage(this, op));
1461 return -EAGAIN; // don't propose yet; wait for message to be retried
1462 }
1463 } else if (prefix == "mds rm") {
1464 mds_gid_t gid;
1465 if (!cmd_getval(g_ceph_context, cmdmap, "gid", gid)) {
1466 ss << "error parsing 'gid' value '"
1467 << cmd_vartype_stringify(cmdmap["gid"]) << "'";
1468 return -EINVAL;
1469 }
1470 if (!pending_fsmap.gid_exists(gid)) {
1471 ss << "mds gid " << gid << " dne";
1472 r = 0;
1473 } else {
1474 MDSMap::DaemonState state = pending_fsmap.get_info_gid(gid).state;
1475 if (state > 0) {
1476 ss << "cannot remove active mds." << pending_fsmap.get_info_gid(gid).name
1477 << " rank " << pending_fsmap.get_info_gid(gid).rank;
1478 return -EBUSY;
1479 } else {
1480 pending_fsmap.erase(gid, {});
1481 ss << "removed mds gid " << gid;
1482 return 0;
1483 }
1484 }
1485 } else if (prefix == "mds rmfailed") {
1486 string confirm;
1487 if (!cmd_getval(g_ceph_context, cmdmap, "confirm", confirm) ||
1488 confirm != "--yes-i-really-mean-it") {
1489 ss << "WARNING: this can make your filesystem inaccessible! "
1490 "Add --yes-i-really-mean-it if you are sure you wish to continue.";
1491 return -EPERM;
1492 }
1493
1494 std::string role_str;
1495 cmd_getval(g_ceph_context, cmdmap, "who", role_str);
1496 mds_role_t role;
1497 int r = parse_role(role_str, &role, ss);
1498 if (r < 0) {
1499 ss << "invalid role '" << role_str << "'";
1500 return -EINVAL;
1501 }
1502
1503 pending_fsmap.modify_filesystem(
1504 role.fscid,
1505 [role](std::shared_ptr<Filesystem> fs)
1506 {
1507 fs->mds_map.failed.erase(role.rank);
1508 });
1509
1510 ss << "removed failed mds." << role;
1511 return 0;
1512 } else if (prefix == "mds compat rm_compat") {
1513 int64_t f;
1514 if (!cmd_getval(g_ceph_context, cmdmap, "feature", f)) {
1515 ss << "error parsing feature value '"
1516 << cmd_vartype_stringify(cmdmap["feature"]) << "'";
1517 return -EINVAL;
1518 }
1519 if (pending_fsmap.compat.compat.contains(f)) {
1520 ss << "removing compat feature " << f;
1521 CompatSet modified = pending_fsmap.compat;
1522 modified.compat.remove(f);
1523 pending_fsmap.update_compat(modified);
1524 } else {
1525 ss << "compat feature " << f << " not present in " << pending_fsmap.compat;
1526 }
1527 r = 0;
1528 } else if (prefix == "mds compat rm_incompat") {
1529 int64_t f;
1530 if (!cmd_getval(g_ceph_context, cmdmap, "feature", f)) {
1531 ss << "error parsing feature value '"
1532 << cmd_vartype_stringify(cmdmap["feature"]) << "'";
1533 return -EINVAL;
1534 }
1535 if (pending_fsmap.compat.incompat.contains(f)) {
1536 ss << "removing incompat feature " << f;
1537 CompatSet modified = pending_fsmap.compat;
1538 modified.incompat.remove(f);
1539 pending_fsmap.update_compat(modified);
1540 } else {
1541 ss << "incompat feature " << f << " not present in " << pending_fsmap.compat;
1542 }
1543 r = 0;
1544 } else if (prefix == "mds repaired") {
1545 std::string role_str;
1546 cmd_getval(g_ceph_context, cmdmap, "rank", role_str);
1547 mds_role_t role;
1548 r = parse_role(role_str, &role, ss);
1549 if (r < 0) {
1550 return r;
1551 }
1552
1553 bool modified = pending_fsmap.undamaged(role.fscid, role.rank);
1554 if (modified) {
1555 dout(4) << "repaired: restoring rank " << role << dendl;
1556 } else {
1557 dout(4) << "repaired: no-op on rank " << role << dendl;
1558 }
1559
1560 r = 0;
1561 } else {
1562 return -ENOSYS;
1563 }
1564
1565 return r;
1566 }
1567
1568 /**
1569 * Helper to legacy_filesystem_command
1570 */
1571 void MDSMonitor::modify_legacy_filesystem(
1572 std::function<void(std::shared_ptr<Filesystem> )> fn)
1573 {
1574 pending_fsmap.modify_filesystem(
1575 pending_fsmap.legacy_client_fscid,
1576 fn
1577 );
1578 }
1579
1580
1581
1582 /**
1583 * Handle a command that affects the filesystem (i.e. a filesystem
1584 * must exist for the command to act upon).
1585 *
1586 * @retval 0 Command was successfully handled and has side effects
1587 * @retval -EAGAIN Messages has been requeued for retry
1588 * @retval -ENOSYS Unknown command
1589 * @retval < 0 An error has occurred; **ss** may have been set.
1590 */
1591 int MDSMonitor::legacy_filesystem_command(
1592 MonOpRequestRef op,
1593 std::string const &prefix,
1594 map<string, cmd_vartype> &cmdmap,
1595 std::stringstream &ss)
1596 {
1597 dout(4) << __func__ << " prefix='" << prefix << "'" << dendl;
1598 op->mark_mdsmon_event(__func__);
1599 int r = 0;
1600 string whostr;
1601 cmd_getval(g_ceph_context, cmdmap, "who", whostr);
1602
1603 assert (pending_fsmap.legacy_client_fscid != FS_CLUSTER_ID_NONE);
1604
1605 if (prefix == "mds set_max_mds") {
1606 // NOTE: deprecated by "fs set max_mds"
1607 int64_t maxmds;
1608 if (!cmd_getval(g_ceph_context, cmdmap, "maxmds", maxmds) || maxmds <= 0) {
1609 return -EINVAL;
1610 }
1611
1612 const MDSMap& mdsmap =
1613 pending_fsmap.filesystems.at(pending_fsmap.legacy_client_fscid)->mds_map;
1614
1615 if (!mdsmap.allows_multimds() &&
1616 maxmds > mdsmap.get_max_mds() &&
1617 maxmds > 1) {
1618 ss << "multi-MDS clusters are not enabled; set 'allow_multimds' to enable";
1619 return -EINVAL;
1620 }
1621
1622 if (maxmds > MAX_MDS) {
1623 ss << "may not have more than " << MAX_MDS << " MDS ranks";
1624 return -EINVAL;
1625 }
1626
1627 modify_legacy_filesystem(
1628 [maxmds](std::shared_ptr<Filesystem> fs)
1629 {
1630 fs->mds_map.set_max_mds(maxmds);
1631 });
1632
1633 r = 0;
1634 ss << "max_mds = " << maxmds;
1635 } else if (prefix == "mds cluster_down") {
1636 // NOTE: deprecated by "fs set cluster_down"
1637 modify_legacy_filesystem(
1638 [](std::shared_ptr<Filesystem> fs)
1639 {
1640 fs->mds_map.set_flag(CEPH_MDSMAP_DOWN);
1641 });
1642 ss << "marked fsmap DOWN";
1643 r = 0;
1644 } else if (prefix == "mds cluster_up") {
1645 // NOTE: deprecated by "fs set cluster_up"
1646 modify_legacy_filesystem(
1647 [](std::shared_ptr<Filesystem> fs)
1648 {
1649 fs->mds_map.clear_flag(CEPH_MDSMAP_DOWN);
1650 });
1651 ss << "unmarked fsmap DOWN";
1652 r = 0;
1653 } else {
1654 return -ENOSYS;
1655 }
1656
1657 return r;
1658 }
1659
1660
1661 void MDSMonitor::check_subs()
1662 {
1663 std::list<std::string> types;
1664
1665 // Subscriptions may be to "mdsmap" (MDS and legacy clients),
1666 // "mdsmap.<namespace>", or to "fsmap" for the full state of all
1667 // filesystems. Build a list of all the types we service
1668 // subscriptions for.
1669 types.push_back("fsmap");
1670 types.push_back("fsmap.user");
1671 types.push_back("mdsmap");
1672 for (const auto &i : fsmap.filesystems) {
1673 auto fscid = i.first;
1674 std::ostringstream oss;
1675 oss << "mdsmap." << fscid;
1676 types.push_back(oss.str());
1677 }
1678
1679 for (const auto &type : types) {
1680 if (mon->session_map.subs.count(type) == 0)
1681 continue;
1682 xlist<Subscription*>::iterator p = mon->session_map.subs[type]->begin();
1683 while (!p.end()) {
1684 Subscription *sub = *p;
1685 ++p;
1686 check_sub(sub);
1687 }
1688 }
1689 }
1690
1691
1692 void MDSMonitor::check_sub(Subscription *sub)
1693 {
1694 dout(20) << __func__ << ": " << sub->type << dendl;
1695
1696 if (sub->type == "fsmap") {
1697 if (sub->next <= fsmap.get_epoch()) {
1698 sub->session->con->send_message(new MFSMap(mon->monmap->fsid, fsmap));
1699 if (sub->onetime) {
1700 mon->session_map.remove_sub(sub);
1701 } else {
1702 sub->next = fsmap.get_epoch() + 1;
1703 }
1704 }
1705 } else if (sub->type == "fsmap.user") {
1706 if (sub->next <= fsmap.get_epoch()) {
1707 FSMapUser fsmap_u;
1708 fsmap_u.epoch = fsmap.get_epoch();
1709 fsmap_u.legacy_client_fscid = fsmap.legacy_client_fscid;
1710 for (auto p = fsmap.filesystems.begin();
1711 p != fsmap.filesystems.end();
1712 ++p) {
1713 FSMapUser::fs_info_t& fs_info = fsmap_u.filesystems[p->first];
1714 fs_info.cid = p->first;
1715 fs_info.name= p->second->mds_map.fs_name;
1716 }
1717 sub->session->con->send_message(new MFSMapUser(mon->monmap->fsid, fsmap_u));
1718 if (sub->onetime) {
1719 mon->session_map.remove_sub(sub);
1720 } else {
1721 sub->next = fsmap.get_epoch() + 1;
1722 }
1723 }
1724 } else if (sub->type.compare(0, 6, "mdsmap") == 0) {
1725 if (sub->next > fsmap.get_epoch()) {
1726 return;
1727 }
1728
1729 const bool is_mds = sub->session->inst.name.is_mds();
1730 mds_gid_t mds_gid = MDS_GID_NONE;
1731 fs_cluster_id_t fscid = FS_CLUSTER_ID_NONE;
1732 if (is_mds) {
1733 // What (if any) namespace are you assigned to?
1734 auto mds_info = fsmap.get_mds_info();
1735 for (const auto &i : mds_info) {
1736 if (i.second.addr == sub->session->inst.addr) {
1737 mds_gid = i.first;
1738 fscid = fsmap.mds_roles.at(mds_gid);
1739 }
1740 }
1741 } else {
1742 // You're a client. Did you request a particular
1743 // namespace?
1744 if (sub->type.find("mdsmap.") == 0) {
1745 auto namespace_id_str = sub->type.substr(std::string("mdsmap.").size());
1746 dout(10) << __func__ << ": namespace_id " << namespace_id_str << dendl;
1747 std::string err;
1748 fscid = strict_strtoll(namespace_id_str.c_str(), 10, &err);
1749 if (!err.empty()) {
1750 // Client asked for a non-existent namespace, send them nothing
1751 dout(1) << "Invalid client subscription '" << sub->type
1752 << "'" << dendl;
1753 return;
1754 }
1755 if (fsmap.filesystems.count(fscid) == 0) {
1756 // Client asked for a non-existent namespace, send them nothing
1757 // TODO: something more graceful for when a client has a filesystem
1758 // mounted, and the fileysstem is deleted. Add a "shut down you fool"
1759 // flag to MMDSMap?
1760 dout(1) << "Client subscribed to non-existent namespace '" <<
1761 fscid << "'" << dendl;
1762 return;
1763 }
1764 } else {
1765 // Unqualified request for "mdsmap": give it the one marked
1766 // for use by legacy clients.
1767 if (fsmap.legacy_client_fscid != FS_CLUSTER_ID_NONE) {
1768 fscid = fsmap.legacy_client_fscid;
1769 } else {
1770 dout(1) << "Client subscribed for legacy filesystem but "
1771 "none is configured" << dendl;
1772 return;
1773 }
1774 }
1775 }
1776 dout(10) << __func__ << ": is_mds=" << is_mds << ", fscid= " << fscid << dendl;
1777
1778 // Work out the effective latest epoch
1779 MDSMap *mds_map = nullptr;
1780 MDSMap null_map;
1781 null_map.compat = fsmap.compat;
1782 if (fscid == FS_CLUSTER_ID_NONE) {
1783 // For a client, we should have already dropped out
1784 assert(is_mds);
1785
1786 if (fsmap.standby_daemons.count(mds_gid)) {
1787 // For an MDS, we need to feed it an MDSMap with its own state in
1788 null_map.mds_info[mds_gid] = fsmap.standby_daemons[mds_gid];
1789 null_map.epoch = fsmap.standby_epochs[mds_gid];
1790 } else {
1791 null_map.epoch = fsmap.epoch;
1792 }
1793 mds_map = &null_map;
1794 } else {
1795 // Check the effective epoch
1796 mds_map = &(fsmap.filesystems.at(fscid)->mds_map);
1797 }
1798
1799 assert(mds_map != nullptr);
1800 dout(10) << __func__ << " selected MDS map epoch " <<
1801 mds_map->epoch << " for namespace " << fscid << " for subscriber "
1802 << sub->session->inst.name << " who wants epoch " << sub->next << dendl;
1803
1804 if (sub->next > mds_map->epoch) {
1805 return;
1806 }
1807 auto msg = new MMDSMap(mon->monmap->fsid, mds_map);
1808
1809 sub->session->con->send_message(msg);
1810 if (sub->onetime) {
1811 mon->session_map.remove_sub(sub);
1812 } else {
1813 sub->next = mds_map->get_epoch() + 1;
1814 }
1815 }
1816 }
1817
1818
1819 void MDSMonitor::update_metadata(mds_gid_t gid,
1820 const map<string, string>& metadata)
1821 {
1822 if (metadata.empty()) {
1823 return;
1824 }
1825 pending_metadata[gid] = metadata;
1826
1827 MonitorDBStore::TransactionRef t = paxos->get_pending_transaction();
1828 bufferlist bl;
1829 ::encode(pending_metadata, bl);
1830 t->put(MDS_METADATA_PREFIX, "last_metadata", bl);
1831 paxos->trigger_propose();
1832 }
1833
1834 void MDSMonitor::remove_from_metadata(MonitorDBStore::TransactionRef t)
1835 {
1836 bool update = false;
1837 for (map<mds_gid_t, Metadata>::iterator i = pending_metadata.begin();
1838 i != pending_metadata.end(); ) {
1839 if (!pending_fsmap.gid_exists(i->first)) {
1840 pending_metadata.erase(i++);
1841 update = true;
1842 } else {
1843 ++i;
1844 }
1845 }
1846 if (!update)
1847 return;
1848 bufferlist bl;
1849 ::encode(pending_metadata, bl);
1850 t->put(MDS_METADATA_PREFIX, "last_metadata", bl);
1851 }
1852
1853 int MDSMonitor::load_metadata(map<mds_gid_t, Metadata>& m)
1854 {
1855 bufferlist bl;
1856 int r = mon->store->get(MDS_METADATA_PREFIX, "last_metadata", bl);
1857 if (r) {
1858 dout(1) << "Unable to load 'last_metadata'" << dendl;
1859 return r;
1860 }
1861
1862 bufferlist::iterator it = bl.begin();
1863 ::decode(m, it);
1864 return 0;
1865 }
1866
1867 void MDSMonitor::count_metadata(const string& field, map<string,int> *out)
1868 {
1869 map<mds_gid_t,Metadata> meta;
1870 load_metadata(meta);
1871 for (auto& p : meta) {
1872 auto q = p.second.find(field);
1873 if (q == p.second.end()) {
1874 (*out)["unknown"]++;
1875 } else {
1876 (*out)[q->second]++;
1877 }
1878 }
1879 }
1880
1881 void MDSMonitor::count_metadata(const string& field, Formatter *f)
1882 {
1883 map<string,int> by_val;
1884 count_metadata(field, &by_val);
1885 f->open_object_section(field.c_str());
1886 for (auto& p : by_val) {
1887 f->dump_int(p.first.c_str(), p.second);
1888 }
1889 f->close_section();
1890 }
1891
1892 int MDSMonitor::dump_metadata(const std::string &who, Formatter *f, ostream& err)
1893 {
1894 assert(f);
1895
1896 mds_gid_t gid = gid_from_arg(who, err);
1897 if (gid == MDS_GID_NONE) {
1898 return -EINVAL;
1899 }
1900
1901 map<mds_gid_t, Metadata> metadata;
1902 if (int r = load_metadata(metadata)) {
1903 err << "Unable to load 'last_metadata'";
1904 return r;
1905 }
1906
1907 if (!metadata.count(gid)) {
1908 return -ENOENT;
1909 }
1910 const Metadata& m = metadata[gid];
1911 for (Metadata::const_iterator p = m.begin(); p != m.end(); ++p) {
1912 f->dump_string(p->first.c_str(), p->second);
1913 }
1914 return 0;
1915 }
1916
1917 int MDSMonitor::print_nodes(Formatter *f)
1918 {
1919 assert(f);
1920
1921 map<mds_gid_t, Metadata> metadata;
1922 if (int r = load_metadata(metadata)) {
1923 return r;
1924 }
1925
1926 map<string, list<int> > mdses; // hostname => rank
1927 for (map<mds_gid_t, Metadata>::iterator it = metadata.begin();
1928 it != metadata.end(); ++it) {
1929 const Metadata& m = it->second;
1930 Metadata::const_iterator hostname = m.find("hostname");
1931 if (hostname == m.end()) {
1932 // not likely though
1933 continue;
1934 }
1935 const mds_gid_t gid = it->first;
1936 if (!fsmap.gid_exists(gid)) {
1937 dout(5) << __func__ << ": GID " << gid << " not existent" << dendl;
1938 continue;
1939 }
1940 const MDSMap::mds_info_t& mds_info = fsmap.get_info_gid(gid);
1941 // FIXME: include filesystem name with rank here
1942 mdses[hostname->second].push_back(mds_info.rank);
1943 }
1944
1945 dump_services(f, mdses, "mds");
1946 return 0;
1947 }
1948
1949 /**
1950 * If a cluster is undersized (with respect to max_mds), then
1951 * attempt to find daemons to grow it.
1952 */
1953 bool MDSMonitor::maybe_expand_cluster(std::shared_ptr<Filesystem> fs)
1954 {
1955 bool do_propose = false;
1956
1957 if (fs->mds_map.test_flag(CEPH_MDSMAP_DOWN)) {
1958 return do_propose;
1959 }
1960
1961 while (fs->mds_map.get_num_in_mds() < size_t(fs->mds_map.get_max_mds()) &&
1962 !fs->mds_map.is_degraded()) {
1963 mds_rank_t mds = mds_rank_t(0);
1964 string name;
1965 while (fs->mds_map.is_in(mds)) {
1966 mds++;
1967 }
1968 mds_gid_t newgid = pending_fsmap.find_replacement_for({fs->fscid, mds},
1969 name, g_conf->mon_force_standby_active);
1970 if (newgid == MDS_GID_NONE) {
1971 break;
1972 }
1973
1974 dout(1) << "adding standby " << pending_fsmap.get_info_gid(newgid).addr
1975 << " as mds." << mds << dendl;
1976 pending_fsmap.promote(newgid, fs, mds);
1977 do_propose = true;
1978 }
1979
1980 return do_propose;
1981 }
1982
1983
1984 /**
1985 * If a daemon is laggy, and a suitable replacement
1986 * is available, fail this daemon (remove from map) and pass its
1987 * role to another daemon.
1988 */
1989 void MDSMonitor::maybe_replace_gid(mds_gid_t gid, const MDSMap::mds_info_t& info,
1990 bool *mds_propose, bool *osd_propose)
1991 {
1992 assert(mds_propose != nullptr);
1993 assert(osd_propose != nullptr);
1994
1995 const auto fscid = pending_fsmap.mds_roles.at(gid);
1996
1997 // We will only take decisive action (replacing/removing a daemon)
1998 // if we have some indicating that some other daemon(s) are successfully
1999 // getting beacons through recently.
2000 utime_t latest_beacon;
2001 for (const auto & i : last_beacon) {
2002 latest_beacon = MAX(i.second.stamp, latest_beacon);
2003 }
2004 const bool may_replace = latest_beacon >
2005 (ceph_clock_now() -
2006 MAX(g_conf->mds_beacon_interval, g_conf->mds_beacon_grace * 0.5));
2007
2008 // are we in?
2009 // and is there a non-laggy standby that can take over for us?
2010 mds_gid_t sgid;
2011 if (info.rank >= 0 &&
2012 info.state != MDSMap::STATE_STANDBY &&
2013 info.state != MDSMap::STATE_STANDBY_REPLAY &&
2014 may_replace &&
2015 !pending_fsmap.get_filesystem(fscid)->mds_map.test_flag(CEPH_MDSMAP_DOWN) &&
2016 (sgid = pending_fsmap.find_replacement_for({fscid, info.rank}, info.name,
2017 g_conf->mon_force_standby_active)) != MDS_GID_NONE)
2018 {
2019
2020 MDSMap::mds_info_t si = pending_fsmap.get_info_gid(sgid);
2021 dout(10) << " replacing " << gid << " " << info.addr << " mds."
2022 << info.rank << "." << info.inc
2023 << " " << ceph_mds_state_name(info.state)
2024 << " with " << sgid << "/" << si.name << " " << si.addr << dendl;
2025
2026 mon->clog->warn() << "MDS daemon '" << info.name << "'"
2027 << " is not responding, replacing it "
2028 << "as rank " << info.rank
2029 << " with standby '" << si.name << "'";
2030
2031 // Remember what NS the old one was in
2032 const fs_cluster_id_t fscid = pending_fsmap.mds_roles.at(gid);
2033
2034 // Remove the old one
2035 *osd_propose |= fail_mds_gid(gid);
2036
2037 // Promote the replacement
2038 auto fs = pending_fsmap.filesystems.at(fscid);
2039 pending_fsmap.promote(sgid, fs, info.rank);
2040
2041 *mds_propose = true;
2042 } else if ((info.state == MDSMap::STATE_STANDBY_REPLAY ||
2043 info.state == MDSMap::STATE_STANDBY) && may_replace) {
2044 dout(10) << " failing and removing " << gid << " " << info.addr << " mds." << info.rank
2045 << "." << info.inc << " " << ceph_mds_state_name(info.state)
2046 << dendl;
2047 mon->clog->info() << "MDS standby '" << info.name
2048 << "' is not responding, removing it from the set of "
2049 << "standbys";
2050 fail_mds_gid(gid);
2051 *mds_propose = true;
2052 } else if (!info.laggy()) {
2053 dout(10) << " marking " << gid << " " << info.addr << " mds." << info.rank << "." << info.inc
2054 << " " << ceph_mds_state_name(info.state)
2055 << " laggy" << dendl;
2056 pending_fsmap.modify_daemon(info.global_id, [](MDSMap::mds_info_t *info) {
2057 info->laggy_since = ceph_clock_now();
2058 });
2059 *mds_propose = true;
2060 }
2061 }
2062
2063 bool MDSMonitor::maybe_promote_standby(std::shared_ptr<Filesystem> fs)
2064 {
2065 assert(!fs->mds_map.test_flag(CEPH_MDSMAP_DOWN));
2066
2067 bool do_propose = false;
2068
2069 // have a standby take over?
2070 set<mds_rank_t> failed;
2071 fs->mds_map.get_failed_mds_set(failed);
2072 if (!failed.empty()) {
2073 set<mds_rank_t>::iterator p = failed.begin();
2074 while (p != failed.end()) {
2075 mds_rank_t f = *p++;
2076 mds_gid_t sgid = pending_fsmap.find_replacement_for({fs->fscid, f}, {},
2077 g_conf->mon_force_standby_active);
2078 if (sgid) {
2079 const MDSMap::mds_info_t si = pending_fsmap.get_info_gid(sgid);
2080 dout(0) << " taking over failed mds." << f << " with " << sgid
2081 << "/" << si.name << " " << si.addr << dendl;
2082 pending_fsmap.promote(sgid, fs, f);
2083 do_propose = true;
2084 }
2085 }
2086 } else {
2087 // There were no failures to replace, so try using any available standbys
2088 // as standby-replay daemons.
2089
2090 // Take a copy of the standby GIDs so that we can iterate over
2091 // them while perhaps-modifying standby_daemons during the loop
2092 // (if we promote anyone they are removed from standby_daemons)
2093 std::vector<mds_gid_t> standby_gids;
2094 for (const auto &j : pending_fsmap.standby_daemons) {
2095 standby_gids.push_back(j.first);
2096 }
2097
2098 for (const auto &gid : standby_gids) {
2099 const auto &info = pending_fsmap.standby_daemons.at(gid);
2100 assert(info.state == MDSMap::STATE_STANDBY);
2101
2102 if (!info.standby_replay) {
2103 continue;
2104 }
2105
2106 /*
2107 * This mds is standby but has no rank assigned.
2108 * See if we can find it somebody to shadow
2109 */
2110 dout(20) << "gid " << gid << " is standby and following nobody" << dendl;
2111
2112 // standby for someone specific?
2113 if (info.standby_for_rank >= 0) {
2114 // The mds_info_t may or may not tell us exactly which filesystem
2115 // the standby_for_rank refers to: lookup via legacy_client_fscid
2116 mds_role_t target_role = {
2117 info.standby_for_fscid == FS_CLUSTER_ID_NONE ?
2118 pending_fsmap.legacy_client_fscid : info.standby_for_fscid,
2119 info.standby_for_rank};
2120
2121 // It is possible that the map contains a standby_for_fscid
2122 // that doesn't correspond to an existing filesystem, especially
2123 // if we loaded from a version with a bug (#17466)
2124 if (info.standby_for_fscid != FS_CLUSTER_ID_NONE
2125 && !pending_fsmap.filesystem_exists(info.standby_for_fscid)) {
2126 derr << "gid " << gid << " has invalid standby_for_fscid "
2127 << info.standby_for_fscid << dendl;
2128 continue;
2129 }
2130
2131 // If we managed to resolve a full target role
2132 if (target_role.fscid != FS_CLUSTER_ID_NONE) {
2133 auto fs = pending_fsmap.get_filesystem(target_role.fscid);
2134 if (fs->mds_map.is_followable(target_role.rank)) {
2135 do_propose |= try_standby_replay(
2136 info,
2137 *fs,
2138 fs->mds_map.get_info(target_role.rank));
2139 }
2140 }
2141
2142 continue;
2143 }
2144
2145 // check everyone
2146 for (auto fs_i : pending_fsmap.filesystems) {
2147 const MDSMap &mds_map = fs_i.second->mds_map;
2148 for (auto mds_i : mds_map.mds_info) {
2149 MDSMap::mds_info_t &cand_info = mds_i.second;
2150 if (cand_info.rank >= 0 && mds_map.is_followable(cand_info.rank)) {
2151 if ((info.standby_for_name.length() && info.standby_for_name != cand_info.name) ||
2152 info.standby_for_rank != MDS_RANK_NONE) {
2153 continue; // we're supposed to follow someone else
2154 }
2155
2156 if (try_standby_replay(info, *(fs_i.second), cand_info)) {
2157 do_propose = true;
2158 break;
2159 }
2160 continue;
2161 }
2162 }
2163 }
2164 }
2165 }
2166
2167 return do_propose;
2168 }
2169
2170 void MDSMonitor::tick()
2171 {
2172 // make sure mds's are still alive
2173 // ...if i am an active leader
2174 if (!is_active()) return;
2175
2176 dout(10) << fsmap << dendl;
2177
2178 bool do_propose = false;
2179
2180 if (!mon->is_leader()) return;
2181
2182 do_propose |= pending_fsmap.check_health();
2183
2184 // expand mds cluster (add new nodes to @in)?
2185 for (auto i : pending_fsmap.filesystems) {
2186 do_propose |= maybe_expand_cluster(i.second);
2187 }
2188
2189 const auto now = ceph_clock_now();
2190 if (last_tick.is_zero()) {
2191 last_tick = now;
2192 }
2193
2194 if (now - last_tick > (g_conf->mds_beacon_grace - g_conf->mds_beacon_interval)) {
2195 // This case handles either local slowness (calls being delayed
2196 // for whatever reason) or cluster election slowness (a long gap
2197 // between calls while an election happened)
2198 dout(4) << __func__ << ": resetting beacon timeouts due to mon delay "
2199 "(slow election?) of " << now - last_tick << " seconds" << dendl;
2200 for (auto &i : last_beacon) {
2201 i.second.stamp = now;
2202 }
2203 }
2204
2205 last_tick = now;
2206
2207 // check beacon timestamps
2208 utime_t cutoff = now;
2209 cutoff -= g_conf->mds_beacon_grace;
2210
2211 // make sure last_beacon is fully populated
2212 for (const auto &p : pending_fsmap.mds_roles) {
2213 auto &gid = p.first;
2214 if (last_beacon.count(gid) == 0) {
2215 last_beacon[gid].stamp = now;
2216 last_beacon[gid].seq = 0;
2217 }
2218 }
2219
2220 bool propose_osdmap = false;
2221 bool osdmap_writeable = mon->osdmon()->is_writeable();
2222 auto p = last_beacon.begin();
2223 while (p != last_beacon.end()) {
2224 mds_gid_t gid = p->first;
2225 auto beacon_info = p->second;
2226 ++p;
2227
2228 if (!pending_fsmap.gid_exists(gid)) {
2229 // clean it out
2230 last_beacon.erase(gid);
2231 continue;
2232 }
2233
2234 if (beacon_info.stamp < cutoff) {
2235 auto &info = pending_fsmap.get_info_gid(gid);
2236 dout(1) << "no beacon from mds." << info.rank << "." << info.inc
2237 << " (gid: " << gid << " addr: " << info.addr
2238 << " state: " << ceph_mds_state_name(info.state) << ")"
2239 << " since " << beacon_info.stamp << dendl;
2240 // If the OSDMap is writeable, we can blacklist things, so we can
2241 // try failing any laggy MDS daemons. Consider each one for failure.
2242 if (osdmap_writeable) {
2243 maybe_replace_gid(gid, info, &do_propose, &propose_osdmap);
2244 }
2245 }
2246 }
2247 if (propose_osdmap) {
2248 request_proposal(mon->osdmon());
2249 }
2250
2251 for (auto i : pending_fsmap.filesystems) {
2252 auto fs = i.second;
2253 if (!fs->mds_map.test_flag(CEPH_MDSMAP_DOWN)) {
2254 do_propose |= maybe_promote_standby(fs);
2255 }
2256 }
2257
2258 if (do_propose) {
2259 propose_pending();
2260 }
2261 }
2262
2263 /**
2264 * finfo: the would-be follower
2265 * leader_fs: the Filesystem containing the would-be leader
2266 * ainfo: the would-be leader
2267 */
2268 bool MDSMonitor::try_standby_replay(
2269 const MDSMap::mds_info_t& finfo,
2270 const Filesystem &leader_fs,
2271 const MDSMap::mds_info_t& ainfo)
2272 {
2273 // someone else already following?
2274 if (leader_fs.has_standby_replay(ainfo.global_id)) {
2275 dout(20) << " mds." << ainfo.rank << " already has a follower" << dendl;
2276 return false;
2277 } else {
2278 // Assign the new role to the standby
2279 dout(10) << " setting to follow mds rank " << ainfo.rank << dendl;
2280 pending_fsmap.assign_standby_replay(finfo.global_id, leader_fs.fscid, ainfo.rank);
2281 return true;
2282 }
2283 }
2284
2285 MDSMonitor::MDSMonitor(Monitor *mn, Paxos *p, string service_name)
2286 : PaxosService(mn, p, service_name)
2287 {
2288 handlers = FileSystemCommandHandler::load(p);
2289 }
2290
2291 void MDSMonitor::on_restart()
2292 {
2293 // Clear out the leader-specific state.
2294 last_tick = utime_t();
2295 last_beacon.clear();
2296 }
2297