]> git.proxmox.com Git - ceph.git/blob - ceph/src/mon/MDSMonitor.cc
update sources to v12.2.1
[ceph.git] / ceph / src / mon / MDSMonitor.cc
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3 /*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 *
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
12 *
13 */
14
15 #include <sstream>
16 #include <boost/utility.hpp>
17 #include <boost/regex.hpp>
18
19 #include "MDSMonitor.h"
20 #include "FSCommands.h"
21 #include "Monitor.h"
22 #include "MonitorDBStore.h"
23 #include "OSDMonitor.h"
24 #include "PGMonitor.h"
25
26 #include "common/strtol.h"
27 #include "common/perf_counters.h"
28 #include "common/config.h"
29 #include "common/cmdparse.h"
30 #include "messages/MMDSMap.h"
31 #include "messages/MFSMap.h"
32 #include "messages/MFSMapUser.h"
33 #include "messages/MMDSLoadTargets.h"
34 #include "messages/MMonCommand.h"
35 #include "messages/MGenericMessage.h"
36
37 #include "include/assert.h"
38 #include "include/str_list.h"
39 #include "include/stringify.h"
40 #include "mds/mdstypes.h"
41 #include "Session.h"
42
43 #define dout_subsys ceph_subsys_mon
44 #undef dout_prefix
45 #define dout_prefix _prefix(_dout, mon, fsmap)
46 static ostream& _prefix(std::ostream *_dout, Monitor *mon, FSMap const& fsmap) {
47 return *_dout << "mon." << mon->name << "@" << mon->rank
48 << "(" << mon->get_state_name()
49 << ").mds e" << fsmap.get_epoch() << " ";
50 }
51
52 /*
53 * Specialized implementation of cmd_getval to allow us to parse
54 * out strongly-typedef'd types
55 */
56 template<> bool cmd_getval(CephContext *cct, const cmdmap_t& cmdmap,
57 const std::string& k, mds_gid_t &val)
58 {
59 return cmd_getval(cct, cmdmap, k, (int64_t&)val);
60 }
61
62 template<> bool cmd_getval(CephContext *cct, const cmdmap_t& cmdmap,
63 const std::string& k, mds_rank_t &val)
64 {
65 return cmd_getval(cct, cmdmap, k, (int64_t&)val);
66 }
67
68 template<> bool cmd_getval(CephContext *cct, const cmdmap_t& cmdmap,
69 const std::string& k, MDSMap::DaemonState &val)
70 {
71 return cmd_getval(cct, cmdmap, k, (int64_t&)val);
72 }
73
74 static const string MDS_METADATA_PREFIX("mds_metadata");
75
76
77 // my methods
78
79 void MDSMonitor::print_map(FSMap &m, int dbl)
80 {
81 dout(dbl) << "print_map\n";
82 m.print(*_dout);
83 *_dout << dendl;
84 }
85
86 // service methods
87 void MDSMonitor::create_initial()
88 {
89 dout(10) << "create_initial" << dendl;
90 }
91
92
93 void MDSMonitor::update_from_paxos(bool *need_bootstrap)
94 {
95 version_t version = get_last_committed();
96 if (version == fsmap.epoch)
97 return;
98
99 dout(10) << __func__ << " version " << version
100 << ", my e " << fsmap.epoch << dendl;
101 assert(version > fsmap.epoch);
102
103 load_health();
104
105 // read and decode
106 bufferlist fsmap_bl;
107 fsmap_bl.clear();
108 int err = get_version(version, fsmap_bl);
109 assert(err == 0);
110
111 assert(fsmap_bl.length() > 0);
112 dout(10) << __func__ << " got " << version << dendl;
113 fsmap.decode(fsmap_bl);
114
115 // new map
116 dout(4) << "new map" << dendl;
117 print_map(fsmap, 0);
118 if (!g_conf->mon_mds_skip_sanity) {
119 fsmap.sanity();
120 }
121
122 check_subs();
123 update_logger();
124 }
125
126 void MDSMonitor::init()
127 {
128 (void)load_metadata(pending_metadata);
129 }
130
131 void MDSMonitor::create_pending()
132 {
133 pending_fsmap = fsmap;
134 pending_fsmap.epoch++;
135
136 dout(10) << "create_pending e" << pending_fsmap.epoch << dendl;
137 }
138
139 void MDSMonitor::encode_pending(MonitorDBStore::TransactionRef t)
140 {
141 dout(10) << "encode_pending e" << pending_fsmap.epoch << dendl;
142
143
144 // print map iff 'debug mon = 30' or higher
145 print_map(pending_fsmap, 30);
146 if (!g_conf->mon_mds_skip_sanity) {
147 pending_fsmap.sanity();
148 }
149
150 // Set 'modified' on maps modified this epoch
151 for (auto &i : fsmap.filesystems) {
152 if (i.second->mds_map.epoch == fsmap.epoch) {
153 i.second->mds_map.modified = ceph_clock_now();
154 }
155 }
156
157 // apply to paxos
158 assert(get_last_committed() + 1 == pending_fsmap.epoch);
159 bufferlist fsmap_bl;
160 pending_fsmap.encode(fsmap_bl, mon->get_quorum_con_features());
161
162 /* put everything in the transaction */
163 put_version(t, pending_fsmap.epoch, fsmap_bl);
164 put_last_committed(t, pending_fsmap.epoch);
165
166 // Encode MDSHealth data
167 for (std::map<uint64_t, MDSHealth>::iterator i = pending_daemon_health.begin();
168 i != pending_daemon_health.end(); ++i) {
169 bufferlist bl;
170 i->second.encode(bl);
171 t->put(MDS_HEALTH_PREFIX, stringify(i->first), bl);
172 }
173
174 for (std::set<uint64_t>::iterator i = pending_daemon_health_rm.begin();
175 i != pending_daemon_health_rm.end(); ++i) {
176 t->erase(MDS_HEALTH_PREFIX, stringify(*i));
177 }
178 pending_daemon_health_rm.clear();
179 remove_from_metadata(t);
180
181 // health
182 health_check_map_t new_checks;
183 const auto info_map = pending_fsmap.get_mds_info();
184 for (const auto &i : info_map) {
185 const auto &gid = i.first;
186 const auto &info = i.second;
187 if (pending_daemon_health_rm.count(gid)) {
188 continue;
189 }
190 MDSHealth health;
191 auto p = pending_daemon_health.find(gid);
192 if (p != pending_daemon_health.end()) {
193 health = p->second;
194 } else {
195 bufferlist bl;
196 mon->store->get(MDS_HEALTH_PREFIX, stringify(gid), bl);
197 if (!bl.length()) {
198 derr << "Missing health data for MDS " << gid << dendl;
199 continue;
200 }
201 bufferlist::iterator bl_i = bl.begin();
202 health.decode(bl_i);
203 }
204 for (const auto &metric : health.metrics) {
205 const int rank = info.rank;
206 health_check_t *check = &new_checks.get_or_add(
207 mds_metric_name(metric.type),
208 metric.sev,
209 mds_metric_summary(metric.type));
210 ostringstream ss;
211 ss << "mds" << info.name << "(mds." << rank << "): " << metric.message;
212 for (auto p = metric.metadata.begin();
213 p != metric.metadata.end();
214 ++p) {
215 if (p != metric.metadata.begin()) {
216 ss << ", ";
217 }
218 ss << p->first << ": " << p->second;
219 }
220 check->detail.push_back(ss.str());
221 }
222 }
223 pending_fsmap.get_health_checks(&new_checks);
224 for (auto& p : new_checks.checks) {
225 p.second.summary = boost::regex_replace(
226 p.second.summary,
227 boost::regex("%num%"),
228 stringify(p.second.detail.size()));
229 p.second.summary = boost::regex_replace(
230 p.second.summary,
231 boost::regex("%plurals%"),
232 p.second.detail.size() > 1 ? "s" : "");
233 p.second.summary = boost::regex_replace(
234 p.second.summary,
235 boost::regex("%isorare%"),
236 p.second.detail.size() > 1 ? "are" : "is");
237 p.second.summary = boost::regex_replace(
238 p.second.summary,
239 boost::regex("%hasorhave%"),
240 p.second.detail.size() > 1 ? "have" : "has");
241 }
242 encode_health(new_checks, t);
243 }
244
245 version_t MDSMonitor::get_trim_to()
246 {
247 version_t floor = 0;
248 if (g_conf->mon_mds_force_trim_to > 0 &&
249 g_conf->mon_mds_force_trim_to < (int)get_last_committed()) {
250 floor = g_conf->mon_mds_force_trim_to;
251 dout(10) << __func__ << " explicit mon_mds_force_trim_to = "
252 << floor << dendl;
253 }
254
255 unsigned max = g_conf->mon_max_mdsmap_epochs;
256 version_t last = get_last_committed();
257
258 if (last - get_first_committed() > max && floor < last - max)
259 return last - max;
260 return floor;
261 }
262
263 void MDSMonitor::update_logger()
264 {
265 dout(10) << "update_logger" << dendl;
266
267 uint64_t up = 0;
268 uint64_t in = 0;
269 uint64_t failed = 0;
270 for (const auto &i : fsmap.filesystems) {
271 const MDSMap &mds_map = i.second->mds_map;
272
273 up += mds_map.get_num_up_mds();
274 in += mds_map.get_num_in_mds();
275 failed += mds_map.get_num_failed_mds();
276 }
277 mon->cluster_logger->set(l_cluster_num_mds_up, up);
278 mon->cluster_logger->set(l_cluster_num_mds_in, in);
279 mon->cluster_logger->set(l_cluster_num_mds_failed, failed);
280 mon->cluster_logger->set(l_cluster_mds_epoch, fsmap.get_epoch());
281 }
282
283 bool MDSMonitor::preprocess_query(MonOpRequestRef op)
284 {
285 op->mark_mdsmon_event(__func__);
286 PaxosServiceMessage *m = static_cast<PaxosServiceMessage*>(op->get_req());
287 dout(10) << "preprocess_query " << *m << " from " << m->get_orig_source_inst() << dendl;
288
289 switch (m->get_type()) {
290
291 case MSG_MDS_BEACON:
292 return preprocess_beacon(op);
293
294 case MSG_MON_COMMAND:
295 return preprocess_command(op);
296
297 case MSG_MDS_OFFLOAD_TARGETS:
298 return preprocess_offload_targets(op);
299
300 default:
301 ceph_abort();
302 return true;
303 }
304 }
305
306 void MDSMonitor::_note_beacon(MMDSBeacon *m)
307 {
308 mds_gid_t gid = mds_gid_t(m->get_global_id());
309 version_t seq = m->get_seq();
310
311 dout(15) << "_note_beacon " << *m << " noting time" << dendl;
312 last_beacon[gid].stamp = ceph_clock_now();
313 last_beacon[gid].seq = seq;
314 }
315
316 bool MDSMonitor::preprocess_beacon(MonOpRequestRef op)
317 {
318 op->mark_mdsmon_event(__func__);
319 MMDSBeacon *m = static_cast<MMDSBeacon*>(op->get_req());
320 MDSMap::DaemonState state = m->get_state();
321 mds_gid_t gid = m->get_global_id();
322 version_t seq = m->get_seq();
323 MDSMap::mds_info_t info;
324 epoch_t effective_epoch = 0;
325
326 // check privileges, ignore if fails
327 MonSession *session = m->get_session();
328 assert(session);
329 if (!session->is_capable("mds", MON_CAP_X)) {
330 dout(0) << "preprocess_beacon got MMDSBeacon from entity with insufficient privileges "
331 << session->caps << dendl;
332 goto ignore;
333 }
334
335 if (m->get_fsid() != mon->monmap->fsid) {
336 dout(0) << "preprocess_beacon on fsid " << m->get_fsid() << " != " << mon->monmap->fsid << dendl;
337 goto ignore;
338 }
339
340 dout(12) << "preprocess_beacon " << *m
341 << " from " << m->get_orig_source_inst()
342 << " " << m->get_compat()
343 << dendl;
344
345 // make sure the address has a port
346 if (m->get_orig_source_addr().get_port() == 0) {
347 dout(1) << " ignoring boot message without a port" << dendl;
348 goto ignore;
349 }
350
351 // check compat
352 if (!m->get_compat().writeable(fsmap.compat)) {
353 dout(1) << " mds " << m->get_source_inst() << " can't write to fsmap " << fsmap.compat << dendl;
354 goto ignore;
355 }
356
357 // fw to leader?
358 if (!mon->is_leader())
359 return false;
360
361 // booted, but not in map?
362 if (!pending_fsmap.gid_exists(gid)) {
363 if (state != MDSMap::STATE_BOOT) {
364 dout(7) << "mds_beacon " << *m << " is not in fsmap (state "
365 << ceph_mds_state_name(state) << ")" << dendl;
366
367 MDSMap null_map;
368 null_map.epoch = fsmap.epoch;
369 null_map.compat = fsmap.compat;
370 mon->send_reply(op, new MMDSMap(mon->monmap->fsid, &null_map));
371 return true;
372 } else {
373 return false; // not booted yet.
374 }
375 }
376 dout(10) << __func__ << ": GID exists in map: " << gid << dendl;
377 info = pending_fsmap.get_info_gid(gid);
378
379 // old seq?
380 if (info.state_seq > seq) {
381 dout(7) << "mds_beacon " << *m << " has old seq, ignoring" << dendl;
382 goto ignore;
383 }
384
385 // Work out the latest epoch that this daemon should have seen
386 {
387 fs_cluster_id_t fscid = pending_fsmap.mds_roles.at(gid);
388 if (fscid == FS_CLUSTER_ID_NONE) {
389 effective_epoch = pending_fsmap.standby_epochs.at(gid);
390 } else {
391 effective_epoch = pending_fsmap.get_filesystem(fscid)->mds_map.epoch;
392 }
393 if (effective_epoch != m->get_last_epoch_seen()) {
394 dout(10) << "mds_beacon " << *m
395 << " ignoring requested state, because mds hasn't seen latest map" << dendl;
396 goto reply;
397 }
398 }
399
400 if (info.laggy()) {
401 _note_beacon(m);
402 return false; // no longer laggy, need to update map.
403 }
404 if (state == MDSMap::STATE_BOOT) {
405 // ignore, already booted.
406 goto ignore;
407 }
408 // is there a state change here?
409 if (info.state != state) {
410 // legal state change?
411 if ((info.state == MDSMap::STATE_STANDBY ||
412 info.state == MDSMap::STATE_STANDBY_REPLAY) && state > 0) {
413 dout(10) << "mds_beacon mds can't activate itself (" << ceph_mds_state_name(info.state)
414 << " -> " << ceph_mds_state_name(state) << ")" << dendl;
415 goto reply;
416 }
417
418 if ((state == MDSMap::STATE_STANDBY || state == MDSMap::STATE_STANDBY_REPLAY)
419 && info.rank != MDS_RANK_NONE)
420 {
421 dout(4) << "mds_beacon MDS can't go back into standby after taking rank: "
422 "held rank " << info.rank << " while requesting state "
423 << ceph_mds_state_name(state) << dendl;
424 goto reply;
425 }
426
427 _note_beacon(m);
428 return false;
429 }
430
431 // Comparing known daemon health with m->get_health()
432 // and return false (i.e. require proposal) if they
433 // do not match, to update our stored
434 if (!(pending_daemon_health[gid] == m->get_health())) {
435 dout(20) << __func__ << " health metrics for gid " << gid << " were updated" << dendl;
436 _note_beacon(m);
437 return false;
438 }
439
440 reply:
441 // note time and reply
442 assert(effective_epoch > 0);
443 _note_beacon(m);
444 mon->send_reply(op,
445 new MMDSBeacon(mon->monmap->fsid, m->get_global_id(), m->get_name(),
446 effective_epoch, state, seq,
447 CEPH_FEATURES_SUPPORTED_DEFAULT));
448 return true;
449
450 ignore:
451 // I won't reply this beacon, drop it.
452 mon->no_reply(op);
453 return true;
454 }
455
456 bool MDSMonitor::preprocess_offload_targets(MonOpRequestRef op)
457 {
458 op->mark_mdsmon_event(__func__);
459 MMDSLoadTargets *m = static_cast<MMDSLoadTargets*>(op->get_req());
460 dout(10) << "preprocess_offload_targets " << *m << " from " << m->get_orig_source() << dendl;
461
462 // check privileges, ignore message if fails
463 MonSession *session = m->get_session();
464 if (!session)
465 goto done;
466 if (!session->is_capable("mds", MON_CAP_X)) {
467 dout(0) << "preprocess_offload_targets got MMDSLoadTargets from entity with insufficient caps "
468 << session->caps << dendl;
469 goto done;
470 }
471
472 if (fsmap.gid_exists(m->global_id) &&
473 m->targets == fsmap.get_info_gid(m->global_id).export_targets)
474 goto done;
475
476 return false;
477
478 done:
479 return true;
480 }
481
482
483 bool MDSMonitor::prepare_update(MonOpRequestRef op)
484 {
485 op->mark_mdsmon_event(__func__);
486 PaxosServiceMessage *m = static_cast<PaxosServiceMessage*>(op->get_req());
487 dout(7) << "prepare_update " << *m << dendl;
488
489 switch (m->get_type()) {
490
491 case MSG_MDS_BEACON:
492 return prepare_beacon(op);
493
494 case MSG_MON_COMMAND:
495 return prepare_command(op);
496
497 case MSG_MDS_OFFLOAD_TARGETS:
498 return prepare_offload_targets(op);
499
500 default:
501 ceph_abort();
502 }
503
504 return true;
505 }
506
507 bool MDSMonitor::prepare_beacon(MonOpRequestRef op)
508 {
509 op->mark_mdsmon_event(__func__);
510 MMDSBeacon *m = static_cast<MMDSBeacon*>(op->get_req());
511 // -- this is an update --
512 dout(12) << "prepare_beacon " << *m << " from " << m->get_orig_source_inst() << dendl;
513 entity_addr_t addr = m->get_orig_source_inst().addr;
514 mds_gid_t gid = m->get_global_id();
515 MDSMap::DaemonState state = m->get_state();
516 version_t seq = m->get_seq();
517
518 dout(20) << __func__ << " got health from gid " << gid << " with " << m->get_health().metrics.size() << " metrics." << dendl;
519
520 // Calculate deltas of health metrics created and removed
521 // Do this by type rather than MDSHealthMetric equality, because messages can
522 // change a lot when they include e.g. a number of items.
523 const auto &old_health = pending_daemon_health[gid].metrics;
524 const auto &new_health = m->get_health().metrics;
525
526 std::set<mds_metric_t> old_types;
527 for (const auto &i : old_health) {
528 old_types.insert(i.type);
529 }
530
531 std::set<mds_metric_t> new_types;
532 for (const auto &i : new_health) {
533 new_types.insert(i.type);
534 }
535
536 for (const auto &new_metric: new_health) {
537 if (old_types.count(new_metric.type) == 0) {
538 std::stringstream msg;
539 msg << "MDS health message (" << m->get_orig_source_inst().name << "): "
540 << new_metric.message;
541 if (new_metric.sev == HEALTH_ERR) {
542 mon->clog->error() << msg.str();
543 } else if (new_metric.sev == HEALTH_WARN) {
544 mon->clog->warn() << msg.str();
545 } else {
546 mon->clog->info() << msg.str();
547 }
548 }
549 }
550
551 // Log the disappearance of health messages at INFO
552 for (const auto &old_metric : old_health) {
553 if (new_types.count(old_metric.type) == 0) {
554 mon->clog->info() << "MDS health message cleared ("
555 << m->get_orig_source_inst().name << "): " << old_metric.message;
556 }
557 }
558
559 // Store health
560 pending_daemon_health[gid] = m->get_health();
561
562 // boot?
563 if (state == MDSMap::STATE_BOOT) {
564 // zap previous instance of this name?
565 if (g_conf->mds_enforce_unique_name) {
566 bool failed_mds = false;
567 while (mds_gid_t existing = pending_fsmap.find_mds_gid_by_name(m->get_name())) {
568 if (!mon->osdmon()->is_writeable()) {
569 mon->osdmon()->wait_for_writeable(op, new C_RetryMessage(this, op));
570 return false;
571 }
572 const MDSMap::mds_info_t &existing_info =
573 pending_fsmap.get_info_gid(existing);
574 mon->clog->info() << existing_info.human_name() << " restarted";
575 fail_mds_gid(existing);
576 failed_mds = true;
577 }
578 if (failed_mds) {
579 assert(mon->osdmon()->is_writeable());
580 request_proposal(mon->osdmon());
581 }
582 }
583
584 // Add this daemon to the map
585 if (pending_fsmap.mds_roles.count(gid) == 0) {
586 MDSMap::mds_info_t new_info;
587 new_info.global_id = gid;
588 new_info.name = m->get_name();
589 new_info.addr = addr;
590 new_info.mds_features = m->get_mds_features();
591 new_info.state = MDSMap::STATE_STANDBY;
592 new_info.state_seq = seq;
593 new_info.standby_for_rank = m->get_standby_for_rank();
594 new_info.standby_for_name = m->get_standby_for_name();
595 new_info.standby_for_fscid = m->get_standby_for_fscid();
596 new_info.standby_replay = m->get_standby_replay();
597 pending_fsmap.insert(new_info);
598 }
599
600 // Resolve standby_for_name to a rank
601 const MDSMap::mds_info_t &info = pending_fsmap.get_info_gid(gid);
602 if (!info.standby_for_name.empty()) {
603 const MDSMap::mds_info_t *leaderinfo = fsmap.find_by_name(
604 info.standby_for_name);
605 if (leaderinfo && (leaderinfo->rank >= 0)) {
606 auto fscid = pending_fsmap.mds_roles.at(leaderinfo->global_id);
607 auto fs = pending_fsmap.get_filesystem(fscid);
608
609 pending_fsmap.modify_daemon(gid, [fscid, leaderinfo](
610 MDSMap::mds_info_t *info) {
611 info->standby_for_rank = leaderinfo->rank;
612 info->standby_for_fscid = fscid;
613 });
614 }
615 }
616
617 // initialize the beacon timer
618 last_beacon[gid].stamp = ceph_clock_now();
619 last_beacon[gid].seq = seq;
620
621 // new incompat?
622 if (!pending_fsmap.compat.writeable(m->get_compat())) {
623 dout(10) << " fsmap " << pending_fsmap.compat
624 << " can't write to new mds' " << m->get_compat()
625 << ", updating fsmap and killing old mds's"
626 << dendl;
627 pending_fsmap.update_compat(m->get_compat());
628 }
629
630 update_metadata(m->get_global_id(), m->get_sys_info());
631 } else {
632 // state update
633 const MDSMap::mds_info_t &info = pending_fsmap.get_info_gid(gid);
634 // Old MDS daemons don't mention that they're standby replay until
635 // after they've sent their boot beacon, so update this field.
636 if (info.standby_replay != m->get_standby_replay()) {
637 pending_fsmap.modify_daemon(info.global_id, [&m](
638 MDSMap::mds_info_t *i)
639 {
640 i->standby_replay = m->get_standby_replay();
641 });
642 }
643
644 if (info.state == MDSMap::STATE_STOPPING && state != MDSMap::STATE_STOPPED ) {
645 // we can't transition to any other states from STOPPING
646 dout(0) << "got beacon for MDS in STATE_STOPPING, ignoring requested state change"
647 << dendl;
648 _note_beacon(m);
649 return true;
650 }
651
652 if (info.laggy()) {
653 dout(10) << "prepare_beacon clearing laggy flag on " << addr << dendl;
654 pending_fsmap.modify_daemon(info.global_id, [](MDSMap::mds_info_t *info)
655 {
656 info->clear_laggy();
657 }
658 );
659 }
660
661 dout(10) << "prepare_beacon mds." << info.rank
662 << " " << ceph_mds_state_name(info.state)
663 << " -> " << ceph_mds_state_name(state)
664 << " standby_for_rank=" << m->get_standby_for_rank()
665 << dendl;
666 if (state == MDSMap::STATE_STOPPED) {
667 const auto fscid = pending_fsmap.mds_roles.at(gid);
668 auto fs = pending_fsmap.get_filesystem(fscid);
669
670 mon->clog->info() << info.human_name() << " finished "
671 << "deactivating rank " << info.rank << " in filesystem "
672 << fs->mds_map.fs_name << " (now has "
673 << fs->mds_map.get_num_in_mds() - 1 << " ranks)";
674
675 auto erased = pending_fsmap.stop(gid);
676 erased.push_back(gid);
677
678 for (const auto &erased_gid : erased) {
679 last_beacon.erase(erased_gid);
680 if (pending_daemon_health.count(erased_gid)) {
681 pending_daemon_health.erase(erased_gid);
682 pending_daemon_health_rm.insert(erased_gid);
683 }
684 }
685
686
687 } else if (state == MDSMap::STATE_DAMAGED) {
688 if (!mon->osdmon()->is_writeable()) {
689 dout(4) << __func__ << ": DAMAGED from rank " << info.rank
690 << " waiting for osdmon writeable to blacklist it" << dendl;
691 mon->osdmon()->wait_for_writeable(op, new C_RetryMessage(this, op));
692 return false;
693 }
694
695 // Record this MDS rank as damaged, so that other daemons
696 // won't try to run it.
697 dout(4) << __func__ << ": marking rank "
698 << info.rank << " damaged" << dendl;
699
700 utime_t until = ceph_clock_now();
701 until += g_conf->mds_blacklist_interval;
702 const auto blacklist_epoch = mon->osdmon()->blacklist(info.addr, until);
703 request_proposal(mon->osdmon());
704 pending_fsmap.damaged(gid, blacklist_epoch);
705 last_beacon.erase(gid);
706
707 // Respond to MDS, so that it knows it can continue to shut down
708 mon->send_reply(op,
709 new MMDSBeacon(
710 mon->monmap->fsid, m->get_global_id(),
711 m->get_name(), fsmap.get_epoch(), state, seq,
712 CEPH_FEATURES_SUPPORTED_DEFAULT));
713 } else if (state == MDSMap::STATE_DNE) {
714 if (!mon->osdmon()->is_writeable()) {
715 dout(4) << __func__ << ": DNE from rank " << info.rank
716 << " waiting for osdmon writeable to blacklist it" << dendl;
717 mon->osdmon()->wait_for_writeable(op, new C_RetryMessage(this, op));
718 return false;
719 }
720
721 fail_mds_gid(gid);
722 assert(mon->osdmon()->is_writeable());
723 request_proposal(mon->osdmon());
724
725 // Respond to MDS, so that it knows it can continue to shut down
726 mon->send_reply(op,
727 new MMDSBeacon(
728 mon->monmap->fsid, m->get_global_id(),
729 m->get_name(), fsmap.get_epoch(), state, seq,
730 CEPH_FEATURES_SUPPORTED_DEFAULT));
731 } else if (info.state == MDSMap::STATE_STANDBY && state != info.state) {
732 // Standby daemons should never modify their own
733 // state. Reject any attempts to do so.
734 derr << "standby " << gid << " attempted to change state to "
735 << ceph_mds_state_name(state) << ", rejecting" << dendl;
736 return true;
737 } else if (info.state != MDSMap::STATE_STANDBY && state != info.state &&
738 !MDSMap::state_transition_valid(info.state, state)) {
739 // Validate state transitions for daemons that hold a rank
740 derr << "daemon " << gid << " (rank " << info.rank << ") "
741 << "reported invalid state transition "
742 << ceph_mds_state_name(info.state) << " -> "
743 << ceph_mds_state_name(state) << dendl;
744 return true;
745 } else {
746 // Made it through special cases and validations, record the
747 // daemon's reported state to the FSMap.
748 pending_fsmap.modify_daemon(gid, [state, seq](MDSMap::mds_info_t *info) {
749 info->state = state;
750 info->state_seq = seq;
751 });
752
753 if (state == MDSMap::STATE_ACTIVE) {
754 auto fscid = pending_fsmap.mds_roles.at(gid);
755 auto fs = pending_fsmap.get_filesystem(fscid);
756 mon->clog->info() << info.human_name() << " is now active in "
757 << "filesystem " << fs->mds_map.fs_name << " as rank "
758 << info.rank;
759 }
760 }
761 }
762
763 dout(7) << "prepare_beacon pending map now:" << dendl;
764 print_map(pending_fsmap);
765
766 wait_for_finished_proposal(op, new FunctionContext([op, this](int r){
767 if (r >= 0)
768 _updated(op); // success
769 else if (r == -ECANCELED) {
770 mon->no_reply(op);
771 } else {
772 dispatch(op); // try again
773 }
774 }));
775
776 return true;
777 }
778
779 bool MDSMonitor::prepare_offload_targets(MonOpRequestRef op)
780 {
781 op->mark_mdsmon_event(__func__);
782 MMDSLoadTargets *m = static_cast<MMDSLoadTargets*>(op->get_req());
783 mds_gid_t gid = m->global_id;
784 if (pending_fsmap.gid_has_rank(gid)) {
785 dout(10) << "prepare_offload_targets " << gid << " " << m->targets << dendl;
786 pending_fsmap.update_export_targets(gid, m->targets);
787 } else {
788 dout(10) << "prepare_offload_targets " << gid << " not in map" << dendl;
789 }
790 return true;
791 }
792
793 bool MDSMonitor::should_propose(double& delay)
794 {
795 // delegate to PaxosService to assess whether we should propose
796 return PaxosService::should_propose(delay);
797 }
798
799 void MDSMonitor::_updated(MonOpRequestRef op)
800 {
801 op->mark_mdsmon_event(__func__);
802 MMDSBeacon *m = static_cast<MMDSBeacon*>(op->get_req());
803 dout(10) << "_updated " << m->get_orig_source() << " " << *m << dendl;
804 mon->clog->debug() << m->get_orig_source_inst() << " "
805 << ceph_mds_state_name(m->get_state());
806
807 if (m->get_state() == MDSMap::STATE_STOPPED) {
808 // send the map manually (they're out of the map, so they won't get it automatic)
809 MDSMap null_map;
810 null_map.epoch = fsmap.epoch;
811 null_map.compat = fsmap.compat;
812 mon->send_reply(op, new MMDSMap(mon->monmap->fsid, &null_map));
813 } else {
814 mon->send_reply(op, new MMDSBeacon(mon->monmap->fsid,
815 m->get_global_id(),
816 m->get_name(),
817 fsmap.get_epoch(),
818 m->get_state(),
819 m->get_seq(),
820 CEPH_FEATURES_SUPPORTED_DEFAULT));
821 }
822 }
823
824 void MDSMonitor::on_active()
825 {
826 tick();
827 update_logger();
828
829 if (mon->is_leader()) {
830 mon->clog->debug() << "fsmap " << fsmap;
831 }
832 }
833
834 void MDSMonitor::get_health(list<pair<health_status_t, string> >& summary,
835 list<pair<health_status_t, string> > *detail,
836 CephContext* cct) const
837 {
838 fsmap.get_health(summary, detail);
839
840 // For each MDS GID...
841 const auto info_map = fsmap.get_mds_info();
842 for (const auto &i : info_map) {
843 const auto &gid = i.first;
844 const auto &info = i.second;
845
846 // Decode MDSHealth
847 bufferlist bl;
848 mon->store->get(MDS_HEALTH_PREFIX, stringify(gid), bl);
849 if (!bl.length()) {
850 derr << "Missing health data for MDS " << gid << dendl;
851 continue;
852 }
853 MDSHealth health;
854 bufferlist::iterator bl_i = bl.begin();
855 health.decode(bl_i);
856
857 for (const auto &metric : health.metrics) {
858 const int rank = info.rank;
859 std::ostringstream message;
860 message << "mds" << rank << ": " << metric.message;
861 summary.push_back(std::make_pair(metric.sev, message.str()));
862
863 if (detail) {
864 // There is no way for us to clealy associate detail entries with summary entries (#7192), so
865 // we duplicate the summary message in the detail string and tag the metadata on.
866 std::ostringstream detail_message;
867 detail_message << message.str();
868 if (metric.metadata.size()) {
869 detail_message << "(";
870 auto k = metric.metadata.begin();
871 while (k != metric.metadata.end()) {
872 detail_message << k->first << ": " << k->second;
873 if (boost::next(k) != metric.metadata.end()) {
874 detail_message << ", ";
875 }
876 ++k;
877 }
878 detail_message << ")";
879 }
880 detail->push_back(std::make_pair(metric.sev, detail_message.str()));
881 }
882 }
883 }
884 }
885
886 void MDSMonitor::dump_info(Formatter *f)
887 {
888 f->open_object_section("fsmap");
889 fsmap.dump(f);
890 f->close_section();
891
892 f->dump_unsigned("mdsmap_first_committed", get_first_committed());
893 f->dump_unsigned("mdsmap_last_committed", get_last_committed());
894 }
895
896 bool MDSMonitor::preprocess_command(MonOpRequestRef op)
897 {
898 op->mark_mdsmon_event(__func__);
899 MMonCommand *m = static_cast<MMonCommand*>(op->get_req());
900 int r = -1;
901 bufferlist rdata;
902 stringstream ss, ds;
903
904 map<string, cmd_vartype> cmdmap;
905 if (!cmdmap_from_json(m->cmd, &cmdmap, ss)) {
906 // ss has reason for failure
907 string rs = ss.str();
908 mon->reply_command(op, -EINVAL, rs, rdata, get_last_committed());
909 return true;
910 }
911
912 string prefix;
913 cmd_getval(g_ceph_context, cmdmap, "prefix", prefix);
914 string format;
915 cmd_getval(g_ceph_context, cmdmap, "format", format, string("plain"));
916 boost::scoped_ptr<Formatter> f(Formatter::create(format));
917
918 MonSession *session = m->get_session();
919 if (!session) {
920 mon->reply_command(op, -EACCES, "access denied", rdata, get_last_committed());
921 return true;
922 }
923
924 if (prefix == "mds stat") {
925 if (f) {
926 f->open_object_section("mds_stat");
927 dump_info(f.get());
928 f->close_section();
929 f->flush(ds);
930 } else {
931 ds << fsmap;
932 }
933 r = 0;
934 } else if (prefix == "mds dump") {
935 int64_t epocharg;
936 epoch_t epoch;
937
938 FSMap *p = &fsmap;
939 if (cmd_getval(g_ceph_context, cmdmap, "epoch", epocharg)) {
940 epoch = epocharg;
941 bufferlist b;
942 int err = get_version(epoch, b);
943 if (err == -ENOENT) {
944 p = 0;
945 r = -ENOENT;
946 } else {
947 assert(err == 0);
948 assert(b.length());
949 p = new FSMap;
950 p->decode(b);
951 }
952 }
953 if (p) {
954 stringstream ds;
955 const MDSMap *mdsmap = nullptr;
956 MDSMap blank;
957 blank.epoch = fsmap.epoch;
958 if (fsmap.legacy_client_fscid != FS_CLUSTER_ID_NONE) {
959 mdsmap = &(fsmap.filesystems[fsmap.legacy_client_fscid]->mds_map);
960 } else {
961 mdsmap = &blank;
962 }
963 if (f != NULL) {
964 f->open_object_section("mdsmap");
965 mdsmap->dump(f.get());
966 f->close_section();
967 f->flush(ds);
968 r = 0;
969 } else {
970 mdsmap->print(ds);
971 r = 0;
972 }
973
974 rdata.append(ds);
975 ss << "dumped fsmap epoch " << p->get_epoch();
976
977 if (p != &fsmap) {
978 delete p;
979 }
980 }
981 } else if (prefix == "fs dump") {
982 int64_t epocharg;
983 epoch_t epoch;
984
985 FSMap *p = &fsmap;
986 if (cmd_getval(g_ceph_context, cmdmap, "epoch", epocharg)) {
987 epoch = epocharg;
988 bufferlist b;
989 int err = get_version(epoch, b);
990 if (err == -ENOENT) {
991 p = 0;
992 r = -ENOENT;
993 } else {
994 assert(err == 0);
995 assert(b.length());
996 p = new FSMap;
997 p->decode(b);
998 }
999 }
1000 if (p) {
1001 stringstream ds;
1002 if (f != NULL) {
1003 f->open_object_section("fsmap");
1004 p->dump(f.get());
1005 f->close_section();
1006 f->flush(ds);
1007 r = 0;
1008 } else {
1009 p->print(ds);
1010 r = 0;
1011 }
1012
1013 rdata.append(ds);
1014 ss << "dumped fsmap epoch " << p->get_epoch();
1015
1016 if (p != &fsmap)
1017 delete p;
1018 }
1019 } else if (prefix == "mds metadata") {
1020 if (!f)
1021 f.reset(Formatter::create("json-pretty"));
1022
1023 string who;
1024 bool all = !cmd_getval(g_ceph_context, cmdmap, "who", who);
1025 dout(1) << "all = " << all << dendl;
1026 if (all) {
1027 r = 0;
1028 // Dump all MDSs' metadata
1029 const auto all_info = fsmap.get_mds_info();
1030
1031 f->open_array_section("mds_metadata");
1032 for(const auto &i : all_info) {
1033 const auto &info = i.second;
1034
1035 f->open_object_section("mds");
1036 f->dump_string("name", info.name);
1037 std::ostringstream get_err;
1038 r = dump_metadata(info.name, f.get(), get_err);
1039 if (r == -EINVAL || r == -ENOENT) {
1040 // Drop error, list what metadata we do have
1041 dout(1) << get_err.str() << dendl;
1042 r = 0;
1043 } else if (r != 0) {
1044 derr << "Unexpected error reading metadata: " << cpp_strerror(r)
1045 << dendl;
1046 ss << get_err.str();
1047 f->close_section();
1048 break;
1049 }
1050 f->close_section();
1051 }
1052 f->close_section();
1053 } else {
1054 // Dump a single daemon's metadata
1055 f->open_object_section("mds_metadata");
1056 r = dump_metadata(who, f.get(), ss);
1057 f->close_section();
1058 }
1059 f->flush(ds);
1060 } else if (prefix == "mds versions") {
1061 if (!f)
1062 f.reset(Formatter::create("json-pretty"));
1063 count_metadata("ceph_version", f.get());
1064 f->flush(ds);
1065 r = 0;
1066 } else if (prefix == "mds count-metadata") {
1067 if (!f)
1068 f.reset(Formatter::create("json-pretty"));
1069 string field;
1070 cmd_getval(g_ceph_context, cmdmap, "property", field);
1071 count_metadata(field, f.get());
1072 f->flush(ds);
1073 r = 0;
1074 } else if (prefix == "mds getmap") {
1075 epoch_t e;
1076 int64_t epocharg;
1077 bufferlist b;
1078 if (cmd_getval(g_ceph_context, cmdmap, "epoch", epocharg)) {
1079 e = epocharg;
1080 int err = get_version(e, b);
1081 if (err == -ENOENT) {
1082 r = -ENOENT;
1083 } else {
1084 assert(err == 0);
1085 assert(b.length());
1086 FSMap mm;
1087 mm.decode(b);
1088 mm.encode(rdata, m->get_connection()->get_features());
1089 ss << "got fsmap epoch " << mm.get_epoch();
1090 r = 0;
1091 }
1092 } else {
1093 fsmap.encode(rdata, m->get_connection()->get_features());
1094 ss << "got fsmap epoch " << fsmap.get_epoch();
1095 r = 0;
1096 }
1097 } else if (prefix == "mds compat show") {
1098 if (f) {
1099 f->open_object_section("mds_compat");
1100 fsmap.compat.dump(f.get());
1101 f->close_section();
1102 f->flush(ds);
1103 } else {
1104 ds << fsmap.compat;
1105 }
1106 r = 0;
1107 } else if (prefix == "fs get") {
1108 string fs_name;
1109 cmd_getval(g_ceph_context, cmdmap, "fs_name", fs_name);
1110 auto fs = fsmap.get_filesystem(fs_name);
1111 if (fs == nullptr) {
1112 ss << "filesystem '" << fs_name << "' not found";
1113 r = -ENOENT;
1114 } else {
1115 if (f != nullptr) {
1116 f->open_object_section("filesystem");
1117 fs->dump(f.get());
1118 f->close_section();
1119 f->flush(ds);
1120 r = 0;
1121 } else {
1122 fs->print(ds);
1123 r = 0;
1124 }
1125 }
1126 } else if (prefix == "fs ls") {
1127 if (f) {
1128 f->open_array_section("filesystems");
1129 {
1130 for (const auto i : fsmap.filesystems) {
1131 const auto fs = i.second;
1132 f->open_object_section("filesystem");
1133 {
1134 const MDSMap &mds_map = fs->mds_map;
1135 f->dump_string("name", mds_map.fs_name);
1136 /* Output both the names and IDs of pools, for use by
1137 * humans and machines respectively */
1138 f->dump_string("metadata_pool", mon->osdmon()->osdmap.get_pool_name(
1139 mds_map.metadata_pool));
1140 f->dump_int("metadata_pool_id", mds_map.metadata_pool);
1141 f->open_array_section("data_pool_ids");
1142 {
1143 for (auto dpi = mds_map.data_pools.begin();
1144 dpi != mds_map.data_pools.end(); ++dpi) {
1145 f->dump_int("data_pool_id", *dpi);
1146 }
1147 }
1148 f->close_section();
1149
1150 f->open_array_section("data_pools");
1151 {
1152 for (auto dpi = mds_map.data_pools.begin();
1153 dpi != mds_map.data_pools.end(); ++dpi) {
1154 const auto &name = mon->osdmon()->osdmap.get_pool_name(
1155 *dpi);
1156 f->dump_string("data_pool", name);
1157 }
1158 }
1159
1160 f->close_section();
1161 }
1162 f->close_section();
1163 }
1164 }
1165 f->close_section();
1166 f->flush(ds);
1167 } else {
1168 for (const auto i : fsmap.filesystems) {
1169 const auto fs = i.second;
1170 const MDSMap &mds_map = fs->mds_map;
1171 const string &md_pool_name = mon->osdmon()->osdmap.get_pool_name(
1172 mds_map.metadata_pool);
1173
1174 ds << "name: " << mds_map.fs_name << ", metadata pool: "
1175 << md_pool_name << ", data pools: [";
1176 for (auto dpi : mds_map.data_pools) {
1177 const string &pool_name = mon->osdmon()->osdmap.get_pool_name(dpi);
1178 ds << pool_name << " ";
1179 }
1180 ds << "]" << std::endl;
1181 }
1182
1183 if (fsmap.filesystems.empty()) {
1184 ds << "No filesystems enabled" << std::endl;
1185 }
1186 }
1187 r = 0;
1188 }
1189
1190 if (r != -1) {
1191 rdata.append(ds);
1192 string rs;
1193 getline(ss, rs);
1194 mon->reply_command(op, r, rs, rdata, get_last_committed());
1195 return true;
1196 } else
1197 return false;
1198 }
1199
1200 bool MDSMonitor::fail_mds_gid(mds_gid_t gid)
1201 {
1202 const MDSMap::mds_info_t info = pending_fsmap.get_info_gid(gid);
1203 dout(10) << "fail_mds_gid " << gid << " mds." << info.name << " role " << info.rank << dendl;
1204
1205 epoch_t blacklist_epoch = 0;
1206 if (info.rank >= 0 && info.state != MDSMap::STATE_STANDBY_REPLAY) {
1207 utime_t until = ceph_clock_now();
1208 until += g_conf->mds_blacklist_interval;
1209 blacklist_epoch = mon->osdmon()->blacklist(info.addr, until);
1210 }
1211
1212 pending_fsmap.erase(gid, blacklist_epoch);
1213 last_beacon.erase(gid);
1214 if (pending_daemon_health.count(gid)) {
1215 pending_daemon_health.erase(gid);
1216 pending_daemon_health_rm.insert(gid);
1217 }
1218
1219 return blacklist_epoch != 0;
1220 }
1221
1222 mds_gid_t MDSMonitor::gid_from_arg(const std::string& arg, std::ostream &ss)
1223 {
1224 const FSMap *relevant_fsmap = mon->is_leader() ? &pending_fsmap : &fsmap;
1225
1226 // Try parsing as a role
1227 mds_role_t role;
1228 std::ostringstream ignore_err; // Don't spam 'ss' with parse_role errors
1229 int r = parse_role(arg, &role, ignore_err);
1230 if (r == 0) {
1231 // See if a GID is assigned to this role
1232 auto fs = relevant_fsmap->get_filesystem(role.fscid);
1233 assert(fs != nullptr); // parse_role ensures it exists
1234 if (fs->mds_map.is_up(role.rank)) {
1235 dout(10) << __func__ << ": validated rank/GID " << role
1236 << " as a rank" << dendl;
1237 return fs->mds_map.get_mds_info(role.rank).global_id;
1238 }
1239 }
1240
1241 // Try parsing as a gid
1242 std::string err;
1243 unsigned long long maybe_gid = strict_strtoll(arg.c_str(), 10, &err);
1244 if (!err.empty()) {
1245 // Not a role or a GID, try as a daemon name
1246 const MDSMap::mds_info_t *mds_info = relevant_fsmap->find_by_name(arg);
1247 if (!mds_info) {
1248 ss << "MDS named '" << arg
1249 << "' does not exist, or is not up";
1250 return MDS_GID_NONE;
1251 }
1252 dout(10) << __func__ << ": resolved MDS name '" << arg
1253 << "' to GID " << mds_info->global_id << dendl;
1254 return mds_info->global_id;
1255 } else {
1256 // Not a role, but parses as a an integer, might be a GID
1257 dout(10) << __func__ << ": treating MDS reference '" << arg
1258 << "' as an integer " << maybe_gid << dendl;
1259
1260 if (relevant_fsmap->gid_exists(mds_gid_t(maybe_gid))) {
1261 return mds_gid_t(maybe_gid);
1262 }
1263 }
1264
1265 dout(1) << __func__ << ": rank/GID " << arg
1266 << " not a existent rank or GID" << dendl;
1267 return MDS_GID_NONE;
1268 }
1269
1270 int MDSMonitor::fail_mds(std::ostream &ss, const std::string &arg,
1271 MDSMap::mds_info_t *failed_info)
1272 {
1273 assert(failed_info != nullptr);
1274
1275 mds_gid_t gid = gid_from_arg(arg, ss);
1276 if (gid == MDS_GID_NONE) {
1277 return 0;
1278 }
1279 if (!mon->osdmon()->is_writeable()) {
1280 return -EAGAIN;
1281 }
1282
1283 // Take a copy of the info before removing the MDS from the map,
1284 // so that the caller knows which mds (if any) they ended up removing.
1285 *failed_info = pending_fsmap.get_info_gid(gid);
1286
1287 fail_mds_gid(gid);
1288 ss << "failed mds gid " << gid;
1289 assert(mon->osdmon()->is_writeable());
1290 request_proposal(mon->osdmon());
1291 return 0;
1292 }
1293
1294 bool MDSMonitor::prepare_command(MonOpRequestRef op)
1295 {
1296 op->mark_mdsmon_event(__func__);
1297 MMonCommand *m = static_cast<MMonCommand*>(op->get_req());
1298 int r = -EINVAL;
1299 stringstream ss;
1300 bufferlist rdata;
1301
1302 map<string, cmd_vartype> cmdmap;
1303 if (!cmdmap_from_json(m->cmd, &cmdmap, ss)) {
1304 string rs = ss.str();
1305 mon->reply_command(op, -EINVAL, rs, rdata, get_last_committed());
1306 return true;
1307 }
1308
1309 string prefix;
1310 cmd_getval(g_ceph_context, cmdmap, "prefix", prefix);
1311
1312 /* Refuse access if message not associated with a valid session */
1313 MonSession *session = m->get_session();
1314 if (!session) {
1315 mon->reply_command(op, -EACCES, "access denied", rdata, get_last_committed());
1316 return true;
1317 }
1318
1319 bool batched_propose = false;
1320 for (auto h : handlers) {
1321 if (h->can_handle(prefix)) {
1322 batched_propose = h->batched_propose();
1323 if (batched_propose) {
1324 paxos->plug();
1325 }
1326 r = h->handle(mon, pending_fsmap, op, cmdmap, ss);
1327 if (batched_propose) {
1328 paxos->unplug();
1329 }
1330
1331 if (r == -EAGAIN) {
1332 // message has been enqueued for retry; return.
1333 dout(4) << __func__ << " enqueue for retry by prepare_command" << dendl;
1334 return false;
1335 } else {
1336 if (r == 0) {
1337 // On successful updates, print the updated map
1338 print_map(pending_fsmap);
1339 }
1340 // Successful or not, we're done: respond.
1341 goto out;
1342 }
1343 }
1344 }
1345
1346 r = filesystem_command(op, prefix, cmdmap, ss);
1347 if (r >= 0) {
1348 goto out;
1349 } else if (r == -EAGAIN) {
1350 // Do not reply, the message has been enqueued for retry
1351 dout(4) << __func__ << " enqueue for retry by filesystem_command" << dendl;
1352 return false;
1353 } else if (r != -ENOSYS) {
1354 goto out;
1355 }
1356
1357 // Only handle legacy commands if there is a filesystem configured
1358 if (pending_fsmap.legacy_client_fscid == FS_CLUSTER_ID_NONE) {
1359 if (pending_fsmap.filesystems.size() == 0) {
1360 ss << "No filesystem configured: use `ceph fs new` to create a filesystem";
1361 } else {
1362 ss << "No filesystem set for use with legacy commands";
1363 }
1364 r = -EINVAL;
1365 goto out;
1366 }
1367
1368 r = legacy_filesystem_command(op, prefix, cmdmap, ss);
1369
1370 if (r == -ENOSYS && ss.str().empty()) {
1371 ss << "unrecognized command";
1372 }
1373
1374 out:
1375 dout(4) << __func__ << " done, r=" << r << dendl;
1376 /* Compose response */
1377 string rs;
1378 getline(ss, rs);
1379
1380 if (r >= 0) {
1381 // success.. delay reply
1382 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, r, rs,
1383 get_last_committed() + 1));
1384 if (batched_propose) {
1385 force_immediate_propose();
1386 }
1387 return true;
1388 } else {
1389 // reply immediately
1390 mon->reply_command(op, r, rs, rdata, get_last_committed());
1391 return false;
1392 }
1393 }
1394
1395
1396 /**
1397 * Given one of the following forms:
1398 * <fs name>:<rank>
1399 * <fs id>:<rank>
1400 * <rank>
1401 *
1402 * Parse into a mds_role_t. The rank-only form is only valid
1403 * if legacy_client_ns is set.
1404 */
1405 int MDSMonitor::parse_role(
1406 const std::string &role_str,
1407 mds_role_t *role,
1408 std::ostream &ss)
1409 {
1410 const FSMap *relevant_fsmap = &fsmap;
1411 if (mon->is_leader()) {
1412 relevant_fsmap = &pending_fsmap;
1413 }
1414 return relevant_fsmap->parse_role(role_str, role, ss);
1415 }
1416
1417 int MDSMonitor::filesystem_command(
1418 MonOpRequestRef op,
1419 std::string const &prefix,
1420 map<string, cmd_vartype> &cmdmap,
1421 std::stringstream &ss)
1422 {
1423 dout(4) << __func__ << " prefix='" << prefix << "'" << dendl;
1424 op->mark_mdsmon_event(__func__);
1425 int r = 0;
1426 string whostr;
1427 cmd_getval(g_ceph_context, cmdmap, "who", whostr);
1428
1429 if (prefix == "mds stop" ||
1430 prefix == "mds deactivate") {
1431
1432 mds_role_t role;
1433 r = parse_role(whostr, &role, ss);
1434 if (r < 0 ) {
1435 return r;
1436 }
1437 auto fs = pending_fsmap.get_filesystem(role.fscid);
1438
1439 if (!fs->mds_map.is_active(role.rank)) {
1440 r = -EEXIST;
1441 ss << "mds." << role << " not active ("
1442 << ceph_mds_state_name(fs->mds_map.get_state(role.rank)) << ")";
1443 } else if (fs->mds_map.get_root() == role.rank ||
1444 fs->mds_map.get_tableserver() == role.rank) {
1445 r = -EINVAL;
1446 ss << "can't tell the root (" << fs->mds_map.get_root()
1447 << ") or tableserver (" << fs->mds_map.get_tableserver()
1448 << ") to deactivate";
1449 } else if (role.rank != fs->mds_map.get_last_in_mds()) {
1450 r = -EINVAL;
1451 ss << "mds." << role << " doesn't have the max rank ("
1452 << fs->mds_map.get_last_in_mds() << ")";
1453 } else if (fs->mds_map.get_num_in_mds() <= size_t(fs->mds_map.get_max_mds())) {
1454 r = -EBUSY;
1455 ss << "must decrease max_mds or else MDS will immediately reactivate";
1456 } else {
1457 r = 0;
1458 mds_gid_t gid = fs->mds_map.up.at(role.rank);
1459 ss << "telling mds." << role << " "
1460 << pending_fsmap.get_info_gid(gid).addr << " to deactivate";
1461
1462 pending_fsmap.modify_daemon(gid, [](MDSMap::mds_info_t *info) {
1463 info->state = MDSMap::STATE_STOPPING;
1464 });
1465 }
1466 } else if (prefix == "mds set_state") {
1467 mds_gid_t gid;
1468 if (!cmd_getval(g_ceph_context, cmdmap, "gid", gid)) {
1469 ss << "error parsing 'gid' value '"
1470 << cmd_vartype_stringify(cmdmap["gid"]) << "'";
1471 return -EINVAL;
1472 }
1473 MDSMap::DaemonState state;
1474 if (!cmd_getval(g_ceph_context, cmdmap, "state", state)) {
1475 ss << "error parsing 'state' string value '"
1476 << cmd_vartype_stringify(cmdmap["state"]) << "'";
1477 return -EINVAL;
1478 }
1479 if (pending_fsmap.gid_exists(gid)) {
1480 pending_fsmap.modify_daemon(gid, [state](MDSMap::mds_info_t *info) {
1481 info->state = state;
1482 });
1483 ss << "set mds gid " << gid << " to state " << state << " "
1484 << ceph_mds_state_name(state);
1485 return 0;
1486 }
1487 } else if (prefix == "mds fail") {
1488 string who;
1489 cmd_getval(g_ceph_context, cmdmap, "who", who);
1490
1491 MDSMap::mds_info_t failed_info;
1492 r = fail_mds(ss, who, &failed_info);
1493 if (r < 0 && r == -EAGAIN) {
1494 mon->osdmon()->wait_for_writeable(op, new C_RetryMessage(this, op));
1495 return -EAGAIN; // don't propose yet; wait for message to be retried
1496 } else if (r == 0) {
1497 // Only log if we really did something (not when was already gone)
1498 if (failed_info.global_id != MDS_GID_NONE) {
1499 mon->clog->info() << failed_info.human_name() << " marked failed by "
1500 << op->get_session()->entity_name;
1501 }
1502 }
1503 } else if (prefix == "mds rm") {
1504 mds_gid_t gid;
1505 if (!cmd_getval(g_ceph_context, cmdmap, "gid", gid)) {
1506 ss << "error parsing 'gid' value '"
1507 << cmd_vartype_stringify(cmdmap["gid"]) << "'";
1508 return -EINVAL;
1509 }
1510 if (!pending_fsmap.gid_exists(gid)) {
1511 ss << "mds gid " << gid << " dne";
1512 r = 0;
1513 } else {
1514 MDSMap::DaemonState state = pending_fsmap.get_info_gid(gid).state;
1515 if (state > 0) {
1516 ss << "cannot remove active mds." << pending_fsmap.get_info_gid(gid).name
1517 << " rank " << pending_fsmap.get_info_gid(gid).rank;
1518 return -EBUSY;
1519 } else {
1520 pending_fsmap.erase(gid, {});
1521 ss << "removed mds gid " << gid;
1522 return 0;
1523 }
1524 }
1525 } else if (prefix == "mds rmfailed") {
1526 string confirm;
1527 if (!cmd_getval(g_ceph_context, cmdmap, "confirm", confirm) ||
1528 confirm != "--yes-i-really-mean-it") {
1529 ss << "WARNING: this can make your filesystem inaccessible! "
1530 "Add --yes-i-really-mean-it if you are sure you wish to continue.";
1531 return -EPERM;
1532 }
1533
1534 std::string role_str;
1535 cmd_getval(g_ceph_context, cmdmap, "who", role_str);
1536 mds_role_t role;
1537 int r = parse_role(role_str, &role, ss);
1538 if (r < 0) {
1539 ss << "invalid role '" << role_str << "'";
1540 return -EINVAL;
1541 }
1542
1543 pending_fsmap.modify_filesystem(
1544 role.fscid,
1545 [role](std::shared_ptr<Filesystem> fs)
1546 {
1547 fs->mds_map.failed.erase(role.rank);
1548 });
1549
1550 ss << "removed failed mds." << role;
1551 return 0;
1552 } else if (prefix == "mds compat rm_compat") {
1553 int64_t f;
1554 if (!cmd_getval(g_ceph_context, cmdmap, "feature", f)) {
1555 ss << "error parsing feature value '"
1556 << cmd_vartype_stringify(cmdmap["feature"]) << "'";
1557 return -EINVAL;
1558 }
1559 if (pending_fsmap.compat.compat.contains(f)) {
1560 ss << "removing compat feature " << f;
1561 CompatSet modified = pending_fsmap.compat;
1562 modified.compat.remove(f);
1563 pending_fsmap.update_compat(modified);
1564 } else {
1565 ss << "compat feature " << f << " not present in " << pending_fsmap.compat;
1566 }
1567 r = 0;
1568 } else if (prefix == "mds compat rm_incompat") {
1569 int64_t f;
1570 if (!cmd_getval(g_ceph_context, cmdmap, "feature", f)) {
1571 ss << "error parsing feature value '"
1572 << cmd_vartype_stringify(cmdmap["feature"]) << "'";
1573 return -EINVAL;
1574 }
1575 if (pending_fsmap.compat.incompat.contains(f)) {
1576 ss << "removing incompat feature " << f;
1577 CompatSet modified = pending_fsmap.compat;
1578 modified.incompat.remove(f);
1579 pending_fsmap.update_compat(modified);
1580 } else {
1581 ss << "incompat feature " << f << " not present in " << pending_fsmap.compat;
1582 }
1583 r = 0;
1584 } else if (prefix == "mds repaired") {
1585 std::string role_str;
1586 cmd_getval(g_ceph_context, cmdmap, "rank", role_str);
1587 mds_role_t role;
1588 r = parse_role(role_str, &role, ss);
1589 if (r < 0) {
1590 return r;
1591 }
1592
1593 bool modified = pending_fsmap.undamaged(role.fscid, role.rank);
1594 if (modified) {
1595 dout(4) << "repaired: restoring rank " << role << dendl;
1596 } else {
1597 dout(4) << "repaired: no-op on rank " << role << dendl;
1598 }
1599
1600 r = 0;
1601 } else {
1602 return -ENOSYS;
1603 }
1604
1605 return r;
1606 }
1607
1608 /**
1609 * Helper to legacy_filesystem_command
1610 */
1611 void MDSMonitor::modify_legacy_filesystem(
1612 std::function<void(std::shared_ptr<Filesystem> )> fn)
1613 {
1614 pending_fsmap.modify_filesystem(
1615 pending_fsmap.legacy_client_fscid,
1616 fn
1617 );
1618 }
1619
1620
1621
1622 /**
1623 * Handle a command that affects the filesystem (i.e. a filesystem
1624 * must exist for the command to act upon).
1625 *
1626 * @retval 0 Command was successfully handled and has side effects
1627 * @retval -EAGAIN Messages has been requeued for retry
1628 * @retval -ENOSYS Unknown command
1629 * @retval < 0 An error has occurred; **ss** may have been set.
1630 */
1631 int MDSMonitor::legacy_filesystem_command(
1632 MonOpRequestRef op,
1633 std::string const &prefix,
1634 map<string, cmd_vartype> &cmdmap,
1635 std::stringstream &ss)
1636 {
1637 dout(4) << __func__ << " prefix='" << prefix << "'" << dendl;
1638 op->mark_mdsmon_event(__func__);
1639 int r = 0;
1640 string whostr;
1641 cmd_getval(g_ceph_context, cmdmap, "who", whostr);
1642
1643 assert (pending_fsmap.legacy_client_fscid != FS_CLUSTER_ID_NONE);
1644
1645 if (prefix == "mds set_max_mds") {
1646 // NOTE: deprecated by "fs set max_mds"
1647 int64_t maxmds;
1648 if (!cmd_getval(g_ceph_context, cmdmap, "maxmds", maxmds) || maxmds <= 0) {
1649 return -EINVAL;
1650 }
1651
1652 const MDSMap& mdsmap =
1653 pending_fsmap.filesystems.at(pending_fsmap.legacy_client_fscid)->mds_map;
1654
1655 if (!mdsmap.allows_multimds() &&
1656 maxmds > mdsmap.get_max_mds() &&
1657 maxmds > 1) {
1658 ss << "multi-MDS clusters are not enabled; set 'allow_multimds' to enable";
1659 return -EINVAL;
1660 }
1661
1662 if (maxmds > MAX_MDS) {
1663 ss << "may not have more than " << MAX_MDS << " MDS ranks";
1664 return -EINVAL;
1665 }
1666
1667 modify_legacy_filesystem(
1668 [maxmds](std::shared_ptr<Filesystem> fs)
1669 {
1670 fs->mds_map.set_max_mds(maxmds);
1671 });
1672
1673 r = 0;
1674 ss << "max_mds = " << maxmds;
1675 } else if (prefix == "mds cluster_down") {
1676 // NOTE: deprecated by "fs set cluster_down"
1677 modify_legacy_filesystem(
1678 [](std::shared_ptr<Filesystem> fs)
1679 {
1680 fs->mds_map.set_flag(CEPH_MDSMAP_DOWN);
1681 });
1682 ss << "marked fsmap DOWN";
1683 r = 0;
1684 } else if (prefix == "mds cluster_up") {
1685 // NOTE: deprecated by "fs set cluster_up"
1686 modify_legacy_filesystem(
1687 [](std::shared_ptr<Filesystem> fs)
1688 {
1689 fs->mds_map.clear_flag(CEPH_MDSMAP_DOWN);
1690 });
1691 ss << "unmarked fsmap DOWN";
1692 r = 0;
1693 } else {
1694 return -ENOSYS;
1695 }
1696
1697 return r;
1698 }
1699
1700
1701 void MDSMonitor::check_subs()
1702 {
1703 std::list<std::string> types;
1704
1705 // Subscriptions may be to "mdsmap" (MDS and legacy clients),
1706 // "mdsmap.<namespace>", or to "fsmap" for the full state of all
1707 // filesystems. Build a list of all the types we service
1708 // subscriptions for.
1709 types.push_back("fsmap");
1710 types.push_back("fsmap.user");
1711 types.push_back("mdsmap");
1712 for (const auto &i : fsmap.filesystems) {
1713 auto fscid = i.first;
1714 std::ostringstream oss;
1715 oss << "mdsmap." << fscid;
1716 types.push_back(oss.str());
1717 }
1718
1719 for (const auto &type : types) {
1720 if (mon->session_map.subs.count(type) == 0)
1721 continue;
1722 xlist<Subscription*>::iterator p = mon->session_map.subs[type]->begin();
1723 while (!p.end()) {
1724 Subscription *sub = *p;
1725 ++p;
1726 check_sub(sub);
1727 }
1728 }
1729 }
1730
1731
1732 void MDSMonitor::check_sub(Subscription *sub)
1733 {
1734 dout(20) << __func__ << ": " << sub->type << dendl;
1735
1736 if (sub->type == "fsmap") {
1737 if (sub->next <= fsmap.get_epoch()) {
1738 sub->session->con->send_message(new MFSMap(mon->monmap->fsid, fsmap));
1739 if (sub->onetime) {
1740 mon->session_map.remove_sub(sub);
1741 } else {
1742 sub->next = fsmap.get_epoch() + 1;
1743 }
1744 }
1745 } else if (sub->type == "fsmap.user") {
1746 if (sub->next <= fsmap.get_epoch()) {
1747 FSMapUser fsmap_u;
1748 fsmap_u.epoch = fsmap.get_epoch();
1749 fsmap_u.legacy_client_fscid = fsmap.legacy_client_fscid;
1750 for (auto p = fsmap.filesystems.begin();
1751 p != fsmap.filesystems.end();
1752 ++p) {
1753 FSMapUser::fs_info_t& fs_info = fsmap_u.filesystems[p->first];
1754 fs_info.cid = p->first;
1755 fs_info.name= p->second->mds_map.fs_name;
1756 }
1757 sub->session->con->send_message(new MFSMapUser(mon->monmap->fsid, fsmap_u));
1758 if (sub->onetime) {
1759 mon->session_map.remove_sub(sub);
1760 } else {
1761 sub->next = fsmap.get_epoch() + 1;
1762 }
1763 }
1764 } else if (sub->type.compare(0, 6, "mdsmap") == 0) {
1765 if (sub->next > fsmap.get_epoch()) {
1766 return;
1767 }
1768
1769 const bool is_mds = sub->session->inst.name.is_mds();
1770 mds_gid_t mds_gid = MDS_GID_NONE;
1771 fs_cluster_id_t fscid = FS_CLUSTER_ID_NONE;
1772 if (is_mds) {
1773 // What (if any) namespace are you assigned to?
1774 auto mds_info = fsmap.get_mds_info();
1775 for (const auto &i : mds_info) {
1776 if (i.second.addr == sub->session->inst.addr) {
1777 mds_gid = i.first;
1778 fscid = fsmap.mds_roles.at(mds_gid);
1779 }
1780 }
1781 } else {
1782 // You're a client. Did you request a particular
1783 // namespace?
1784 if (sub->type.find("mdsmap.") == 0) {
1785 auto namespace_id_str = sub->type.substr(std::string("mdsmap.").size());
1786 dout(10) << __func__ << ": namespace_id " << namespace_id_str << dendl;
1787 std::string err;
1788 fscid = strict_strtoll(namespace_id_str.c_str(), 10, &err);
1789 if (!err.empty()) {
1790 // Client asked for a non-existent namespace, send them nothing
1791 dout(1) << "Invalid client subscription '" << sub->type
1792 << "'" << dendl;
1793 return;
1794 }
1795 if (fsmap.filesystems.count(fscid) == 0) {
1796 // Client asked for a non-existent namespace, send them nothing
1797 // TODO: something more graceful for when a client has a filesystem
1798 // mounted, and the fileysstem is deleted. Add a "shut down you fool"
1799 // flag to MMDSMap?
1800 dout(1) << "Client subscribed to non-existent namespace '" <<
1801 fscid << "'" << dendl;
1802 return;
1803 }
1804 } else {
1805 // Unqualified request for "mdsmap": give it the one marked
1806 // for use by legacy clients.
1807 if (fsmap.legacy_client_fscid != FS_CLUSTER_ID_NONE) {
1808 fscid = fsmap.legacy_client_fscid;
1809 } else {
1810 dout(1) << "Client subscribed for legacy filesystem but "
1811 "none is configured" << dendl;
1812 return;
1813 }
1814 }
1815 }
1816 dout(10) << __func__ << ": is_mds=" << is_mds << ", fscid= " << fscid << dendl;
1817
1818 // Work out the effective latest epoch
1819 MDSMap *mds_map = nullptr;
1820 MDSMap null_map;
1821 null_map.compat = fsmap.compat;
1822 if (fscid == FS_CLUSTER_ID_NONE) {
1823 // For a client, we should have already dropped out
1824 assert(is_mds);
1825
1826 if (fsmap.standby_daemons.count(mds_gid)) {
1827 // For an MDS, we need to feed it an MDSMap with its own state in
1828 null_map.mds_info[mds_gid] = fsmap.standby_daemons[mds_gid];
1829 null_map.epoch = fsmap.standby_epochs[mds_gid];
1830 } else {
1831 null_map.epoch = fsmap.epoch;
1832 }
1833 mds_map = &null_map;
1834 } else {
1835 // Check the effective epoch
1836 mds_map = &(fsmap.filesystems.at(fscid)->mds_map);
1837 }
1838
1839 assert(mds_map != nullptr);
1840 dout(10) << __func__ << " selected MDS map epoch " <<
1841 mds_map->epoch << " for namespace " << fscid << " for subscriber "
1842 << sub->session->inst.name << " who wants epoch " << sub->next << dendl;
1843
1844 if (sub->next > mds_map->epoch) {
1845 return;
1846 }
1847 auto msg = new MMDSMap(mon->monmap->fsid, mds_map);
1848
1849 sub->session->con->send_message(msg);
1850 if (sub->onetime) {
1851 mon->session_map.remove_sub(sub);
1852 } else {
1853 sub->next = mds_map->get_epoch() + 1;
1854 }
1855 }
1856 }
1857
1858
1859 void MDSMonitor::update_metadata(mds_gid_t gid,
1860 const map<string, string>& metadata)
1861 {
1862 if (metadata.empty()) {
1863 return;
1864 }
1865 pending_metadata[gid] = metadata;
1866
1867 MonitorDBStore::TransactionRef t = paxos->get_pending_transaction();
1868 bufferlist bl;
1869 ::encode(pending_metadata, bl);
1870 t->put(MDS_METADATA_PREFIX, "last_metadata", bl);
1871 paxos->trigger_propose();
1872 }
1873
1874 void MDSMonitor::remove_from_metadata(MonitorDBStore::TransactionRef t)
1875 {
1876 bool update = false;
1877 for (map<mds_gid_t, Metadata>::iterator i = pending_metadata.begin();
1878 i != pending_metadata.end(); ) {
1879 if (!pending_fsmap.gid_exists(i->first)) {
1880 pending_metadata.erase(i++);
1881 update = true;
1882 } else {
1883 ++i;
1884 }
1885 }
1886 if (!update)
1887 return;
1888 bufferlist bl;
1889 ::encode(pending_metadata, bl);
1890 t->put(MDS_METADATA_PREFIX, "last_metadata", bl);
1891 }
1892
1893 int MDSMonitor::load_metadata(map<mds_gid_t, Metadata>& m)
1894 {
1895 bufferlist bl;
1896 int r = mon->store->get(MDS_METADATA_PREFIX, "last_metadata", bl);
1897 if (r) {
1898 dout(1) << "Unable to load 'last_metadata'" << dendl;
1899 return r;
1900 }
1901
1902 bufferlist::iterator it = bl.begin();
1903 ::decode(m, it);
1904 return 0;
1905 }
1906
1907 void MDSMonitor::count_metadata(const string& field, map<string,int> *out)
1908 {
1909 map<mds_gid_t,Metadata> meta;
1910 load_metadata(meta);
1911 for (auto& p : meta) {
1912 auto q = p.second.find(field);
1913 if (q == p.second.end()) {
1914 (*out)["unknown"]++;
1915 } else {
1916 (*out)[q->second]++;
1917 }
1918 }
1919 }
1920
1921 void MDSMonitor::count_metadata(const string& field, Formatter *f)
1922 {
1923 map<string,int> by_val;
1924 count_metadata(field, &by_val);
1925 f->open_object_section(field.c_str());
1926 for (auto& p : by_val) {
1927 f->dump_int(p.first.c_str(), p.second);
1928 }
1929 f->close_section();
1930 }
1931
1932 int MDSMonitor::dump_metadata(const std::string &who, Formatter *f, ostream& err)
1933 {
1934 assert(f);
1935
1936 mds_gid_t gid = gid_from_arg(who, err);
1937 if (gid == MDS_GID_NONE) {
1938 return -EINVAL;
1939 }
1940
1941 map<mds_gid_t, Metadata> metadata;
1942 if (int r = load_metadata(metadata)) {
1943 err << "Unable to load 'last_metadata'";
1944 return r;
1945 }
1946
1947 if (!metadata.count(gid)) {
1948 return -ENOENT;
1949 }
1950 const Metadata& m = metadata[gid];
1951 for (Metadata::const_iterator p = m.begin(); p != m.end(); ++p) {
1952 f->dump_string(p->first.c_str(), p->second);
1953 }
1954 return 0;
1955 }
1956
1957 int MDSMonitor::print_nodes(Formatter *f)
1958 {
1959 assert(f);
1960
1961 map<mds_gid_t, Metadata> metadata;
1962 if (int r = load_metadata(metadata)) {
1963 return r;
1964 }
1965
1966 map<string, list<int> > mdses; // hostname => rank
1967 for (map<mds_gid_t, Metadata>::iterator it = metadata.begin();
1968 it != metadata.end(); ++it) {
1969 const Metadata& m = it->second;
1970 Metadata::const_iterator hostname = m.find("hostname");
1971 if (hostname == m.end()) {
1972 // not likely though
1973 continue;
1974 }
1975 const mds_gid_t gid = it->first;
1976 if (!fsmap.gid_exists(gid)) {
1977 dout(5) << __func__ << ": GID " << gid << " not existent" << dendl;
1978 continue;
1979 }
1980 const MDSMap::mds_info_t& mds_info = fsmap.get_info_gid(gid);
1981 // FIXME: include filesystem name with rank here
1982 mdses[hostname->second].push_back(mds_info.rank);
1983 }
1984
1985 dump_services(f, mdses, "mds");
1986 return 0;
1987 }
1988
1989 /**
1990 * If a cluster is undersized (with respect to max_mds), then
1991 * attempt to find daemons to grow it.
1992 */
1993 bool MDSMonitor::maybe_expand_cluster(std::shared_ptr<Filesystem> fs)
1994 {
1995 bool do_propose = false;
1996
1997 if (fs->mds_map.test_flag(CEPH_MDSMAP_DOWN)) {
1998 return do_propose;
1999 }
2000
2001 while (fs->mds_map.get_num_in_mds() < size_t(fs->mds_map.get_max_mds()) &&
2002 !fs->mds_map.is_degraded()) {
2003 mds_rank_t mds = mds_rank_t(0);
2004 string name;
2005 while (fs->mds_map.is_in(mds)) {
2006 mds++;
2007 }
2008 mds_gid_t newgid = pending_fsmap.find_replacement_for({fs->fscid, mds},
2009 name, g_conf->mon_force_standby_active);
2010 if (newgid == MDS_GID_NONE) {
2011 break;
2012 }
2013
2014 const auto &new_info = pending_fsmap.get_info_gid(newgid);
2015 dout(1) << "assigned standby " << new_info.addr
2016 << " as mds." << mds << dendl;
2017
2018 mon->clog->info() << new_info.human_name() << " assigned to "
2019 "filesystem " << fs->mds_map.fs_name << " as rank "
2020 << mds << " (now has " << fs->mds_map.get_num_in_mds() + 1
2021 << " ranks)";
2022 pending_fsmap.promote(newgid, fs, mds);
2023 do_propose = true;
2024 }
2025
2026 return do_propose;
2027 }
2028
2029
2030 /**
2031 * If a daemon is laggy, and a suitable replacement
2032 * is available, fail this daemon (remove from map) and pass its
2033 * role to another daemon.
2034 */
2035 void MDSMonitor::maybe_replace_gid(mds_gid_t gid, const MDSMap::mds_info_t& info,
2036 bool *mds_propose, bool *osd_propose)
2037 {
2038 assert(mds_propose != nullptr);
2039 assert(osd_propose != nullptr);
2040
2041 const auto fscid = pending_fsmap.mds_roles.at(gid);
2042
2043 // We will only take decisive action (replacing/removing a daemon)
2044 // if we have some indicating that some other daemon(s) are successfully
2045 // getting beacons through recently.
2046 utime_t latest_beacon;
2047 for (const auto & i : last_beacon) {
2048 latest_beacon = MAX(i.second.stamp, latest_beacon);
2049 }
2050 const bool may_replace = latest_beacon >
2051 (ceph_clock_now() -
2052 MAX(g_conf->mds_beacon_interval, g_conf->mds_beacon_grace * 0.5));
2053
2054 // are we in?
2055 // and is there a non-laggy standby that can take over for us?
2056 mds_gid_t sgid;
2057 if (info.rank >= 0 &&
2058 info.state != MDSMap::STATE_STANDBY &&
2059 info.state != MDSMap::STATE_STANDBY_REPLAY &&
2060 may_replace &&
2061 !pending_fsmap.get_filesystem(fscid)->mds_map.test_flag(CEPH_MDSMAP_DOWN) &&
2062 (sgid = pending_fsmap.find_replacement_for({fscid, info.rank}, info.name,
2063 g_conf->mon_force_standby_active)) != MDS_GID_NONE)
2064 {
2065
2066 MDSMap::mds_info_t si = pending_fsmap.get_info_gid(sgid);
2067 dout(10) << " replacing " << gid << " " << info.addr << " mds."
2068 << info.rank << "." << info.inc
2069 << " " << ceph_mds_state_name(info.state)
2070 << " with " << sgid << "/" << si.name << " " << si.addr << dendl;
2071
2072 mon->clog->warn() << info.human_name()
2073 << " is not responding, replacing it "
2074 << "as rank " << info.rank
2075 << " with standby " << si.human_name();
2076
2077 // Remember what NS the old one was in
2078 const fs_cluster_id_t fscid = pending_fsmap.mds_roles.at(gid);
2079
2080 // Remove the old one
2081 *osd_propose |= fail_mds_gid(gid);
2082
2083 // Promote the replacement
2084 auto fs = pending_fsmap.filesystems.at(fscid);
2085 pending_fsmap.promote(sgid, fs, info.rank);
2086
2087 *mds_propose = true;
2088 } else if ((info.state == MDSMap::STATE_STANDBY_REPLAY ||
2089 info.state == MDSMap::STATE_STANDBY) && may_replace) {
2090 dout(10) << " failing and removing " << gid << " " << info.addr << " mds." << info.rank
2091 << "." << info.inc << " " << ceph_mds_state_name(info.state)
2092 << dendl;
2093 mon->clog->info() << "Standby " << info.human_name() << " is not "
2094 "responding, dropping it";
2095 fail_mds_gid(gid);
2096 *mds_propose = true;
2097 } else if (!info.laggy()) {
2098 dout(10) << " marking " << gid << " " << info.addr << " mds." << info.rank << "." << info.inc
2099 << " " << ceph_mds_state_name(info.state)
2100 << " laggy" << dendl;
2101 pending_fsmap.modify_daemon(info.global_id, [](MDSMap::mds_info_t *info) {
2102 info->laggy_since = ceph_clock_now();
2103 });
2104 *mds_propose = true;
2105 }
2106 }
2107
2108 bool MDSMonitor::maybe_promote_standby(std::shared_ptr<Filesystem> fs)
2109 {
2110 assert(!fs->mds_map.test_flag(CEPH_MDSMAP_DOWN));
2111
2112 bool do_propose = false;
2113
2114 // have a standby take over?
2115 set<mds_rank_t> failed;
2116 fs->mds_map.get_failed_mds_set(failed);
2117 if (!failed.empty()) {
2118 set<mds_rank_t>::iterator p = failed.begin();
2119 while (p != failed.end()) {
2120 mds_rank_t f = *p++;
2121 mds_gid_t sgid = pending_fsmap.find_replacement_for({fs->fscid, f}, {},
2122 g_conf->mon_force_standby_active);
2123 if (sgid) {
2124 const MDSMap::mds_info_t si = pending_fsmap.get_info_gid(sgid);
2125 dout(0) << " taking over failed mds." << f << " with " << sgid
2126 << "/" << si.name << " " << si.addr << dendl;
2127 mon->clog->info() << "Standby " << si.human_name()
2128 << " assigned to filesystem " << fs->mds_map.fs_name
2129 << " as rank " << f;
2130
2131 pending_fsmap.promote(sgid, fs, f);
2132 do_propose = true;
2133 }
2134 }
2135 } else {
2136 // There were no failures to replace, so try using any available standbys
2137 // as standby-replay daemons.
2138
2139 // Take a copy of the standby GIDs so that we can iterate over
2140 // them while perhaps-modifying standby_daemons during the loop
2141 // (if we promote anyone they are removed from standby_daemons)
2142 std::vector<mds_gid_t> standby_gids;
2143 for (const auto &j : pending_fsmap.standby_daemons) {
2144 standby_gids.push_back(j.first);
2145 }
2146
2147 for (const auto &gid : standby_gids) {
2148 const auto &info = pending_fsmap.standby_daemons.at(gid);
2149 assert(info.state == MDSMap::STATE_STANDBY);
2150
2151 if (!info.standby_replay) {
2152 continue;
2153 }
2154
2155 /*
2156 * This mds is standby but has no rank assigned.
2157 * See if we can find it somebody to shadow
2158 */
2159 dout(20) << "gid " << gid << " is standby and following nobody" << dendl;
2160
2161 // standby for someone specific?
2162 if (info.standby_for_rank >= 0) {
2163 // The mds_info_t may or may not tell us exactly which filesystem
2164 // the standby_for_rank refers to: lookup via legacy_client_fscid
2165 mds_role_t target_role = {
2166 info.standby_for_fscid == FS_CLUSTER_ID_NONE ?
2167 pending_fsmap.legacy_client_fscid : info.standby_for_fscid,
2168 info.standby_for_rank};
2169
2170 // It is possible that the map contains a standby_for_fscid
2171 // that doesn't correspond to an existing filesystem, especially
2172 // if we loaded from a version with a bug (#17466)
2173 if (info.standby_for_fscid != FS_CLUSTER_ID_NONE
2174 && !pending_fsmap.filesystem_exists(info.standby_for_fscid)) {
2175 derr << "gid " << gid << " has invalid standby_for_fscid "
2176 << info.standby_for_fscid << dendl;
2177 continue;
2178 }
2179
2180 // If we managed to resolve a full target role
2181 if (target_role.fscid != FS_CLUSTER_ID_NONE) {
2182 auto fs = pending_fsmap.get_filesystem(target_role.fscid);
2183 if (fs->mds_map.is_followable(target_role.rank)) {
2184 do_propose |= try_standby_replay(
2185 info,
2186 *fs,
2187 fs->mds_map.get_info(target_role.rank));
2188 }
2189 }
2190
2191 continue;
2192 }
2193
2194 // check everyone
2195 for (auto fs_i : pending_fsmap.filesystems) {
2196 const MDSMap &mds_map = fs_i.second->mds_map;
2197 for (auto mds_i : mds_map.mds_info) {
2198 MDSMap::mds_info_t &cand_info = mds_i.second;
2199 if (cand_info.rank >= 0 && mds_map.is_followable(cand_info.rank)) {
2200 if ((info.standby_for_name.length() && info.standby_for_name != cand_info.name) ||
2201 info.standby_for_rank != MDS_RANK_NONE) {
2202 continue; // we're supposed to follow someone else
2203 }
2204
2205 if (try_standby_replay(info, *(fs_i.second), cand_info)) {
2206 do_propose = true;
2207 break;
2208 }
2209 continue;
2210 }
2211 }
2212 }
2213 }
2214 }
2215
2216 return do_propose;
2217 }
2218
2219 void MDSMonitor::tick()
2220 {
2221 // make sure mds's are still alive
2222 // ...if i am an active leader
2223 if (!is_active()) return;
2224
2225 dout(10) << fsmap << dendl;
2226
2227 bool do_propose = false;
2228
2229 if (!mon->is_leader()) return;
2230
2231 do_propose |= pending_fsmap.check_health();
2232
2233 // expand mds cluster (add new nodes to @in)?
2234 for (auto i : pending_fsmap.filesystems) {
2235 do_propose |= maybe_expand_cluster(i.second);
2236 }
2237
2238 const auto now = ceph_clock_now();
2239 if (last_tick.is_zero()) {
2240 last_tick = now;
2241 }
2242
2243 if (now - last_tick > (g_conf->mds_beacon_grace - g_conf->mds_beacon_interval)) {
2244 // This case handles either local slowness (calls being delayed
2245 // for whatever reason) or cluster election slowness (a long gap
2246 // between calls while an election happened)
2247 dout(4) << __func__ << ": resetting beacon timeouts due to mon delay "
2248 "(slow election?) of " << now - last_tick << " seconds" << dendl;
2249 for (auto &i : last_beacon) {
2250 i.second.stamp = now;
2251 }
2252 }
2253
2254 last_tick = now;
2255
2256 // check beacon timestamps
2257 utime_t cutoff = now;
2258 cutoff -= g_conf->mds_beacon_grace;
2259
2260 // make sure last_beacon is fully populated
2261 for (const auto &p : pending_fsmap.mds_roles) {
2262 auto &gid = p.first;
2263 if (last_beacon.count(gid) == 0) {
2264 last_beacon[gid].stamp = now;
2265 last_beacon[gid].seq = 0;
2266 }
2267 }
2268
2269 bool propose_osdmap = false;
2270 bool osdmap_writeable = mon->osdmon()->is_writeable();
2271 auto p = last_beacon.begin();
2272 while (p != last_beacon.end()) {
2273 mds_gid_t gid = p->first;
2274 auto beacon_info = p->second;
2275 ++p;
2276
2277 if (!pending_fsmap.gid_exists(gid)) {
2278 // clean it out
2279 last_beacon.erase(gid);
2280 continue;
2281 }
2282
2283 if (beacon_info.stamp < cutoff) {
2284 auto &info = pending_fsmap.get_info_gid(gid);
2285 dout(1) << "no beacon from mds." << info.rank << "." << info.inc
2286 << " (gid: " << gid << " addr: " << info.addr
2287 << " state: " << ceph_mds_state_name(info.state) << ")"
2288 << " since " << beacon_info.stamp << dendl;
2289 // If the OSDMap is writeable, we can blacklist things, so we can
2290 // try failing any laggy MDS daemons. Consider each one for failure.
2291 if (osdmap_writeable) {
2292 maybe_replace_gid(gid, info, &do_propose, &propose_osdmap);
2293 }
2294 }
2295 }
2296 if (propose_osdmap) {
2297 request_proposal(mon->osdmon());
2298 }
2299
2300 for (auto i : pending_fsmap.filesystems) {
2301 auto fs = i.second;
2302 if (!fs->mds_map.test_flag(CEPH_MDSMAP_DOWN)) {
2303 do_propose |= maybe_promote_standby(fs);
2304 }
2305 }
2306
2307 if (do_propose) {
2308 propose_pending();
2309 }
2310 }
2311
2312 /**
2313 * finfo: the would-be follower
2314 * leader_fs: the Filesystem containing the would-be leader
2315 * ainfo: the would-be leader
2316 */
2317 bool MDSMonitor::try_standby_replay(
2318 const MDSMap::mds_info_t& finfo,
2319 const Filesystem &leader_fs,
2320 const MDSMap::mds_info_t& ainfo)
2321 {
2322 // someone else already following?
2323 if (leader_fs.has_standby_replay(ainfo.global_id)) {
2324 dout(20) << " mds." << ainfo.rank << " already has a follower" << dendl;
2325 return false;
2326 } else {
2327 // Assign the new role to the standby
2328 dout(10) << " setting to follow mds rank " << ainfo.rank << dendl;
2329 pending_fsmap.assign_standby_replay(finfo.global_id, leader_fs.fscid, ainfo.rank);
2330 return true;
2331 }
2332 }
2333
2334 MDSMonitor::MDSMonitor(Monitor *mn, Paxos *p, string service_name)
2335 : PaxosService(mn, p, service_name)
2336 {
2337 handlers = FileSystemCommandHandler::load(p);
2338 }
2339
2340 void MDSMonitor::on_restart()
2341 {
2342 // Clear out the leader-specific state.
2343 last_tick = utime_t();
2344 last_beacon.clear();
2345 }
2346