]> git.proxmox.com Git - ceph.git/blob - ceph/src/mon/MDSMonitor.cc
update sources to 12.2.8
[ceph.git] / ceph / src / mon / MDSMonitor.cc
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3 /*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 *
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
12 *
13 */
14
15 #include <sstream>
16 #include <boost/utility.hpp>
17 #include <boost/regex.hpp>
18
19 #include "MDSMonitor.h"
20 #include "FSCommands.h"
21 #include "Monitor.h"
22 #include "MonitorDBStore.h"
23 #include "OSDMonitor.h"
24 #include "PGMonitor.h"
25
26 #include "common/strtol.h"
27 #include "common/perf_counters.h"
28 #include "common/config.h"
29 #include "common/cmdparse.h"
30 #include "messages/MMDSMap.h"
31 #include "messages/MFSMap.h"
32 #include "messages/MFSMapUser.h"
33 #include "messages/MMDSLoadTargets.h"
34 #include "messages/MMonCommand.h"
35 #include "messages/MGenericMessage.h"
36
37 #include "include/assert.h"
38 #include "include/str_list.h"
39 #include "include/stringify.h"
40 #include "mds/mdstypes.h"
41 #include "Session.h"
42
43 #define dout_subsys ceph_subsys_mon
44 #undef dout_prefix
45 #define dout_prefix _prefix(_dout, mon, get_fsmap())
46 static ostream& _prefix(std::ostream *_dout, Monitor *mon, const FSMap& fsmap) {
47 return *_dout << "mon." << mon->name << "@" << mon->rank
48 << "(" << mon->get_state_name()
49 << ").mds e" << fsmap.get_epoch() << " ";
50 }
51
52 static const string MDS_METADATA_PREFIX("mds_metadata");
53 static const string MDS_HEALTH_PREFIX("mds_health");
54
55
56 /*
57 * Specialized implementation of cmd_getval to allow us to parse
58 * out strongly-typedef'd types
59 */
60 template<> bool cmd_getval(CephContext *cct, const cmdmap_t& cmdmap,
61 const std::string& k, mds_gid_t &val)
62 {
63 return cmd_getval(cct, cmdmap, k, (int64_t&)val);
64 }
65
66 template<> bool cmd_getval(CephContext *cct, const cmdmap_t& cmdmap,
67 const std::string& k, mds_rank_t &val)
68 {
69 return cmd_getval(cct, cmdmap, k, (int64_t&)val);
70 }
71
72 template<> bool cmd_getval(CephContext *cct, const cmdmap_t& cmdmap,
73 const std::string& k, MDSMap::DaemonState &val)
74 {
75 return cmd_getval(cct, cmdmap, k, (int64_t&)val);
76 }
77
78 // my methods
79
80 void MDSMonitor::print_map(const FSMap &m, int dbl)
81 {
82 dout(dbl) << "print_map\n";
83 m.print(*_dout);
84 *_dout << dendl;
85 }
86
87 // service methods
88 void MDSMonitor::create_initial()
89 {
90 dout(10) << "create_initial" << dendl;
91 }
92
93 void MDSMonitor::get_store_prefixes(std::set<string>& s)
94 {
95 s.insert(service_name);
96 s.insert(MDS_METADATA_PREFIX);
97 s.insert(MDS_HEALTH_PREFIX);
98 }
99
100 void MDSMonitor::update_from_paxos(bool *need_bootstrap)
101 {
102 version_t version = get_last_committed();
103 if (version == get_fsmap().epoch)
104 return;
105
106 dout(10) << __func__ << " version " << version
107 << ", my e " << get_fsmap().epoch << dendl;
108 assert(version > get_fsmap().epoch);
109
110 load_health();
111
112 // read and decode
113 bufferlist fsmap_bl;
114 fsmap_bl.clear();
115 int err = get_version(version, fsmap_bl);
116 assert(err == 0);
117
118 assert(fsmap_bl.length() > 0);
119 dout(10) << __func__ << " got " << version << dendl;
120 PaxosFSMap::decode(fsmap_bl);
121
122 // new map
123 dout(4) << "new map" << dendl;
124 print_map(get_fsmap(), 0);
125 if (!g_conf->mon_mds_skip_sanity) {
126 get_fsmap().sanity();
127 }
128
129 check_subs();
130 update_logger();
131 }
132
133 void MDSMonitor::init()
134 {
135 (void)load_metadata(pending_metadata);
136 }
137
138 void MDSMonitor::create_pending()
139 {
140 auto &fsmap = PaxosFSMap::create_pending();
141
142 if (mon->osdmon()->is_readable()) {
143 const auto &osdmap = mon->osdmon()->osdmap;
144 fsmap.sanitize([&osdmap](int64_t pool){return osdmap.have_pg_pool(pool);});
145 }
146
147 dout(10) << "create_pending e" << fsmap.epoch << dendl;
148 }
149
150 void MDSMonitor::encode_pending(MonitorDBStore::TransactionRef t)
151 {
152 auto &pending = get_pending_fsmap_writeable();
153 auto &epoch = pending.epoch;
154
155 dout(10) << "encode_pending e" << epoch << dendl;
156
157 // print map iff 'debug mon = 30' or higher
158 print_map(get_pending_fsmap(), 30);
159 if (!g_conf->mon_mds_skip_sanity) {
160 pending.sanity();
161 }
162
163 // Set 'modified' on maps modified this epoch
164 for (auto &p : pending.filesystems) {
165 if (p.second->mds_map.epoch == epoch) {
166 p.second->mds_map.modified = ceph_clock_now();
167 }
168 }
169
170 // apply to paxos
171 assert(get_last_committed() + 1 == pending.epoch);
172 bufferlist pending_bl;
173 pending.encode(pending_bl, mon->get_quorum_con_features());
174
175 /* put everything in the transaction */
176 put_version(t, pending.epoch, pending_bl);
177 put_last_committed(t, pending.epoch);
178
179 // Encode MDSHealth data
180 for (std::map<uint64_t, MDSHealth>::iterator i = pending_daemon_health.begin();
181 i != pending_daemon_health.end(); ++i) {
182 bufferlist bl;
183 i->second.encode(bl);
184 t->put(MDS_HEALTH_PREFIX, stringify(i->first), bl);
185 }
186
187 for (std::set<uint64_t>::iterator i = pending_daemon_health_rm.begin();
188 i != pending_daemon_health_rm.end(); ++i) {
189 t->erase(MDS_HEALTH_PREFIX, stringify(*i));
190 }
191 pending_daemon_health_rm.clear();
192 remove_from_metadata(pending, t);
193
194 // health
195 health_check_map_t new_checks;
196 const auto &info_map = pending.get_mds_info();
197 for (const auto &i : info_map) {
198 const auto &gid = i.first;
199 const auto &info = i.second;
200 if (pending_daemon_health_rm.count(gid)) {
201 continue;
202 }
203 MDSHealth health;
204 auto p = pending_daemon_health.find(gid);
205 if (p != pending_daemon_health.end()) {
206 health = p->second;
207 } else {
208 bufferlist bl;
209 mon->store->get(MDS_HEALTH_PREFIX, stringify(gid), bl);
210 if (!bl.length()) {
211 derr << "Missing health data for MDS " << gid << dendl;
212 continue;
213 }
214 bufferlist::iterator bl_i = bl.begin();
215 health.decode(bl_i);
216 }
217 for (const auto &metric : health.metrics) {
218 const int rank = info.rank;
219 health_check_t *check = &new_checks.get_or_add(
220 mds_metric_name(metric.type),
221 metric.sev,
222 mds_metric_summary(metric.type));
223 ostringstream ss;
224 ss << "mds" << info.name << "(mds." << rank << "): " << metric.message;
225 bool first = true;
226 for (auto &p : metric.metadata) {
227 if (first) {
228 ss << " ";
229 } else {
230 ss << ", ";
231 }
232 ss << p.first << ": " << p.second;
233 first = false;
234 }
235 check->detail.push_back(ss.str());
236 }
237 }
238 pending.get_health_checks(&new_checks);
239 for (auto& p : new_checks.checks) {
240 p.second.summary = boost::regex_replace(
241 p.second.summary,
242 boost::regex("%num%"),
243 stringify(p.second.detail.size()));
244 p.second.summary = boost::regex_replace(
245 p.second.summary,
246 boost::regex("%plurals%"),
247 p.second.detail.size() > 1 ? "s" : "");
248 p.second.summary = boost::regex_replace(
249 p.second.summary,
250 boost::regex("%isorare%"),
251 p.second.detail.size() > 1 ? "are" : "is");
252 p.second.summary = boost::regex_replace(
253 p.second.summary,
254 boost::regex("%hasorhave%"),
255 p.second.detail.size() > 1 ? "have" : "has");
256 }
257 encode_health(new_checks, t);
258 }
259
260 version_t MDSMonitor::get_trim_to()
261 {
262 version_t floor = 0;
263 if (g_conf->mon_mds_force_trim_to > 0 &&
264 g_conf->mon_mds_force_trim_to < (int)get_last_committed()) {
265 floor = g_conf->mon_mds_force_trim_to;
266 dout(10) << __func__ << " explicit mon_mds_force_trim_to = "
267 << floor << dendl;
268 }
269
270 unsigned max = g_conf->mon_max_mdsmap_epochs;
271 version_t last = get_last_committed();
272
273 if (last - get_first_committed() > max && floor < last - max)
274 return last - max;
275 return floor;
276 }
277
278 void MDSMonitor::update_logger()
279 {
280 dout(10) << "update_logger" << dendl;
281
282 const auto &fsmap = get_fsmap();
283
284 uint64_t up = 0;
285 uint64_t in = 0;
286 uint64_t failed = 0;
287 for (const auto &i : fsmap.filesystems) {
288 const MDSMap &mds_map = i.second->mds_map;
289
290 up += mds_map.get_num_up_mds();
291 in += mds_map.get_num_in_mds();
292 failed += mds_map.get_num_failed_mds();
293 }
294 mon->cluster_logger->set(l_cluster_num_mds_up, up);
295 mon->cluster_logger->set(l_cluster_num_mds_in, in);
296 mon->cluster_logger->set(l_cluster_num_mds_failed, failed);
297 mon->cluster_logger->set(l_cluster_mds_epoch, fsmap.get_epoch());
298 }
299
300 bool MDSMonitor::preprocess_query(MonOpRequestRef op)
301 {
302 op->mark_mdsmon_event(__func__);
303 PaxosServiceMessage *m = static_cast<PaxosServiceMessage*>(op->get_req());
304 dout(10) << "preprocess_query " << *m << " from " << m->get_orig_source_inst() << dendl;
305
306 switch (m->get_type()) {
307
308 case MSG_MDS_BEACON:
309 return preprocess_beacon(op);
310
311 case MSG_MON_COMMAND:
312 return preprocess_command(op);
313
314 case MSG_MDS_OFFLOAD_TARGETS:
315 return preprocess_offload_targets(op);
316
317 default:
318 ceph_abort();
319 return true;
320 }
321 }
322
323 void MDSMonitor::_note_beacon(MMDSBeacon *m)
324 {
325 mds_gid_t gid = mds_gid_t(m->get_global_id());
326 version_t seq = m->get_seq();
327
328 dout(15) << "_note_beacon " << *m << " noting time" << dendl;
329 auto &beacon = last_beacon[gid];
330 beacon.stamp = mono_clock::now();
331 beacon.seq = seq;
332 }
333
334 bool MDSMonitor::preprocess_beacon(MonOpRequestRef op)
335 {
336 op->mark_mdsmon_event(__func__);
337 MMDSBeacon *m = static_cast<MMDSBeacon*>(op->get_req());
338 MDSMap::DaemonState state = m->get_state();
339 mds_gid_t gid = m->get_global_id();
340 version_t seq = m->get_seq();
341 MDSMap::mds_info_t info;
342 epoch_t effective_epoch = 0;
343
344 const auto &fsmap = get_fsmap();
345
346 // check privileges, ignore if fails
347 MonSession *session = m->get_session();
348 assert(session);
349 if (!session->is_capable("mds", MON_CAP_X)) {
350 dout(0) << "preprocess_beacon got MMDSBeacon from entity with insufficient privileges "
351 << session->caps << dendl;
352 goto ignore;
353 }
354
355 if (m->get_fsid() != mon->monmap->fsid) {
356 dout(0) << "preprocess_beacon on fsid " << m->get_fsid() << " != " << mon->monmap->fsid << dendl;
357 goto ignore;
358 }
359
360 dout(12) << "preprocess_beacon " << *m
361 << " from " << m->get_orig_source_inst()
362 << " " << m->get_compat()
363 << dendl;
364
365 // make sure the address has a port
366 if (m->get_orig_source_addr().get_port() == 0) {
367 dout(1) << " ignoring boot message without a port" << dendl;
368 goto ignore;
369 }
370
371 // check compat
372 if (!m->get_compat().writeable(fsmap.compat)) {
373 dout(1) << " mds " << m->get_source_inst() << " can't write to fsmap " << fsmap.compat << dendl;
374 goto ignore;
375 }
376
377 // fw to leader?
378 if (!is_leader())
379 return false;
380
381 // booted, but not in map?
382 if (!fsmap.gid_exists(gid)) {
383 if (state != MDSMap::STATE_BOOT) {
384 dout(7) << "mds_beacon " << *m << " is not in fsmap (state "
385 << ceph_mds_state_name(state) << ")" << dendl;
386
387 /* We can't send an MDSMap this MDS was a part of because we no longer
388 * know which FS it was part of. Nor does this matter. Sending an empty
389 * MDSMap is sufficient for getting the MDS to respawn.
390 */
391 MDSMap null_map;
392 null_map.epoch = fsmap.epoch;
393 null_map.compat = fsmap.compat;
394 mon->send_reply(op, new MMDSMap(mon->monmap->fsid, &null_map));
395 return true;
396 } else {
397 return false; // not booted yet.
398 }
399 }
400 dout(10) << __func__ << ": GID exists in map: " << gid << dendl;
401 info = fsmap.get_info_gid(gid);
402
403 // old seq?
404 if (info.state_seq > seq) {
405 dout(7) << "mds_beacon " << *m << " has old seq, ignoring" << dendl;
406 goto ignore;
407 }
408
409 // Work out the latest epoch that this daemon should have seen
410 {
411 fs_cluster_id_t fscid = fsmap.mds_roles.at(gid);
412 if (fscid == FS_CLUSTER_ID_NONE) {
413 effective_epoch = fsmap.standby_epochs.at(gid);
414 } else {
415 effective_epoch = fsmap.get_filesystem(fscid)->mds_map.epoch;
416 }
417 if (effective_epoch != m->get_last_epoch_seen()) {
418 dout(10) << "mds_beacon " << *m
419 << " ignoring requested state, because mds hasn't seen latest map" << dendl;
420 goto reply;
421 }
422 }
423
424 if (info.laggy()) {
425 _note_beacon(m);
426 return false; // no longer laggy, need to update map.
427 }
428 if (state == MDSMap::STATE_BOOT) {
429 // ignore, already booted.
430 goto ignore;
431 }
432 // is there a state change here?
433 if (info.state != state) {
434 // legal state change?
435 if ((info.state == MDSMap::STATE_STANDBY ||
436 info.state == MDSMap::STATE_STANDBY_REPLAY) && state > 0) {
437 dout(10) << "mds_beacon mds can't activate itself (" << ceph_mds_state_name(info.state)
438 << " -> " << ceph_mds_state_name(state) << ")" << dendl;
439 goto reply;
440 }
441
442 if ((state == MDSMap::STATE_STANDBY || state == MDSMap::STATE_STANDBY_REPLAY)
443 && info.rank != MDS_RANK_NONE)
444 {
445 dout(4) << "mds_beacon MDS can't go back into standby after taking rank: "
446 "held rank " << info.rank << " while requesting state "
447 << ceph_mds_state_name(state) << dendl;
448 goto reply;
449 }
450
451 _note_beacon(m);
452 return false;
453 }
454
455 // Comparing known daemon health with m->get_health()
456 // and return false (i.e. require proposal) if they
457 // do not match, to update our stored
458 if (!(pending_daemon_health[gid] == m->get_health())) {
459 dout(20) << __func__ << " health metrics for gid " << gid << " were updated" << dendl;
460 _note_beacon(m);
461 return false;
462 }
463
464 reply:
465 // note time and reply
466 assert(effective_epoch > 0);
467 _note_beacon(m);
468 mon->send_reply(op,
469 new MMDSBeacon(mon->monmap->fsid, m->get_global_id(), m->get_name(),
470 effective_epoch, state, seq,
471 CEPH_FEATURES_SUPPORTED_DEFAULT));
472 return true;
473
474 ignore:
475 // I won't reply this beacon, drop it.
476 mon->no_reply(op);
477 return true;
478 }
479
480 bool MDSMonitor::preprocess_offload_targets(MonOpRequestRef op)
481 {
482 op->mark_mdsmon_event(__func__);
483 MMDSLoadTargets *m = static_cast<MMDSLoadTargets*>(op->get_req());
484 dout(10) << "preprocess_offload_targets " << *m << " from " << m->get_orig_source() << dendl;
485
486 const auto &fsmap = get_fsmap();
487
488 // check privileges, ignore message if fails
489 MonSession *session = m->get_session();
490 if (!session)
491 goto ignore;
492 if (!session->is_capable("mds", MON_CAP_X)) {
493 dout(0) << "preprocess_offload_targets got MMDSLoadTargets from entity with insufficient caps "
494 << session->caps << dendl;
495 goto ignore;
496 }
497
498 if (fsmap.gid_exists(m->global_id) &&
499 m->targets == fsmap.get_info_gid(m->global_id).export_targets)
500 goto ignore;
501
502 return false;
503
504 ignore:
505 mon->no_reply(op);
506 return true;
507 }
508
509
510 bool MDSMonitor::prepare_update(MonOpRequestRef op)
511 {
512 op->mark_mdsmon_event(__func__);
513 PaxosServiceMessage *m = static_cast<PaxosServiceMessage*>(op->get_req());
514 dout(7) << "prepare_update " << *m << dendl;
515
516 switch (m->get_type()) {
517
518 case MSG_MDS_BEACON:
519 return prepare_beacon(op);
520
521 case MSG_MON_COMMAND:
522 return prepare_command(op);
523
524 case MSG_MDS_OFFLOAD_TARGETS:
525 return prepare_offload_targets(op);
526
527 default:
528 ceph_abort();
529 }
530
531 return true;
532 }
533
534 bool MDSMonitor::prepare_beacon(MonOpRequestRef op)
535 {
536 op->mark_mdsmon_event(__func__);
537 MMDSBeacon *m = static_cast<MMDSBeacon*>(op->get_req());
538 // -- this is an update --
539 dout(12) << "prepare_beacon " << *m << " from " << m->get_orig_source_inst() << dendl;
540 entity_addr_t addr = m->get_orig_source_inst().addr;
541 mds_gid_t gid = m->get_global_id();
542 MDSMap::DaemonState state = m->get_state();
543 version_t seq = m->get_seq();
544
545 auto &pending = get_pending_fsmap_writeable();
546
547 dout(20) << __func__ << " got health from gid " << gid << " with " << m->get_health().metrics.size() << " metrics." << dendl;
548
549 // Calculate deltas of health metrics created and removed
550 // Do this by type rather than MDSHealthMetric equality, because messages can
551 // change a lot when they include e.g. a number of items.
552 const auto &old_health = pending_daemon_health[gid].metrics;
553 const auto &new_health = m->get_health().metrics;
554
555 std::set<mds_metric_t> old_types;
556 for (const auto &i : old_health) {
557 old_types.insert(i.type);
558 }
559
560 std::set<mds_metric_t> new_types;
561 for (const auto &i : new_health) {
562 new_types.insert(i.type);
563 }
564
565 for (const auto &new_metric: new_health) {
566 if (old_types.count(new_metric.type) == 0) {
567 dout(10) << "MDS health message (" << m->get_orig_source_inst().name
568 << "): " << new_metric.sev << " " << new_metric.message << dendl;
569 }
570 }
571
572 // Log the disappearance of health messages at INFO
573 for (const auto &old_metric : old_health) {
574 if (new_types.count(old_metric.type) == 0) {
575 mon->clog->info() << "MDS health message cleared ("
576 << m->get_orig_source_inst().name << "): " << old_metric.message;
577 }
578 }
579
580 // Store health
581 pending_daemon_health[gid] = m->get_health();
582
583 // boot?
584 if (state == MDSMap::STATE_BOOT) {
585 // zap previous instance of this name?
586 if (g_conf->mds_enforce_unique_name) {
587 bool failed_mds = false;
588 while (mds_gid_t existing = pending.find_mds_gid_by_name(m->get_name())) {
589 if (!mon->osdmon()->is_writeable()) {
590 mon->osdmon()->wait_for_writeable(op, new C_RetryMessage(this, op));
591 return false;
592 }
593 const MDSMap::mds_info_t &existing_info =
594 pending.get_info_gid(existing);
595 mon->clog->info() << existing_info.human_name() << " restarted";
596 fail_mds_gid(pending, existing);
597 failed_mds = true;
598 }
599 if (failed_mds) {
600 assert(mon->osdmon()->is_writeable());
601 request_proposal(mon->osdmon());
602 }
603 }
604
605 // Add this daemon to the map
606 if (pending.mds_roles.count(gid) == 0) {
607 MDSMap::mds_info_t new_info;
608 new_info.global_id = gid;
609 new_info.name = m->get_name();
610 new_info.addr = addr;
611 new_info.mds_features = m->get_mds_features();
612 new_info.state = MDSMap::STATE_STANDBY;
613 new_info.state_seq = seq;
614 new_info.standby_for_rank = m->get_standby_for_rank();
615 new_info.standby_for_name = m->get_standby_for_name();
616 new_info.standby_for_fscid = m->get_standby_for_fscid();
617 new_info.standby_replay = m->get_standby_replay();
618 pending.insert(new_info);
619 }
620
621 // Resolve standby_for_name to a rank
622 const MDSMap::mds_info_t &info = pending.get_info_gid(gid);
623 if (!info.standby_for_name.empty()) {
624 const MDSMap::mds_info_t *leaderinfo = pending.find_by_name(
625 info.standby_for_name);
626 if (leaderinfo && (leaderinfo->rank >= 0)) {
627 const auto &fscid = pending.mds_roles.at(leaderinfo->global_id);
628
629 pending.modify_daemon(gid, [fscid, leaderinfo](
630 MDSMap::mds_info_t *info) {
631 info->standby_for_rank = leaderinfo->rank;
632 info->standby_for_fscid = fscid;
633 });
634 }
635 }
636
637 // initialize the beacon timer
638 auto &beacon = last_beacon[gid];
639 beacon.stamp = mono_clock::now();
640 beacon.seq = seq;
641
642 // new incompat?
643 if (!pending.compat.writeable(m->get_compat())) {
644 dout(10) << " fsmap " << pending.compat
645 << " can't write to new mds' " << m->get_compat()
646 << ", updating fsmap and killing old mds's"
647 << dendl;
648 pending.update_compat(m->get_compat());
649 }
650
651 update_metadata(m->get_global_id(), m->get_sys_info());
652 } else {
653 // state update
654 const MDSMap::mds_info_t &info = pending.get_info_gid(gid);
655 // Old MDS daemons don't mention that they're standby replay until
656 // after they've sent their boot beacon, so update this field.
657 if (info.standby_replay != m->get_standby_replay()) {
658 pending.modify_daemon(info.global_id, [&m](
659 MDSMap::mds_info_t *i)
660 {
661 i->standby_replay = m->get_standby_replay();
662 });
663 }
664
665 if (info.state == MDSMap::STATE_STOPPING && state != MDSMap::STATE_STOPPED ) {
666 // we can't transition to any other states from STOPPING
667 dout(0) << "got beacon for MDS in STATE_STOPPING, ignoring requested state change"
668 << dendl;
669 _note_beacon(m);
670 return true;
671 }
672
673 if (info.laggy()) {
674 dout(10) << "prepare_beacon clearing laggy flag on " << addr << dendl;
675 pending.modify_daemon(info.global_id, [](MDSMap::mds_info_t *info)
676 {
677 info->clear_laggy();
678 }
679 );
680 }
681
682 dout(10) << "prepare_beacon mds." << info.rank
683 << " " << ceph_mds_state_name(info.state)
684 << " -> " << ceph_mds_state_name(state)
685 << " standby_for_rank=" << m->get_standby_for_rank()
686 << dendl;
687 if (state == MDSMap::STATE_STOPPED) {
688 const auto fscid = pending.mds_roles.at(gid);
689 const auto &fs = pending.get_filesystem(fscid);
690
691 mon->clog->info() << info.human_name() << " finished "
692 << "deactivating rank " << info.rank << " in filesystem "
693 << fs->mds_map.fs_name << " (now has "
694 << fs->mds_map.get_num_in_mds() - 1 << " ranks)";
695
696 auto erased = pending.stop(gid);
697 erased.push_back(gid);
698
699 for (const auto &erased_gid : erased) {
700 last_beacon.erase(erased_gid);
701 if (pending_daemon_health.count(erased_gid)) {
702 pending_daemon_health.erase(erased_gid);
703 pending_daemon_health_rm.insert(erased_gid);
704 }
705 }
706
707
708 } else if (state == MDSMap::STATE_DAMAGED) {
709 if (!mon->osdmon()->is_writeable()) {
710 dout(4) << __func__ << ": DAMAGED from rank " << info.rank
711 << " waiting for osdmon writeable to blacklist it" << dendl;
712 mon->osdmon()->wait_for_writeable(op, new C_RetryMessage(this, op));
713 return false;
714 }
715
716 // Record this MDS rank as damaged, so that other daemons
717 // won't try to run it.
718 dout(4) << __func__ << ": marking rank "
719 << info.rank << " damaged" << dendl;
720
721 utime_t until = ceph_clock_now();
722 until += g_conf->get_val<double>("mon_mds_blacklist_interval");
723 const auto blacklist_epoch = mon->osdmon()->blacklist(info.addr, until);
724 request_proposal(mon->osdmon());
725 pending.damaged(gid, blacklist_epoch);
726 last_beacon.erase(gid);
727
728 // Respond to MDS, so that it knows it can continue to shut down
729 mon->send_reply(op,
730 new MMDSBeacon(
731 mon->monmap->fsid, m->get_global_id(),
732 m->get_name(), pending.get_epoch(), state, seq,
733 CEPH_FEATURES_SUPPORTED_DEFAULT));
734 } else if (state == MDSMap::STATE_DNE) {
735 if (!mon->osdmon()->is_writeable()) {
736 dout(4) << __func__ << ": DNE from rank " << info.rank
737 << " waiting for osdmon writeable to blacklist it" << dendl;
738 mon->osdmon()->wait_for_writeable(op, new C_RetryMessage(this, op));
739 return false;
740 }
741
742 fail_mds_gid(pending, gid);
743 assert(mon->osdmon()->is_writeable());
744 request_proposal(mon->osdmon());
745
746 // Respond to MDS, so that it knows it can continue to shut down
747 mon->send_reply(op,
748 new MMDSBeacon(
749 mon->monmap->fsid, m->get_global_id(),
750 m->get_name(), pending.get_epoch(), state, seq,
751 CEPH_FEATURES_SUPPORTED_DEFAULT));
752 } else if (info.state == MDSMap::STATE_STANDBY && state != info.state) {
753 // Standby daemons should never modify their own
754 // state. Reject any attempts to do so.
755 derr << "standby " << gid << " attempted to change state to "
756 << ceph_mds_state_name(state) << ", rejecting" << dendl;
757 return true;
758 } else if (info.state != MDSMap::STATE_STANDBY && state != info.state &&
759 !MDSMap::state_transition_valid(info.state, state)) {
760 // Validate state transitions for daemons that hold a rank
761 derr << "daemon " << gid << " (rank " << info.rank << ") "
762 << "reported invalid state transition "
763 << ceph_mds_state_name(info.state) << " -> "
764 << ceph_mds_state_name(state) << dendl;
765 return true;
766 } else {
767 if (info.state != MDSMap::STATE_ACTIVE && state == MDSMap::STATE_ACTIVE) {
768 const auto &fscid = pending.mds_roles.at(gid);
769 const auto &fs = pending.get_filesystem(fscid);
770 mon->clog->info() << info.human_name() << " is now active in "
771 << "filesystem " << fs->mds_map.fs_name << " as rank "
772 << info.rank;
773 }
774
775 // Made it through special cases and validations, record the
776 // daemon's reported state to the FSMap.
777 pending.modify_daemon(gid, [state, seq](MDSMap::mds_info_t *info) {
778 info->state = state;
779 info->state_seq = seq;
780 });
781 }
782 }
783
784 dout(7) << "prepare_beacon pending map now:" << dendl;
785 print_map(pending);
786
787 wait_for_finished_proposal(op, new FunctionContext([op, this](int r){
788 if (r >= 0)
789 _updated(op); // success
790 else if (r == -ECANCELED) {
791 mon->no_reply(op);
792 } else {
793 dispatch(op); // try again
794 }
795 }));
796
797 return true;
798 }
799
800 bool MDSMonitor::prepare_offload_targets(MonOpRequestRef op)
801 {
802 auto &pending = get_pending_fsmap_writeable();
803
804 op->mark_mdsmon_event(__func__);
805 MMDSLoadTargets *m = static_cast<MMDSLoadTargets*>(op->get_req());
806 mds_gid_t gid = m->global_id;
807 if (pending.gid_has_rank(gid)) {
808 dout(10) << "prepare_offload_targets " << gid << " " << m->targets << dendl;
809 pending.update_export_targets(gid, m->targets);
810 } else {
811 dout(10) << "prepare_offload_targets " << gid << " not in map" << dendl;
812 }
813 return true;
814 }
815
816 bool MDSMonitor::should_propose(double& delay)
817 {
818 // delegate to PaxosService to assess whether we should propose
819 return PaxosService::should_propose(delay);
820 }
821
822 void MDSMonitor::_updated(MonOpRequestRef op)
823 {
824 const auto &fsmap = get_fsmap();
825 op->mark_mdsmon_event(__func__);
826 MMDSBeacon *m = static_cast<MMDSBeacon*>(op->get_req());
827 dout(10) << "_updated " << m->get_orig_source() << " " << *m << dendl;
828 mon->clog->debug() << m->get_orig_source_inst() << " "
829 << ceph_mds_state_name(m->get_state());
830
831 if (m->get_state() == MDSMap::STATE_STOPPED) {
832 // send the map manually (they're out of the map, so they won't get it automatic)
833 MDSMap null_map;
834 null_map.epoch = fsmap.epoch;
835 null_map.compat = fsmap.compat;
836 mon->send_reply(op, new MMDSMap(mon->monmap->fsid, &null_map));
837 } else {
838 mon->send_reply(op, new MMDSBeacon(mon->monmap->fsid,
839 m->get_global_id(),
840 m->get_name(),
841 fsmap.get_epoch(),
842 m->get_state(),
843 m->get_seq(),
844 CEPH_FEATURES_SUPPORTED_DEFAULT));
845 }
846 }
847
848 void MDSMonitor::on_active()
849 {
850 tick();
851 update_logger();
852
853 if (is_leader()) {
854 mon->clog->debug() << "fsmap " << get_fsmap();
855 }
856 }
857
858 void MDSMonitor::get_health(list<pair<health_status_t, string> >& summary,
859 list<pair<health_status_t, string> > *detail,
860 CephContext* cct) const
861 {
862 const auto &fsmap = get_fsmap();
863
864 fsmap.get_health(summary, detail);
865
866 // For each MDS GID...
867 const auto &info_map = fsmap.get_mds_info();
868 for (const auto &i : info_map) {
869 const auto &gid = i.first;
870 const auto &info = i.second;
871
872 // Decode MDSHealth
873 bufferlist bl;
874 mon->store->get(MDS_HEALTH_PREFIX, stringify(gid), bl);
875 if (!bl.length()) {
876 derr << "Missing health data for MDS " << gid << dendl;
877 continue;
878 }
879 MDSHealth health;
880 bufferlist::iterator bl_i = bl.begin();
881 health.decode(bl_i);
882
883 for (const auto &metric : health.metrics) {
884 const int rank = info.rank;
885 std::ostringstream message;
886 message << "mds" << rank << ": " << metric.message;
887 summary.push_back(std::make_pair(metric.sev, message.str()));
888
889 if (detail) {
890 // There is no way for us to clealy associate detail entries with summary entries (#7192), so
891 // we duplicate the summary message in the detail string and tag the metadata on.
892 std::ostringstream detail_message;
893 detail_message << message.str();
894 if (metric.metadata.size()) {
895 detail_message << "(";
896 auto k = metric.metadata.begin();
897 while (k != metric.metadata.end()) {
898 detail_message << k->first << ": " << k->second;
899 if (boost::next(k) != metric.metadata.end()) {
900 detail_message << ", ";
901 }
902 ++k;
903 }
904 detail_message << ")";
905 }
906 detail->push_back(std::make_pair(metric.sev, detail_message.str()));
907 }
908 }
909 }
910 }
911
912 void MDSMonitor::dump_info(Formatter *f)
913 {
914 f->open_object_section("fsmap");
915 get_fsmap().dump(f);
916 f->close_section();
917
918 f->dump_unsigned("mdsmap_first_committed", get_first_committed());
919 f->dump_unsigned("mdsmap_last_committed", get_last_committed());
920 }
921
922 bool MDSMonitor::preprocess_command(MonOpRequestRef op)
923 {
924 op->mark_mdsmon_event(__func__);
925 MMonCommand *m = static_cast<MMonCommand*>(op->get_req());
926 int r = -1;
927 bufferlist rdata;
928 stringstream ss, ds;
929
930 map<string, cmd_vartype> cmdmap;
931 const auto &fsmap = get_fsmap();
932
933 if (!cmdmap_from_json(m->cmd, &cmdmap, ss)) {
934 // ss has reason for failure
935 string rs = ss.str();
936 mon->reply_command(op, -EINVAL, rs, rdata, get_last_committed());
937 return true;
938 }
939
940 string prefix;
941 cmd_getval(g_ceph_context, cmdmap, "prefix", prefix);
942 string format;
943 cmd_getval(g_ceph_context, cmdmap, "format", format, string("plain"));
944 std::unique_ptr<Formatter> f(Formatter::create(format));
945
946 MonSession *session = m->get_session();
947 if (!session) {
948 mon->reply_command(op, -EACCES, "access denied", rdata, get_last_committed());
949 return true;
950 }
951
952 if (prefix == "mds stat") {
953 if (f) {
954 f->open_object_section("mds_stat");
955 dump_info(f.get());
956 f->close_section();
957 f->flush(ds);
958 } else {
959 ds << fsmap;
960 }
961 r = 0;
962 } else if (prefix == "mds dump") {
963 int64_t epocharg;
964 epoch_t epoch;
965
966 const FSMap *fsmapp = &get_fsmap();
967 FSMap dummy;
968 if (cmd_getval(g_ceph_context, cmdmap, "epoch", epocharg)) {
969 epoch = epocharg;
970 bufferlist b;
971 int err = get_version(epoch, b);
972 if (err == -ENOENT) {
973 r = -ENOENT;
974 goto out;
975 } else {
976 assert(err == 0);
977 assert(b.length());
978 dummy.decode(b);
979 fsmapp = &dummy;
980 }
981 }
982
983 stringstream ds;
984 const MDSMap *mdsmapp = nullptr;
985 MDSMap blank;
986 blank.epoch = fsmapp->epoch;
987 if (fsmapp->legacy_client_fscid != FS_CLUSTER_ID_NONE) {
988 mdsmapp = &fsmapp->filesystems.at(fsmapp->legacy_client_fscid)->mds_map;
989 } else {
990 mdsmapp = &blank;
991 }
992 if (f != NULL) {
993 f->open_object_section("mdsmap");
994 mdsmapp->dump(f.get());
995 f->close_section();
996 f->flush(ds);
997 r = 0;
998 } else {
999 mdsmapp->print(ds);
1000 r = 0;
1001 }
1002
1003 rdata.append(ds);
1004 ss << "dumped fsmap epoch " << fsmapp->get_epoch();
1005 } else if (prefix == "fs dump") {
1006 int64_t epocharg;
1007 epoch_t epoch;
1008
1009 const FSMap *fsmapp = &fsmap;
1010 FSMap dummy;
1011 if (cmd_getval(g_ceph_context, cmdmap, "epoch", epocharg)) {
1012 epoch = epocharg;
1013 bufferlist b;
1014 int err = get_version(epoch, b);
1015 if (err == -ENOENT) {
1016 r = -ENOENT;
1017 goto out;
1018 } else {
1019 assert(err == 0);
1020 assert(b.length());
1021 dummy.decode(b);
1022 fsmapp = &dummy;
1023 }
1024 }
1025
1026 stringstream ds;
1027 if (f != NULL) {
1028 f->open_object_section("fsmap");
1029 fsmapp->dump(f.get());
1030 f->close_section();
1031 f->flush(ds);
1032 r = 0;
1033 } else {
1034 fsmapp->print(ds);
1035 r = 0;
1036 }
1037
1038 rdata.append(ds);
1039 ss << "dumped fsmap epoch " << fsmapp->get_epoch();
1040 } else if (prefix == "mds metadata") {
1041 if (!f)
1042 f.reset(Formatter::create("json-pretty"));
1043
1044 string who;
1045 bool all = !cmd_getval(g_ceph_context, cmdmap, "who", who);
1046 dout(1) << "all = " << all << dendl;
1047 if (all) {
1048 r = 0;
1049 // Dump all MDSs' metadata
1050 const auto all_info = fsmap.get_mds_info();
1051
1052 f->open_array_section("mds_metadata");
1053 for(const auto &i : all_info) {
1054 const auto &info = i.second;
1055
1056 f->open_object_section("mds");
1057 f->dump_string("name", info.name);
1058 std::ostringstream get_err;
1059 r = dump_metadata(fsmap, info.name, f.get(), get_err);
1060 if (r == -EINVAL || r == -ENOENT) {
1061 // Drop error, list what metadata we do have
1062 dout(1) << get_err.str() << dendl;
1063 r = 0;
1064 } else if (r != 0) {
1065 derr << "Unexpected error reading metadata: " << cpp_strerror(r)
1066 << dendl;
1067 ss << get_err.str();
1068 f->close_section();
1069 break;
1070 }
1071 f->close_section();
1072 }
1073 f->close_section();
1074 } else {
1075 // Dump a single daemon's metadata
1076 f->open_object_section("mds_metadata");
1077 r = dump_metadata(fsmap, who, f.get(), ss);
1078 f->close_section();
1079 }
1080 f->flush(ds);
1081 } else if (prefix == "mds versions") {
1082 if (!f)
1083 f.reset(Formatter::create("json-pretty"));
1084 count_metadata("ceph_version", f.get());
1085 f->flush(ds);
1086 r = 0;
1087 } else if (prefix == "mds count-metadata") {
1088 if (!f)
1089 f.reset(Formatter::create("json-pretty"));
1090 string field;
1091 cmd_getval(g_ceph_context, cmdmap, "property", field);
1092 count_metadata(field, f.get());
1093 f->flush(ds);
1094 r = 0;
1095 } else if (prefix == "mds getmap") {
1096 epoch_t e;
1097 int64_t epocharg;
1098 bufferlist b;
1099 if (cmd_getval(g_ceph_context, cmdmap, "epoch", epocharg)) {
1100 e = epocharg;
1101 int err = get_version(e, b);
1102 if (err == -ENOENT) {
1103 r = -ENOENT;
1104 } else {
1105 assert(err == 0);
1106 assert(b.length());
1107 FSMap mm;
1108 mm.decode(b);
1109 mm.encode(rdata, m->get_connection()->get_features());
1110 ss << "got fsmap epoch " << mm.get_epoch();
1111 r = 0;
1112 }
1113 } else {
1114 fsmap.encode(rdata, m->get_connection()->get_features());
1115 ss << "got fsmap epoch " << fsmap.get_epoch();
1116 r = 0;
1117 }
1118 } else if (prefix == "mds compat show") {
1119 if (f) {
1120 f->open_object_section("mds_compat");
1121 fsmap.compat.dump(f.get());
1122 f->close_section();
1123 f->flush(ds);
1124 } else {
1125 ds << fsmap.compat;
1126 }
1127 r = 0;
1128 } else if (prefix == "fs get") {
1129 string fs_name;
1130 cmd_getval(g_ceph_context, cmdmap, "fs_name", fs_name);
1131 const auto &fs = fsmap.get_filesystem(fs_name);
1132 if (fs == nullptr) {
1133 ss << "filesystem '" << fs_name << "' not found";
1134 r = -ENOENT;
1135 } else {
1136 if (f != nullptr) {
1137 f->open_object_section("filesystem");
1138 fs->dump(f.get());
1139 f->close_section();
1140 f->flush(ds);
1141 r = 0;
1142 } else {
1143 fs->print(ds);
1144 r = 0;
1145 }
1146 }
1147 } else if (prefix == "fs ls") {
1148 if (f) {
1149 f->open_array_section("filesystems");
1150 for (const auto &p : fsmap.filesystems) {
1151 const auto &fs = p.second;
1152 f->open_object_section("filesystem");
1153 {
1154 const MDSMap &mds_map = fs->mds_map;
1155 f->dump_string("name", mds_map.fs_name);
1156 /* Output both the names and IDs of pools, for use by
1157 * humans and machines respectively */
1158 f->dump_string("metadata_pool", mon->osdmon()->osdmap.get_pool_name(
1159 mds_map.metadata_pool));
1160 f->dump_int("metadata_pool_id", mds_map.metadata_pool);
1161 f->open_array_section("data_pool_ids");
1162 for (const auto &id : mds_map.data_pools) {
1163 f->dump_int("data_pool_id", id);
1164 }
1165 f->close_section();
1166
1167 f->open_array_section("data_pools");
1168 for (const auto &id : mds_map.data_pools) {
1169 const auto &name = mon->osdmon()->osdmap.get_pool_name(id);
1170 f->dump_string("data_pool", name);
1171 }
1172 f->close_section();
1173 }
1174 f->close_section();
1175 }
1176 f->close_section();
1177 f->flush(ds);
1178 } else {
1179 for (const auto &p : fsmap.filesystems) {
1180 const auto &fs = p.second;
1181 const MDSMap &mds_map = fs->mds_map;
1182 const string &md_pool_name = mon->osdmon()->osdmap.get_pool_name(
1183 mds_map.metadata_pool);
1184
1185 ds << "name: " << mds_map.fs_name << ", metadata pool: "
1186 << md_pool_name << ", data pools: [";
1187 for (const auto &id : mds_map.data_pools) {
1188 const string &pool_name = mon->osdmon()->osdmap.get_pool_name(id);
1189 ds << pool_name << " ";
1190 }
1191 ds << "]" << std::endl;
1192 }
1193
1194 if (fsmap.filesystems.empty()) {
1195 ds << "No filesystems enabled" << std::endl;
1196 }
1197 }
1198 r = 0;
1199 }
1200
1201 out:
1202 if (r != -1) {
1203 rdata.append(ds);
1204 string rs;
1205 getline(ss, rs);
1206 mon->reply_command(op, r, rs, rdata, get_last_committed());
1207 return true;
1208 } else
1209 return false;
1210 }
1211
1212 bool MDSMonitor::fail_mds_gid(FSMap &fsmap, mds_gid_t gid)
1213 {
1214 const MDSMap::mds_info_t &info = fsmap.get_info_gid(gid);
1215 dout(10) << "fail_mds_gid " << gid << " mds." << info.name << " role " << info.rank << dendl;
1216
1217 epoch_t blacklist_epoch = 0;
1218 if (info.rank >= 0 && info.state != MDSMap::STATE_STANDBY_REPLAY) {
1219 utime_t until = ceph_clock_now();
1220 until += g_conf->get_val<double>("mon_mds_blacklist_interval");
1221 blacklist_epoch = mon->osdmon()->blacklist(info.addr, until);
1222 }
1223
1224 fsmap.erase(gid, blacklist_epoch);
1225 last_beacon.erase(gid);
1226 if (pending_daemon_health.count(gid)) {
1227 pending_daemon_health.erase(gid);
1228 pending_daemon_health_rm.insert(gid);
1229 }
1230
1231 return blacklist_epoch != 0;
1232 }
1233
1234 mds_gid_t MDSMonitor::gid_from_arg(const FSMap &fsmap, const std::string &arg, std::ostream &ss)
1235 {
1236 // Try parsing as a role
1237 mds_role_t role;
1238 std::ostringstream ignore_err; // Don't spam 'ss' with parse_role errors
1239 int r = fsmap.parse_role(arg, &role, ignore_err);
1240 if (r == 0) {
1241 // See if a GID is assigned to this role
1242 const auto &fs = fsmap.get_filesystem(role.fscid);
1243 assert(fs != nullptr); // parse_role ensures it exists
1244 if (fs->mds_map.is_up(role.rank)) {
1245 dout(10) << __func__ << ": validated rank/GID " << role
1246 << " as a rank" << dendl;
1247 return fs->mds_map.get_mds_info(role.rank).global_id;
1248 }
1249 }
1250
1251 // Try parsing as a gid
1252 std::string err;
1253 unsigned long long maybe_gid = strict_strtoll(arg.c_str(), 10, &err);
1254 if (!err.empty()) {
1255 // Not a role or a GID, try as a daemon name
1256 const MDSMap::mds_info_t *mds_info = fsmap.find_by_name(arg);
1257 if (!mds_info) {
1258 ss << "MDS named '" << arg
1259 << "' does not exist, or is not up";
1260 return MDS_GID_NONE;
1261 }
1262 dout(10) << __func__ << ": resolved MDS name '" << arg
1263 << "' to GID " << mds_info->global_id << dendl;
1264 return mds_info->global_id;
1265 } else {
1266 // Not a role, but parses as a an integer, might be a GID
1267 dout(10) << __func__ << ": treating MDS reference '" << arg
1268 << "' as an integer " << maybe_gid << dendl;
1269
1270 if (fsmap.gid_exists(mds_gid_t(maybe_gid))) {
1271 return mds_gid_t(maybe_gid);
1272 }
1273 }
1274
1275 dout(1) << __func__ << ": rank/GID " << arg
1276 << " not a existent rank or GID" << dendl;
1277 return MDS_GID_NONE;
1278 }
1279
1280 int MDSMonitor::fail_mds(FSMap &fsmap, std::ostream &ss,
1281 const std::string &arg, MDSMap::mds_info_t *failed_info)
1282 {
1283 assert(failed_info != nullptr);
1284
1285 mds_gid_t gid = gid_from_arg(fsmap, arg, ss);
1286 if (gid == MDS_GID_NONE) {
1287 return 0;
1288 }
1289 if (!mon->osdmon()->is_writeable()) {
1290 return -EAGAIN;
1291 }
1292
1293 // Take a copy of the info before removing the MDS from the map,
1294 // so that the caller knows which mds (if any) they ended up removing.
1295 *failed_info = fsmap.get_info_gid(gid);
1296
1297 fail_mds_gid(fsmap, gid);
1298 ss << "failed mds gid " << gid;
1299 assert(mon->osdmon()->is_writeable());
1300 request_proposal(mon->osdmon());
1301 return 0;
1302 }
1303
1304 bool MDSMonitor::prepare_command(MonOpRequestRef op)
1305 {
1306 op->mark_mdsmon_event(__func__);
1307 MMonCommand *m = static_cast<MMonCommand*>(op->get_req());
1308 int r = -EINVAL;
1309 stringstream ss;
1310 bufferlist rdata;
1311
1312 map<string, cmd_vartype> cmdmap;
1313 if (!cmdmap_from_json(m->cmd, &cmdmap, ss)) {
1314 string rs = ss.str();
1315 mon->reply_command(op, -EINVAL, rs, rdata, get_last_committed());
1316 return true;
1317 }
1318
1319 string prefix;
1320 cmd_getval(g_ceph_context, cmdmap, "prefix", prefix);
1321
1322 /* Refuse access if message not associated with a valid session */
1323 MonSession *session = m->get_session();
1324 if (!session) {
1325 mon->reply_command(op, -EACCES, "access denied", rdata, get_last_committed());
1326 return true;
1327 }
1328
1329 auto &pending = get_pending_fsmap_writeable();
1330
1331 bool batched_propose = false;
1332 for (const auto &h : handlers) {
1333 if (h->can_handle(prefix)) {
1334 batched_propose = h->batched_propose();
1335 if (batched_propose) {
1336 paxos->plug();
1337 }
1338 r = h->handle(mon, pending, op, cmdmap, ss);
1339 if (batched_propose) {
1340 paxos->unplug();
1341 }
1342
1343 if (r == -EAGAIN) {
1344 // message has been enqueued for retry; return.
1345 dout(4) << __func__ << " enqueue for retry by prepare_command" << dendl;
1346 return false;
1347 } else {
1348 if (r == 0) {
1349 // On successful updates, print the updated map
1350 print_map(pending);
1351 }
1352 // Successful or not, we're done: respond.
1353 goto out;
1354 }
1355 }
1356 }
1357
1358 r = filesystem_command(pending, op, prefix, cmdmap, ss);
1359 if (r >= 0) {
1360 goto out;
1361 } else if (r == -EAGAIN) {
1362 // Do not reply, the message has been enqueued for retry
1363 dout(4) << __func__ << " enqueue for retry by filesystem_command" << dendl;
1364 return false;
1365 } else if (r != -ENOSYS) {
1366 goto out;
1367 }
1368
1369 // Only handle legacy commands if there is a filesystem configured
1370 if (pending.legacy_client_fscid == FS_CLUSTER_ID_NONE) {
1371 if (pending.filesystems.size() == 0) {
1372 ss << "No filesystem configured: use `ceph fs new` to create a filesystem";
1373 } else {
1374 ss << "No filesystem set for use with legacy commands";
1375 }
1376 r = -EINVAL;
1377 goto out;
1378 }
1379
1380 r = legacy_filesystem_command(pending, op, prefix, cmdmap, ss);
1381
1382 if (r == -ENOSYS && ss.str().empty()) {
1383 ss << "unrecognized command";
1384 }
1385
1386 out:
1387 dout(4) << __func__ << " done, r=" << r << dendl;
1388 /* Compose response */
1389 string rs;
1390 getline(ss, rs);
1391
1392 if (r >= 0) {
1393 // success.. delay reply
1394 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, r, rs,
1395 get_last_committed() + 1));
1396 if (batched_propose) {
1397 force_immediate_propose();
1398 }
1399 return true;
1400 } else {
1401 // reply immediately
1402 mon->reply_command(op, r, rs, rdata, get_last_committed());
1403 return false;
1404 }
1405 }
1406
1407 int MDSMonitor::filesystem_command(
1408 FSMap &fsmap,
1409 MonOpRequestRef op,
1410 std::string const &prefix,
1411 map<string, cmd_vartype> &cmdmap,
1412 std::stringstream &ss)
1413 {
1414 dout(4) << __func__ << " prefix='" << prefix << "'" << dendl;
1415 op->mark_mdsmon_event(__func__);
1416 int r = 0;
1417 string whostr;
1418 cmd_getval(g_ceph_context, cmdmap, "who", whostr);
1419
1420 if (prefix == "mds stop" ||
1421 prefix == "mds deactivate") {
1422 mds_role_t role;
1423 r = fsmap.parse_role(whostr, &role, ss);
1424 if (r < 0 ) {
1425 return r;
1426 }
1427 const auto &fs = fsmap.get_filesystem(role.fscid);
1428
1429 if (!fs->mds_map.is_active(role.rank)) {
1430 r = -EEXIST;
1431 ss << "mds." << role << " not active ("
1432 << ceph_mds_state_name(fs->mds_map.get_state(role.rank)) << ")";
1433 } else if (fs->mds_map.get_root() == role.rank ||
1434 fs->mds_map.get_tableserver() == role.rank) {
1435 r = -EINVAL;
1436 ss << "can't tell the root (" << fs->mds_map.get_root()
1437 << ") or tableserver (" << fs->mds_map.get_tableserver()
1438 << ") to deactivate";
1439 } else if (role.rank != fs->mds_map.get_last_in_mds()) {
1440 r = -EINVAL;
1441 ss << "mds." << role << " doesn't have the max rank ("
1442 << fs->mds_map.get_last_in_mds() << ")";
1443 } else if (fs->mds_map.get_num_in_mds() <= size_t(fs->mds_map.get_max_mds())) {
1444 r = -EBUSY;
1445 ss << "must decrease max_mds or else MDS will immediately reactivate";
1446 } else {
1447 r = 0;
1448 mds_gid_t gid = fs->mds_map.up.at(role.rank);
1449 ss << "telling mds." << role << " "
1450 << fsmap.get_info_gid(gid).addr << " to deactivate";
1451
1452 fsmap.modify_daemon(gid, [](MDSMap::mds_info_t *info) {
1453 info->state = MDSMap::STATE_STOPPING;
1454 });
1455 }
1456 } else if (prefix == "mds set_state") {
1457 mds_gid_t gid;
1458 if (!cmd_getval(g_ceph_context, cmdmap, "gid", gid)) {
1459 ss << "error parsing 'gid' value '"
1460 << cmd_vartype_stringify(cmdmap["gid"]) << "'";
1461 return -EINVAL;
1462 }
1463 MDSMap::DaemonState state;
1464 if (!cmd_getval(g_ceph_context, cmdmap, "state", state)) {
1465 ss << "error parsing 'state' string value '"
1466 << cmd_vartype_stringify(cmdmap["state"]) << "'";
1467 return -EINVAL;
1468 }
1469 if (fsmap.gid_exists(gid)) {
1470 fsmap.modify_daemon(gid, [state](MDSMap::mds_info_t *info) {
1471 info->state = state;
1472 });
1473 ss << "set mds gid " << gid << " to state " << state << " "
1474 << ceph_mds_state_name(state);
1475 return 0;
1476 }
1477 } else if (prefix == "mds fail") {
1478 string who;
1479 cmd_getval(g_ceph_context, cmdmap, "who", who);
1480
1481 MDSMap::mds_info_t failed_info;
1482 r = fail_mds(fsmap, ss, who, &failed_info);
1483 if (r < 0 && r == -EAGAIN) {
1484 mon->osdmon()->wait_for_writeable(op, new C_RetryMessage(this, op));
1485 return -EAGAIN; // don't propose yet; wait for message to be retried
1486 } else if (r == 0) {
1487 // Only log if we really did something (not when was already gone)
1488 if (failed_info.global_id != MDS_GID_NONE) {
1489 mon->clog->info() << failed_info.human_name() << " marked failed by "
1490 << op->get_session()->entity_name;
1491 }
1492 }
1493 } else if (prefix == "mds rm") {
1494 mds_gid_t gid;
1495 if (!cmd_getval(g_ceph_context, cmdmap, "gid", gid)) {
1496 ss << "error parsing 'gid' value '"
1497 << cmd_vartype_stringify(cmdmap["gid"]) << "'";
1498 return -EINVAL;
1499 }
1500 if (!fsmap.gid_exists(gid)) {
1501 ss << "mds gid " << gid << " dne";
1502 r = 0;
1503 } else {
1504 const auto &info = fsmap.get_info_gid(gid);
1505 MDSMap::DaemonState state = info.state;
1506 if (state > 0) {
1507 ss << "cannot remove active mds." << info.name
1508 << " rank " << info.rank;
1509 return -EBUSY;
1510 } else {
1511 fsmap.erase(gid, {});
1512 ss << "removed mds gid " << gid;
1513 return 0;
1514 }
1515 }
1516 } else if (prefix == "mds rmfailed") {
1517 string confirm;
1518 if (!cmd_getval(g_ceph_context, cmdmap, "confirm", confirm) ||
1519 confirm != "--yes-i-really-mean-it") {
1520 ss << "WARNING: this can make your filesystem inaccessible! "
1521 "Add --yes-i-really-mean-it if you are sure you wish to continue.";
1522 return -EPERM;
1523 }
1524
1525 std::string role_str;
1526 cmd_getval(g_ceph_context, cmdmap, "who", role_str);
1527 mds_role_t role;
1528 int r = fsmap.parse_role(role_str, &role, ss);
1529 if (r < 0) {
1530 ss << "invalid role '" << role_str << "'";
1531 return -EINVAL;
1532 }
1533
1534 fsmap.modify_filesystem(
1535 role.fscid,
1536 [role](std::shared_ptr<Filesystem> fs)
1537 {
1538 fs->mds_map.failed.erase(role.rank);
1539 });
1540
1541 ss << "removed failed mds." << role;
1542 return 0;
1543 } else if (prefix == "mds compat rm_compat") {
1544 int64_t f;
1545 if (!cmd_getval(g_ceph_context, cmdmap, "feature", f)) {
1546 ss << "error parsing feature value '"
1547 << cmd_vartype_stringify(cmdmap["feature"]) << "'";
1548 return -EINVAL;
1549 }
1550 if (fsmap.compat.compat.contains(f)) {
1551 ss << "removing compat feature " << f;
1552 CompatSet modified = fsmap.compat;
1553 modified.compat.remove(f);
1554 fsmap.update_compat(modified);
1555 } else {
1556 ss << "compat feature " << f << " not present in " << fsmap.compat;
1557 }
1558 r = 0;
1559 } else if (prefix == "mds compat rm_incompat") {
1560 int64_t f;
1561 if (!cmd_getval(g_ceph_context, cmdmap, "feature", f)) {
1562 ss << "error parsing feature value '"
1563 << cmd_vartype_stringify(cmdmap["feature"]) << "'";
1564 return -EINVAL;
1565 }
1566 if (fsmap.compat.incompat.contains(f)) {
1567 ss << "removing incompat feature " << f;
1568 CompatSet modified = fsmap.compat;
1569 modified.incompat.remove(f);
1570 fsmap.update_compat(modified);
1571 } else {
1572 ss << "incompat feature " << f << " not present in " << fsmap.compat;
1573 }
1574 r = 0;
1575 } else if (prefix == "mds repaired") {
1576 std::string role_str;
1577 cmd_getval(g_ceph_context, cmdmap, "rank", role_str);
1578 mds_role_t role;
1579 r = fsmap.parse_role(role_str, &role, ss);
1580 if (r < 0) {
1581 return r;
1582 }
1583
1584 bool modified = fsmap.undamaged(role.fscid, role.rank);
1585 if (modified) {
1586 dout(4) << "repaired: restoring rank " << role << dendl;
1587 } else {
1588 dout(4) << "repaired: no-op on rank " << role << dendl;
1589 }
1590
1591 r = 0;
1592 } else {
1593 return -ENOSYS;
1594 }
1595
1596 return r;
1597 }
1598
1599 /**
1600 * Helper to legacy_filesystem_command
1601 */
1602 void MDSMonitor::modify_legacy_filesystem(FSMap &fsmap,
1603 std::function<void(std::shared_ptr<Filesystem> )> fn)
1604 {
1605 fsmap.modify_filesystem(
1606 fsmap.legacy_client_fscid,
1607 fn
1608 );
1609 }
1610
1611
1612
1613 /**
1614 * Handle a command that affects the filesystem (i.e. a filesystem
1615 * must exist for the command to act upon).
1616 *
1617 * @retval 0 Command was successfully handled and has side effects
1618 * @retval -EAGAIN Messages has been requeued for retry
1619 * @retval -ENOSYS Unknown command
1620 * @retval < 0 An error has occurred; **ss** may have been set.
1621 */
1622 int MDSMonitor::legacy_filesystem_command(
1623 FSMap &fsmap,
1624 MonOpRequestRef op,
1625 std::string const &prefix,
1626 map<string, cmd_vartype> &cmdmap,
1627 std::stringstream &ss)
1628 {
1629 dout(4) << __func__ << " prefix='" << prefix << "'" << dendl;
1630 op->mark_mdsmon_event(__func__);
1631 int r = 0;
1632 string whostr;
1633 cmd_getval(g_ceph_context, cmdmap, "who", whostr);
1634
1635 assert (fsmap.legacy_client_fscid != FS_CLUSTER_ID_NONE);
1636
1637 if (prefix == "mds set_max_mds") {
1638 // NOTE: deprecated by "fs set max_mds"
1639 int64_t maxmds;
1640 if (!cmd_getval(g_ceph_context, cmdmap, "maxmds", maxmds) || maxmds <= 0) {
1641 return -EINVAL;
1642 }
1643
1644 const MDSMap& mdsmap =
1645 fsmap.filesystems.at(fsmap.legacy_client_fscid)->mds_map;
1646
1647 if (!mdsmap.allows_multimds() &&
1648 maxmds > mdsmap.get_max_mds() &&
1649 maxmds > 1) {
1650 ss << "multi-MDS clusters are not enabled; set 'allow_multimds' to enable";
1651 return -EINVAL;
1652 }
1653
1654 if (maxmds > MAX_MDS) {
1655 ss << "may not have more than " << MAX_MDS << " MDS ranks";
1656 return -EINVAL;
1657 }
1658
1659 modify_legacy_filesystem(fsmap,
1660 [maxmds](std::shared_ptr<Filesystem> fs)
1661 {
1662 fs->mds_map.set_max_mds(maxmds);
1663 });
1664
1665 r = 0;
1666 ss << "max_mds = " << maxmds;
1667 } else if (prefix == "mds cluster_down") {
1668 // NOTE: deprecated by "fs set cluster_down"
1669 modify_legacy_filesystem(fsmap,
1670 [](std::shared_ptr<Filesystem> fs)
1671 {
1672 fs->mds_map.set_flag(CEPH_MDSMAP_DOWN);
1673 });
1674 ss << "marked fsmap DOWN";
1675 r = 0;
1676 } else if (prefix == "mds cluster_up") {
1677 // NOTE: deprecated by "fs set cluster_up"
1678 modify_legacy_filesystem(fsmap,
1679 [](std::shared_ptr<Filesystem> fs)
1680 {
1681 fs->mds_map.clear_flag(CEPH_MDSMAP_DOWN);
1682 });
1683 ss << "unmarked fsmap DOWN";
1684 r = 0;
1685 } else {
1686 return -ENOSYS;
1687 }
1688
1689 return r;
1690 }
1691
1692
1693 void MDSMonitor::check_subs()
1694 {
1695 std::list<std::string> types;
1696
1697 // Subscriptions may be to "mdsmap" (MDS and legacy clients),
1698 // "mdsmap.<namespace>", or to "fsmap" for the full state of all
1699 // filesystems. Build a list of all the types we service
1700 // subscriptions for.
1701 types.push_back("fsmap");
1702 types.push_back("fsmap.user");
1703 types.push_back("mdsmap");
1704 for (const auto &p : get_fsmap().filesystems) {
1705 const auto &fscid = p.first;
1706 std::ostringstream oss;
1707 oss << "mdsmap." << fscid;
1708 types.push_back(oss.str());
1709 }
1710
1711 for (const auto &type : types) {
1712 if (mon->session_map.subs.count(type) == 0)
1713 continue;
1714 xlist<Subscription*>::iterator p = mon->session_map.subs[type]->begin();
1715 while (!p.end()) {
1716 Subscription *sub = *p;
1717 ++p;
1718 check_sub(sub);
1719 }
1720 }
1721 }
1722
1723
1724 void MDSMonitor::check_sub(Subscription *sub)
1725 {
1726 dout(20) << __func__ << ": " << sub->type << dendl;
1727
1728 const auto &fsmap = get_fsmap();
1729
1730 if (sub->type == "fsmap") {
1731 if (sub->next <= fsmap.get_epoch()) {
1732 sub->session->con->send_message(new MFSMap(mon->monmap->fsid, fsmap));
1733 if (sub->onetime) {
1734 mon->session_map.remove_sub(sub);
1735 } else {
1736 sub->next = fsmap.get_epoch() + 1;
1737 }
1738 }
1739 } else if (sub->type == "fsmap.user") {
1740 if (sub->next <= fsmap.get_epoch()) {
1741 FSMapUser fsmap_u;
1742 fsmap_u.epoch = fsmap.get_epoch();
1743 fsmap_u.legacy_client_fscid = fsmap.legacy_client_fscid;
1744 for (const auto &p : fsmap.filesystems) {
1745 FSMapUser::fs_info_t& fs_info = fsmap_u.filesystems[p.second->fscid];
1746 fs_info.cid = p.second->fscid;
1747 fs_info.name = p.second->mds_map.fs_name;
1748 }
1749 sub->session->con->send_message(new MFSMapUser(mon->monmap->fsid, fsmap_u));
1750 if (sub->onetime) {
1751 mon->session_map.remove_sub(sub);
1752 } else {
1753 sub->next = fsmap.get_epoch() + 1;
1754 }
1755 }
1756 } else if (sub->type.compare(0, 6, "mdsmap") == 0) {
1757 if (sub->next > fsmap.get_epoch()) {
1758 return;
1759 }
1760
1761 const bool is_mds = sub->session->inst.name.is_mds();
1762 mds_gid_t mds_gid = MDS_GID_NONE;
1763 fs_cluster_id_t fscid = FS_CLUSTER_ID_NONE;
1764 if (is_mds) {
1765 // What (if any) namespace are you assigned to?
1766 auto mds_info = fsmap.get_mds_info();
1767 for (const auto &p : mds_info) {
1768 if (p.second.addr == sub->session->inst.addr) {
1769 mds_gid = p.first;
1770 fscid = fsmap.mds_roles.at(mds_gid);
1771 }
1772 }
1773 } else {
1774 // You're a client. Did you request a particular
1775 // namespace?
1776 if (sub->type.find("mdsmap.") == 0) {
1777 auto namespace_id_str = sub->type.substr(std::string("mdsmap.").size());
1778 dout(10) << __func__ << ": namespace_id " << namespace_id_str << dendl;
1779 std::string err;
1780 fscid = strict_strtoll(namespace_id_str.c_str(), 10, &err);
1781 if (!err.empty()) {
1782 // Client asked for a non-existent namespace, send them nothing
1783 dout(1) << "Invalid client subscription '" << sub->type
1784 << "'" << dendl;
1785 return;
1786 }
1787 if (fsmap.filesystems.count(fscid) == 0) {
1788 // Client asked for a non-existent namespace, send them nothing
1789 // TODO: something more graceful for when a client has a filesystem
1790 // mounted, and the fileysstem is deleted. Add a "shut down you fool"
1791 // flag to MMDSMap?
1792 dout(1) << "Client subscribed to non-existent namespace '" <<
1793 fscid << "'" << dendl;
1794 return;
1795 }
1796 } else {
1797 // Unqualified request for "mdsmap": give it the one marked
1798 // for use by legacy clients.
1799 if (fsmap.legacy_client_fscid != FS_CLUSTER_ID_NONE) {
1800 fscid = fsmap.legacy_client_fscid;
1801 } else {
1802 dout(1) << "Client subscribed for legacy filesystem but "
1803 "none is configured" << dendl;
1804 return;
1805 }
1806 }
1807 }
1808 dout(10) << __func__ << ": is_mds=" << is_mds << ", fscid= " << fscid << dendl;
1809
1810 // Work out the effective latest epoch
1811 const MDSMap *mds_map = nullptr;
1812 MDSMap null_map;
1813 null_map.compat = fsmap.compat;
1814 if (fscid == FS_CLUSTER_ID_NONE) {
1815 // For a client, we should have already dropped out
1816 assert(is_mds);
1817
1818 auto it = fsmap.standby_daemons.find(mds_gid);
1819 if (it != fsmap.standby_daemons.end()) {
1820 // For an MDS, we need to feed it an MDSMap with its own state in
1821 null_map.mds_info[mds_gid] = it->second;
1822 null_map.epoch = fsmap.standby_epochs.at(mds_gid);
1823 } else {
1824 null_map.epoch = fsmap.epoch;
1825 }
1826 mds_map = &null_map;
1827 } else {
1828 // Check the effective epoch
1829 mds_map = &fsmap.get_filesystem(fscid)->mds_map;
1830 }
1831
1832 assert(mds_map != nullptr);
1833 dout(10) << __func__ << " selected MDS map epoch " <<
1834 mds_map->epoch << " for namespace " << fscid << " for subscriber "
1835 << sub->session->inst.name << " who wants epoch " << sub->next << dendl;
1836
1837 if (sub->next > mds_map->epoch) {
1838 return;
1839 }
1840 auto msg = new MMDSMap(mon->monmap->fsid, mds_map);
1841
1842 sub->session->con->send_message(msg);
1843 if (sub->onetime) {
1844 mon->session_map.remove_sub(sub);
1845 } else {
1846 sub->next = mds_map->get_epoch() + 1;
1847 }
1848 }
1849 }
1850
1851
1852 void MDSMonitor::update_metadata(mds_gid_t gid,
1853 const map<string, string>& metadata)
1854 {
1855 if (metadata.empty()) {
1856 return;
1857 }
1858 pending_metadata[gid] = metadata;
1859
1860 MonitorDBStore::TransactionRef t = paxos->get_pending_transaction();
1861 bufferlist bl;
1862 ::encode(pending_metadata, bl);
1863 t->put(MDS_METADATA_PREFIX, "last_metadata", bl);
1864 paxos->trigger_propose();
1865 }
1866
1867 void MDSMonitor::remove_from_metadata(const FSMap &fsmap, MonitorDBStore::TransactionRef t)
1868 {
1869 bool update = false;
1870 for (auto it = pending_metadata.begin(); it != pending_metadata.end(); ) {
1871 if (!fsmap.gid_exists(it->first)) {
1872 it = pending_metadata.erase(it);
1873 update = true;
1874 } else {
1875 ++it;
1876 }
1877 }
1878 if (!update)
1879 return;
1880 bufferlist bl;
1881 ::encode(pending_metadata, bl);
1882 t->put(MDS_METADATA_PREFIX, "last_metadata", bl);
1883 }
1884
1885 int MDSMonitor::load_metadata(map<mds_gid_t, Metadata>& m)
1886 {
1887 bufferlist bl;
1888 int r = mon->store->get(MDS_METADATA_PREFIX, "last_metadata", bl);
1889 if (r) {
1890 dout(1) << "Unable to load 'last_metadata'" << dendl;
1891 return r;
1892 }
1893
1894 bufferlist::iterator it = bl.begin();
1895 ::decode(m, it);
1896 return 0;
1897 }
1898
1899 void MDSMonitor::count_metadata(const std::string &field, map<string,int> *out)
1900 {
1901 map<mds_gid_t,Metadata> meta;
1902 load_metadata(meta);
1903 for (auto& p : meta) {
1904 auto q = p.second.find(field);
1905 if (q == p.second.end()) {
1906 (*out)["unknown"]++;
1907 } else {
1908 (*out)[q->second]++;
1909 }
1910 }
1911 }
1912
1913 void MDSMonitor::count_metadata(const std::string &field, Formatter *f)
1914 {
1915 map<string,int> by_val;
1916 count_metadata(field, &by_val);
1917 f->open_object_section(field.c_str());
1918 for (auto& p : by_val) {
1919 f->dump_int(p.first.c_str(), p.second);
1920 }
1921 f->close_section();
1922 }
1923
1924 int MDSMonitor::dump_metadata(const FSMap& fsmap, const std::string &who,
1925 Formatter *f, ostream& err)
1926 {
1927 assert(f);
1928
1929 mds_gid_t gid = gid_from_arg(fsmap, who, err);
1930 if (gid == MDS_GID_NONE) {
1931 return -EINVAL;
1932 }
1933
1934 map<mds_gid_t, Metadata> metadata;
1935 if (int r = load_metadata(metadata)) {
1936 err << "Unable to load 'last_metadata'";
1937 return r;
1938 }
1939
1940 if (!metadata.count(gid)) {
1941 return -ENOENT;
1942 }
1943 const Metadata& m = metadata[gid];
1944 for (Metadata::const_iterator p = m.begin(); p != m.end(); ++p) {
1945 f->dump_string(p->first.c_str(), p->second);
1946 }
1947 return 0;
1948 }
1949
1950 int MDSMonitor::print_nodes(Formatter *f)
1951 {
1952 assert(f);
1953
1954 const auto &fsmap = get_fsmap();
1955
1956 map<mds_gid_t, Metadata> metadata;
1957 if (int r = load_metadata(metadata)) {
1958 return r;
1959 }
1960
1961 map<string, list<int> > mdses; // hostname => rank
1962 for (const auto &p : metadata) {
1963 const mds_gid_t& gid = p.first;
1964 const Metadata& m = p.second;
1965 Metadata::const_iterator hostname = m.find("hostname");
1966 if (hostname == m.end()) {
1967 // not likely though
1968 continue;
1969 }
1970 if (!fsmap.gid_exists(gid)) {
1971 dout(5) << __func__ << ": GID " << gid << " not existent" << dendl;
1972 continue;
1973 }
1974 const MDSMap::mds_info_t& mds_info = fsmap.get_info_gid(gid);
1975 // FIXME: include filesystem name with rank here
1976 mdses[hostname->second].push_back(mds_info.rank);
1977 }
1978
1979 dump_services(f, mdses, "mds");
1980 return 0;
1981 }
1982
1983 /**
1984 * If a cluster is undersized (with respect to max_mds), then
1985 * attempt to find daemons to grow it.
1986 */
1987 bool MDSMonitor::maybe_expand_cluster(FSMap &fsmap, fs_cluster_id_t fscid)
1988 {
1989 auto fs = fsmap.get_filesystem(fscid);
1990 auto &mds_map = fs->mds_map;
1991
1992 if (fs->mds_map.test_flag(CEPH_MDSMAP_DOWN)) {
1993 return false;
1994 }
1995
1996 int in = mds_map.get_num_in_mds();
1997 int max = mds_map.get_max_mds();
1998
1999 dout(20) << __func__ << " in " << in << " max " << max << dendl;
2000
2001 if (in < max) {
2002 mds_rank_t mds = mds_rank_t(0);
2003 string name;
2004 while (mds_map.is_in(mds)) {
2005 mds++;
2006 }
2007 mds_gid_t newgid = fsmap.find_replacement_for({fscid, mds},
2008 name, g_conf->mon_force_standby_active);
2009 if (newgid == MDS_GID_NONE) {
2010 return false;
2011 }
2012
2013 const auto &new_info = fsmap.get_info_gid(newgid);
2014 dout(1) << "assigned standby " << new_info.addr
2015 << " as mds." << mds << dendl;
2016
2017 mon->clog->info() << new_info.human_name() << " assigned to "
2018 "filesystem " << mds_map.fs_name << " as rank "
2019 << mds << " (now has " << mds_map.get_num_in_mds() + 1
2020 << " ranks)";
2021 fsmap.promote(newgid, fs, mds);
2022 return true;
2023 }
2024
2025 return false;
2026 }
2027
2028
2029 /**
2030 * If a daemon is laggy, and a suitable replacement
2031 * is available, fail this daemon (remove from map) and pass its
2032 * role to another daemon.
2033 */
2034 void MDSMonitor::maybe_replace_gid(FSMap &fsmap, mds_gid_t gid,
2035 const MDSMap::mds_info_t& info, bool *mds_propose, bool *osd_propose)
2036 {
2037 assert(mds_propose != nullptr);
2038 assert(osd_propose != nullptr);
2039
2040 const auto fscid = fsmap.mds_roles.at(gid);
2041
2042 // We will only take decisive action (replacing/removing a daemon)
2043 // if we have some indicating that some other daemon(s) are successfully
2044 // getting beacons through recently.
2045 mono_time latest_beacon = mono_clock::zero();
2046 for (const auto &p : last_beacon) {
2047 latest_beacon = std::max(p.second.stamp, latest_beacon);
2048 }
2049 mono_time now = mono_clock::now();
2050 chrono::duration<double> since = now-latest_beacon;
2051 const bool may_replace = since.count() <
2052 std::max(g_conf->mds_beacon_interval, g_conf->mds_beacon_grace * 0.5);
2053
2054 // are we in?
2055 // and is there a non-laggy standby that can take over for us?
2056 mds_gid_t sgid;
2057 if (info.rank >= 0 &&
2058 info.state != MDSMap::STATE_STANDBY &&
2059 info.state != MDSMap::STATE_STANDBY_REPLAY &&
2060 may_replace &&
2061 !fsmap.get_filesystem(fscid)->mds_map.test_flag(CEPH_MDSMAP_DOWN) &&
2062 (sgid = fsmap.find_replacement_for({fscid, info.rank}, info.name,
2063 g_conf->mon_force_standby_active)) != MDS_GID_NONE)
2064 {
2065
2066 MDSMap::mds_info_t si = fsmap.get_info_gid(sgid);
2067 dout(10) << " replacing " << gid << " " << info.addr << " mds."
2068 << info.rank << "." << info.inc
2069 << " " << ceph_mds_state_name(info.state)
2070 << " with " << sgid << "/" << si.name << " " << si.addr << dendl;
2071
2072 mon->clog->warn() << info.human_name()
2073 << " is not responding, replacing it "
2074 << "as rank " << info.rank
2075 << " with standby " << si.human_name();
2076
2077 // Remember what NS the old one was in
2078 const fs_cluster_id_t fscid = fsmap.mds_roles.at(gid);
2079
2080 // Remove the old one
2081 *osd_propose |= fail_mds_gid(fsmap, gid);
2082
2083 // Promote the replacement
2084 auto fs = fsmap.filesystems.at(fscid);
2085 fsmap.promote(sgid, fs, info.rank);
2086
2087 *mds_propose = true;
2088 } else if ((info.state == MDSMap::STATE_STANDBY_REPLAY ||
2089 info.state == MDSMap::STATE_STANDBY) && may_replace) {
2090 dout(10) << " failing and removing " << gid << " " << info.addr << " mds." << info.rank
2091 << "." << info.inc << " " << ceph_mds_state_name(info.state)
2092 << dendl;
2093 mon->clog->info() << "Standby " << info.human_name() << " is not "
2094 "responding, dropping it";
2095 fail_mds_gid(fsmap, gid);
2096 *mds_propose = true;
2097 } else if (!info.laggy()) {
2098 dout(10) << " marking " << gid << " " << info.addr << " mds." << info.rank << "." << info.inc
2099 << " " << ceph_mds_state_name(info.state)
2100 << " laggy" << dendl;
2101 fsmap.modify_daemon(info.global_id, [](MDSMap::mds_info_t *info) {
2102 info->laggy_since = ceph_clock_now();
2103 });
2104 *mds_propose = true;
2105 }
2106 }
2107
2108 bool MDSMonitor::maybe_promote_standby(FSMap &fsmap, std::shared_ptr<Filesystem> &fs)
2109 {
2110 assert(!fs->mds_map.test_flag(CEPH_MDSMAP_DOWN));
2111
2112 bool do_propose = false;
2113
2114 // have a standby take over?
2115 set<mds_rank_t> failed;
2116 fs->mds_map.get_failed_mds_set(failed);
2117 if (!failed.empty()) {
2118 set<mds_rank_t>::iterator p = failed.begin();
2119 while (p != failed.end()) {
2120 mds_rank_t f = *p++;
2121 mds_gid_t sgid = fsmap.find_replacement_for({fs->fscid, f}, {},
2122 g_conf->mon_force_standby_active);
2123 if (sgid) {
2124 const MDSMap::mds_info_t si = fsmap.get_info_gid(sgid);
2125 dout(0) << " taking over failed mds." << f << " with " << sgid
2126 << "/" << si.name << " " << si.addr << dendl;
2127 mon->clog->info() << "Standby " << si.human_name()
2128 << " assigned to filesystem " << fs->mds_map.fs_name
2129 << " as rank " << f;
2130
2131 fsmap.promote(sgid, fs, f);
2132 do_propose = true;
2133 }
2134 }
2135 } else {
2136 // There were no failures to replace, so try using any available standbys
2137 // as standby-replay daemons.
2138
2139 // Take a copy of the standby GIDs so that we can iterate over
2140 // them while perhaps-modifying standby_daemons during the loop
2141 // (if we promote anyone they are removed from standby_daemons)
2142 std::vector<mds_gid_t> standby_gids;
2143 for (const auto &j : fsmap.standby_daemons) {
2144 standby_gids.push_back(j.first);
2145 }
2146
2147 for (const auto &gid : standby_gids) {
2148 const auto &info = fsmap.standby_daemons.at(gid);
2149 assert(info.state == MDSMap::STATE_STANDBY);
2150
2151 if (!info.standby_replay) {
2152 continue;
2153 }
2154
2155 /*
2156 * This mds is standby but has no rank assigned.
2157 * See if we can find it somebody to shadow
2158 */
2159 dout(20) << "gid " << gid << " is standby and following nobody" << dendl;
2160
2161 // standby for someone specific?
2162 if (info.standby_for_rank >= 0) {
2163 // The mds_info_t may or may not tell us exactly which filesystem
2164 // the standby_for_rank refers to: lookup via legacy_client_fscid
2165 mds_role_t target_role = {
2166 info.standby_for_fscid == FS_CLUSTER_ID_NONE ?
2167 fsmap.legacy_client_fscid : info.standby_for_fscid,
2168 info.standby_for_rank};
2169
2170 // It is possible that the map contains a standby_for_fscid
2171 // that doesn't correspond to an existing filesystem, especially
2172 // if we loaded from a version with a bug (#17466)
2173 if (info.standby_for_fscid != FS_CLUSTER_ID_NONE
2174 && !fsmap.filesystem_exists(info.standby_for_fscid)) {
2175 derr << "gid " << gid << " has invalid standby_for_fscid "
2176 << info.standby_for_fscid << dendl;
2177 continue;
2178 }
2179
2180 // If we managed to resolve a full target role
2181 if (target_role.fscid != FS_CLUSTER_ID_NONE) {
2182 const auto &fs = fsmap.get_filesystem(target_role.fscid);
2183 if (fs->mds_map.is_followable(target_role.rank)) {
2184 do_propose |= try_standby_replay(fsmap, info, *fs,
2185 fs->mds_map.get_info(target_role.rank));
2186 }
2187 }
2188
2189 continue;
2190 }
2191
2192 // check everyone
2193 for (const auto &p : fsmap.filesystems) {
2194 if (info.standby_for_fscid != FS_CLUSTER_ID_NONE &&
2195 info.standby_for_fscid != p.first)
2196 continue;
2197
2198 bool assigned = false;
2199 const auto &fs = p.second;
2200 const MDSMap &mds_map = fs->mds_map;
2201 for (const auto &mds_i : mds_map.mds_info) {
2202 const MDSMap::mds_info_t &cand_info = mds_i.second;
2203 if (cand_info.rank >= 0 && mds_map.is_followable(cand_info.rank)) {
2204 if ((info.standby_for_name.length() && info.standby_for_name != cand_info.name) ||
2205 info.standby_for_rank != MDS_RANK_NONE) {
2206 continue; // we're supposed to follow someone else
2207 }
2208
2209 if (try_standby_replay(fsmap, info, *fs, cand_info)) {
2210 assigned = true;
2211 break;
2212 }
2213 }
2214 }
2215 if (assigned) {
2216 do_propose = true;
2217 break;
2218 }
2219 }
2220 }
2221 }
2222
2223 return do_propose;
2224 }
2225
2226 void MDSMonitor::tick()
2227 {
2228 // make sure mds's are still alive
2229 // ...if i am an active leader
2230
2231 if (!is_active() || !is_leader()) return;
2232
2233 auto &pending = get_pending_fsmap_writeable();
2234
2235 bool do_propose = false;
2236
2237 do_propose |= pending.check_health();
2238
2239 // expand mds cluster (add new nodes to @in)?
2240 for (auto &p : pending.filesystems) {
2241 do_propose |= maybe_expand_cluster(pending, p.second->fscid);
2242 }
2243
2244 mono_time now = mono_clock::now();
2245 if (last_tick == decltype(last_tick)::min()) {
2246 last_tick = now;
2247 }
2248 chrono::duration<double> since_last = now-last_tick;
2249
2250 if (since_last.count() >
2251 (g_conf->mds_beacon_grace - g_conf->mds_beacon_interval)) {
2252 // This case handles either local slowness (calls being delayed
2253 // for whatever reason) or cluster election slowness (a long gap
2254 // between calls while an election happened)
2255 dout(4) << __func__ << ": resetting beacon timeouts due to mon delay "
2256 "(slow election?) of " << now - last_tick << " seconds" << dendl;
2257 for (auto &p : last_beacon) {
2258 p.second.stamp = now;
2259 }
2260 }
2261
2262 last_tick = now;
2263
2264 // make sure last_beacon is fully populated
2265 for (auto &p : pending.mds_roles) {
2266 auto &gid = p.first;
2267 last_beacon.emplace(std::piecewise_construct,
2268 std::forward_as_tuple(gid),
2269 std::forward_as_tuple(mono_clock::now(), 0));
2270 }
2271
2272
2273 // check beacon timestamps
2274 bool propose_osdmap = false;
2275 bool osdmap_writeable = mon->osdmon()->is_writeable();
2276 for (auto it = last_beacon.begin(); it != last_beacon.end(); ) {
2277 mds_gid_t gid = it->first;
2278 auto beacon_info = it->second;
2279 chrono::duration<double> since_last = now-beacon_info.stamp;
2280
2281 if (!pending.gid_exists(gid)) {
2282 // clean it out
2283 it = last_beacon.erase(it);
2284 continue;
2285 }
2286
2287
2288 if (since_last.count() >= g_conf->mds_beacon_grace) {
2289 auto &info = pending.get_info_gid(gid);
2290 dout(1) << "no beacon from mds." << info.rank << "." << info.inc
2291 << " (gid: " << gid << " addr: " << info.addr
2292 << " state: " << ceph_mds_state_name(info.state) << ")"
2293 << " since " << since_last.count() << "s" << dendl;
2294 // If the OSDMap is writeable, we can blacklist things, so we can
2295 // try failing any laggy MDS daemons. Consider each one for failure.
2296 if (osdmap_writeable) {
2297 maybe_replace_gid(pending, gid, info, &do_propose, &propose_osdmap);
2298 }
2299 }
2300
2301 ++it;
2302 }
2303 if (propose_osdmap) {
2304 request_proposal(mon->osdmon());
2305 }
2306
2307 for (auto &p : pending.filesystems) {
2308 auto &fs = p.second;
2309 if (!fs->mds_map.test_flag(CEPH_MDSMAP_DOWN)) {
2310 do_propose |= maybe_promote_standby(pending, fs);
2311 }
2312 }
2313
2314 if (do_propose) {
2315 propose_pending();
2316 }
2317 }
2318
2319 /**
2320 * finfo: the would-be follower
2321 * leader_fs: the Filesystem containing the would-be leader
2322 * ainfo: the would-be leader
2323 */
2324 bool MDSMonitor::try_standby_replay(
2325 FSMap &fsmap,
2326 const MDSMap::mds_info_t& finfo,
2327 const Filesystem &leader_fs,
2328 const MDSMap::mds_info_t& ainfo)
2329 {
2330 // someone else already following?
2331 if (leader_fs.has_standby_replay(ainfo.global_id)) {
2332 dout(20) << " mds." << ainfo.rank << " already has a follower" << dendl;
2333 return false;
2334 } else {
2335 // Assign the new role to the standby
2336 dout(10) << " setting to follow mds rank " << ainfo.rank << dendl;
2337 fsmap.assign_standby_replay(finfo.global_id, leader_fs.fscid, ainfo.rank);
2338 return true;
2339 }
2340 }
2341
2342 MDSMonitor::MDSMonitor(Monitor *mn, Paxos *p, string service_name)
2343 : PaxosService(mn, p, service_name)
2344 {
2345 handlers = FileSystemCommandHandler::load(p);
2346 }
2347
2348 void MDSMonitor::on_restart()
2349 {
2350 // Clear out the leader-specific state.
2351 last_tick = mono_clock::now();
2352 last_beacon.clear();
2353 }
2354