]> git.proxmox.com Git - ceph.git/blob - ceph/src/mon/MDSMonitor.cc
update sources to 12.2.10
[ceph.git] / ceph / src / mon / MDSMonitor.cc
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3 /*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 *
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
12 *
13 */
14
15 #include <sstream>
16 #include <boost/utility.hpp>
17 #include <boost/regex.hpp>
18
19 #include "MDSMonitor.h"
20 #include "FSCommands.h"
21 #include "Monitor.h"
22 #include "MonitorDBStore.h"
23 #include "OSDMonitor.h"
24 #include "PGMonitor.h"
25
26 #include "common/strtol.h"
27 #include "common/perf_counters.h"
28 #include "common/config.h"
29 #include "common/cmdparse.h"
30 #include "messages/MMDSMap.h"
31 #include "messages/MFSMap.h"
32 #include "messages/MFSMapUser.h"
33 #include "messages/MMDSLoadTargets.h"
34 #include "messages/MMonCommand.h"
35 #include "messages/MGenericMessage.h"
36
37 #include "include/assert.h"
38 #include "include/str_list.h"
39 #include "include/stringify.h"
40 #include "mds/mdstypes.h"
41 #include "Session.h"
42
43 #define dout_subsys ceph_subsys_mon
44 #undef dout_prefix
45 #define dout_prefix _prefix(_dout, mon, get_fsmap())
46 static ostream& _prefix(std::ostream *_dout, Monitor *mon, const FSMap& fsmap) {
47 return *_dout << "mon." << mon->name << "@" << mon->rank
48 << "(" << mon->get_state_name()
49 << ").mds e" << fsmap.get_epoch() << " ";
50 }
51
52 static const string MDS_METADATA_PREFIX("mds_metadata");
53 static const string MDS_HEALTH_PREFIX("mds_health");
54
55
56 /*
57 * Specialized implementation of cmd_getval to allow us to parse
58 * out strongly-typedef'd types
59 */
60 template<> bool cmd_getval(CephContext *cct, const cmdmap_t& cmdmap,
61 const std::string& k, mds_gid_t &val)
62 {
63 return cmd_getval(cct, cmdmap, k, (int64_t&)val);
64 }
65
66 template<> bool cmd_getval(CephContext *cct, const cmdmap_t& cmdmap,
67 const std::string& k, mds_rank_t &val)
68 {
69 return cmd_getval(cct, cmdmap, k, (int64_t&)val);
70 }
71
72 template<> bool cmd_getval(CephContext *cct, const cmdmap_t& cmdmap,
73 const std::string& k, MDSMap::DaemonState &val)
74 {
75 return cmd_getval(cct, cmdmap, k, (int64_t&)val);
76 }
77
78 // my methods
79
80 void MDSMonitor::print_map(const FSMap &m, int dbl)
81 {
82 dout(dbl) << "print_map\n";
83 m.print(*_dout);
84 *_dout << dendl;
85 }
86
87 // service methods
88 void MDSMonitor::create_initial()
89 {
90 dout(10) << "create_initial" << dendl;
91 }
92
93 void MDSMonitor::get_store_prefixes(std::set<string>& s)
94 {
95 s.insert(service_name);
96 s.insert(MDS_METADATA_PREFIX);
97 s.insert(MDS_HEALTH_PREFIX);
98 }
99
100 void MDSMonitor::update_from_paxos(bool *need_bootstrap)
101 {
102 version_t version = get_last_committed();
103 if (version == get_fsmap().epoch)
104 return;
105
106 dout(10) << __func__ << " version " << version
107 << ", my e " << get_fsmap().epoch << dendl;
108 assert(version > get_fsmap().epoch);
109
110 load_health();
111
112 // read and decode
113 bufferlist fsmap_bl;
114 fsmap_bl.clear();
115 int err = get_version(version, fsmap_bl);
116 assert(err == 0);
117
118 assert(fsmap_bl.length() > 0);
119 dout(10) << __func__ << " got " << version << dendl;
120 PaxosFSMap::decode(fsmap_bl);
121
122 // new map
123 dout(0) << "new map" << dendl;
124 print_map(get_fsmap(), 0);
125 if (!g_conf->mon_mds_skip_sanity) {
126 get_fsmap().sanity();
127 }
128
129 check_subs();
130 update_logger();
131 }
132
133 void MDSMonitor::init()
134 {
135 (void)load_metadata(pending_metadata);
136 }
137
138 void MDSMonitor::create_pending()
139 {
140 auto &fsmap = PaxosFSMap::create_pending();
141
142 if (mon->osdmon()->is_readable()) {
143 const auto &osdmap = mon->osdmon()->osdmap;
144 fsmap.sanitize([&osdmap](int64_t pool){return osdmap.have_pg_pool(pool);});
145 }
146
147 dout(10) << "create_pending e" << fsmap.epoch << dendl;
148 }
149
150 void MDSMonitor::encode_pending(MonitorDBStore::TransactionRef t)
151 {
152 auto &pending = get_pending_fsmap_writeable();
153 auto &epoch = pending.epoch;
154
155 dout(10) << "encode_pending e" << epoch << dendl;
156
157 // print map iff 'debug mon = 30' or higher
158 print_map(get_pending_fsmap(), 30);
159 if (!g_conf->mon_mds_skip_sanity) {
160 pending.sanity();
161 }
162
163 // Set 'modified' on maps modified this epoch
164 for (auto &p : pending.filesystems) {
165 if (p.second->mds_map.epoch == epoch) {
166 p.second->mds_map.modified = ceph_clock_now();
167 }
168 }
169
170 // apply to paxos
171 assert(get_last_committed() + 1 == pending.epoch);
172 bufferlist pending_bl;
173 pending.encode(pending_bl, mon->get_quorum_con_features());
174
175 /* put everything in the transaction */
176 put_version(t, pending.epoch, pending_bl);
177 put_last_committed(t, pending.epoch);
178
179 // Encode MDSHealth data
180 for (std::map<uint64_t, MDSHealth>::iterator i = pending_daemon_health.begin();
181 i != pending_daemon_health.end(); ++i) {
182 bufferlist bl;
183 i->second.encode(bl);
184 t->put(MDS_HEALTH_PREFIX, stringify(i->first), bl);
185 }
186
187 for (std::set<uint64_t>::iterator i = pending_daemon_health_rm.begin();
188 i != pending_daemon_health_rm.end(); ++i) {
189 t->erase(MDS_HEALTH_PREFIX, stringify(*i));
190 }
191 pending_daemon_health_rm.clear();
192 remove_from_metadata(pending, t);
193
194 // health
195 health_check_map_t new_checks;
196 const auto &info_map = pending.get_mds_info();
197 for (const auto &i : info_map) {
198 const auto &gid = i.first;
199 const auto &info = i.second;
200 if (pending_daemon_health_rm.count(gid)) {
201 continue;
202 }
203 MDSHealth health;
204 auto p = pending_daemon_health.find(gid);
205 if (p != pending_daemon_health.end()) {
206 health = p->second;
207 } else {
208 bufferlist bl;
209 mon->store->get(MDS_HEALTH_PREFIX, stringify(gid), bl);
210 if (!bl.length()) {
211 derr << "Missing health data for MDS " << gid << dendl;
212 continue;
213 }
214 bufferlist::iterator bl_i = bl.begin();
215 health.decode(bl_i);
216 }
217 for (const auto &metric : health.metrics) {
218 const int rank = info.rank;
219 health_check_t *check = &new_checks.get_or_add(
220 mds_metric_name(metric.type),
221 metric.sev,
222 mds_metric_summary(metric.type));
223 ostringstream ss;
224 ss << "mds" << info.name << "(mds." << rank << "): " << metric.message;
225 bool first = true;
226 for (auto &p : metric.metadata) {
227 if (first) {
228 ss << " ";
229 } else {
230 ss << ", ";
231 }
232 ss << p.first << ": " << p.second;
233 first = false;
234 }
235 check->detail.push_back(ss.str());
236 }
237 }
238 pending.get_health_checks(&new_checks);
239 for (auto& p : new_checks.checks) {
240 p.second.summary = boost::regex_replace(
241 p.second.summary,
242 boost::regex("%num%"),
243 stringify(p.second.detail.size()));
244 p.second.summary = boost::regex_replace(
245 p.second.summary,
246 boost::regex("%plurals%"),
247 p.second.detail.size() > 1 ? "s" : "");
248 p.second.summary = boost::regex_replace(
249 p.second.summary,
250 boost::regex("%isorare%"),
251 p.second.detail.size() > 1 ? "are" : "is");
252 p.second.summary = boost::regex_replace(
253 p.second.summary,
254 boost::regex("%hasorhave%"),
255 p.second.detail.size() > 1 ? "have" : "has");
256 }
257 encode_health(new_checks, t);
258 }
259
260 version_t MDSMonitor::get_trim_to()
261 {
262 version_t floor = 0;
263 if (g_conf->mon_mds_force_trim_to > 0 &&
264 g_conf->mon_mds_force_trim_to < (int)get_last_committed()) {
265 floor = g_conf->mon_mds_force_trim_to;
266 dout(10) << __func__ << " explicit mon_mds_force_trim_to = "
267 << floor << dendl;
268 }
269
270 unsigned max = g_conf->mon_max_mdsmap_epochs;
271 version_t last = get_last_committed();
272
273 if (last - get_first_committed() > max && floor < last - max)
274 return last - max;
275 return floor;
276 }
277
278 void MDSMonitor::update_logger()
279 {
280 dout(10) << "update_logger" << dendl;
281
282 const auto &fsmap = get_fsmap();
283
284 uint64_t up = 0;
285 uint64_t in = 0;
286 uint64_t failed = 0;
287 for (const auto &i : fsmap.filesystems) {
288 const MDSMap &mds_map = i.second->mds_map;
289
290 up += mds_map.get_num_up_mds();
291 in += mds_map.get_num_in_mds();
292 failed += mds_map.get_num_failed_mds();
293 }
294 mon->cluster_logger->set(l_cluster_num_mds_up, up);
295 mon->cluster_logger->set(l_cluster_num_mds_in, in);
296 mon->cluster_logger->set(l_cluster_num_mds_failed, failed);
297 mon->cluster_logger->set(l_cluster_mds_epoch, fsmap.get_epoch());
298 }
299
300 bool MDSMonitor::preprocess_query(MonOpRequestRef op)
301 {
302 op->mark_mdsmon_event(__func__);
303 PaxosServiceMessage *m = static_cast<PaxosServiceMessage*>(op->get_req());
304 dout(10) << "preprocess_query " << *m << " from " << m->get_orig_source_inst() << dendl;
305
306 switch (m->get_type()) {
307
308 case MSG_MDS_BEACON:
309 return preprocess_beacon(op);
310
311 case MSG_MON_COMMAND:
312 return preprocess_command(op);
313
314 case MSG_MDS_OFFLOAD_TARGETS:
315 return preprocess_offload_targets(op);
316
317 default:
318 ceph_abort();
319 return true;
320 }
321 }
322
323 void MDSMonitor::_note_beacon(MMDSBeacon *m)
324 {
325 mds_gid_t gid = mds_gid_t(m->get_global_id());
326 version_t seq = m->get_seq();
327
328 dout(5) << "_note_beacon " << *m << " noting time" << dendl;
329 auto &beacon = last_beacon[gid];
330 beacon.stamp = mono_clock::now();
331 beacon.seq = seq;
332 }
333
334 bool MDSMonitor::preprocess_beacon(MonOpRequestRef op)
335 {
336 op->mark_mdsmon_event(__func__);
337 MMDSBeacon *m = static_cast<MMDSBeacon*>(op->get_req());
338 MDSMap::DaemonState state = m->get_state();
339 mds_gid_t gid = m->get_global_id();
340 version_t seq = m->get_seq();
341 MDSMap::mds_info_t info;
342 epoch_t effective_epoch = 0;
343
344 const auto &fsmap = get_fsmap();
345
346 // check privileges, ignore if fails
347 MonSession *session = m->get_session();
348 assert(session);
349 if (!session->is_capable("mds", MON_CAP_X)) {
350 dout(0) << "preprocess_beacon got MMDSBeacon from entity with insufficient privileges "
351 << session->caps << dendl;
352 goto ignore;
353 }
354
355 if (m->get_fsid() != mon->monmap->fsid) {
356 dout(0) << "preprocess_beacon on fsid " << m->get_fsid() << " != " << mon->monmap->fsid << dendl;
357 goto ignore;
358 }
359
360 dout(5) << "preprocess_beacon " << *m
361 << " from " << m->get_orig_source_inst()
362 << " " << m->get_compat()
363 << dendl;
364
365 // make sure the address has a port
366 if (m->get_orig_source_addr().get_port() == 0) {
367 dout(1) << " ignoring boot message without a port" << dendl;
368 goto ignore;
369 }
370
371 // check compat
372 if (!m->get_compat().writeable(fsmap.compat)) {
373 dout(1) << " mds " << m->get_source_inst() << " can't write to fsmap " << fsmap.compat << dendl;
374 goto ignore;
375 }
376
377 // fw to leader?
378 if (!is_leader())
379 return false;
380
381 // booted, but not in map?
382 if (!fsmap.gid_exists(gid)) {
383 if (state != MDSMap::STATE_BOOT) {
384 dout(7) << "mds_beacon " << *m << " is not in fsmap (state "
385 << ceph_mds_state_name(state) << ")" << dendl;
386
387 /* We can't send an MDSMap this MDS was a part of because we no longer
388 * know which FS it was part of. Nor does this matter. Sending an empty
389 * MDSMap is sufficient for getting the MDS to respawn.
390 */
391 MDSMap null_map;
392 null_map.epoch = fsmap.epoch;
393 null_map.compat = fsmap.compat;
394 mon->send_reply(op, new MMDSMap(mon->monmap->fsid, &null_map));
395 return true;
396 } else {
397 return false; // not booted yet.
398 }
399 }
400 dout(10) << __func__ << ": GID exists in map: " << gid << dendl;
401 info = fsmap.get_info_gid(gid);
402
403 // old seq?
404 if (info.state_seq > seq) {
405 dout(7) << "mds_beacon " << *m << " has old seq, ignoring" << dendl;
406 goto ignore;
407 }
408
409 // Work out the latest epoch that this daemon should have seen
410 {
411 fs_cluster_id_t fscid = fsmap.mds_roles.at(gid);
412 if (fscid == FS_CLUSTER_ID_NONE) {
413 effective_epoch = fsmap.standby_epochs.at(gid);
414 } else {
415 effective_epoch = fsmap.get_filesystem(fscid)->mds_map.epoch;
416 }
417 if (effective_epoch != m->get_last_epoch_seen()) {
418 dout(10) << "mds_beacon " << *m
419 << " ignoring requested state, because mds hasn't seen latest map" << dendl;
420 goto reply;
421 }
422 }
423
424 if (info.laggy()) {
425 _note_beacon(m);
426 return false; // no longer laggy, need to update map.
427 }
428 if (state == MDSMap::STATE_BOOT) {
429 // ignore, already booted.
430 goto ignore;
431 }
432 // is there a state change here?
433 if (info.state != state) {
434 // legal state change?
435 if ((info.state == MDSMap::STATE_STANDBY ||
436 info.state == MDSMap::STATE_STANDBY_REPLAY) && state > 0) {
437 dout(10) << "mds_beacon mds can't activate itself (" << ceph_mds_state_name(info.state)
438 << " -> " << ceph_mds_state_name(state) << ")" << dendl;
439 goto reply;
440 }
441
442 if ((state == MDSMap::STATE_STANDBY || state == MDSMap::STATE_STANDBY_REPLAY)
443 && info.rank != MDS_RANK_NONE)
444 {
445 dout(4) << "mds_beacon MDS can't go back into standby after taking rank: "
446 "held rank " << info.rank << " while requesting state "
447 << ceph_mds_state_name(state) << dendl;
448 goto reply;
449 }
450
451 _note_beacon(m);
452 return false;
453 }
454
455 // Comparing known daemon health with m->get_health()
456 // and return false (i.e. require proposal) if they
457 // do not match, to update our stored
458 if (!(pending_daemon_health[gid] == m->get_health())) {
459 dout(10) << __func__ << " health metrics for gid " << gid << " were updated" << dendl;
460 _note_beacon(m);
461 return false;
462 }
463
464 reply:
465 // note time and reply
466 assert(effective_epoch > 0);
467 _note_beacon(m);
468 mon->send_reply(op,
469 new MMDSBeacon(mon->monmap->fsid, m->get_global_id(), m->get_name(),
470 effective_epoch, state, seq,
471 CEPH_FEATURES_SUPPORTED_DEFAULT));
472 return true;
473
474 ignore:
475 // I won't reply this beacon, drop it.
476 mon->no_reply(op);
477 return true;
478 }
479
480 bool MDSMonitor::preprocess_offload_targets(MonOpRequestRef op)
481 {
482 op->mark_mdsmon_event(__func__);
483 MMDSLoadTargets *m = static_cast<MMDSLoadTargets*>(op->get_req());
484 dout(10) << "preprocess_offload_targets " << *m << " from " << m->get_orig_source() << dendl;
485
486 const auto &fsmap = get_fsmap();
487
488 // check privileges, ignore message if fails
489 MonSession *session = m->get_session();
490 if (!session)
491 goto ignore;
492 if (!session->is_capable("mds", MON_CAP_X)) {
493 dout(0) << "preprocess_offload_targets got MMDSLoadTargets from entity with insufficient caps "
494 << session->caps << dendl;
495 goto ignore;
496 }
497
498 if (fsmap.gid_exists(m->global_id) &&
499 m->targets == fsmap.get_info_gid(m->global_id).export_targets)
500 goto ignore;
501
502 return false;
503
504 ignore:
505 mon->no_reply(op);
506 return true;
507 }
508
509
510 bool MDSMonitor::prepare_update(MonOpRequestRef op)
511 {
512 op->mark_mdsmon_event(__func__);
513 PaxosServiceMessage *m = static_cast<PaxosServiceMessage*>(op->get_req());
514 dout(7) << "prepare_update " << *m << dendl;
515
516 switch (m->get_type()) {
517
518 case MSG_MDS_BEACON:
519 return prepare_beacon(op);
520
521 case MSG_MON_COMMAND:
522 return prepare_command(op);
523
524 case MSG_MDS_OFFLOAD_TARGETS:
525 return prepare_offload_targets(op);
526
527 default:
528 ceph_abort();
529 }
530
531 return true;
532 }
533
534 bool MDSMonitor::prepare_beacon(MonOpRequestRef op)
535 {
536 op->mark_mdsmon_event(__func__);
537 MMDSBeacon *m = static_cast<MMDSBeacon*>(op->get_req());
538 // -- this is an update --
539 dout(12) << "prepare_beacon " << *m << " from " << m->get_orig_source_inst() << dendl;
540 entity_addr_t addr = m->get_orig_source_inst().addr;
541 mds_gid_t gid = m->get_global_id();
542 MDSMap::DaemonState state = m->get_state();
543 version_t seq = m->get_seq();
544
545 auto &pending = get_pending_fsmap_writeable();
546
547 dout(15) << __func__ << " got health from gid " << gid << " with " << m->get_health().metrics.size() << " metrics." << dendl;
548
549 // Calculate deltas of health metrics created and removed
550 // Do this by type rather than MDSHealthMetric equality, because messages can
551 // change a lot when they include e.g. a number of items.
552 const auto &old_health = pending_daemon_health[gid].metrics;
553 const auto &new_health = m->get_health().metrics;
554
555 std::set<mds_metric_t> old_types;
556 for (const auto &i : old_health) {
557 old_types.insert(i.type);
558 }
559
560 std::set<mds_metric_t> new_types;
561 for (const auto &i : new_health) {
562 new_types.insert(i.type);
563 }
564
565 for (const auto &new_metric: new_health) {
566 if (old_types.count(new_metric.type) == 0) {
567 dout(10) << "MDS health message (" << m->get_orig_source_inst().name
568 << "): " << new_metric.sev << " " << new_metric.message << dendl;
569 }
570 }
571
572 // Log the disappearance of health messages at INFO
573 for (const auto &old_metric : old_health) {
574 if (new_types.count(old_metric.type) == 0) {
575 mon->clog->info() << "MDS health message cleared ("
576 << m->get_orig_source_inst().name << "): " << old_metric.message;
577 }
578 }
579
580 // Store health
581 pending_daemon_health[gid] = m->get_health();
582
583 // boot?
584 if (state == MDSMap::STATE_BOOT) {
585 // zap previous instance of this name?
586 if (g_conf->mds_enforce_unique_name) {
587 bool failed_mds = false;
588 while (mds_gid_t existing = pending.find_mds_gid_by_name(m->get_name())) {
589 if (!mon->osdmon()->is_writeable()) {
590 mon->osdmon()->wait_for_writeable(op, new C_RetryMessage(this, op));
591 return false;
592 }
593 const MDSMap::mds_info_t &existing_info =
594 pending.get_info_gid(existing);
595 mon->clog->info() << existing_info.human_name() << " restarted";
596 fail_mds_gid(pending, existing);
597 failed_mds = true;
598 }
599 if (failed_mds) {
600 assert(mon->osdmon()->is_writeable());
601 request_proposal(mon->osdmon());
602 }
603 }
604
605 // Add this daemon to the map
606 if (pending.mds_roles.count(gid) == 0) {
607 MDSMap::mds_info_t new_info;
608 new_info.global_id = gid;
609 new_info.name = m->get_name();
610 new_info.addr = addr;
611 new_info.mds_features = m->get_mds_features();
612 new_info.state = MDSMap::STATE_STANDBY;
613 new_info.state_seq = seq;
614 new_info.standby_for_rank = m->get_standby_for_rank();
615 new_info.standby_for_name = m->get_standby_for_name();
616 new_info.standby_for_fscid = m->get_standby_for_fscid();
617 new_info.standby_replay = m->get_standby_replay();
618 pending.insert(new_info);
619 }
620
621 // Resolve standby_for_name to a rank
622 const MDSMap::mds_info_t &info = pending.get_info_gid(gid);
623 if (!info.standby_for_name.empty()) {
624 const MDSMap::mds_info_t *leaderinfo = pending.find_by_name(
625 info.standby_for_name);
626 if (leaderinfo && (leaderinfo->rank >= 0)) {
627 const auto &fscid = pending.mds_roles.at(leaderinfo->global_id);
628
629 pending.modify_daemon(gid, [fscid, leaderinfo](
630 MDSMap::mds_info_t *info) {
631 info->standby_for_rank = leaderinfo->rank;
632 info->standby_for_fscid = fscid;
633 });
634 }
635 }
636
637 // initialize the beacon timer
638 auto &beacon = last_beacon[gid];
639 beacon.stamp = mono_clock::now();
640 beacon.seq = seq;
641
642 // new incompat?
643 if (!pending.compat.writeable(m->get_compat())) {
644 dout(10) << " fsmap " << pending.compat
645 << " can't write to new mds' " << m->get_compat()
646 << ", updating fsmap and killing old mds's"
647 << dendl;
648 pending.update_compat(m->get_compat());
649 }
650
651 update_metadata(m->get_global_id(), m->get_sys_info());
652 } else {
653 // state update
654
655 if (!pending.gid_exists(gid)) {
656 /* gid has been removed from pending, send null map */
657 dout(5) << "mds_beacon " << *m << " is not in fsmap (state "
658 << ceph_mds_state_name(state) << ")" << dendl;
659
660 /* We can't send an MDSMap this MDS was a part of because we no longer
661 * know which FS it was part of. Nor does this matter. Sending an empty
662 * MDSMap is sufficient for getting the MDS to respawn.
663 */
664 wait_for_finished_proposal(op, new FunctionContext([op, this](int r){
665 if (r >= 0) {
666 const auto& fsmap = get_fsmap();
667 MDSMap null_map;
668 null_map.epoch = fsmap.epoch;
669 null_map.compat = fsmap.compat;
670 mon->send_reply(op, new MMDSMap(mon->monmap->fsid, &null_map));
671 } else {
672 dispatch(op); // try again
673 }
674 }));
675 return true;
676 }
677
678 const MDSMap::mds_info_t &info = pending.get_info_gid(gid);
679 // Old MDS daemons don't mention that they're standby replay until
680 // after they've sent their boot beacon, so update this field.
681 if (info.standby_replay != m->get_standby_replay()) {
682 pending.modify_daemon(info.global_id, [&m](
683 MDSMap::mds_info_t *i)
684 {
685 i->standby_replay = m->get_standby_replay();
686 });
687 }
688
689 if (info.state == MDSMap::STATE_STOPPING && state != MDSMap::STATE_STOPPED ) {
690 // we can't transition to any other states from STOPPING
691 dout(0) << "got beacon for MDS in STATE_STOPPING, ignoring requested state change"
692 << dendl;
693 _note_beacon(m);
694 return true;
695 }
696
697 if (info.laggy()) {
698 dout(1) << "prepare_beacon clearing laggy flag on " << addr << dendl;
699 pending.modify_daemon(info.global_id, [](MDSMap::mds_info_t *info)
700 {
701 info->clear_laggy();
702 }
703 );
704 }
705
706 dout(5) << "prepare_beacon mds." << info.rank
707 << " " << ceph_mds_state_name(info.state)
708 << " -> " << ceph_mds_state_name(state)
709 << " standby_for_rank=" << m->get_standby_for_rank()
710 << dendl;
711 if (state == MDSMap::STATE_STOPPED) {
712 const auto fscid = pending.mds_roles.at(gid);
713 const auto &fs = pending.get_filesystem(fscid);
714
715 mon->clog->info() << info.human_name() << " finished "
716 << "deactivating rank " << info.rank << " in filesystem "
717 << fs->mds_map.fs_name << " (now has "
718 << fs->mds_map.get_num_in_mds() - 1 << " ranks)";
719
720 auto erased = pending.stop(gid);
721 erased.push_back(gid);
722
723 for (const auto &erased_gid : erased) {
724 last_beacon.erase(erased_gid);
725 if (pending_daemon_health.count(erased_gid)) {
726 pending_daemon_health.erase(erased_gid);
727 pending_daemon_health_rm.insert(erased_gid);
728 }
729 }
730
731
732 } else if (state == MDSMap::STATE_DAMAGED) {
733 if (!mon->osdmon()->is_writeable()) {
734 dout(1) << __func__ << ": DAMAGED from rank " << info.rank
735 << " waiting for osdmon writeable to blacklist it" << dendl;
736 mon->osdmon()->wait_for_writeable(op, new C_RetryMessage(this, op));
737 return false;
738 }
739
740 // Record this MDS rank as damaged, so that other daemons
741 // won't try to run it.
742 dout(0) << __func__ << ": marking rank "
743 << info.rank << " damaged" << dendl;
744
745 utime_t until = ceph_clock_now();
746 until += g_conf->get_val<double>("mon_mds_blacklist_interval");
747 const auto blacklist_epoch = mon->osdmon()->blacklist(info.addr, until);
748 request_proposal(mon->osdmon());
749 pending.damaged(gid, blacklist_epoch);
750 last_beacon.erase(gid);
751
752 // Respond to MDS, so that it knows it can continue to shut down
753 mon->send_reply(op,
754 new MMDSBeacon(
755 mon->monmap->fsid, m->get_global_id(),
756 m->get_name(), pending.get_epoch(), state, seq,
757 CEPH_FEATURES_SUPPORTED_DEFAULT));
758 } else if (state == MDSMap::STATE_DNE) {
759 if (!mon->osdmon()->is_writeable()) {
760 dout(1) << __func__ << ": DNE from rank " << info.rank
761 << " waiting for osdmon writeable to blacklist it" << dendl;
762 mon->osdmon()->wait_for_writeable(op, new C_RetryMessage(this, op));
763 return false;
764 }
765
766 fail_mds_gid(pending, gid);
767 assert(mon->osdmon()->is_writeable());
768 request_proposal(mon->osdmon());
769
770 // Respond to MDS, so that it knows it can continue to shut down
771 mon->send_reply(op,
772 new MMDSBeacon(
773 mon->monmap->fsid, m->get_global_id(),
774 m->get_name(), pending.get_epoch(), state, seq,
775 CEPH_FEATURES_SUPPORTED_DEFAULT));
776 } else if (info.state == MDSMap::STATE_STANDBY && state != info.state) {
777 // Standby daemons should never modify their own
778 // state. Reject any attempts to do so.
779 derr << "standby " << gid << " attempted to change state to "
780 << ceph_mds_state_name(state) << ", rejecting" << dendl;
781 return true;
782 } else if (info.state != MDSMap::STATE_STANDBY && state != info.state &&
783 !MDSMap::state_transition_valid(info.state, state)) {
784 // Validate state transitions for daemons that hold a rank
785 derr << "daemon " << gid << " (rank " << info.rank << ") "
786 << "reported invalid state transition "
787 << ceph_mds_state_name(info.state) << " -> "
788 << ceph_mds_state_name(state) << dendl;
789 return true;
790 } else {
791 if (info.state != MDSMap::STATE_ACTIVE && state == MDSMap::STATE_ACTIVE) {
792 const auto &fscid = pending.mds_roles.at(gid);
793 const auto &fs = pending.get_filesystem(fscid);
794 mon->clog->info() << info.human_name() << " is now active in "
795 << "filesystem " << fs->mds_map.fs_name << " as rank "
796 << info.rank;
797 }
798
799 // Made it through special cases and validations, record the
800 // daemon's reported state to the FSMap.
801 pending.modify_daemon(gid, [state, seq](MDSMap::mds_info_t *info) {
802 info->state = state;
803 info->state_seq = seq;
804 });
805 }
806 }
807
808 dout(5) << "prepare_beacon pending map now:" << dendl;
809 print_map(pending);
810
811 wait_for_finished_proposal(op, new FunctionContext([op, this](int r){
812 if (r >= 0)
813 _updated(op); // success
814 else if (r == -ECANCELED) {
815 mon->no_reply(op);
816 } else {
817 dispatch(op); // try again
818 }
819 }));
820
821 return true;
822 }
823
824 bool MDSMonitor::prepare_offload_targets(MonOpRequestRef op)
825 {
826 auto &pending = get_pending_fsmap_writeable();
827
828 op->mark_mdsmon_event(__func__);
829 MMDSLoadTargets *m = static_cast<MMDSLoadTargets*>(op->get_req());
830 mds_gid_t gid = m->global_id;
831 if (pending.gid_has_rank(gid)) {
832 dout(10) << "prepare_offload_targets " << gid << " " << m->targets << dendl;
833 pending.update_export_targets(gid, m->targets);
834 } else {
835 dout(10) << "prepare_offload_targets " << gid << " not in map" << dendl;
836 }
837 mon->no_reply(op);
838 return true;
839 }
840
841 bool MDSMonitor::should_propose(double& delay)
842 {
843 // delegate to PaxosService to assess whether we should propose
844 return PaxosService::should_propose(delay);
845 }
846
847 void MDSMonitor::_updated(MonOpRequestRef op)
848 {
849 const auto &fsmap = get_fsmap();
850 op->mark_mdsmon_event(__func__);
851 MMDSBeacon *m = static_cast<MMDSBeacon*>(op->get_req());
852 dout(10) << "_updated " << m->get_orig_source() << " " << *m << dendl;
853 mon->clog->debug() << m->get_orig_source_inst() << " "
854 << ceph_mds_state_name(m->get_state());
855
856 if (m->get_state() == MDSMap::STATE_STOPPED) {
857 // send the map manually (they're out of the map, so they won't get it automatic)
858 MDSMap null_map;
859 null_map.epoch = fsmap.epoch;
860 null_map.compat = fsmap.compat;
861 mon->send_reply(op, new MMDSMap(mon->monmap->fsid, &null_map));
862 } else {
863 mon->send_reply(op, new MMDSBeacon(mon->monmap->fsid,
864 m->get_global_id(),
865 m->get_name(),
866 fsmap.get_epoch(),
867 m->get_state(),
868 m->get_seq(),
869 CEPH_FEATURES_SUPPORTED_DEFAULT));
870 }
871 }
872
873 void MDSMonitor::on_active()
874 {
875 tick();
876 update_logger();
877
878 if (is_leader()) {
879 mon->clog->debug() << "fsmap " << get_fsmap();
880 }
881 }
882
883 void MDSMonitor::get_health(list<pair<health_status_t, string> >& summary,
884 list<pair<health_status_t, string> > *detail,
885 CephContext* cct) const
886 {
887 const auto &fsmap = get_fsmap();
888
889 fsmap.get_health(summary, detail);
890
891 // For each MDS GID...
892 const auto &info_map = fsmap.get_mds_info();
893 for (const auto &i : info_map) {
894 const auto &gid = i.first;
895 const auto &info = i.second;
896
897 // Decode MDSHealth
898 bufferlist bl;
899 mon->store->get(MDS_HEALTH_PREFIX, stringify(gid), bl);
900 if (!bl.length()) {
901 derr << "Missing health data for MDS " << gid << dendl;
902 continue;
903 }
904 MDSHealth health;
905 bufferlist::iterator bl_i = bl.begin();
906 health.decode(bl_i);
907
908 for (const auto &metric : health.metrics) {
909 const int rank = info.rank;
910 std::ostringstream message;
911 message << "mds" << rank << ": " << metric.message;
912 summary.push_back(std::make_pair(metric.sev, message.str()));
913
914 if (detail) {
915 // There is no way for us to clealy associate detail entries with summary entries (#7192), so
916 // we duplicate the summary message in the detail string and tag the metadata on.
917 std::ostringstream detail_message;
918 detail_message << message.str();
919 if (metric.metadata.size()) {
920 detail_message << "(";
921 auto k = metric.metadata.begin();
922 while (k != metric.metadata.end()) {
923 detail_message << k->first << ": " << k->second;
924 if (boost::next(k) != metric.metadata.end()) {
925 detail_message << ", ";
926 }
927 ++k;
928 }
929 detail_message << ")";
930 }
931 detail->push_back(std::make_pair(metric.sev, detail_message.str()));
932 }
933 }
934 }
935 }
936
937 void MDSMonitor::dump_info(Formatter *f)
938 {
939 f->open_object_section("fsmap");
940 get_fsmap().dump(f);
941 f->close_section();
942
943 f->dump_unsigned("mdsmap_first_committed", get_first_committed());
944 f->dump_unsigned("mdsmap_last_committed", get_last_committed());
945 }
946
947 bool MDSMonitor::preprocess_command(MonOpRequestRef op)
948 {
949 op->mark_mdsmon_event(__func__);
950 MMonCommand *m = static_cast<MMonCommand*>(op->get_req());
951 int r = -1;
952 bufferlist rdata;
953 stringstream ss, ds;
954
955 map<string, cmd_vartype> cmdmap;
956 const auto &fsmap = get_fsmap();
957
958 if (!cmdmap_from_json(m->cmd, &cmdmap, ss)) {
959 // ss has reason for failure
960 string rs = ss.str();
961 mon->reply_command(op, -EINVAL, rs, rdata, get_last_committed());
962 return true;
963 }
964
965 string prefix;
966 cmd_getval(g_ceph_context, cmdmap, "prefix", prefix);
967 string format;
968 cmd_getval(g_ceph_context, cmdmap, "format", format, string("plain"));
969 std::unique_ptr<Formatter> f(Formatter::create(format));
970
971 MonSession *session = m->get_session();
972 if (!session) {
973 mon->reply_command(op, -EACCES, "access denied", rdata, get_last_committed());
974 return true;
975 }
976
977 if (prefix == "mds stat") {
978 if (f) {
979 f->open_object_section("mds_stat");
980 dump_info(f.get());
981 f->close_section();
982 f->flush(ds);
983 } else {
984 ds << fsmap;
985 }
986 r = 0;
987 } else if (prefix == "mds dump") {
988 int64_t epocharg;
989 epoch_t epoch;
990
991 const FSMap *fsmapp = &get_fsmap();
992 FSMap dummy;
993 if (cmd_getval(g_ceph_context, cmdmap, "epoch", epocharg)) {
994 epoch = epocharg;
995 bufferlist b;
996 int err = get_version(epoch, b);
997 if (err == -ENOENT) {
998 r = -ENOENT;
999 goto out;
1000 } else {
1001 assert(err == 0);
1002 assert(b.length());
1003 dummy.decode(b);
1004 fsmapp = &dummy;
1005 }
1006 }
1007
1008 stringstream ds;
1009 const MDSMap *mdsmapp = nullptr;
1010 MDSMap blank;
1011 blank.epoch = fsmapp->epoch;
1012 if (fsmapp->legacy_client_fscid != FS_CLUSTER_ID_NONE) {
1013 mdsmapp = &fsmapp->filesystems.at(fsmapp->legacy_client_fscid)->mds_map;
1014 } else {
1015 mdsmapp = &blank;
1016 }
1017 if (f != NULL) {
1018 f->open_object_section("mdsmap");
1019 mdsmapp->dump(f.get());
1020 f->close_section();
1021 f->flush(ds);
1022 r = 0;
1023 } else {
1024 mdsmapp->print(ds);
1025 r = 0;
1026 }
1027
1028 rdata.append(ds);
1029 ss << "dumped fsmap epoch " << fsmapp->get_epoch();
1030 } else if (prefix == "fs dump") {
1031 int64_t epocharg;
1032 epoch_t epoch;
1033
1034 const FSMap *fsmapp = &fsmap;
1035 FSMap dummy;
1036 if (cmd_getval(g_ceph_context, cmdmap, "epoch", epocharg)) {
1037 epoch = epocharg;
1038 bufferlist b;
1039 int err = get_version(epoch, b);
1040 if (err == -ENOENT) {
1041 r = -ENOENT;
1042 goto out;
1043 } else {
1044 assert(err == 0);
1045 assert(b.length());
1046 dummy.decode(b);
1047 fsmapp = &dummy;
1048 }
1049 }
1050
1051 stringstream ds;
1052 if (f != NULL) {
1053 f->open_object_section("fsmap");
1054 fsmapp->dump(f.get());
1055 f->close_section();
1056 f->flush(ds);
1057 r = 0;
1058 } else {
1059 fsmapp->print(ds);
1060 r = 0;
1061 }
1062
1063 rdata.append(ds);
1064 ss << "dumped fsmap epoch " << fsmapp->get_epoch();
1065 } else if (prefix == "mds metadata") {
1066 if (!f)
1067 f.reset(Formatter::create("json-pretty"));
1068
1069 string who;
1070 bool all = !cmd_getval(g_ceph_context, cmdmap, "who", who);
1071 dout(1) << "all = " << all << dendl;
1072 if (all) {
1073 r = 0;
1074 // Dump all MDSs' metadata
1075 const auto all_info = fsmap.get_mds_info();
1076
1077 f->open_array_section("mds_metadata");
1078 for(const auto &i : all_info) {
1079 const auto &info = i.second;
1080
1081 f->open_object_section("mds");
1082 f->dump_string("name", info.name);
1083 std::ostringstream get_err;
1084 r = dump_metadata(fsmap, info.name, f.get(), get_err);
1085 if (r == -EINVAL || r == -ENOENT) {
1086 // Drop error, list what metadata we do have
1087 dout(1) << get_err.str() << dendl;
1088 r = 0;
1089 } else if (r != 0) {
1090 derr << "Unexpected error reading metadata: " << cpp_strerror(r)
1091 << dendl;
1092 ss << get_err.str();
1093 f->close_section();
1094 break;
1095 }
1096 f->close_section();
1097 }
1098 f->close_section();
1099 } else {
1100 // Dump a single daemon's metadata
1101 f->open_object_section("mds_metadata");
1102 r = dump_metadata(fsmap, who, f.get(), ss);
1103 f->close_section();
1104 }
1105 f->flush(ds);
1106 } else if (prefix == "mds versions") {
1107 if (!f)
1108 f.reset(Formatter::create("json-pretty"));
1109 count_metadata("ceph_version", f.get());
1110 f->flush(ds);
1111 r = 0;
1112 } else if (prefix == "mds count-metadata") {
1113 if (!f)
1114 f.reset(Formatter::create("json-pretty"));
1115 string field;
1116 cmd_getval(g_ceph_context, cmdmap, "property", field);
1117 count_metadata(field, f.get());
1118 f->flush(ds);
1119 r = 0;
1120 } else if (prefix == "mds getmap") {
1121 epoch_t e;
1122 int64_t epocharg;
1123 bufferlist b;
1124 if (cmd_getval(g_ceph_context, cmdmap, "epoch", epocharg)) {
1125 e = epocharg;
1126 int err = get_version(e, b);
1127 if (err == -ENOENT) {
1128 r = -ENOENT;
1129 } else {
1130 assert(err == 0);
1131 assert(b.length());
1132 FSMap mm;
1133 mm.decode(b);
1134 mm.encode(rdata, m->get_connection()->get_features());
1135 ss << "got fsmap epoch " << mm.get_epoch();
1136 r = 0;
1137 }
1138 } else {
1139 fsmap.encode(rdata, m->get_connection()->get_features());
1140 ss << "got fsmap epoch " << fsmap.get_epoch();
1141 r = 0;
1142 }
1143 } else if (prefix == "mds compat show") {
1144 if (f) {
1145 f->open_object_section("mds_compat");
1146 fsmap.compat.dump(f.get());
1147 f->close_section();
1148 f->flush(ds);
1149 } else {
1150 ds << fsmap.compat;
1151 }
1152 r = 0;
1153 } else if (prefix == "fs get") {
1154 string fs_name;
1155 cmd_getval(g_ceph_context, cmdmap, "fs_name", fs_name);
1156 const auto &fs = fsmap.get_filesystem(fs_name);
1157 if (fs == nullptr) {
1158 ss << "filesystem '" << fs_name << "' not found";
1159 r = -ENOENT;
1160 } else {
1161 if (f != nullptr) {
1162 f->open_object_section("filesystem");
1163 fs->dump(f.get());
1164 f->close_section();
1165 f->flush(ds);
1166 r = 0;
1167 } else {
1168 fs->print(ds);
1169 r = 0;
1170 }
1171 }
1172 } else if (prefix == "fs ls") {
1173 if (f) {
1174 f->open_array_section("filesystems");
1175 for (const auto &p : fsmap.filesystems) {
1176 const auto &fs = p.second;
1177 f->open_object_section("filesystem");
1178 {
1179 const MDSMap &mds_map = fs->mds_map;
1180 f->dump_string("name", mds_map.fs_name);
1181 /* Output both the names and IDs of pools, for use by
1182 * humans and machines respectively */
1183 f->dump_string("metadata_pool", mon->osdmon()->osdmap.get_pool_name(
1184 mds_map.metadata_pool));
1185 f->dump_int("metadata_pool_id", mds_map.metadata_pool);
1186 f->open_array_section("data_pool_ids");
1187 for (const auto &id : mds_map.data_pools) {
1188 f->dump_int("data_pool_id", id);
1189 }
1190 f->close_section();
1191
1192 f->open_array_section("data_pools");
1193 for (const auto &id : mds_map.data_pools) {
1194 const auto &name = mon->osdmon()->osdmap.get_pool_name(id);
1195 f->dump_string("data_pool", name);
1196 }
1197 f->close_section();
1198 }
1199 f->close_section();
1200 }
1201 f->close_section();
1202 f->flush(ds);
1203 } else {
1204 for (const auto &p : fsmap.filesystems) {
1205 const auto &fs = p.second;
1206 const MDSMap &mds_map = fs->mds_map;
1207 const string &md_pool_name = mon->osdmon()->osdmap.get_pool_name(
1208 mds_map.metadata_pool);
1209
1210 ds << "name: " << mds_map.fs_name << ", metadata pool: "
1211 << md_pool_name << ", data pools: [";
1212 for (const auto &id : mds_map.data_pools) {
1213 const string &pool_name = mon->osdmon()->osdmap.get_pool_name(id);
1214 ds << pool_name << " ";
1215 }
1216 ds << "]" << std::endl;
1217 }
1218
1219 if (fsmap.filesystems.empty()) {
1220 ds << "No filesystems enabled" << std::endl;
1221 }
1222 }
1223 r = 0;
1224 }
1225
1226 out:
1227 if (r != -1) {
1228 rdata.append(ds);
1229 string rs;
1230 getline(ss, rs);
1231 mon->reply_command(op, r, rs, rdata, get_last_committed());
1232 return true;
1233 } else
1234 return false;
1235 }
1236
1237 bool MDSMonitor::fail_mds_gid(FSMap &fsmap, mds_gid_t gid)
1238 {
1239 const MDSMap::mds_info_t &info = fsmap.get_info_gid(gid);
1240 dout(1) << "fail_mds_gid " << gid << " mds." << info.name << " role " << info.rank << dendl;
1241
1242 epoch_t blacklist_epoch = 0;
1243 if (info.rank >= 0 && info.state != MDSMap::STATE_STANDBY_REPLAY) {
1244 utime_t until = ceph_clock_now();
1245 until += g_conf->get_val<double>("mon_mds_blacklist_interval");
1246 blacklist_epoch = mon->osdmon()->blacklist(info.addr, until);
1247 }
1248
1249 fsmap.erase(gid, blacklist_epoch);
1250 last_beacon.erase(gid);
1251 if (pending_daemon_health.count(gid)) {
1252 pending_daemon_health.erase(gid);
1253 pending_daemon_health_rm.insert(gid);
1254 }
1255
1256 return blacklist_epoch != 0;
1257 }
1258
1259 mds_gid_t MDSMonitor::gid_from_arg(const FSMap &fsmap, const std::string &arg, std::ostream &ss)
1260 {
1261 // Try parsing as a role
1262 mds_role_t role;
1263 std::ostringstream ignore_err; // Don't spam 'ss' with parse_role errors
1264 int r = fsmap.parse_role(arg, &role, ignore_err);
1265 if (r == 0) {
1266 // See if a GID is assigned to this role
1267 const auto &fs = fsmap.get_filesystem(role.fscid);
1268 assert(fs != nullptr); // parse_role ensures it exists
1269 if (fs->mds_map.is_up(role.rank)) {
1270 dout(10) << __func__ << ": validated rank/GID " << role
1271 << " as a rank" << dendl;
1272 return fs->mds_map.get_mds_info(role.rank).global_id;
1273 }
1274 }
1275
1276 // Try parsing as a gid
1277 std::string err;
1278 unsigned long long maybe_gid = strict_strtoll(arg.c_str(), 10, &err);
1279 if (!err.empty()) {
1280 // Not a role or a GID, try as a daemon name
1281 const MDSMap::mds_info_t *mds_info = fsmap.find_by_name(arg);
1282 if (!mds_info) {
1283 ss << "MDS named '" << arg
1284 << "' does not exist, or is not up";
1285 return MDS_GID_NONE;
1286 }
1287 dout(10) << __func__ << ": resolved MDS name '" << arg
1288 << "' to GID " << mds_info->global_id << dendl;
1289 return mds_info->global_id;
1290 } else {
1291 // Not a role, but parses as a an integer, might be a GID
1292 dout(10) << __func__ << ": treating MDS reference '" << arg
1293 << "' as an integer " << maybe_gid << dendl;
1294
1295 if (fsmap.gid_exists(mds_gid_t(maybe_gid))) {
1296 return mds_gid_t(maybe_gid);
1297 }
1298 }
1299
1300 dout(1) << __func__ << ": rank/GID " << arg
1301 << " not a existent rank or GID" << dendl;
1302 return MDS_GID_NONE;
1303 }
1304
1305 int MDSMonitor::fail_mds(FSMap &fsmap, std::ostream &ss,
1306 const std::string &arg, MDSMap::mds_info_t *failed_info)
1307 {
1308 assert(failed_info != nullptr);
1309
1310 mds_gid_t gid = gid_from_arg(fsmap, arg, ss);
1311 if (gid == MDS_GID_NONE) {
1312 return 0;
1313 }
1314 if (!mon->osdmon()->is_writeable()) {
1315 return -EAGAIN;
1316 }
1317
1318 // Take a copy of the info before removing the MDS from the map,
1319 // so that the caller knows which mds (if any) they ended up removing.
1320 *failed_info = fsmap.get_info_gid(gid);
1321
1322 fail_mds_gid(fsmap, gid);
1323 ss << "failed mds gid " << gid;
1324 assert(mon->osdmon()->is_writeable());
1325 request_proposal(mon->osdmon());
1326 return 0;
1327 }
1328
1329 bool MDSMonitor::prepare_command(MonOpRequestRef op)
1330 {
1331 op->mark_mdsmon_event(__func__);
1332 MMonCommand *m = static_cast<MMonCommand*>(op->get_req());
1333 int r = -EINVAL;
1334 stringstream ss;
1335 bufferlist rdata;
1336
1337 map<string, cmd_vartype> cmdmap;
1338 if (!cmdmap_from_json(m->cmd, &cmdmap, ss)) {
1339 string rs = ss.str();
1340 mon->reply_command(op, -EINVAL, rs, rdata, get_last_committed());
1341 return true;
1342 }
1343
1344 string prefix;
1345 cmd_getval(g_ceph_context, cmdmap, "prefix", prefix);
1346
1347 /* Refuse access if message not associated with a valid session */
1348 MonSession *session = m->get_session();
1349 if (!session) {
1350 mon->reply_command(op, -EACCES, "access denied", rdata, get_last_committed());
1351 return true;
1352 }
1353
1354 auto &pending = get_pending_fsmap_writeable();
1355
1356 bool batched_propose = false;
1357 for (const auto &h : handlers) {
1358 if (h->can_handle(prefix)) {
1359 batched_propose = h->batched_propose();
1360 if (batched_propose) {
1361 paxos->plug();
1362 }
1363 r = h->handle(mon, pending, op, cmdmap, ss);
1364 if (batched_propose) {
1365 paxos->unplug();
1366 }
1367
1368 if (r == -EAGAIN) {
1369 // message has been enqueued for retry; return.
1370 dout(4) << __func__ << " enqueue for retry by prepare_command" << dendl;
1371 return false;
1372 } else {
1373 if (r == 0) {
1374 // On successful updates, print the updated map
1375 print_map(pending);
1376 }
1377 // Successful or not, we're done: respond.
1378 goto out;
1379 }
1380 }
1381 }
1382
1383 r = filesystem_command(pending, op, prefix, cmdmap, ss);
1384 if (r >= 0) {
1385 goto out;
1386 } else if (r == -EAGAIN) {
1387 // Do not reply, the message has been enqueued for retry
1388 dout(4) << __func__ << " enqueue for retry by filesystem_command" << dendl;
1389 return false;
1390 } else if (r != -ENOSYS) {
1391 goto out;
1392 }
1393
1394 // Only handle legacy commands if there is a filesystem configured
1395 if (pending.legacy_client_fscid == FS_CLUSTER_ID_NONE) {
1396 if (pending.filesystems.size() == 0) {
1397 ss << "No filesystem configured: use `ceph fs new` to create a filesystem";
1398 } else {
1399 ss << "No filesystem set for use with legacy commands";
1400 }
1401 r = -EINVAL;
1402 goto out;
1403 }
1404
1405 r = legacy_filesystem_command(pending, op, prefix, cmdmap, ss);
1406
1407 if (r == -ENOSYS && ss.str().empty()) {
1408 ss << "unrecognized command";
1409 }
1410
1411 out:
1412 dout(4) << __func__ << " done, r=" << r << dendl;
1413 /* Compose response */
1414 string rs;
1415 getline(ss, rs);
1416
1417 if (r >= 0) {
1418 // success.. delay reply
1419 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, r, rs,
1420 get_last_committed() + 1));
1421 if (batched_propose) {
1422 force_immediate_propose();
1423 }
1424 return true;
1425 } else {
1426 // reply immediately
1427 mon->reply_command(op, r, rs, rdata, get_last_committed());
1428 return false;
1429 }
1430 }
1431
1432 int MDSMonitor::filesystem_command(
1433 FSMap &fsmap,
1434 MonOpRequestRef op,
1435 std::string const &prefix,
1436 map<string, cmd_vartype> &cmdmap,
1437 std::stringstream &ss)
1438 {
1439 dout(4) << __func__ << " prefix='" << prefix << "'" << dendl;
1440 op->mark_mdsmon_event(__func__);
1441 int r = 0;
1442 string whostr;
1443 cmd_getval(g_ceph_context, cmdmap, "who", whostr);
1444
1445 if (prefix == "mds stop" ||
1446 prefix == "mds deactivate") {
1447 mds_role_t role;
1448 r = fsmap.parse_role(whostr, &role, ss);
1449 if (r < 0 ) {
1450 return r;
1451 }
1452 const auto &fs = fsmap.get_filesystem(role.fscid);
1453
1454 if (!fs->mds_map.is_active(role.rank)) {
1455 r = -EEXIST;
1456 ss << "mds." << role << " not active ("
1457 << ceph_mds_state_name(fs->mds_map.get_state(role.rank)) << ")";
1458 } else if (fs->mds_map.get_root() == role.rank ||
1459 fs->mds_map.get_tableserver() == role.rank) {
1460 r = -EINVAL;
1461 ss << "can't tell the root (" << fs->mds_map.get_root()
1462 << ") or tableserver (" << fs->mds_map.get_tableserver()
1463 << ") to deactivate";
1464 } else if (role.rank != fs->mds_map.get_last_in_mds()) {
1465 r = -EINVAL;
1466 ss << "mds." << role << " doesn't have the max rank ("
1467 << fs->mds_map.get_last_in_mds() << ")";
1468 } else if (fs->mds_map.get_num_in_mds() <= size_t(fs->mds_map.get_max_mds())) {
1469 r = -EBUSY;
1470 ss << "must decrease max_mds or else MDS will immediately reactivate";
1471 } else {
1472 r = 0;
1473 mds_gid_t gid = fs->mds_map.up.at(role.rank);
1474 ss << "telling mds." << role << " "
1475 << fsmap.get_info_gid(gid).addr << " to deactivate";
1476
1477 fsmap.modify_daemon(gid, [](MDSMap::mds_info_t *info) {
1478 info->state = MDSMap::STATE_STOPPING;
1479 });
1480 }
1481 } else if (prefix == "mds set_state") {
1482 mds_gid_t gid;
1483 if (!cmd_getval(g_ceph_context, cmdmap, "gid", gid)) {
1484 ss << "error parsing 'gid' value '"
1485 << cmd_vartype_stringify(cmdmap["gid"]) << "'";
1486 return -EINVAL;
1487 }
1488 MDSMap::DaemonState state;
1489 if (!cmd_getval(g_ceph_context, cmdmap, "state", state)) {
1490 ss << "error parsing 'state' string value '"
1491 << cmd_vartype_stringify(cmdmap["state"]) << "'";
1492 return -EINVAL;
1493 }
1494 if (fsmap.gid_exists(gid)) {
1495 fsmap.modify_daemon(gid, [state](MDSMap::mds_info_t *info) {
1496 info->state = state;
1497 });
1498 ss << "set mds gid " << gid << " to state " << state << " "
1499 << ceph_mds_state_name(state);
1500 return 0;
1501 }
1502 } else if (prefix == "mds fail") {
1503 string who;
1504 cmd_getval(g_ceph_context, cmdmap, "who", who);
1505
1506 MDSMap::mds_info_t failed_info;
1507 r = fail_mds(fsmap, ss, who, &failed_info);
1508 if (r < 0 && r == -EAGAIN) {
1509 mon->osdmon()->wait_for_writeable(op, new C_RetryMessage(this, op));
1510 return -EAGAIN; // don't propose yet; wait for message to be retried
1511 } else if (r == 0) {
1512 // Only log if we really did something (not when was already gone)
1513 if (failed_info.global_id != MDS_GID_NONE) {
1514 mon->clog->info() << failed_info.human_name() << " marked failed by "
1515 << op->get_session()->entity_name;
1516 }
1517 }
1518 } else if (prefix == "mds rm") {
1519 mds_gid_t gid;
1520 if (!cmd_getval(g_ceph_context, cmdmap, "gid", gid)) {
1521 ss << "error parsing 'gid' value '"
1522 << cmd_vartype_stringify(cmdmap["gid"]) << "'";
1523 return -EINVAL;
1524 }
1525 if (!fsmap.gid_exists(gid)) {
1526 ss << "mds gid " << gid << " dne";
1527 r = 0;
1528 } else {
1529 const auto &info = fsmap.get_info_gid(gid);
1530 MDSMap::DaemonState state = info.state;
1531 if (state > 0) {
1532 ss << "cannot remove active mds." << info.name
1533 << " rank " << info.rank;
1534 return -EBUSY;
1535 } else {
1536 fsmap.erase(gid, {});
1537 ss << "removed mds gid " << gid;
1538 return 0;
1539 }
1540 }
1541 } else if (prefix == "mds rmfailed") {
1542 string confirm;
1543 if (!cmd_getval(g_ceph_context, cmdmap, "confirm", confirm) ||
1544 confirm != "--yes-i-really-mean-it") {
1545 ss << "WARNING: this can make your filesystem inaccessible! "
1546 "Add --yes-i-really-mean-it if you are sure you wish to continue.";
1547 return -EPERM;
1548 }
1549
1550 std::string role_str;
1551 cmd_getval(g_ceph_context, cmdmap, "who", role_str);
1552 mds_role_t role;
1553 int r = fsmap.parse_role(role_str, &role, ss);
1554 if (r < 0) {
1555 ss << "invalid role '" << role_str << "'";
1556 return -EINVAL;
1557 }
1558
1559 fsmap.modify_filesystem(
1560 role.fscid,
1561 [role](std::shared_ptr<Filesystem> fs)
1562 {
1563 fs->mds_map.failed.erase(role.rank);
1564 });
1565
1566 ss << "removed failed mds." << role;
1567 return 0;
1568 } else if (prefix == "mds compat rm_compat") {
1569 int64_t f;
1570 if (!cmd_getval(g_ceph_context, cmdmap, "feature", f)) {
1571 ss << "error parsing feature value '"
1572 << cmd_vartype_stringify(cmdmap["feature"]) << "'";
1573 return -EINVAL;
1574 }
1575 if (fsmap.compat.compat.contains(f)) {
1576 ss << "removing compat feature " << f;
1577 CompatSet modified = fsmap.compat;
1578 modified.compat.remove(f);
1579 fsmap.update_compat(modified);
1580 } else {
1581 ss << "compat feature " << f << " not present in " << fsmap.compat;
1582 }
1583 r = 0;
1584 } else if (prefix == "mds compat rm_incompat") {
1585 int64_t f;
1586 if (!cmd_getval(g_ceph_context, cmdmap, "feature", f)) {
1587 ss << "error parsing feature value '"
1588 << cmd_vartype_stringify(cmdmap["feature"]) << "'";
1589 return -EINVAL;
1590 }
1591 if (fsmap.compat.incompat.contains(f)) {
1592 ss << "removing incompat feature " << f;
1593 CompatSet modified = fsmap.compat;
1594 modified.incompat.remove(f);
1595 fsmap.update_compat(modified);
1596 } else {
1597 ss << "incompat feature " << f << " not present in " << fsmap.compat;
1598 }
1599 r = 0;
1600 } else if (prefix == "mds repaired") {
1601 std::string role_str;
1602 cmd_getval(g_ceph_context, cmdmap, "rank", role_str);
1603 mds_role_t role;
1604 r = fsmap.parse_role(role_str, &role, ss);
1605 if (r < 0) {
1606 return r;
1607 }
1608
1609 bool modified = fsmap.undamaged(role.fscid, role.rank);
1610 if (modified) {
1611 dout(1) << "repaired: restoring rank " << role << dendl;
1612 } else {
1613 dout(1) << "repaired: no-op on rank " << role << dendl;
1614 }
1615
1616 r = 0;
1617 } else {
1618 return -ENOSYS;
1619 }
1620
1621 return r;
1622 }
1623
1624 /**
1625 * Helper to legacy_filesystem_command
1626 */
1627 void MDSMonitor::modify_legacy_filesystem(FSMap &fsmap,
1628 std::function<void(std::shared_ptr<Filesystem> )> fn)
1629 {
1630 fsmap.modify_filesystem(
1631 fsmap.legacy_client_fscid,
1632 fn
1633 );
1634 }
1635
1636
1637
1638 /**
1639 * Handle a command that affects the filesystem (i.e. a filesystem
1640 * must exist for the command to act upon).
1641 *
1642 * @retval 0 Command was successfully handled and has side effects
1643 * @retval -EAGAIN Messages has been requeued for retry
1644 * @retval -ENOSYS Unknown command
1645 * @retval < 0 An error has occurred; **ss** may have been set.
1646 */
1647 int MDSMonitor::legacy_filesystem_command(
1648 FSMap &fsmap,
1649 MonOpRequestRef op,
1650 std::string const &prefix,
1651 map<string, cmd_vartype> &cmdmap,
1652 std::stringstream &ss)
1653 {
1654 dout(4) << __func__ << " prefix='" << prefix << "'" << dendl;
1655 op->mark_mdsmon_event(__func__);
1656 int r = 0;
1657 string whostr;
1658 cmd_getval(g_ceph_context, cmdmap, "who", whostr);
1659
1660 assert (fsmap.legacy_client_fscid != FS_CLUSTER_ID_NONE);
1661
1662 if (prefix == "mds set_max_mds") {
1663 // NOTE: deprecated by "fs set max_mds"
1664 int64_t maxmds;
1665 if (!cmd_getval(g_ceph_context, cmdmap, "maxmds", maxmds) || maxmds <= 0) {
1666 return -EINVAL;
1667 }
1668
1669 const MDSMap& mdsmap =
1670 fsmap.filesystems.at(fsmap.legacy_client_fscid)->mds_map;
1671
1672 if (!mdsmap.allows_multimds() &&
1673 maxmds > mdsmap.get_max_mds() &&
1674 maxmds > 1) {
1675 ss << "multi-MDS clusters are not enabled; set 'allow_multimds' to enable";
1676 return -EINVAL;
1677 }
1678
1679 if (maxmds > MAX_MDS) {
1680 ss << "may not have more than " << MAX_MDS << " MDS ranks";
1681 return -EINVAL;
1682 }
1683
1684 modify_legacy_filesystem(fsmap,
1685 [maxmds](std::shared_ptr<Filesystem> fs)
1686 {
1687 fs->mds_map.set_max_mds(maxmds);
1688 });
1689
1690 r = 0;
1691 ss << "max_mds = " << maxmds;
1692 } else if (prefix == "mds cluster_down") {
1693 // NOTE: deprecated by "fs set cluster_down"
1694 modify_legacy_filesystem(fsmap,
1695 [](std::shared_ptr<Filesystem> fs)
1696 {
1697 fs->mds_map.set_flag(CEPH_MDSMAP_DOWN);
1698 });
1699 ss << "marked fsmap DOWN";
1700 r = 0;
1701 } else if (prefix == "mds cluster_up") {
1702 // NOTE: deprecated by "fs set cluster_up"
1703 modify_legacy_filesystem(fsmap,
1704 [](std::shared_ptr<Filesystem> fs)
1705 {
1706 fs->mds_map.clear_flag(CEPH_MDSMAP_DOWN);
1707 });
1708 ss << "unmarked fsmap DOWN";
1709 r = 0;
1710 } else {
1711 return -ENOSYS;
1712 }
1713
1714 return r;
1715 }
1716
1717
1718 void MDSMonitor::check_subs()
1719 {
1720 std::list<std::string> types;
1721
1722 // Subscriptions may be to "mdsmap" (MDS and legacy clients),
1723 // "mdsmap.<namespace>", or to "fsmap" for the full state of all
1724 // filesystems. Build a list of all the types we service
1725 // subscriptions for.
1726 types.push_back("fsmap");
1727 types.push_back("fsmap.user");
1728 types.push_back("mdsmap");
1729 for (const auto &p : get_fsmap().filesystems) {
1730 const auto &fscid = p.first;
1731 std::ostringstream oss;
1732 oss << "mdsmap." << fscid;
1733 types.push_back(oss.str());
1734 }
1735
1736 for (const auto &type : types) {
1737 if (mon->session_map.subs.count(type) == 0)
1738 continue;
1739 xlist<Subscription*>::iterator p = mon->session_map.subs[type]->begin();
1740 while (!p.end()) {
1741 Subscription *sub = *p;
1742 ++p;
1743 check_sub(sub);
1744 }
1745 }
1746 }
1747
1748
1749 void MDSMonitor::check_sub(Subscription *sub)
1750 {
1751 dout(20) << __func__ << ": " << sub->type << dendl;
1752
1753 const auto &fsmap = get_fsmap();
1754
1755 if (sub->type == "fsmap") {
1756 if (sub->next <= fsmap.get_epoch()) {
1757 sub->session->con->send_message(new MFSMap(mon->monmap->fsid, fsmap));
1758 if (sub->onetime) {
1759 mon->session_map.remove_sub(sub);
1760 } else {
1761 sub->next = fsmap.get_epoch() + 1;
1762 }
1763 }
1764 } else if (sub->type == "fsmap.user") {
1765 if (sub->next <= fsmap.get_epoch()) {
1766 FSMapUser fsmap_u;
1767 fsmap_u.epoch = fsmap.get_epoch();
1768 fsmap_u.legacy_client_fscid = fsmap.legacy_client_fscid;
1769 for (const auto &p : fsmap.filesystems) {
1770 FSMapUser::fs_info_t& fs_info = fsmap_u.filesystems[p.second->fscid];
1771 fs_info.cid = p.second->fscid;
1772 fs_info.name = p.second->mds_map.fs_name;
1773 }
1774 sub->session->con->send_message(new MFSMapUser(mon->monmap->fsid, fsmap_u));
1775 if (sub->onetime) {
1776 mon->session_map.remove_sub(sub);
1777 } else {
1778 sub->next = fsmap.get_epoch() + 1;
1779 }
1780 }
1781 } else if (sub->type.compare(0, 6, "mdsmap") == 0) {
1782 if (sub->next > fsmap.get_epoch()) {
1783 return;
1784 }
1785
1786 const bool is_mds = sub->session->inst.name.is_mds();
1787 mds_gid_t mds_gid = MDS_GID_NONE;
1788 fs_cluster_id_t fscid = FS_CLUSTER_ID_NONE;
1789 if (is_mds) {
1790 // What (if any) namespace are you assigned to?
1791 auto mds_info = fsmap.get_mds_info();
1792 for (const auto &p : mds_info) {
1793 if (p.second.addr == sub->session->inst.addr) {
1794 mds_gid = p.first;
1795 fscid = fsmap.mds_roles.at(mds_gid);
1796 }
1797 }
1798 } else {
1799 // You're a client. Did you request a particular
1800 // namespace?
1801 if (sub->type.find("mdsmap.") == 0) {
1802 auto namespace_id_str = sub->type.substr(std::string("mdsmap.").size());
1803 dout(10) << __func__ << ": namespace_id " << namespace_id_str << dendl;
1804 std::string err;
1805 fscid = strict_strtoll(namespace_id_str.c_str(), 10, &err);
1806 if (!err.empty()) {
1807 // Client asked for a non-existent namespace, send them nothing
1808 dout(1) << "Invalid client subscription '" << sub->type
1809 << "'" << dendl;
1810 return;
1811 }
1812 if (fsmap.filesystems.count(fscid) == 0) {
1813 // Client asked for a non-existent namespace, send them nothing
1814 // TODO: something more graceful for when a client has a filesystem
1815 // mounted, and the fileysstem is deleted. Add a "shut down you fool"
1816 // flag to MMDSMap?
1817 dout(1) << "Client subscribed to non-existent namespace '" <<
1818 fscid << "'" << dendl;
1819 return;
1820 }
1821 } else {
1822 // Unqualified request for "mdsmap": give it the one marked
1823 // for use by legacy clients.
1824 if (fsmap.legacy_client_fscid != FS_CLUSTER_ID_NONE) {
1825 fscid = fsmap.legacy_client_fscid;
1826 } else {
1827 dout(1) << "Client subscribed for legacy filesystem but "
1828 "none is configured" << dendl;
1829 return;
1830 }
1831 }
1832 }
1833 dout(10) << __func__ << ": is_mds=" << is_mds << ", fscid= " << fscid << dendl;
1834
1835 // Work out the effective latest epoch
1836 const MDSMap *mds_map = nullptr;
1837 MDSMap null_map;
1838 null_map.compat = fsmap.compat;
1839 if (fscid == FS_CLUSTER_ID_NONE) {
1840 // For a client, we should have already dropped out
1841 assert(is_mds);
1842
1843 auto it = fsmap.standby_daemons.find(mds_gid);
1844 if (it != fsmap.standby_daemons.end()) {
1845 // For an MDS, we need to feed it an MDSMap with its own state in
1846 null_map.mds_info[mds_gid] = it->second;
1847 null_map.epoch = fsmap.standby_epochs.at(mds_gid);
1848 } else {
1849 null_map.epoch = fsmap.epoch;
1850 }
1851 mds_map = &null_map;
1852 } else {
1853 // Check the effective epoch
1854 mds_map = &fsmap.get_filesystem(fscid)->mds_map;
1855 }
1856
1857 assert(mds_map != nullptr);
1858 dout(10) << __func__ << " selected MDS map epoch " <<
1859 mds_map->epoch << " for namespace " << fscid << " for subscriber "
1860 << sub->session->inst.name << " who wants epoch " << sub->next << dendl;
1861
1862 if (sub->next > mds_map->epoch) {
1863 return;
1864 }
1865 auto msg = new MMDSMap(mon->monmap->fsid, mds_map);
1866
1867 sub->session->con->send_message(msg);
1868 if (sub->onetime) {
1869 mon->session_map.remove_sub(sub);
1870 } else {
1871 sub->next = mds_map->get_epoch() + 1;
1872 }
1873 }
1874 }
1875
1876
1877 void MDSMonitor::update_metadata(mds_gid_t gid,
1878 const map<string, string>& metadata)
1879 {
1880 if (metadata.empty()) {
1881 return;
1882 }
1883 pending_metadata[gid] = metadata;
1884
1885 MonitorDBStore::TransactionRef t = paxos->get_pending_transaction();
1886 bufferlist bl;
1887 ::encode(pending_metadata, bl);
1888 t->put(MDS_METADATA_PREFIX, "last_metadata", bl);
1889 paxos->trigger_propose();
1890 }
1891
1892 void MDSMonitor::remove_from_metadata(const FSMap &fsmap, MonitorDBStore::TransactionRef t)
1893 {
1894 bool update = false;
1895 for (auto it = pending_metadata.begin(); it != pending_metadata.end(); ) {
1896 if (!fsmap.gid_exists(it->first)) {
1897 it = pending_metadata.erase(it);
1898 update = true;
1899 } else {
1900 ++it;
1901 }
1902 }
1903 if (!update)
1904 return;
1905 bufferlist bl;
1906 ::encode(pending_metadata, bl);
1907 t->put(MDS_METADATA_PREFIX, "last_metadata", bl);
1908 }
1909
1910 int MDSMonitor::load_metadata(map<mds_gid_t, Metadata>& m)
1911 {
1912 bufferlist bl;
1913 int r = mon->store->get(MDS_METADATA_PREFIX, "last_metadata", bl);
1914 if (r) {
1915 dout(1) << "Unable to load 'last_metadata'" << dendl;
1916 return r;
1917 }
1918
1919 bufferlist::iterator it = bl.begin();
1920 ::decode(m, it);
1921 return 0;
1922 }
1923
1924 void MDSMonitor::count_metadata(const std::string &field, map<string,int> *out)
1925 {
1926 map<mds_gid_t,Metadata> meta;
1927 load_metadata(meta);
1928 for (auto& p : meta) {
1929 auto q = p.second.find(field);
1930 if (q == p.second.end()) {
1931 (*out)["unknown"]++;
1932 } else {
1933 (*out)[q->second]++;
1934 }
1935 }
1936 }
1937
1938 void MDSMonitor::count_metadata(const std::string &field, Formatter *f)
1939 {
1940 map<string,int> by_val;
1941 count_metadata(field, &by_val);
1942 f->open_object_section(field.c_str());
1943 for (auto& p : by_val) {
1944 f->dump_int(p.first.c_str(), p.second);
1945 }
1946 f->close_section();
1947 }
1948
1949 int MDSMonitor::dump_metadata(const FSMap& fsmap, const std::string &who,
1950 Formatter *f, ostream& err)
1951 {
1952 assert(f);
1953
1954 mds_gid_t gid = gid_from_arg(fsmap, who, err);
1955 if (gid == MDS_GID_NONE) {
1956 return -EINVAL;
1957 }
1958
1959 map<mds_gid_t, Metadata> metadata;
1960 if (int r = load_metadata(metadata)) {
1961 err << "Unable to load 'last_metadata'";
1962 return r;
1963 }
1964
1965 if (!metadata.count(gid)) {
1966 return -ENOENT;
1967 }
1968 const Metadata& m = metadata[gid];
1969 for (Metadata::const_iterator p = m.begin(); p != m.end(); ++p) {
1970 f->dump_string(p->first.c_str(), p->second);
1971 }
1972 return 0;
1973 }
1974
1975 int MDSMonitor::print_nodes(Formatter *f)
1976 {
1977 assert(f);
1978
1979 const auto &fsmap = get_fsmap();
1980
1981 map<mds_gid_t, Metadata> metadata;
1982 if (int r = load_metadata(metadata)) {
1983 return r;
1984 }
1985
1986 map<string, list<int> > mdses; // hostname => rank
1987 for (const auto &p : metadata) {
1988 const mds_gid_t& gid = p.first;
1989 const Metadata& m = p.second;
1990 Metadata::const_iterator hostname = m.find("hostname");
1991 if (hostname == m.end()) {
1992 // not likely though
1993 continue;
1994 }
1995 if (!fsmap.gid_exists(gid)) {
1996 dout(5) << __func__ << ": GID " << gid << " not existent" << dendl;
1997 continue;
1998 }
1999 const MDSMap::mds_info_t& mds_info = fsmap.get_info_gid(gid);
2000 // FIXME: include filesystem name with rank here
2001 mdses[hostname->second].push_back(mds_info.rank);
2002 }
2003
2004 dump_services(f, mdses, "mds");
2005 return 0;
2006 }
2007
2008 /**
2009 * If a cluster is undersized (with respect to max_mds), then
2010 * attempt to find daemons to grow it.
2011 */
2012 bool MDSMonitor::maybe_expand_cluster(FSMap &fsmap, fs_cluster_id_t fscid)
2013 {
2014 auto fs = fsmap.get_filesystem(fscid);
2015 auto &mds_map = fs->mds_map;
2016
2017 if (fs->mds_map.test_flag(CEPH_MDSMAP_DOWN)) {
2018 return false;
2019 }
2020
2021 int in = mds_map.get_num_in_mds();
2022 int max = mds_map.get_max_mds();
2023
2024 dout(20) << __func__ << " in " << in << " max " << max << dendl;
2025
2026 if (in < max) {
2027 mds_rank_t mds = mds_rank_t(0);
2028 string name;
2029 while (mds_map.is_in(mds)) {
2030 mds++;
2031 }
2032 mds_gid_t newgid = fsmap.find_replacement_for({fscid, mds},
2033 name, g_conf->mon_force_standby_active);
2034 if (newgid == MDS_GID_NONE) {
2035 return false;
2036 }
2037
2038 const auto &new_info = fsmap.get_info_gid(newgid);
2039 dout(1) << "assigned standby " << new_info.addr
2040 << " as mds." << mds << dendl;
2041
2042 mon->clog->info() << new_info.human_name() << " assigned to "
2043 "filesystem " << mds_map.fs_name << " as rank "
2044 << mds << " (now has " << mds_map.get_num_in_mds() + 1
2045 << " ranks)";
2046 fsmap.promote(newgid, fs, mds);
2047 return true;
2048 }
2049
2050 return false;
2051 }
2052
2053
2054 /**
2055 * If a daemon is laggy, and a suitable replacement
2056 * is available, fail this daemon (remove from map) and pass its
2057 * role to another daemon.
2058 */
2059 void MDSMonitor::maybe_replace_gid(FSMap &fsmap, mds_gid_t gid,
2060 const MDSMap::mds_info_t& info, bool *mds_propose, bool *osd_propose)
2061 {
2062 assert(mds_propose != nullptr);
2063 assert(osd_propose != nullptr);
2064
2065 const auto fscid = fsmap.mds_roles.at(gid);
2066
2067 // We will only take decisive action (replacing/removing a daemon)
2068 // if we have some indicating that some other daemon(s) are successfully
2069 // getting beacons through recently.
2070 mono_time latest_beacon = mono_clock::zero();
2071 for (const auto &p : last_beacon) {
2072 latest_beacon = std::max(p.second.stamp, latest_beacon);
2073 }
2074 mono_time now = mono_clock::now();
2075 chrono::duration<double> since = now-latest_beacon;
2076 const bool may_replace = since.count() <
2077 std::max(g_conf->mds_beacon_interval, g_conf->mds_beacon_grace * 0.5);
2078
2079 // are we in?
2080 // and is there a non-laggy standby that can take over for us?
2081 mds_gid_t sgid;
2082 if (info.rank >= 0 &&
2083 info.state != MDSMap::STATE_STANDBY &&
2084 info.state != MDSMap::STATE_STANDBY_REPLAY &&
2085 may_replace &&
2086 !fsmap.get_filesystem(fscid)->mds_map.test_flag(CEPH_MDSMAP_DOWN) &&
2087 (sgid = fsmap.find_replacement_for({fscid, info.rank}, info.name,
2088 g_conf->mon_force_standby_active)) != MDS_GID_NONE)
2089 {
2090
2091 MDSMap::mds_info_t si = fsmap.get_info_gid(sgid);
2092 dout(1) << " replacing " << gid << " " << info.addr << " mds."
2093 << info.rank << "." << info.inc
2094 << " " << ceph_mds_state_name(info.state)
2095 << " with " << sgid << "/" << si.name << " " << si.addr << dendl;
2096
2097 mon->clog->warn() << info.human_name()
2098 << " is not responding, replacing it "
2099 << "as rank " << info.rank
2100 << " with standby " << si.human_name();
2101
2102 // Remember what NS the old one was in
2103 const fs_cluster_id_t fscid = fsmap.mds_roles.at(gid);
2104
2105 // Remove the old one
2106 *osd_propose |= fail_mds_gid(fsmap, gid);
2107
2108 // Promote the replacement
2109 auto fs = fsmap.filesystems.at(fscid);
2110 fsmap.promote(sgid, fs, info.rank);
2111
2112 *mds_propose = true;
2113 } else if ((info.state == MDSMap::STATE_STANDBY_REPLAY ||
2114 info.state == MDSMap::STATE_STANDBY) && may_replace) {
2115 dout(1) << " failing and removing " << gid << " " << info.addr << " mds." << info.rank
2116 << "." << info.inc << " " << ceph_mds_state_name(info.state)
2117 << dendl;
2118 mon->clog->info() << "Standby " << info.human_name() << " is not "
2119 "responding, dropping it";
2120 fail_mds_gid(fsmap, gid);
2121 *mds_propose = true;
2122 } else if (!info.laggy()) {
2123 dout(1) << " marking " << gid << " " << info.addr << " mds." << info.rank << "." << info.inc
2124 << " " << ceph_mds_state_name(info.state)
2125 << " laggy" << dendl;
2126 fsmap.modify_daemon(info.global_id, [](MDSMap::mds_info_t *info) {
2127 info->laggy_since = ceph_clock_now();
2128 });
2129 *mds_propose = true;
2130 }
2131 }
2132
2133 bool MDSMonitor::maybe_promote_standby(FSMap &fsmap, std::shared_ptr<Filesystem> &fs)
2134 {
2135 assert(!fs->mds_map.test_flag(CEPH_MDSMAP_DOWN));
2136
2137 bool do_propose = false;
2138
2139 // have a standby take over?
2140 set<mds_rank_t> failed;
2141 fs->mds_map.get_failed_mds_set(failed);
2142 if (!failed.empty()) {
2143 set<mds_rank_t>::iterator p = failed.begin();
2144 while (p != failed.end()) {
2145 mds_rank_t f = *p++;
2146 mds_gid_t sgid = fsmap.find_replacement_for({fs->fscid, f}, {},
2147 g_conf->mon_force_standby_active);
2148 if (sgid) {
2149 const MDSMap::mds_info_t si = fsmap.get_info_gid(sgid);
2150 dout(1) << " taking over failed mds." << f << " with " << sgid
2151 << "/" << si.name << " " << si.addr << dendl;
2152 mon->clog->info() << "Standby " << si.human_name()
2153 << " assigned to filesystem " << fs->mds_map.fs_name
2154 << " as rank " << f;
2155
2156 fsmap.promote(sgid, fs, f);
2157 do_propose = true;
2158 }
2159 }
2160 } else {
2161 // There were no failures to replace, so try using any available standbys
2162 // as standby-replay daemons.
2163
2164 // Take a copy of the standby GIDs so that we can iterate over
2165 // them while perhaps-modifying standby_daemons during the loop
2166 // (if we promote anyone they are removed from standby_daemons)
2167 std::vector<mds_gid_t> standby_gids;
2168 for (const auto &j : fsmap.standby_daemons) {
2169 standby_gids.push_back(j.first);
2170 }
2171
2172 for (const auto &gid : standby_gids) {
2173 const auto &info = fsmap.standby_daemons.at(gid);
2174 assert(info.state == MDSMap::STATE_STANDBY);
2175
2176 if (!info.standby_replay) {
2177 continue;
2178 }
2179
2180 /*
2181 * This mds is standby but has no rank assigned.
2182 * See if we can find it somebody to shadow
2183 */
2184 dout(20) << "gid " << gid << " is standby and following nobody" << dendl;
2185
2186 // standby for someone specific?
2187 if (info.standby_for_rank >= 0) {
2188 // The mds_info_t may or may not tell us exactly which filesystem
2189 // the standby_for_rank refers to: lookup via legacy_client_fscid
2190 mds_role_t target_role = {
2191 info.standby_for_fscid == FS_CLUSTER_ID_NONE ?
2192 fsmap.legacy_client_fscid : info.standby_for_fscid,
2193 info.standby_for_rank};
2194
2195 // It is possible that the map contains a standby_for_fscid
2196 // that doesn't correspond to an existing filesystem, especially
2197 // if we loaded from a version with a bug (#17466)
2198 if (info.standby_for_fscid != FS_CLUSTER_ID_NONE
2199 && !fsmap.filesystem_exists(info.standby_for_fscid)) {
2200 derr << "gid " << gid << " has invalid standby_for_fscid "
2201 << info.standby_for_fscid << dendl;
2202 continue;
2203 }
2204
2205 // If we managed to resolve a full target role
2206 if (target_role.fscid != FS_CLUSTER_ID_NONE) {
2207 const auto &fs = fsmap.get_filesystem(target_role.fscid);
2208 if (fs->mds_map.is_followable(target_role.rank)) {
2209 do_propose |= try_standby_replay(fsmap, info, *fs,
2210 fs->mds_map.get_info(target_role.rank));
2211 }
2212 }
2213
2214 continue;
2215 }
2216
2217 // check everyone
2218 for (const auto &p : fsmap.filesystems) {
2219 if (info.standby_for_fscid != FS_CLUSTER_ID_NONE &&
2220 info.standby_for_fscid != p.first)
2221 continue;
2222
2223 bool assigned = false;
2224 const auto &fs = p.second;
2225 const MDSMap &mds_map = fs->mds_map;
2226 for (const auto &mds_i : mds_map.mds_info) {
2227 const MDSMap::mds_info_t &cand_info = mds_i.second;
2228 if (cand_info.rank >= 0 && mds_map.is_followable(cand_info.rank)) {
2229 if ((info.standby_for_name.length() && info.standby_for_name != cand_info.name) ||
2230 info.standby_for_rank != MDS_RANK_NONE) {
2231 continue; // we're supposed to follow someone else
2232 }
2233
2234 if (try_standby_replay(fsmap, info, *fs, cand_info)) {
2235 assigned = true;
2236 break;
2237 }
2238 }
2239 }
2240 if (assigned) {
2241 do_propose = true;
2242 break;
2243 }
2244 }
2245 }
2246 }
2247
2248 return do_propose;
2249 }
2250
2251 void MDSMonitor::tick()
2252 {
2253 // make sure mds's are still alive
2254 // ...if i am an active leader
2255
2256 if (!is_active() || !is_leader()) return;
2257
2258 auto &pending = get_pending_fsmap_writeable();
2259
2260 bool do_propose = false;
2261
2262 do_propose |= pending.check_health();
2263
2264 // expand mds cluster (add new nodes to @in)?
2265 for (auto &p : pending.filesystems) {
2266 do_propose |= maybe_expand_cluster(pending, p.second->fscid);
2267 }
2268
2269 mono_time now = mono_clock::now();
2270 if (last_tick == decltype(last_tick)::min()) {
2271 last_tick = now;
2272 }
2273 chrono::duration<double> since_last = now-last_tick;
2274
2275 if (since_last.count() >
2276 (g_conf->mds_beacon_grace - g_conf->mds_beacon_interval)) {
2277 // This case handles either local slowness (calls being delayed
2278 // for whatever reason) or cluster election slowness (a long gap
2279 // between calls while an election happened)
2280 dout(1) << __func__ << ": resetting beacon timeouts due to mon delay "
2281 "(slow election?) of " << now - last_tick << " seconds" << dendl;
2282 for (auto &p : last_beacon) {
2283 p.second.stamp = now;
2284 }
2285 }
2286
2287 last_tick = now;
2288
2289 // make sure last_beacon is fully populated
2290 for (auto &p : pending.mds_roles) {
2291 auto &gid = p.first;
2292 last_beacon.emplace(std::piecewise_construct,
2293 std::forward_as_tuple(gid),
2294 std::forward_as_tuple(mono_clock::now(), 0));
2295 }
2296
2297
2298 // check beacon timestamps
2299 bool propose_osdmap = false;
2300 bool osdmap_writeable = mon->osdmon()->is_writeable();
2301 for (auto it = last_beacon.begin(); it != last_beacon.end(); ) {
2302 mds_gid_t gid = it->first;
2303 auto beacon_info = it->second;
2304 chrono::duration<double> since_last = now-beacon_info.stamp;
2305
2306 if (!pending.gid_exists(gid)) {
2307 // clean it out
2308 it = last_beacon.erase(it);
2309 continue;
2310 }
2311
2312
2313 if (since_last.count() >= g_conf->mds_beacon_grace) {
2314 auto &info = pending.get_info_gid(gid);
2315 dout(1) << "no beacon from mds." << info.rank << "." << info.inc
2316 << " (gid: " << gid << " addr: " << info.addr
2317 << " state: " << ceph_mds_state_name(info.state) << ")"
2318 << " since " << since_last.count() << "s" << dendl;
2319 // If the OSDMap is writeable, we can blacklist things, so we can
2320 // try failing any laggy MDS daemons. Consider each one for failure.
2321 if (osdmap_writeable) {
2322 maybe_replace_gid(pending, gid, info, &do_propose, &propose_osdmap);
2323 }
2324 }
2325
2326 ++it;
2327 }
2328 if (propose_osdmap) {
2329 request_proposal(mon->osdmon());
2330 }
2331
2332 for (auto &p : pending.filesystems) {
2333 auto &fs = p.second;
2334 if (!fs->mds_map.test_flag(CEPH_MDSMAP_DOWN)) {
2335 do_propose |= maybe_promote_standby(pending, fs);
2336 }
2337 }
2338
2339 if (do_propose) {
2340 propose_pending();
2341 }
2342 }
2343
2344 /**
2345 * finfo: the would-be follower
2346 * leader_fs: the Filesystem containing the would-be leader
2347 * ainfo: the would-be leader
2348 */
2349 bool MDSMonitor::try_standby_replay(
2350 FSMap &fsmap,
2351 const MDSMap::mds_info_t& finfo,
2352 const Filesystem &leader_fs,
2353 const MDSMap::mds_info_t& ainfo)
2354 {
2355 // someone else already following?
2356 if (leader_fs.has_standby_replay(ainfo.global_id)) {
2357 dout(20) << " mds." << ainfo.rank << " already has a follower" << dendl;
2358 return false;
2359 } else {
2360 // Assign the new role to the standby
2361 dout(10) << " setting to follow mds rank " << ainfo.rank << dendl;
2362 fsmap.assign_standby_replay(finfo.global_id, leader_fs.fscid, ainfo.rank);
2363 return true;
2364 }
2365 }
2366
2367 MDSMonitor::MDSMonitor(Monitor *mn, Paxos *p, string service_name)
2368 : PaxosService(mn, p, service_name)
2369 {
2370 handlers = FileSystemCommandHandler::load(p);
2371 }
2372
2373 void MDSMonitor::on_restart()
2374 {
2375 // Clear out the leader-specific state.
2376 last_tick = mono_clock::now();
2377 last_beacon.clear();
2378 }
2379