]> git.proxmox.com Git - ceph.git/blob - ceph/src/mon/MDSMonitor.cc
update sources to 12.2.7
[ceph.git] / ceph / src / mon / MDSMonitor.cc
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3 /*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 *
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
12 *
13 */
14
15 #include <sstream>
16 #include <boost/utility.hpp>
17 #include <boost/regex.hpp>
18
19 #include "MDSMonitor.h"
20 #include "FSCommands.h"
21 #include "Monitor.h"
22 #include "MonitorDBStore.h"
23 #include "OSDMonitor.h"
24 #include "PGMonitor.h"
25
26 #include "common/strtol.h"
27 #include "common/perf_counters.h"
28 #include "common/config.h"
29 #include "common/cmdparse.h"
30 #include "messages/MMDSMap.h"
31 #include "messages/MFSMap.h"
32 #include "messages/MFSMapUser.h"
33 #include "messages/MMDSLoadTargets.h"
34 #include "messages/MMonCommand.h"
35 #include "messages/MGenericMessage.h"
36
37 #include "include/assert.h"
38 #include "include/str_list.h"
39 #include "include/stringify.h"
40 #include "mds/mdstypes.h"
41 #include "Session.h"
42
43 #define dout_subsys ceph_subsys_mon
44 #undef dout_prefix
45 #define dout_prefix _prefix(_dout, mon, get_fsmap())
46 static ostream& _prefix(std::ostream *_dout, Monitor *mon, const FSMap& fsmap) {
47 return *_dout << "mon." << mon->name << "@" << mon->rank
48 << "(" << mon->get_state_name()
49 << ").mds e" << fsmap.get_epoch() << " ";
50 }
51
52 static const string MDS_METADATA_PREFIX("mds_metadata");
53 static const string MDS_HEALTH_PREFIX("mds_health");
54
55
56 /*
57 * Specialized implementation of cmd_getval to allow us to parse
58 * out strongly-typedef'd types
59 */
60 template<> bool cmd_getval(CephContext *cct, const cmdmap_t& cmdmap,
61 const std::string& k, mds_gid_t &val)
62 {
63 return cmd_getval(cct, cmdmap, k, (int64_t&)val);
64 }
65
66 template<> bool cmd_getval(CephContext *cct, const cmdmap_t& cmdmap,
67 const std::string& k, mds_rank_t &val)
68 {
69 return cmd_getval(cct, cmdmap, k, (int64_t&)val);
70 }
71
72 template<> bool cmd_getval(CephContext *cct, const cmdmap_t& cmdmap,
73 const std::string& k, MDSMap::DaemonState &val)
74 {
75 return cmd_getval(cct, cmdmap, k, (int64_t&)val);
76 }
77
78 // my methods
79
80 void MDSMonitor::print_map(const FSMap &m, int dbl)
81 {
82 dout(dbl) << "print_map\n";
83 m.print(*_dout);
84 *_dout << dendl;
85 }
86
87 // service methods
88 void MDSMonitor::create_initial()
89 {
90 dout(10) << "create_initial" << dendl;
91 }
92
93 void MDSMonitor::get_store_prefixes(std::set<string>& s)
94 {
95 s.insert(service_name);
96 s.insert(MDS_METADATA_PREFIX);
97 s.insert(MDS_HEALTH_PREFIX);
98 }
99
100 void MDSMonitor::update_from_paxos(bool *need_bootstrap)
101 {
102 version_t version = get_last_committed();
103 if (version == get_fsmap().epoch)
104 return;
105
106 dout(10) << __func__ << " version " << version
107 << ", my e " << get_fsmap().epoch << dendl;
108 assert(version > get_fsmap().epoch);
109
110 load_health();
111
112 // read and decode
113 bufferlist fsmap_bl;
114 fsmap_bl.clear();
115 int err = get_version(version, fsmap_bl);
116 assert(err == 0);
117
118 assert(fsmap_bl.length() > 0);
119 dout(10) << __func__ << " got " << version << dendl;
120 PaxosFSMap::decode(fsmap_bl);
121
122 // new map
123 dout(4) << "new map" << dendl;
124 print_map(get_fsmap(), 0);
125 if (!g_conf->mon_mds_skip_sanity) {
126 get_fsmap().sanity();
127 }
128
129 check_subs();
130 update_logger();
131 }
132
133 void MDSMonitor::init()
134 {
135 (void)load_metadata(pending_metadata);
136 }
137
138 void MDSMonitor::create_pending()
139 {
140 auto &fsmap = PaxosFSMap::create_pending();
141
142 if (mon->osdmon()->is_readable()) {
143 const auto &osdmap = mon->osdmon()->osdmap;
144 fsmap.sanitize([&osdmap](int64_t pool){return osdmap.have_pg_pool(pool);});
145 }
146
147 dout(10) << "create_pending e" << fsmap.epoch << dendl;
148 }
149
150 void MDSMonitor::encode_pending(MonitorDBStore::TransactionRef t)
151 {
152 auto &pending = get_pending_fsmap_writeable();
153 auto &epoch = pending.epoch;
154
155 dout(10) << "encode_pending e" << epoch << dendl;
156
157 // print map iff 'debug mon = 30' or higher
158 print_map(get_pending_fsmap(), 30);
159 if (!g_conf->mon_mds_skip_sanity) {
160 pending.sanity();
161 }
162
163 // Set 'modified' on maps modified this epoch
164 for (auto &p : pending.filesystems) {
165 if (p.second->mds_map.epoch == epoch) {
166 p.second->mds_map.modified = ceph_clock_now();
167 }
168 }
169
170 // apply to paxos
171 assert(get_last_committed() + 1 == pending.epoch);
172 bufferlist pending_bl;
173 pending.encode(pending_bl, mon->get_quorum_con_features());
174
175 /* put everything in the transaction */
176 put_version(t, pending.epoch, pending_bl);
177 put_last_committed(t, pending.epoch);
178
179 // Encode MDSHealth data
180 for (std::map<uint64_t, MDSHealth>::iterator i = pending_daemon_health.begin();
181 i != pending_daemon_health.end(); ++i) {
182 bufferlist bl;
183 i->second.encode(bl);
184 t->put(MDS_HEALTH_PREFIX, stringify(i->first), bl);
185 }
186
187 for (std::set<uint64_t>::iterator i = pending_daemon_health_rm.begin();
188 i != pending_daemon_health_rm.end(); ++i) {
189 t->erase(MDS_HEALTH_PREFIX, stringify(*i));
190 }
191 pending_daemon_health_rm.clear();
192 remove_from_metadata(t);
193
194 // health
195 health_check_map_t new_checks;
196 const auto &info_map = pending.get_mds_info();
197 for (const auto &i : info_map) {
198 const auto &gid = i.first;
199 const auto &info = i.second;
200 if (pending_daemon_health_rm.count(gid)) {
201 continue;
202 }
203 MDSHealth health;
204 auto p = pending_daemon_health.find(gid);
205 if (p != pending_daemon_health.end()) {
206 health = p->second;
207 } else {
208 bufferlist bl;
209 mon->store->get(MDS_HEALTH_PREFIX, stringify(gid), bl);
210 if (!bl.length()) {
211 derr << "Missing health data for MDS " << gid << dendl;
212 continue;
213 }
214 bufferlist::iterator bl_i = bl.begin();
215 health.decode(bl_i);
216 }
217 for (const auto &metric : health.metrics) {
218 const int rank = info.rank;
219 health_check_t *check = &new_checks.get_or_add(
220 mds_metric_name(metric.type),
221 metric.sev,
222 mds_metric_summary(metric.type));
223 ostringstream ss;
224 ss << "mds" << info.name << "(mds." << rank << "): " << metric.message;
225 bool first = true;
226 for (auto &p : metric.metadata) {
227 if (first) {
228 ss << " ";
229 } else {
230 ss << ", ";
231 }
232 ss << p.first << ": " << p.second;
233 first = false;
234 }
235 check->detail.push_back(ss.str());
236 }
237 }
238 pending.get_health_checks(&new_checks);
239 for (auto& p : new_checks.checks) {
240 p.second.summary = boost::regex_replace(
241 p.second.summary,
242 boost::regex("%num%"),
243 stringify(p.second.detail.size()));
244 p.second.summary = boost::regex_replace(
245 p.second.summary,
246 boost::regex("%plurals%"),
247 p.second.detail.size() > 1 ? "s" : "");
248 p.second.summary = boost::regex_replace(
249 p.second.summary,
250 boost::regex("%isorare%"),
251 p.second.detail.size() > 1 ? "are" : "is");
252 p.second.summary = boost::regex_replace(
253 p.second.summary,
254 boost::regex("%hasorhave%"),
255 p.second.detail.size() > 1 ? "have" : "has");
256 }
257 encode_health(new_checks, t);
258 }
259
260 version_t MDSMonitor::get_trim_to()
261 {
262 version_t floor = 0;
263 if (g_conf->mon_mds_force_trim_to > 0 &&
264 g_conf->mon_mds_force_trim_to < (int)get_last_committed()) {
265 floor = g_conf->mon_mds_force_trim_to;
266 dout(10) << __func__ << " explicit mon_mds_force_trim_to = "
267 << floor << dendl;
268 }
269
270 unsigned max = g_conf->mon_max_mdsmap_epochs;
271 version_t last = get_last_committed();
272
273 if (last - get_first_committed() > max && floor < last - max)
274 return last - max;
275 return floor;
276 }
277
278 void MDSMonitor::update_logger()
279 {
280 dout(10) << "update_logger" << dendl;
281
282 const auto &fsmap = get_fsmap();
283
284 uint64_t up = 0;
285 uint64_t in = 0;
286 uint64_t failed = 0;
287 for (const auto &i : fsmap.filesystems) {
288 const MDSMap &mds_map = i.second->mds_map;
289
290 up += mds_map.get_num_up_mds();
291 in += mds_map.get_num_in_mds();
292 failed += mds_map.get_num_failed_mds();
293 }
294 mon->cluster_logger->set(l_cluster_num_mds_up, up);
295 mon->cluster_logger->set(l_cluster_num_mds_in, in);
296 mon->cluster_logger->set(l_cluster_num_mds_failed, failed);
297 mon->cluster_logger->set(l_cluster_mds_epoch, fsmap.get_epoch());
298 }
299
300 bool MDSMonitor::preprocess_query(MonOpRequestRef op)
301 {
302 op->mark_mdsmon_event(__func__);
303 PaxosServiceMessage *m = static_cast<PaxosServiceMessage*>(op->get_req());
304 dout(10) << "preprocess_query " << *m << " from " << m->get_orig_source_inst() << dendl;
305
306 switch (m->get_type()) {
307
308 case MSG_MDS_BEACON:
309 return preprocess_beacon(op);
310
311 case MSG_MON_COMMAND:
312 return preprocess_command(op);
313
314 case MSG_MDS_OFFLOAD_TARGETS:
315 return preprocess_offload_targets(op);
316
317 default:
318 ceph_abort();
319 return true;
320 }
321 }
322
323 void MDSMonitor::_note_beacon(MMDSBeacon *m)
324 {
325 mds_gid_t gid = mds_gid_t(m->get_global_id());
326 version_t seq = m->get_seq();
327
328 dout(15) << "_note_beacon " << *m << " noting time" << dendl;
329 last_beacon[gid].stamp = ceph_clock_now();
330 last_beacon[gid].seq = seq;
331 }
332
333 bool MDSMonitor::preprocess_beacon(MonOpRequestRef op)
334 {
335 op->mark_mdsmon_event(__func__);
336 MMDSBeacon *m = static_cast<MMDSBeacon*>(op->get_req());
337 MDSMap::DaemonState state = m->get_state();
338 mds_gid_t gid = m->get_global_id();
339 version_t seq = m->get_seq();
340 MDSMap::mds_info_t info;
341 epoch_t effective_epoch = 0;
342
343 const auto &fsmap = get_working_fsmap();
344
345 // check privileges, ignore if fails
346 MonSession *session = m->get_session();
347 assert(session);
348 if (!session->is_capable("mds", MON_CAP_X)) {
349 dout(0) << "preprocess_beacon got MMDSBeacon from entity with insufficient privileges "
350 << session->caps << dendl;
351 goto ignore;
352 }
353
354 if (m->get_fsid() != mon->monmap->fsid) {
355 dout(0) << "preprocess_beacon on fsid " << m->get_fsid() << " != " << mon->monmap->fsid << dendl;
356 goto ignore;
357 }
358
359 dout(12) << "preprocess_beacon " << *m
360 << " from " << m->get_orig_source_inst()
361 << " " << m->get_compat()
362 << dendl;
363
364 // make sure the address has a port
365 if (m->get_orig_source_addr().get_port() == 0) {
366 dout(1) << " ignoring boot message without a port" << dendl;
367 goto ignore;
368 }
369
370 // check compat
371 if (!m->get_compat().writeable(fsmap.compat)) {
372 dout(1) << " mds " << m->get_source_inst() << " can't write to fsmap " << fsmap.compat << dendl;
373 goto ignore;
374 }
375
376 // fw to leader?
377 if (!is_leader())
378 return false;
379
380 // booted, but not in map?
381 if (!fsmap.gid_exists(gid)) {
382 if (state != MDSMap::STATE_BOOT) {
383 dout(7) << "mds_beacon " << *m << " is not in fsmap (state "
384 << ceph_mds_state_name(state) << ")" << dendl;
385
386 MDSMap null_map;
387 null_map.epoch = fsmap.epoch;
388 null_map.compat = fsmap.compat;
389 mon->send_reply(op, new MMDSMap(mon->monmap->fsid, &null_map));
390 return true;
391 } else {
392 return false; // not booted yet.
393 }
394 }
395 dout(10) << __func__ << ": GID exists in map: " << gid << dendl;
396 info = fsmap.get_info_gid(gid);
397
398 // old seq?
399 if (info.state_seq > seq) {
400 dout(7) << "mds_beacon " << *m << " has old seq, ignoring" << dendl;
401 goto ignore;
402 }
403
404 // Work out the latest epoch that this daemon should have seen
405 {
406 fs_cluster_id_t fscid = fsmap.mds_roles.at(gid);
407 if (fscid == FS_CLUSTER_ID_NONE) {
408 effective_epoch = fsmap.standby_epochs.at(gid);
409 } else {
410 effective_epoch = fsmap.get_filesystem(fscid)->mds_map.epoch;
411 }
412 if (effective_epoch != m->get_last_epoch_seen()) {
413 dout(10) << "mds_beacon " << *m
414 << " ignoring requested state, because mds hasn't seen latest map" << dendl;
415 goto reply;
416 }
417 }
418
419 if (info.laggy()) {
420 _note_beacon(m);
421 return false; // no longer laggy, need to update map.
422 }
423 if (state == MDSMap::STATE_BOOT) {
424 // ignore, already booted.
425 goto ignore;
426 }
427 // is there a state change here?
428 if (info.state != state) {
429 // legal state change?
430 if ((info.state == MDSMap::STATE_STANDBY ||
431 info.state == MDSMap::STATE_STANDBY_REPLAY) && state > 0) {
432 dout(10) << "mds_beacon mds can't activate itself (" << ceph_mds_state_name(info.state)
433 << " -> " << ceph_mds_state_name(state) << ")" << dendl;
434 goto reply;
435 }
436
437 if ((state == MDSMap::STATE_STANDBY || state == MDSMap::STATE_STANDBY_REPLAY)
438 && info.rank != MDS_RANK_NONE)
439 {
440 dout(4) << "mds_beacon MDS can't go back into standby after taking rank: "
441 "held rank " << info.rank << " while requesting state "
442 << ceph_mds_state_name(state) << dendl;
443 goto reply;
444 }
445
446 _note_beacon(m);
447 return false;
448 }
449
450 // Comparing known daemon health with m->get_health()
451 // and return false (i.e. require proposal) if they
452 // do not match, to update our stored
453 if (!(pending_daemon_health[gid] == m->get_health())) {
454 dout(20) << __func__ << " health metrics for gid " << gid << " were updated" << dendl;
455 _note_beacon(m);
456 return false;
457 }
458
459 reply:
460 // note time and reply
461 assert(effective_epoch > 0);
462 _note_beacon(m);
463 mon->send_reply(op,
464 new MMDSBeacon(mon->monmap->fsid, m->get_global_id(), m->get_name(),
465 effective_epoch, state, seq,
466 CEPH_FEATURES_SUPPORTED_DEFAULT));
467 return true;
468
469 ignore:
470 // I won't reply this beacon, drop it.
471 mon->no_reply(op);
472 return true;
473 }
474
475 bool MDSMonitor::preprocess_offload_targets(MonOpRequestRef op)
476 {
477 op->mark_mdsmon_event(__func__);
478 MMDSLoadTargets *m = static_cast<MMDSLoadTargets*>(op->get_req());
479 dout(10) << "preprocess_offload_targets " << *m << " from " << m->get_orig_source() << dendl;
480
481 auto &fsmap = get_working_fsmap();
482
483 // check privileges, ignore message if fails
484 MonSession *session = m->get_session();
485 if (!session)
486 goto done;
487 if (!session->is_capable("mds", MON_CAP_X)) {
488 dout(0) << "preprocess_offload_targets got MMDSLoadTargets from entity with insufficient caps "
489 << session->caps << dendl;
490 goto done;
491 }
492
493 if (fsmap.gid_exists(m->global_id) &&
494 m->targets == fsmap.get_info_gid(m->global_id).export_targets)
495 goto done;
496
497 return false;
498
499 done:
500 return true;
501 }
502
503
504 bool MDSMonitor::prepare_update(MonOpRequestRef op)
505 {
506 op->mark_mdsmon_event(__func__);
507 PaxosServiceMessage *m = static_cast<PaxosServiceMessage*>(op->get_req());
508 dout(7) << "prepare_update " << *m << dendl;
509
510 switch (m->get_type()) {
511
512 case MSG_MDS_BEACON:
513 return prepare_beacon(op);
514
515 case MSG_MON_COMMAND:
516 return prepare_command(op);
517
518 case MSG_MDS_OFFLOAD_TARGETS:
519 return prepare_offload_targets(op);
520
521 default:
522 ceph_abort();
523 }
524
525 return true;
526 }
527
528 bool MDSMonitor::prepare_beacon(MonOpRequestRef op)
529 {
530 op->mark_mdsmon_event(__func__);
531 MMDSBeacon *m = static_cast<MMDSBeacon*>(op->get_req());
532 // -- this is an update --
533 dout(12) << "prepare_beacon " << *m << " from " << m->get_orig_source_inst() << dendl;
534 entity_addr_t addr = m->get_orig_source_inst().addr;
535 mds_gid_t gid = m->get_global_id();
536 MDSMap::DaemonState state = m->get_state();
537 version_t seq = m->get_seq();
538
539 auto &pending = get_pending_fsmap_writeable();
540
541 dout(20) << __func__ << " got health from gid " << gid << " with " << m->get_health().metrics.size() << " metrics." << dendl;
542
543 // Calculate deltas of health metrics created and removed
544 // Do this by type rather than MDSHealthMetric equality, because messages can
545 // change a lot when they include e.g. a number of items.
546 const auto &old_health = pending_daemon_health[gid].metrics;
547 const auto &new_health = m->get_health().metrics;
548
549 std::set<mds_metric_t> old_types;
550 for (const auto &i : old_health) {
551 old_types.insert(i.type);
552 }
553
554 std::set<mds_metric_t> new_types;
555 for (const auto &i : new_health) {
556 new_types.insert(i.type);
557 }
558
559 for (const auto &new_metric: new_health) {
560 if (old_types.count(new_metric.type) == 0) {
561 dout(10) << "MDS health message (" << m->get_orig_source_inst().name
562 << "): " << new_metric.sev << " " << new_metric.message << dendl;
563 }
564 }
565
566 // Log the disappearance of health messages at INFO
567 for (const auto &old_metric : old_health) {
568 if (new_types.count(old_metric.type) == 0) {
569 mon->clog->info() << "MDS health message cleared ("
570 << m->get_orig_source_inst().name << "): " << old_metric.message;
571 }
572 }
573
574 // Store health
575 pending_daemon_health[gid] = m->get_health();
576
577 // boot?
578 if (state == MDSMap::STATE_BOOT) {
579 // zap previous instance of this name?
580 if (g_conf->mds_enforce_unique_name) {
581 bool failed_mds = false;
582 while (mds_gid_t existing = pending.find_mds_gid_by_name(m->get_name())) {
583 if (!mon->osdmon()->is_writeable()) {
584 mon->osdmon()->wait_for_writeable(op, new C_RetryMessage(this, op));
585 return false;
586 }
587 const MDSMap::mds_info_t &existing_info =
588 pending.get_info_gid(existing);
589 mon->clog->info() << existing_info.human_name() << " restarted";
590 fail_mds_gid(existing);
591 failed_mds = true;
592 }
593 if (failed_mds) {
594 assert(mon->osdmon()->is_writeable());
595 request_proposal(mon->osdmon());
596 }
597 }
598
599 // Add this daemon to the map
600 if (pending.mds_roles.count(gid) == 0) {
601 MDSMap::mds_info_t new_info;
602 new_info.global_id = gid;
603 new_info.name = m->get_name();
604 new_info.addr = addr;
605 new_info.mds_features = m->get_mds_features();
606 new_info.state = MDSMap::STATE_STANDBY;
607 new_info.state_seq = seq;
608 new_info.standby_for_rank = m->get_standby_for_rank();
609 new_info.standby_for_name = m->get_standby_for_name();
610 new_info.standby_for_fscid = m->get_standby_for_fscid();
611 new_info.standby_replay = m->get_standby_replay();
612 pending.insert(new_info);
613 }
614
615 // Resolve standby_for_name to a rank
616 const MDSMap::mds_info_t &info = pending.get_info_gid(gid);
617 if (!info.standby_for_name.empty()) {
618 const MDSMap::mds_info_t *leaderinfo = pending.find_by_name(
619 info.standby_for_name);
620 if (leaderinfo && (leaderinfo->rank >= 0)) {
621 const auto &fscid = pending.mds_roles.at(leaderinfo->global_id);
622 const auto &fs = pending.get_filesystem(fscid);
623
624 pending.modify_daemon(gid, [fscid, leaderinfo](
625 MDSMap::mds_info_t *info) {
626 info->standby_for_rank = leaderinfo->rank;
627 info->standby_for_fscid = fscid;
628 });
629 }
630 }
631
632 // initialize the beacon timer
633 last_beacon[gid].stamp = ceph_clock_now();
634 last_beacon[gid].seq = seq;
635
636 // new incompat?
637 if (!pending.compat.writeable(m->get_compat())) {
638 dout(10) << " fsmap " << pending.compat
639 << " can't write to new mds' " << m->get_compat()
640 << ", updating fsmap and killing old mds's"
641 << dendl;
642 pending.update_compat(m->get_compat());
643 }
644
645 update_metadata(m->get_global_id(), m->get_sys_info());
646 } else {
647 // state update
648 const MDSMap::mds_info_t &info = pending.get_info_gid(gid);
649 // Old MDS daemons don't mention that they're standby replay until
650 // after they've sent their boot beacon, so update this field.
651 if (info.standby_replay != m->get_standby_replay()) {
652 pending.modify_daemon(info.global_id, [&m](
653 MDSMap::mds_info_t *i)
654 {
655 i->standby_replay = m->get_standby_replay();
656 });
657 }
658
659 if (info.state == MDSMap::STATE_STOPPING && state != MDSMap::STATE_STOPPED ) {
660 // we can't transition to any other states from STOPPING
661 dout(0) << "got beacon for MDS in STATE_STOPPING, ignoring requested state change"
662 << dendl;
663 _note_beacon(m);
664 return true;
665 }
666
667 if (info.laggy()) {
668 dout(10) << "prepare_beacon clearing laggy flag on " << addr << dendl;
669 pending.modify_daemon(info.global_id, [](MDSMap::mds_info_t *info)
670 {
671 info->clear_laggy();
672 }
673 );
674 }
675
676 dout(10) << "prepare_beacon mds." << info.rank
677 << " " << ceph_mds_state_name(info.state)
678 << " -> " << ceph_mds_state_name(state)
679 << " standby_for_rank=" << m->get_standby_for_rank()
680 << dendl;
681 if (state == MDSMap::STATE_STOPPED) {
682 const auto fscid = pending.mds_roles.at(gid);
683 const auto &fs = pending.get_filesystem(fscid);
684
685 mon->clog->info() << info.human_name() << " finished "
686 << "deactivating rank " << info.rank << " in filesystem "
687 << fs->mds_map.fs_name << " (now has "
688 << fs->mds_map.get_num_in_mds() - 1 << " ranks)";
689
690 auto erased = pending.stop(gid);
691 erased.push_back(gid);
692
693 for (const auto &erased_gid : erased) {
694 last_beacon.erase(erased_gid);
695 if (pending_daemon_health.count(erased_gid)) {
696 pending_daemon_health.erase(erased_gid);
697 pending_daemon_health_rm.insert(erased_gid);
698 }
699 }
700
701
702 } else if (state == MDSMap::STATE_DAMAGED) {
703 if (!mon->osdmon()->is_writeable()) {
704 dout(4) << __func__ << ": DAMAGED from rank " << info.rank
705 << " waiting for osdmon writeable to blacklist it" << dendl;
706 mon->osdmon()->wait_for_writeable(op, new C_RetryMessage(this, op));
707 return false;
708 }
709
710 // Record this MDS rank as damaged, so that other daemons
711 // won't try to run it.
712 dout(4) << __func__ << ": marking rank "
713 << info.rank << " damaged" << dendl;
714
715 utime_t until = ceph_clock_now();
716 until += g_conf->get_val<double>("mon_mds_blacklist_interval");
717 const auto blacklist_epoch = mon->osdmon()->blacklist(info.addr, until);
718 request_proposal(mon->osdmon());
719 pending.damaged(gid, blacklist_epoch);
720 last_beacon.erase(gid);
721
722 // Respond to MDS, so that it knows it can continue to shut down
723 mon->send_reply(op,
724 new MMDSBeacon(
725 mon->monmap->fsid, m->get_global_id(),
726 m->get_name(), pending.get_epoch(), state, seq,
727 CEPH_FEATURES_SUPPORTED_DEFAULT));
728 } else if (state == MDSMap::STATE_DNE) {
729 if (!mon->osdmon()->is_writeable()) {
730 dout(4) << __func__ << ": DNE from rank " << info.rank
731 << " waiting for osdmon writeable to blacklist it" << dendl;
732 mon->osdmon()->wait_for_writeable(op, new C_RetryMessage(this, op));
733 return false;
734 }
735
736 fail_mds_gid(gid);
737 assert(mon->osdmon()->is_writeable());
738 request_proposal(mon->osdmon());
739
740 // Respond to MDS, so that it knows it can continue to shut down
741 mon->send_reply(op,
742 new MMDSBeacon(
743 mon->monmap->fsid, m->get_global_id(),
744 m->get_name(), pending.get_epoch(), state, seq,
745 CEPH_FEATURES_SUPPORTED_DEFAULT));
746 } else if (info.state == MDSMap::STATE_STANDBY && state != info.state) {
747 // Standby daemons should never modify their own
748 // state. Reject any attempts to do so.
749 derr << "standby " << gid << " attempted to change state to "
750 << ceph_mds_state_name(state) << ", rejecting" << dendl;
751 return true;
752 } else if (info.state != MDSMap::STATE_STANDBY && state != info.state &&
753 !MDSMap::state_transition_valid(info.state, state)) {
754 // Validate state transitions for daemons that hold a rank
755 derr << "daemon " << gid << " (rank " << info.rank << ") "
756 << "reported invalid state transition "
757 << ceph_mds_state_name(info.state) << " -> "
758 << ceph_mds_state_name(state) << dendl;
759 return true;
760 } else {
761 if (info.state != MDSMap::STATE_ACTIVE && state == MDSMap::STATE_ACTIVE) {
762 const auto &fscid = pending.mds_roles.at(gid);
763 const auto &fs = pending.get_filesystem(fscid);
764 mon->clog->info() << info.human_name() << " is now active in "
765 << "filesystem " << fs->mds_map.fs_name << " as rank "
766 << info.rank;
767 }
768
769 // Made it through special cases and validations, record the
770 // daemon's reported state to the FSMap.
771 pending.modify_daemon(gid, [state, seq](MDSMap::mds_info_t *info) {
772 info->state = state;
773 info->state_seq = seq;
774 });
775 }
776 }
777
778 dout(7) << "prepare_beacon pending map now:" << dendl;
779 print_map(pending);
780
781 wait_for_finished_proposal(op, new FunctionContext([op, this](int r){
782 if (r >= 0)
783 _updated(op); // success
784 else if (r == -ECANCELED) {
785 mon->no_reply(op);
786 } else {
787 dispatch(op); // try again
788 }
789 }));
790
791 return true;
792 }
793
794 bool MDSMonitor::prepare_offload_targets(MonOpRequestRef op)
795 {
796 auto &pending = get_pending_fsmap_writeable();
797
798 op->mark_mdsmon_event(__func__);
799 MMDSLoadTargets *m = static_cast<MMDSLoadTargets*>(op->get_req());
800 mds_gid_t gid = m->global_id;
801 if (pending.gid_has_rank(gid)) {
802 dout(10) << "prepare_offload_targets " << gid << " " << m->targets << dendl;
803 pending.update_export_targets(gid, m->targets);
804 } else {
805 dout(10) << "prepare_offload_targets " << gid << " not in map" << dendl;
806 }
807 return true;
808 }
809
810 bool MDSMonitor::should_propose(double& delay)
811 {
812 // delegate to PaxosService to assess whether we should propose
813 return PaxosService::should_propose(delay);
814 }
815
816 void MDSMonitor::_updated(MonOpRequestRef op)
817 {
818 const auto &fsmap = get_fsmap();
819 op->mark_mdsmon_event(__func__);
820 MMDSBeacon *m = static_cast<MMDSBeacon*>(op->get_req());
821 dout(10) << "_updated " << m->get_orig_source() << " " << *m << dendl;
822 mon->clog->debug() << m->get_orig_source_inst() << " "
823 << ceph_mds_state_name(m->get_state());
824
825 if (m->get_state() == MDSMap::STATE_STOPPED) {
826 // send the map manually (they're out of the map, so they won't get it automatic)
827 MDSMap null_map;
828 null_map.epoch = fsmap.epoch;
829 null_map.compat = fsmap.compat;
830 mon->send_reply(op, new MMDSMap(mon->monmap->fsid, &null_map));
831 } else {
832 mon->send_reply(op, new MMDSBeacon(mon->monmap->fsid,
833 m->get_global_id(),
834 m->get_name(),
835 fsmap.get_epoch(),
836 m->get_state(),
837 m->get_seq(),
838 CEPH_FEATURES_SUPPORTED_DEFAULT));
839 }
840 }
841
842 void MDSMonitor::on_active()
843 {
844 tick();
845 update_logger();
846
847 if (is_leader()) {
848 mon->clog->debug() << "fsmap " << get_fsmap();
849 }
850 }
851
852 void MDSMonitor::get_health(list<pair<health_status_t, string> >& summary,
853 list<pair<health_status_t, string> > *detail,
854 CephContext* cct) const
855 {
856 const auto &fsmap = get_fsmap();
857
858 fsmap.get_health(summary, detail);
859
860 // For each MDS GID...
861 const auto &info_map = fsmap.get_mds_info();
862 for (const auto &i : info_map) {
863 const auto &gid = i.first;
864 const auto &info = i.second;
865
866 // Decode MDSHealth
867 bufferlist bl;
868 mon->store->get(MDS_HEALTH_PREFIX, stringify(gid), bl);
869 if (!bl.length()) {
870 derr << "Missing health data for MDS " << gid << dendl;
871 continue;
872 }
873 MDSHealth health;
874 bufferlist::iterator bl_i = bl.begin();
875 health.decode(bl_i);
876
877 for (const auto &metric : health.metrics) {
878 const int rank = info.rank;
879 std::ostringstream message;
880 message << "mds" << rank << ": " << metric.message;
881 summary.push_back(std::make_pair(metric.sev, message.str()));
882
883 if (detail) {
884 // There is no way for us to clealy associate detail entries with summary entries (#7192), so
885 // we duplicate the summary message in the detail string and tag the metadata on.
886 std::ostringstream detail_message;
887 detail_message << message.str();
888 if (metric.metadata.size()) {
889 detail_message << "(";
890 auto k = metric.metadata.begin();
891 while (k != metric.metadata.end()) {
892 detail_message << k->first << ": " << k->second;
893 if (boost::next(k) != metric.metadata.end()) {
894 detail_message << ", ";
895 }
896 ++k;
897 }
898 detail_message << ")";
899 }
900 detail->push_back(std::make_pair(metric.sev, detail_message.str()));
901 }
902 }
903 }
904 }
905
906 void MDSMonitor::dump_info(Formatter *f)
907 {
908 f->open_object_section("fsmap");
909 get_fsmap().dump(f);
910 f->close_section();
911
912 f->dump_unsigned("mdsmap_first_committed", get_first_committed());
913 f->dump_unsigned("mdsmap_last_committed", get_last_committed());
914 }
915
916 bool MDSMonitor::preprocess_command(MonOpRequestRef op)
917 {
918 op->mark_mdsmon_event(__func__);
919 MMonCommand *m = static_cast<MMonCommand*>(op->get_req());
920 int r = -1;
921 bufferlist rdata;
922 stringstream ss, ds;
923
924 map<string, cmd_vartype> cmdmap;
925 const auto &fsmap = get_working_fsmap();
926
927 if (!cmdmap_from_json(m->cmd, &cmdmap, ss)) {
928 // ss has reason for failure
929 string rs = ss.str();
930 mon->reply_command(op, -EINVAL, rs, rdata, get_last_committed());
931 return true;
932 }
933
934 string prefix;
935 cmd_getval(g_ceph_context, cmdmap, "prefix", prefix);
936 string format;
937 cmd_getval(g_ceph_context, cmdmap, "format", format, string("plain"));
938 boost::scoped_ptr<Formatter> f(Formatter::create(format));
939
940 MonSession *session = m->get_session();
941 if (!session) {
942 mon->reply_command(op, -EACCES, "access denied", rdata, get_last_committed());
943 return true;
944 }
945
946 if (prefix == "mds stat") {
947 if (f) {
948 f->open_object_section("mds_stat");
949 dump_info(f.get());
950 f->close_section();
951 f->flush(ds);
952 } else {
953 ds << fsmap;
954 }
955 r = 0;
956 } else if (prefix == "mds dump") {
957 int64_t epocharg;
958 epoch_t epoch;
959
960 const FSMap *fsmapp = &get_fsmap();
961 FSMap dummy;
962 if (cmd_getval(g_ceph_context, cmdmap, "epoch", epocharg)) {
963 epoch = epocharg;
964 bufferlist b;
965 int err = get_version(epoch, b);
966 if (err == -ENOENT) {
967 r = -ENOENT;
968 goto out;
969 } else {
970 assert(err == 0);
971 assert(b.length());
972 dummy.decode(b);
973 fsmapp = &dummy;
974 }
975 }
976
977 stringstream ds;
978 const MDSMap *mdsmapp = nullptr;
979 MDSMap blank;
980 blank.epoch = fsmapp->epoch;
981 if (fsmapp->legacy_client_fscid != FS_CLUSTER_ID_NONE) {
982 mdsmapp = &fsmapp->filesystems.at(fsmapp->legacy_client_fscid)->mds_map;
983 } else {
984 mdsmapp = &blank;
985 }
986 if (f != NULL) {
987 f->open_object_section("mdsmap");
988 mdsmapp->dump(f.get());
989 f->close_section();
990 f->flush(ds);
991 r = 0;
992 } else {
993 mdsmapp->print(ds);
994 r = 0;
995 }
996
997 rdata.append(ds);
998 ss << "dumped fsmap epoch " << fsmapp->get_epoch();
999 } else if (prefix == "fs dump") {
1000 int64_t epocharg;
1001 epoch_t epoch;
1002
1003 const FSMap *fsmapp = &get_fsmap();
1004 FSMap dummy;
1005 if (cmd_getval(g_ceph_context, cmdmap, "epoch", epocharg)) {
1006 epoch = epocharg;
1007 bufferlist b;
1008 int err = get_version(epoch, b);
1009 if (err == -ENOENT) {
1010 r = -ENOENT;
1011 goto out;
1012 } else {
1013 assert(err == 0);
1014 assert(b.length());
1015 dummy.decode(b);
1016 fsmapp = &dummy;
1017 }
1018 }
1019
1020 stringstream ds;
1021 if (f != NULL) {
1022 f->open_object_section("fsmap");
1023 fsmapp->dump(f.get());
1024 f->close_section();
1025 f->flush(ds);
1026 r = 0;
1027 } else {
1028 fsmapp->print(ds);
1029 r = 0;
1030 }
1031
1032 rdata.append(ds);
1033 ss << "dumped fsmap epoch " << fsmapp->get_epoch();
1034 } else if (prefix == "mds metadata") {
1035 if (!f)
1036 f.reset(Formatter::create("json-pretty"));
1037
1038 string who;
1039 bool all = !cmd_getval(g_ceph_context, cmdmap, "who", who);
1040 dout(1) << "all = " << all << dendl;
1041 if (all) {
1042 r = 0;
1043 // Dump all MDSs' metadata
1044 const auto all_info = fsmap.get_mds_info();
1045
1046 f->open_array_section("mds_metadata");
1047 for(const auto &i : all_info) {
1048 const auto &info = i.second;
1049
1050 f->open_object_section("mds");
1051 f->dump_string("name", info.name);
1052 std::ostringstream get_err;
1053 r = dump_metadata(info.name, f.get(), get_err);
1054 if (r == -EINVAL || r == -ENOENT) {
1055 // Drop error, list what metadata we do have
1056 dout(1) << get_err.str() << dendl;
1057 r = 0;
1058 } else if (r != 0) {
1059 derr << "Unexpected error reading metadata: " << cpp_strerror(r)
1060 << dendl;
1061 ss << get_err.str();
1062 f->close_section();
1063 break;
1064 }
1065 f->close_section();
1066 }
1067 f->close_section();
1068 } else {
1069 // Dump a single daemon's metadata
1070 f->open_object_section("mds_metadata");
1071 r = dump_metadata(who, f.get(), ss);
1072 f->close_section();
1073 }
1074 f->flush(ds);
1075 } else if (prefix == "mds versions") {
1076 if (!f)
1077 f.reset(Formatter::create("json-pretty"));
1078 count_metadata("ceph_version", f.get());
1079 f->flush(ds);
1080 r = 0;
1081 } else if (prefix == "mds count-metadata") {
1082 if (!f)
1083 f.reset(Formatter::create("json-pretty"));
1084 string field;
1085 cmd_getval(g_ceph_context, cmdmap, "property", field);
1086 count_metadata(field, f.get());
1087 f->flush(ds);
1088 r = 0;
1089 } else if (prefix == "mds getmap") {
1090 epoch_t e;
1091 int64_t epocharg;
1092 bufferlist b;
1093 if (cmd_getval(g_ceph_context, cmdmap, "epoch", epocharg)) {
1094 e = epocharg;
1095 int err = get_version(e, b);
1096 if (err == -ENOENT) {
1097 r = -ENOENT;
1098 } else {
1099 assert(err == 0);
1100 assert(b.length());
1101 FSMap mm;
1102 mm.decode(b);
1103 mm.encode(rdata, m->get_connection()->get_features());
1104 ss << "got fsmap epoch " << mm.get_epoch();
1105 r = 0;
1106 }
1107 } else {
1108 fsmap.encode(rdata, m->get_connection()->get_features());
1109 ss << "got fsmap epoch " << fsmap.get_epoch();
1110 r = 0;
1111 }
1112 } else if (prefix == "mds compat show") {
1113 if (f) {
1114 f->open_object_section("mds_compat");
1115 fsmap.compat.dump(f.get());
1116 f->close_section();
1117 f->flush(ds);
1118 } else {
1119 ds << fsmap.compat;
1120 }
1121 r = 0;
1122 } else if (prefix == "fs get") {
1123 string fs_name;
1124 cmd_getval(g_ceph_context, cmdmap, "fs_name", fs_name);
1125 const auto &fs = fsmap.get_filesystem(fs_name);
1126 if (fs == nullptr) {
1127 ss << "filesystem '" << fs_name << "' not found";
1128 r = -ENOENT;
1129 } else {
1130 if (f != nullptr) {
1131 f->open_object_section("filesystem");
1132 fs->dump(f.get());
1133 f->close_section();
1134 f->flush(ds);
1135 r = 0;
1136 } else {
1137 fs->print(ds);
1138 r = 0;
1139 }
1140 }
1141 } else if (prefix == "fs ls") {
1142 if (f) {
1143 f->open_array_section("filesystems");
1144 {
1145 for (const auto &p : fsmap.filesystems) {
1146 const auto &fs = p.second;
1147 f->open_object_section("filesystem");
1148 {
1149 const MDSMap &mds_map = fs->mds_map;
1150 f->dump_string("name", mds_map.fs_name);
1151 /* Output both the names and IDs of pools, for use by
1152 * humans and machines respectively */
1153 f->dump_string("metadata_pool", mon->osdmon()->osdmap.get_pool_name(
1154 mds_map.metadata_pool));
1155 f->dump_int("metadata_pool_id", mds_map.metadata_pool);
1156 f->open_array_section("data_pool_ids");
1157 {
1158 for (auto dpi = mds_map.data_pools.begin();
1159 dpi != mds_map.data_pools.end(); ++dpi) {
1160 f->dump_int("data_pool_id", *dpi);
1161 }
1162 }
1163 f->close_section();
1164
1165 f->open_array_section("data_pools");
1166 {
1167 for (auto dpi = mds_map.data_pools.begin();
1168 dpi != mds_map.data_pools.end(); ++dpi) {
1169 const auto &name = mon->osdmon()->osdmap.get_pool_name(
1170 *dpi);
1171 f->dump_string("data_pool", name);
1172 }
1173 }
1174
1175 f->close_section();
1176 }
1177 f->close_section();
1178 }
1179 }
1180 f->close_section();
1181 f->flush(ds);
1182 } else {
1183 for (const auto &p : fsmap.filesystems) {
1184 const auto &fs = p.second;
1185 const MDSMap &mds_map = fs->mds_map;
1186 const string &md_pool_name = mon->osdmon()->osdmap.get_pool_name(
1187 mds_map.metadata_pool);
1188
1189 ds << "name: " << mds_map.fs_name << ", metadata pool: "
1190 << md_pool_name << ", data pools: [";
1191 for (auto dpi : mds_map.data_pools) {
1192 const string &pool_name = mon->osdmon()->osdmap.get_pool_name(dpi);
1193 ds << pool_name << " ";
1194 }
1195 ds << "]" << std::endl;
1196 }
1197
1198 if (fsmap.filesystems.empty()) {
1199 ds << "No filesystems enabled" << std::endl;
1200 }
1201 }
1202 r = 0;
1203 }
1204
1205 out:
1206 if (r != -1) {
1207 rdata.append(ds);
1208 string rs;
1209 getline(ss, rs);
1210 mon->reply_command(op, r, rs, rdata, get_last_committed());
1211 return true;
1212 } else
1213 return false;
1214 }
1215
1216 bool MDSMonitor::fail_mds_gid(mds_gid_t gid)
1217 {
1218 auto &pending = get_pending_fsmap_writeable();
1219
1220 const MDSMap::mds_info_t &info = pending.get_info_gid(gid);
1221 dout(10) << "fail_mds_gid " << gid << " mds." << info.name << " role " << info.rank << dendl;
1222
1223 epoch_t blacklist_epoch = 0;
1224 if (info.rank >= 0 && info.state != MDSMap::STATE_STANDBY_REPLAY) {
1225 utime_t until = ceph_clock_now();
1226 until += g_conf->get_val<double>("mon_mds_blacklist_interval");
1227 blacklist_epoch = mon->osdmon()->blacklist(info.addr, until);
1228 }
1229
1230 pending.erase(gid, blacklist_epoch);
1231 last_beacon.erase(gid);
1232 if (pending_daemon_health.count(gid)) {
1233 pending_daemon_health.erase(gid);
1234 pending_daemon_health_rm.insert(gid);
1235 }
1236
1237 return blacklist_epoch != 0;
1238 }
1239
1240 mds_gid_t MDSMonitor::gid_from_arg(const std::string& arg, std::ostream &ss)
1241 {
1242 const auto &fsmap = get_working_fsmap();
1243
1244 // Try parsing as a role
1245 mds_role_t role;
1246 std::ostringstream ignore_err; // Don't spam 'ss' with parse_role errors
1247 int r = parse_role(arg, &role, ignore_err);
1248 if (r == 0) {
1249 // See if a GID is assigned to this role
1250 const auto &fs = fsmap.get_filesystem(role.fscid);
1251 assert(fs != nullptr); // parse_role ensures it exists
1252 if (fs->mds_map.is_up(role.rank)) {
1253 dout(10) << __func__ << ": validated rank/GID " << role
1254 << " as a rank" << dendl;
1255 return fs->mds_map.get_mds_info(role.rank).global_id;
1256 }
1257 }
1258
1259 // Try parsing as a gid
1260 std::string err;
1261 unsigned long long maybe_gid = strict_strtoll(arg.c_str(), 10, &err);
1262 if (!err.empty()) {
1263 // Not a role or a GID, try as a daemon name
1264 const MDSMap::mds_info_t *mds_info = fsmap.find_by_name(arg);
1265 if (!mds_info) {
1266 ss << "MDS named '" << arg
1267 << "' does not exist, or is not up";
1268 return MDS_GID_NONE;
1269 }
1270 dout(10) << __func__ << ": resolved MDS name '" << arg
1271 << "' to GID " << mds_info->global_id << dendl;
1272 return mds_info->global_id;
1273 } else {
1274 // Not a role, but parses as a an integer, might be a GID
1275 dout(10) << __func__ << ": treating MDS reference '" << arg
1276 << "' as an integer " << maybe_gid << dendl;
1277
1278 if (fsmap.gid_exists(mds_gid_t(maybe_gid))) {
1279 return mds_gid_t(maybe_gid);
1280 }
1281 }
1282
1283 dout(1) << __func__ << ": rank/GID " << arg
1284 << " not a existent rank or GID" << dendl;
1285 return MDS_GID_NONE;
1286 }
1287
1288 int MDSMonitor::fail_mds(std::ostream &ss, const std::string &arg,
1289 MDSMap::mds_info_t *failed_info)
1290 {
1291 assert(failed_info != nullptr);
1292
1293 mds_gid_t gid = gid_from_arg(arg, ss);
1294 if (gid == MDS_GID_NONE) {
1295 return 0;
1296 }
1297 if (!mon->osdmon()->is_writeable()) {
1298 return -EAGAIN;
1299 }
1300
1301 // Take a copy of the info before removing the MDS from the map,
1302 // so that the caller knows which mds (if any) they ended up removing.
1303 *failed_info = get_pending_fsmap().get_info_gid(gid);
1304
1305 fail_mds_gid(gid);
1306 ss << "failed mds gid " << gid;
1307 assert(mon->osdmon()->is_writeable());
1308 request_proposal(mon->osdmon());
1309 return 0;
1310 }
1311
1312 bool MDSMonitor::prepare_command(MonOpRequestRef op)
1313 {
1314 op->mark_mdsmon_event(__func__);
1315 MMonCommand *m = static_cast<MMonCommand*>(op->get_req());
1316 int r = -EINVAL;
1317 stringstream ss;
1318 bufferlist rdata;
1319
1320 map<string, cmd_vartype> cmdmap;
1321 if (!cmdmap_from_json(m->cmd, &cmdmap, ss)) {
1322 string rs = ss.str();
1323 mon->reply_command(op, -EINVAL, rs, rdata, get_last_committed());
1324 return true;
1325 }
1326
1327 string prefix;
1328 cmd_getval(g_ceph_context, cmdmap, "prefix", prefix);
1329
1330 /* Refuse access if message not associated with a valid session */
1331 MonSession *session = m->get_session();
1332 if (!session) {
1333 mon->reply_command(op, -EACCES, "access denied", rdata, get_last_committed());
1334 return true;
1335 }
1336
1337 auto &pending = get_pending_fsmap_writeable();
1338
1339 bool batched_propose = false;
1340 for (const auto &h : handlers) {
1341 if (h->can_handle(prefix)) {
1342 batched_propose = h->batched_propose();
1343 if (batched_propose) {
1344 paxos->plug();
1345 }
1346 r = h->handle(mon, pending, op, cmdmap, ss);
1347 if (batched_propose) {
1348 paxos->unplug();
1349 }
1350
1351 if (r == -EAGAIN) {
1352 // message has been enqueued for retry; return.
1353 dout(4) << __func__ << " enqueue for retry by prepare_command" << dendl;
1354 return false;
1355 } else {
1356 if (r == 0) {
1357 // On successful updates, print the updated map
1358 print_map(pending);
1359 }
1360 // Successful or not, we're done: respond.
1361 goto out;
1362 }
1363 }
1364 }
1365
1366 r = filesystem_command(op, prefix, cmdmap, ss);
1367 if (r >= 0) {
1368 goto out;
1369 } else if (r == -EAGAIN) {
1370 // Do not reply, the message has been enqueued for retry
1371 dout(4) << __func__ << " enqueue for retry by filesystem_command" << dendl;
1372 return false;
1373 } else if (r != -ENOSYS) {
1374 goto out;
1375 }
1376
1377 // Only handle legacy commands if there is a filesystem configured
1378 if (pending.legacy_client_fscid == FS_CLUSTER_ID_NONE) {
1379 if (pending.filesystems.size() == 0) {
1380 ss << "No filesystem configured: use `ceph fs new` to create a filesystem";
1381 } else {
1382 ss << "No filesystem set for use with legacy commands";
1383 }
1384 r = -EINVAL;
1385 goto out;
1386 }
1387
1388 r = legacy_filesystem_command(op, prefix, cmdmap, ss);
1389
1390 if (r == -ENOSYS && ss.str().empty()) {
1391 ss << "unrecognized command";
1392 }
1393
1394 out:
1395 dout(4) << __func__ << " done, r=" << r << dendl;
1396 /* Compose response */
1397 string rs;
1398 getline(ss, rs);
1399
1400 if (r >= 0) {
1401 // success.. delay reply
1402 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, r, rs,
1403 get_last_committed() + 1));
1404 if (batched_propose) {
1405 force_immediate_propose();
1406 }
1407 return true;
1408 } else {
1409 // reply immediately
1410 mon->reply_command(op, r, rs, rdata, get_last_committed());
1411 return false;
1412 }
1413 }
1414
1415
1416 /**
1417 * Given one of the following forms:
1418 * <fs name>:<rank>
1419 * <fs id>:<rank>
1420 * <rank>
1421 *
1422 * Parse into a mds_role_t. The rank-only form is only valid
1423 * if legacy_client_ns is set.
1424 */
1425 int MDSMonitor::parse_role(
1426 const std::string &role_str,
1427 mds_role_t *role,
1428 std::ostream &ss)
1429 {
1430 return get_working_fsmap().parse_role(role_str, role, ss);
1431 }
1432
1433 int MDSMonitor::filesystem_command(
1434 MonOpRequestRef op,
1435 std::string const &prefix,
1436 map<string, cmd_vartype> &cmdmap,
1437 std::stringstream &ss)
1438 {
1439 dout(4) << __func__ << " prefix='" << prefix << "'" << dendl;
1440 op->mark_mdsmon_event(__func__);
1441 int r = 0;
1442 string whostr;
1443 cmd_getval(g_ceph_context, cmdmap, "who", whostr);
1444
1445 auto &pending = get_pending_fsmap_writeable();
1446 if (prefix == "mds stop" ||
1447 prefix == "mds deactivate") {
1448 mds_role_t role;
1449 r = parse_role(whostr, &role, ss);
1450 if (r < 0 ) {
1451 return r;
1452 }
1453 const auto &fs = pending.get_filesystem(role.fscid);
1454
1455 if (!fs->mds_map.is_active(role.rank)) {
1456 r = -EEXIST;
1457 ss << "mds." << role << " not active ("
1458 << ceph_mds_state_name(fs->mds_map.get_state(role.rank)) << ")";
1459 } else if (fs->mds_map.get_root() == role.rank ||
1460 fs->mds_map.get_tableserver() == role.rank) {
1461 r = -EINVAL;
1462 ss << "can't tell the root (" << fs->mds_map.get_root()
1463 << ") or tableserver (" << fs->mds_map.get_tableserver()
1464 << ") to deactivate";
1465 } else if (role.rank != fs->mds_map.get_last_in_mds()) {
1466 r = -EINVAL;
1467 ss << "mds." << role << " doesn't have the max rank ("
1468 << fs->mds_map.get_last_in_mds() << ")";
1469 } else if (fs->mds_map.get_num_in_mds() <= size_t(fs->mds_map.get_max_mds())) {
1470 r = -EBUSY;
1471 ss << "must decrease max_mds or else MDS will immediately reactivate";
1472 } else {
1473 r = 0;
1474 mds_gid_t gid = fs->mds_map.up.at(role.rank);
1475 ss << "telling mds." << role << " "
1476 << pending.get_info_gid(gid).addr << " to deactivate";
1477
1478 pending.modify_daemon(gid, [](MDSMap::mds_info_t *info) {
1479 info->state = MDSMap::STATE_STOPPING;
1480 });
1481 }
1482 } else if (prefix == "mds set_state") {
1483 mds_gid_t gid;
1484 if (!cmd_getval(g_ceph_context, cmdmap, "gid", gid)) {
1485 ss << "error parsing 'gid' value '"
1486 << cmd_vartype_stringify(cmdmap["gid"]) << "'";
1487 return -EINVAL;
1488 }
1489 MDSMap::DaemonState state;
1490 if (!cmd_getval(g_ceph_context, cmdmap, "state", state)) {
1491 ss << "error parsing 'state' string value '"
1492 << cmd_vartype_stringify(cmdmap["state"]) << "'";
1493 return -EINVAL;
1494 }
1495 if (pending.gid_exists(gid)) {
1496 pending.modify_daemon(gid, [state](MDSMap::mds_info_t *info) {
1497 info->state = state;
1498 });
1499 ss << "set mds gid " << gid << " to state " << state << " "
1500 << ceph_mds_state_name(state);
1501 return 0;
1502 }
1503 } else if (prefix == "mds fail") {
1504 string who;
1505 cmd_getval(g_ceph_context, cmdmap, "who", who);
1506
1507 MDSMap::mds_info_t failed_info;
1508 r = fail_mds(ss, who, &failed_info);
1509 if (r < 0 && r == -EAGAIN) {
1510 mon->osdmon()->wait_for_writeable(op, new C_RetryMessage(this, op));
1511 return -EAGAIN; // don't propose yet; wait for message to be retried
1512 } else if (r == 0) {
1513 // Only log if we really did something (not when was already gone)
1514 if (failed_info.global_id != MDS_GID_NONE) {
1515 mon->clog->info() << failed_info.human_name() << " marked failed by "
1516 << op->get_session()->entity_name;
1517 }
1518 }
1519 } else if (prefix == "mds rm") {
1520 mds_gid_t gid;
1521 if (!cmd_getval(g_ceph_context, cmdmap, "gid", gid)) {
1522 ss << "error parsing 'gid' value '"
1523 << cmd_vartype_stringify(cmdmap["gid"]) << "'";
1524 return -EINVAL;
1525 }
1526 if (!pending.gid_exists(gid)) {
1527 ss << "mds gid " << gid << " dne";
1528 r = 0;
1529 } else {
1530 const auto &info = pending.get_info_gid(gid);
1531 MDSMap::DaemonState state = info.state;
1532 if (state > 0) {
1533 ss << "cannot remove active mds." << info.name
1534 << " rank " << info.rank;
1535 return -EBUSY;
1536 } else {
1537 pending.erase(gid, {});
1538 ss << "removed mds gid " << gid;
1539 return 0;
1540 }
1541 }
1542 } else if (prefix == "mds rmfailed") {
1543 string confirm;
1544 if (!cmd_getval(g_ceph_context, cmdmap, "confirm", confirm) ||
1545 confirm != "--yes-i-really-mean-it") {
1546 ss << "WARNING: this can make your filesystem inaccessible! "
1547 "Add --yes-i-really-mean-it if you are sure you wish to continue.";
1548 return -EPERM;
1549 }
1550
1551 std::string role_str;
1552 cmd_getval(g_ceph_context, cmdmap, "who", role_str);
1553 mds_role_t role;
1554 int r = parse_role(role_str, &role, ss);
1555 if (r < 0) {
1556 ss << "invalid role '" << role_str << "'";
1557 return -EINVAL;
1558 }
1559
1560 pending.modify_filesystem(
1561 role.fscid,
1562 [role](std::shared_ptr<Filesystem> fs)
1563 {
1564 fs->mds_map.failed.erase(role.rank);
1565 });
1566
1567 ss << "removed failed mds." << role;
1568 return 0;
1569 } else if (prefix == "mds compat rm_compat") {
1570 int64_t f;
1571 if (!cmd_getval(g_ceph_context, cmdmap, "feature", f)) {
1572 ss << "error parsing feature value '"
1573 << cmd_vartype_stringify(cmdmap["feature"]) << "'";
1574 return -EINVAL;
1575 }
1576 if (pending.compat.compat.contains(f)) {
1577 ss << "removing compat feature " << f;
1578 CompatSet modified = pending.compat;
1579 modified.compat.remove(f);
1580 pending.update_compat(modified);
1581 } else {
1582 ss << "compat feature " << f << " not present in " << pending.compat;
1583 }
1584 r = 0;
1585 } else if (prefix == "mds compat rm_incompat") {
1586 int64_t f;
1587 if (!cmd_getval(g_ceph_context, cmdmap, "feature", f)) {
1588 ss << "error parsing feature value '"
1589 << cmd_vartype_stringify(cmdmap["feature"]) << "'";
1590 return -EINVAL;
1591 }
1592 if (pending.compat.incompat.contains(f)) {
1593 ss << "removing incompat feature " << f;
1594 CompatSet modified = pending.compat;
1595 modified.incompat.remove(f);
1596 pending.update_compat(modified);
1597 } else {
1598 ss << "incompat feature " << f << " not present in " << pending.compat;
1599 }
1600 r = 0;
1601 } else if (prefix == "mds repaired") {
1602 std::string role_str;
1603 cmd_getval(g_ceph_context, cmdmap, "rank", role_str);
1604 mds_role_t role;
1605 r = parse_role(role_str, &role, ss);
1606 if (r < 0) {
1607 return r;
1608 }
1609
1610 bool modified = pending.undamaged(role.fscid, role.rank);
1611 if (modified) {
1612 dout(4) << "repaired: restoring rank " << role << dendl;
1613 } else {
1614 dout(4) << "repaired: no-op on rank " << role << dendl;
1615 }
1616
1617 r = 0;
1618 } else {
1619 return -ENOSYS;
1620 }
1621
1622 return r;
1623 }
1624
1625 /**
1626 * Helper to legacy_filesystem_command
1627 */
1628 void MDSMonitor::modify_legacy_filesystem(
1629 std::function<void(std::shared_ptr<Filesystem> )> fn)
1630 {
1631 auto &pending_fsmap = get_pending_fsmap_writeable();
1632 pending_fsmap.modify_filesystem(
1633 pending_fsmap.legacy_client_fscid,
1634 fn
1635 );
1636 }
1637
1638
1639
1640 /**
1641 * Handle a command that affects the filesystem (i.e. a filesystem
1642 * must exist for the command to act upon).
1643 *
1644 * @retval 0 Command was successfully handled and has side effects
1645 * @retval -EAGAIN Messages has been requeued for retry
1646 * @retval -ENOSYS Unknown command
1647 * @retval < 0 An error has occurred; **ss** may have been set.
1648 */
1649 int MDSMonitor::legacy_filesystem_command(
1650 MonOpRequestRef op,
1651 std::string const &prefix,
1652 map<string, cmd_vartype> &cmdmap,
1653 std::stringstream &ss)
1654 {
1655 dout(4) << __func__ << " prefix='" << prefix << "'" << dendl;
1656 op->mark_mdsmon_event(__func__);
1657 int r = 0;
1658 string whostr;
1659 cmd_getval(g_ceph_context, cmdmap, "who", whostr);
1660
1661 auto &pending_fsmap = get_pending_fsmap_writeable();
1662
1663 assert (pending_fsmap.legacy_client_fscid != FS_CLUSTER_ID_NONE);
1664
1665 if (prefix == "mds set_max_mds") {
1666 // NOTE: deprecated by "fs set max_mds"
1667 int64_t maxmds;
1668 if (!cmd_getval(g_ceph_context, cmdmap, "maxmds", maxmds) || maxmds <= 0) {
1669 return -EINVAL;
1670 }
1671
1672 const MDSMap& mdsmap =
1673 pending_fsmap.filesystems.at(pending_fsmap.legacy_client_fscid)->mds_map;
1674
1675 if (!mdsmap.allows_multimds() &&
1676 maxmds > mdsmap.get_max_mds() &&
1677 maxmds > 1) {
1678 ss << "multi-MDS clusters are not enabled; set 'allow_multimds' to enable";
1679 return -EINVAL;
1680 }
1681
1682 if (maxmds > MAX_MDS) {
1683 ss << "may not have more than " << MAX_MDS << " MDS ranks";
1684 return -EINVAL;
1685 }
1686
1687 modify_legacy_filesystem(
1688 [maxmds](std::shared_ptr<Filesystem> fs)
1689 {
1690 fs->mds_map.set_max_mds(maxmds);
1691 });
1692
1693 r = 0;
1694 ss << "max_mds = " << maxmds;
1695 } else if (prefix == "mds cluster_down") {
1696 // NOTE: deprecated by "fs set cluster_down"
1697 modify_legacy_filesystem(
1698 [](std::shared_ptr<Filesystem> fs)
1699 {
1700 fs->mds_map.set_flag(CEPH_MDSMAP_DOWN);
1701 });
1702 ss << "marked fsmap DOWN";
1703 r = 0;
1704 } else if (prefix == "mds cluster_up") {
1705 // NOTE: deprecated by "fs set cluster_up"
1706 modify_legacy_filesystem(
1707 [](std::shared_ptr<Filesystem> fs)
1708 {
1709 fs->mds_map.clear_flag(CEPH_MDSMAP_DOWN);
1710 });
1711 ss << "unmarked fsmap DOWN";
1712 r = 0;
1713 } else {
1714 return -ENOSYS;
1715 }
1716
1717 return r;
1718 }
1719
1720
1721 void MDSMonitor::check_subs()
1722 {
1723 std::list<std::string> types;
1724
1725 // Subscriptions may be to "mdsmap" (MDS and legacy clients),
1726 // "mdsmap.<namespace>", or to "fsmap" for the full state of all
1727 // filesystems. Build a list of all the types we service
1728 // subscriptions for.
1729 types.push_back("fsmap");
1730 types.push_back("fsmap.user");
1731 types.push_back("mdsmap");
1732 for (const auto &p : get_fsmap().filesystems) {
1733 const auto &fscid = p.first;
1734 std::ostringstream oss;
1735 oss << "mdsmap." << fscid;
1736 types.push_back(oss.str());
1737 }
1738
1739 for (const auto &type : types) {
1740 if (mon->session_map.subs.count(type) == 0)
1741 continue;
1742 xlist<Subscription*>::iterator p = mon->session_map.subs[type]->begin();
1743 while (!p.end()) {
1744 Subscription *sub = *p;
1745 ++p;
1746 check_sub(sub);
1747 }
1748 }
1749 }
1750
1751
1752 void MDSMonitor::check_sub(Subscription *sub)
1753 {
1754 dout(20) << __func__ << ": " << sub->type << dendl;
1755
1756 const auto &fsmap = get_fsmap();
1757
1758 if (sub->type == "fsmap") {
1759 if (sub->next <= fsmap.get_epoch()) {
1760 sub->session->con->send_message(new MFSMap(mon->monmap->fsid, fsmap));
1761 if (sub->onetime) {
1762 mon->session_map.remove_sub(sub);
1763 } else {
1764 sub->next = fsmap.get_epoch() + 1;
1765 }
1766 }
1767 } else if (sub->type == "fsmap.user") {
1768 if (sub->next <= fsmap.get_epoch()) {
1769 FSMapUser fsmap_u;
1770 fsmap_u.epoch = fsmap.get_epoch();
1771 fsmap_u.legacy_client_fscid = fsmap.legacy_client_fscid;
1772 for (const auto &p : fsmap.filesystems) {
1773 FSMapUser::fs_info_t& fs_info = fsmap_u.filesystems[p.second->fscid];
1774 fs_info.cid = p.second->fscid;
1775 fs_info.name = p.second->mds_map.fs_name;
1776 }
1777 sub->session->con->send_message(new MFSMapUser(mon->monmap->fsid, fsmap_u));
1778 if (sub->onetime) {
1779 mon->session_map.remove_sub(sub);
1780 } else {
1781 sub->next = fsmap.get_epoch() + 1;
1782 }
1783 }
1784 } else if (sub->type.compare(0, 6, "mdsmap") == 0) {
1785 if (sub->next > fsmap.get_epoch()) {
1786 return;
1787 }
1788
1789 const bool is_mds = sub->session->inst.name.is_mds();
1790 mds_gid_t mds_gid = MDS_GID_NONE;
1791 fs_cluster_id_t fscid = FS_CLUSTER_ID_NONE;
1792 if (is_mds) {
1793 // What (if any) namespace are you assigned to?
1794 auto mds_info = fsmap.get_mds_info();
1795 for (const auto &i : mds_info) {
1796 if (i.second.addr == sub->session->inst.addr) {
1797 mds_gid = i.first;
1798 fscid = fsmap.mds_roles.at(mds_gid);
1799 }
1800 }
1801 } else {
1802 // You're a client. Did you request a particular
1803 // namespace?
1804 if (sub->type.find("mdsmap.") == 0) {
1805 auto namespace_id_str = sub->type.substr(std::string("mdsmap.").size());
1806 dout(10) << __func__ << ": namespace_id " << namespace_id_str << dendl;
1807 std::string err;
1808 fscid = strict_strtoll(namespace_id_str.c_str(), 10, &err);
1809 if (!err.empty()) {
1810 // Client asked for a non-existent namespace, send them nothing
1811 dout(1) << "Invalid client subscription '" << sub->type
1812 << "'" << dendl;
1813 return;
1814 }
1815 if (fsmap.filesystems.count(fscid) == 0) {
1816 // Client asked for a non-existent namespace, send them nothing
1817 // TODO: something more graceful for when a client has a filesystem
1818 // mounted, and the fileysstem is deleted. Add a "shut down you fool"
1819 // flag to MMDSMap?
1820 dout(1) << "Client subscribed to non-existent namespace '" <<
1821 fscid << "'" << dendl;
1822 return;
1823 }
1824 } else {
1825 // Unqualified request for "mdsmap": give it the one marked
1826 // for use by legacy clients.
1827 if (fsmap.legacy_client_fscid != FS_CLUSTER_ID_NONE) {
1828 fscid = fsmap.legacy_client_fscid;
1829 } else {
1830 dout(1) << "Client subscribed for legacy filesystem but "
1831 "none is configured" << dendl;
1832 return;
1833 }
1834 }
1835 }
1836 dout(10) << __func__ << ": is_mds=" << is_mds << ", fscid= " << fscid << dendl;
1837
1838 // Work out the effective latest epoch
1839 const MDSMap *mds_map = nullptr;
1840 MDSMap null_map;
1841 null_map.compat = fsmap.compat;
1842 if (fscid == FS_CLUSTER_ID_NONE) {
1843 // For a client, we should have already dropped out
1844 assert(is_mds);
1845
1846 auto it = fsmap.standby_daemons.find(mds_gid);
1847 if (it != fsmap.standby_daemons.end()) {
1848 // For an MDS, we need to feed it an MDSMap with its own state in
1849 null_map.mds_info[mds_gid] = it->second;
1850 null_map.epoch = fsmap.standby_epochs.at(mds_gid);
1851 } else {
1852 null_map.epoch = fsmap.epoch;
1853 }
1854 mds_map = &null_map;
1855 } else {
1856 // Check the effective epoch
1857 mds_map = &fsmap.get_filesystem(fscid)->mds_map;
1858 }
1859
1860 assert(mds_map != nullptr);
1861 dout(10) << __func__ << " selected MDS map epoch " <<
1862 mds_map->epoch << " for namespace " << fscid << " for subscriber "
1863 << sub->session->inst.name << " who wants epoch " << sub->next << dendl;
1864
1865 if (sub->next > mds_map->epoch) {
1866 return;
1867 }
1868 auto msg = new MMDSMap(mon->monmap->fsid, mds_map);
1869
1870 sub->session->con->send_message(msg);
1871 if (sub->onetime) {
1872 mon->session_map.remove_sub(sub);
1873 } else {
1874 sub->next = mds_map->get_epoch() + 1;
1875 }
1876 }
1877 }
1878
1879
1880 void MDSMonitor::update_metadata(mds_gid_t gid,
1881 const map<string, string>& metadata)
1882 {
1883 if (metadata.empty()) {
1884 return;
1885 }
1886 pending_metadata[gid] = metadata;
1887
1888 MonitorDBStore::TransactionRef t = paxos->get_pending_transaction();
1889 bufferlist bl;
1890 ::encode(pending_metadata, bl);
1891 t->put(MDS_METADATA_PREFIX, "last_metadata", bl);
1892 paxos->trigger_propose();
1893 }
1894
1895 void MDSMonitor::remove_from_metadata(MonitorDBStore::TransactionRef t)
1896 {
1897 bool update = false;
1898 for (map<mds_gid_t, Metadata>::iterator i = pending_metadata.begin();
1899 i != pending_metadata.end(); ) {
1900 if (!get_pending_fsmap().gid_exists(i->first)) {
1901 pending_metadata.erase(i++);
1902 update = true;
1903 } else {
1904 ++i;
1905 }
1906 }
1907 if (!update)
1908 return;
1909 bufferlist bl;
1910 ::encode(pending_metadata, bl);
1911 t->put(MDS_METADATA_PREFIX, "last_metadata", bl);
1912 }
1913
1914 int MDSMonitor::load_metadata(map<mds_gid_t, Metadata>& m)
1915 {
1916 bufferlist bl;
1917 int r = mon->store->get(MDS_METADATA_PREFIX, "last_metadata", bl);
1918 if (r) {
1919 dout(1) << "Unable to load 'last_metadata'" << dendl;
1920 return r;
1921 }
1922
1923 bufferlist::iterator it = bl.begin();
1924 ::decode(m, it);
1925 return 0;
1926 }
1927
1928 void MDSMonitor::count_metadata(const string& field, map<string,int> *out)
1929 {
1930 map<mds_gid_t,Metadata> meta;
1931 load_metadata(meta);
1932 for (auto& p : meta) {
1933 auto q = p.second.find(field);
1934 if (q == p.second.end()) {
1935 (*out)["unknown"]++;
1936 } else {
1937 (*out)[q->second]++;
1938 }
1939 }
1940 }
1941
1942 void MDSMonitor::count_metadata(const string& field, Formatter *f)
1943 {
1944 map<string,int> by_val;
1945 count_metadata(field, &by_val);
1946 f->open_object_section(field.c_str());
1947 for (auto& p : by_val) {
1948 f->dump_int(p.first.c_str(), p.second);
1949 }
1950 f->close_section();
1951 }
1952
1953 int MDSMonitor::dump_metadata(const std::string &who, Formatter *f, ostream& err)
1954 {
1955 assert(f);
1956
1957 mds_gid_t gid = gid_from_arg(who, err);
1958 if (gid == MDS_GID_NONE) {
1959 return -EINVAL;
1960 }
1961
1962 map<mds_gid_t, Metadata> metadata;
1963 if (int r = load_metadata(metadata)) {
1964 err << "Unable to load 'last_metadata'";
1965 return r;
1966 }
1967
1968 if (!metadata.count(gid)) {
1969 return -ENOENT;
1970 }
1971 const Metadata& m = metadata[gid];
1972 for (Metadata::const_iterator p = m.begin(); p != m.end(); ++p) {
1973 f->dump_string(p->first.c_str(), p->second);
1974 }
1975 return 0;
1976 }
1977
1978 int MDSMonitor::print_nodes(Formatter *f)
1979 {
1980 assert(f);
1981
1982 map<mds_gid_t, Metadata> metadata;
1983 if (int r = load_metadata(metadata)) {
1984 return r;
1985 }
1986
1987 map<string, list<int> > mdses; // hostname => rank
1988 for (map<mds_gid_t, Metadata>::iterator it = metadata.begin();
1989 it != metadata.end(); ++it) {
1990 const Metadata& m = it->second;
1991 Metadata::const_iterator hostname = m.find("hostname");
1992 if (hostname == m.end()) {
1993 // not likely though
1994 continue;
1995 }
1996 const mds_gid_t gid = it->first;
1997 if (!get_fsmap().gid_exists(gid)) {
1998 dout(5) << __func__ << ": GID " << gid << " not existent" << dendl;
1999 continue;
2000 }
2001 const MDSMap::mds_info_t& mds_info = get_fsmap().get_info_gid(gid);
2002 // FIXME: include filesystem name with rank here
2003 mdses[hostname->second].push_back(mds_info.rank);
2004 }
2005
2006 dump_services(f, mdses, "mds");
2007 return 0;
2008 }
2009
2010 /**
2011 * If a cluster is undersized (with respect to max_mds), then
2012 * attempt to find daemons to grow it.
2013 */
2014 bool MDSMonitor::maybe_expand_cluster(std::shared_ptr<Filesystem> &fs)
2015 {
2016 bool do_propose = false;
2017 auto &pending = get_pending_fsmap_writeable();
2018
2019 if (fs->mds_map.test_flag(CEPH_MDSMAP_DOWN)) {
2020 return do_propose;
2021 }
2022
2023 while (fs->mds_map.get_num_in_mds() < size_t(fs->mds_map.get_max_mds()) &&
2024 !fs->mds_map.is_degraded()) {
2025 mds_rank_t mds = mds_rank_t(0);
2026 string name;
2027 while (fs->mds_map.is_in(mds)) {
2028 mds++;
2029 }
2030 mds_gid_t newgid = pending.find_replacement_for({fs->fscid, mds},
2031 name, g_conf->mon_force_standby_active);
2032 if (newgid == MDS_GID_NONE) {
2033 break;
2034 }
2035
2036 const auto &new_info = pending.get_info_gid(newgid);
2037 dout(1) << "assigned standby " << new_info.addr
2038 << " as mds." << mds << dendl;
2039
2040 mon->clog->info() << new_info.human_name() << " assigned to "
2041 "filesystem " << fs->mds_map.fs_name << " as rank "
2042 << mds << " (now has " << fs->mds_map.get_num_in_mds() + 1
2043 << " ranks)";
2044 pending.promote(newgid, fs, mds);
2045 do_propose = true;
2046 }
2047
2048 return do_propose;
2049 }
2050
2051
2052 /**
2053 * If a daemon is laggy, and a suitable replacement
2054 * is available, fail this daemon (remove from map) and pass its
2055 * role to another daemon.
2056 */
2057 void MDSMonitor::maybe_replace_gid(mds_gid_t gid, const MDSMap::mds_info_t& info,
2058 bool *mds_propose, bool *osd_propose)
2059 {
2060 assert(mds_propose != nullptr);
2061 assert(osd_propose != nullptr);
2062
2063 auto &pending = get_pending_fsmap_writeable();
2064 const auto fscid = pending.mds_roles.at(gid);
2065
2066 // We will only take decisive action (replacing/removing a daemon)
2067 // if we have some indicating that some other daemon(s) are successfully
2068 // getting beacons through recently.
2069 utime_t latest_beacon;
2070 for (const auto & i : last_beacon) {
2071 latest_beacon = MAX(i.second.stamp, latest_beacon);
2072 }
2073 const bool may_replace = latest_beacon >
2074 (ceph_clock_now() -
2075 MAX(g_conf->mds_beacon_interval, g_conf->mds_beacon_grace * 0.5));
2076
2077 // are we in?
2078 // and is there a non-laggy standby that can take over for us?
2079 mds_gid_t sgid;
2080 if (info.rank >= 0 &&
2081 info.state != MDSMap::STATE_STANDBY &&
2082 info.state != MDSMap::STATE_STANDBY_REPLAY &&
2083 may_replace &&
2084 !pending.get_filesystem(fscid)->mds_map.test_flag(CEPH_MDSMAP_DOWN) &&
2085 (sgid = pending.find_replacement_for({fscid, info.rank}, info.name,
2086 g_conf->mon_force_standby_active)) != MDS_GID_NONE)
2087 {
2088
2089 MDSMap::mds_info_t si = pending.get_info_gid(sgid);
2090 dout(10) << " replacing " << gid << " " << info.addr << " mds."
2091 << info.rank << "." << info.inc
2092 << " " << ceph_mds_state_name(info.state)
2093 << " with " << sgid << "/" << si.name << " " << si.addr << dendl;
2094
2095 mon->clog->warn() << info.human_name()
2096 << " is not responding, replacing it "
2097 << "as rank " << info.rank
2098 << " with standby " << si.human_name();
2099
2100 // Remember what NS the old one was in
2101 const fs_cluster_id_t fscid = pending.mds_roles.at(gid);
2102
2103 // Remove the old one
2104 *osd_propose |= fail_mds_gid(gid);
2105
2106 // Promote the replacement
2107 auto fs = pending.filesystems.at(fscid);
2108 pending.promote(sgid, fs, info.rank);
2109
2110 *mds_propose = true;
2111 } else if ((info.state == MDSMap::STATE_STANDBY_REPLAY ||
2112 info.state == MDSMap::STATE_STANDBY) && may_replace) {
2113 dout(10) << " failing and removing " << gid << " " << info.addr << " mds." << info.rank
2114 << "." << info.inc << " " << ceph_mds_state_name(info.state)
2115 << dendl;
2116 mon->clog->info() << "Standby " << info.human_name() << " is not "
2117 "responding, dropping it";
2118 fail_mds_gid(gid);
2119 *mds_propose = true;
2120 } else if (!info.laggy()) {
2121 dout(10) << " marking " << gid << " " << info.addr << " mds." << info.rank << "." << info.inc
2122 << " " << ceph_mds_state_name(info.state)
2123 << " laggy" << dendl;
2124 pending.modify_daemon(info.global_id, [](MDSMap::mds_info_t *info) {
2125 info->laggy_since = ceph_clock_now();
2126 });
2127 *mds_propose = true;
2128 }
2129 }
2130
2131 bool MDSMonitor::maybe_promote_standby(std::shared_ptr<Filesystem> &fs)
2132 {
2133 assert(!fs->mds_map.test_flag(CEPH_MDSMAP_DOWN));
2134
2135 auto &pending = get_pending_fsmap_writeable();
2136
2137 bool do_propose = false;
2138
2139 // have a standby take over?
2140 set<mds_rank_t> failed;
2141 fs->mds_map.get_failed_mds_set(failed);
2142 if (!failed.empty()) {
2143 set<mds_rank_t>::iterator p = failed.begin();
2144 while (p != failed.end()) {
2145 mds_rank_t f = *p++;
2146 mds_gid_t sgid = pending.find_replacement_for({fs->fscid, f}, {},
2147 g_conf->mon_force_standby_active);
2148 if (sgid) {
2149 const MDSMap::mds_info_t si = pending.get_info_gid(sgid);
2150 dout(0) << " taking over failed mds." << f << " with " << sgid
2151 << "/" << si.name << " " << si.addr << dendl;
2152 mon->clog->info() << "Standby " << si.human_name()
2153 << " assigned to filesystem " << fs->mds_map.fs_name
2154 << " as rank " << f;
2155
2156 pending.promote(sgid, fs, f);
2157 do_propose = true;
2158 }
2159 }
2160 } else {
2161 // There were no failures to replace, so try using any available standbys
2162 // as standby-replay daemons.
2163
2164 // Take a copy of the standby GIDs so that we can iterate over
2165 // them while perhaps-modifying standby_daemons during the loop
2166 // (if we promote anyone they are removed from standby_daemons)
2167 std::vector<mds_gid_t> standby_gids;
2168 for (const auto &j : pending.standby_daemons) {
2169 standby_gids.push_back(j.first);
2170 }
2171
2172 for (const auto &gid : standby_gids) {
2173 const auto &info = pending.standby_daemons.at(gid);
2174 assert(info.state == MDSMap::STATE_STANDBY);
2175
2176 if (!info.standby_replay) {
2177 continue;
2178 }
2179
2180 /*
2181 * This mds is standby but has no rank assigned.
2182 * See if we can find it somebody to shadow
2183 */
2184 dout(20) << "gid " << gid << " is standby and following nobody" << dendl;
2185
2186 // standby for someone specific?
2187 if (info.standby_for_rank >= 0) {
2188 // The mds_info_t may or may not tell us exactly which filesystem
2189 // the standby_for_rank refers to: lookup via legacy_client_fscid
2190 mds_role_t target_role = {
2191 info.standby_for_fscid == FS_CLUSTER_ID_NONE ?
2192 pending.legacy_client_fscid : info.standby_for_fscid,
2193 info.standby_for_rank};
2194
2195 // It is possible that the map contains a standby_for_fscid
2196 // that doesn't correspond to an existing filesystem, especially
2197 // if we loaded from a version with a bug (#17466)
2198 if (info.standby_for_fscid != FS_CLUSTER_ID_NONE
2199 && !pending.filesystem_exists(info.standby_for_fscid)) {
2200 derr << "gid " << gid << " has invalid standby_for_fscid "
2201 << info.standby_for_fscid << dendl;
2202 continue;
2203 }
2204
2205 // If we managed to resolve a full target role
2206 if (target_role.fscid != FS_CLUSTER_ID_NONE) {
2207 const auto &fs = pending.get_filesystem(target_role.fscid);
2208 if (fs->mds_map.is_followable(target_role.rank)) {
2209 do_propose |= try_standby_replay(
2210 info,
2211 *fs,
2212 fs->mds_map.get_info(target_role.rank));
2213 }
2214 }
2215
2216 continue;
2217 }
2218
2219 // check everyone
2220 for (const auto &p : pending.filesystems) {
2221 if (info.standby_for_fscid != FS_CLUSTER_ID_NONE &&
2222 info.standby_for_fscid != p.first)
2223 continue;
2224
2225 bool assigned = false;
2226 const auto &fs = p.second;
2227 const MDSMap &mds_map = fs->mds_map;
2228 for (const auto &mds_i : mds_map.mds_info) {
2229 const MDSMap::mds_info_t &cand_info = mds_i.second;
2230 if (cand_info.rank >= 0 && mds_map.is_followable(cand_info.rank)) {
2231 if ((info.standby_for_name.length() && info.standby_for_name != cand_info.name) ||
2232 info.standby_for_rank != MDS_RANK_NONE) {
2233 continue; // we're supposed to follow someone else
2234 }
2235
2236 if (try_standby_replay(info, *fs, cand_info)) {
2237 assigned = true;
2238 break;
2239 }
2240 }
2241 }
2242 if (assigned) {
2243 do_propose = true;
2244 break;
2245 }
2246 }
2247 }
2248 }
2249
2250 return do_propose;
2251 }
2252
2253 void MDSMonitor::tick()
2254 {
2255 // make sure mds's are still alive
2256 // ...if i am an active leader
2257
2258 if (!is_active()) return;
2259
2260 dout(10) << get_working_fsmap() << dendl;
2261
2262 if (!is_leader()) return;
2263
2264 auto &pending = get_pending_fsmap_writeable();
2265
2266 bool do_propose = false;
2267
2268 do_propose |= pending.check_health();
2269
2270 // expand mds cluster (add new nodes to @in)?
2271 for (auto &p : pending.filesystems) {
2272 do_propose |= maybe_expand_cluster(p.second);
2273 }
2274
2275 const auto now = ceph_clock_now();
2276 if (last_tick.is_zero()) {
2277 last_tick = now;
2278 }
2279
2280 if (now - last_tick > (g_conf->mds_beacon_grace - g_conf->mds_beacon_interval)) {
2281 // This case handles either local slowness (calls being delayed
2282 // for whatever reason) or cluster election slowness (a long gap
2283 // between calls while an election happened)
2284 dout(4) << __func__ << ": resetting beacon timeouts due to mon delay "
2285 "(slow election?) of " << now - last_tick << " seconds" << dendl;
2286 for (auto &i : last_beacon) {
2287 i.second.stamp = now;
2288 }
2289 }
2290
2291 last_tick = now;
2292
2293 // check beacon timestamps
2294 utime_t cutoff = now;
2295 cutoff -= g_conf->mds_beacon_grace;
2296
2297 // make sure last_beacon is fully populated
2298 for (auto &p : pending.mds_roles) {
2299 auto &gid = p.first;
2300 if (last_beacon.count(gid) == 0) {
2301 last_beacon[gid].stamp = now;
2302 last_beacon[gid].seq = 0;
2303 }
2304 }
2305
2306 bool propose_osdmap = false;
2307 bool osdmap_writeable = mon->osdmon()->is_writeable();
2308 auto p = last_beacon.begin();
2309 while (p != last_beacon.end()) {
2310 mds_gid_t gid = p->first;
2311 auto beacon_info = p->second;
2312 ++p;
2313
2314 if (!pending.gid_exists(gid)) {
2315 // clean it out
2316 last_beacon.erase(gid);
2317 continue;
2318 }
2319
2320 if (beacon_info.stamp < cutoff) {
2321 auto &info = pending.get_info_gid(gid);
2322 dout(1) << "no beacon from mds." << info.rank << "." << info.inc
2323 << " (gid: " << gid << " addr: " << info.addr
2324 << " state: " << ceph_mds_state_name(info.state) << ")"
2325 << " since " << beacon_info.stamp << dendl;
2326 // If the OSDMap is writeable, we can blacklist things, so we can
2327 // try failing any laggy MDS daemons. Consider each one for failure.
2328 if (osdmap_writeable) {
2329 maybe_replace_gid(gid, info, &do_propose, &propose_osdmap);
2330 }
2331 }
2332 }
2333 if (propose_osdmap) {
2334 request_proposal(mon->osdmon());
2335 }
2336
2337 for (auto &p : pending.filesystems) {
2338 auto &fs = p.second;
2339 if (!fs->mds_map.test_flag(CEPH_MDSMAP_DOWN)) {
2340 do_propose |= maybe_promote_standby(fs);
2341 }
2342 }
2343
2344 if (do_propose) {
2345 propose_pending();
2346 }
2347 }
2348
2349 /**
2350 * finfo: the would-be follower
2351 * leader_fs: the Filesystem containing the would-be leader
2352 * ainfo: the would-be leader
2353 */
2354 bool MDSMonitor::try_standby_replay(
2355 const MDSMap::mds_info_t& finfo,
2356 const Filesystem &leader_fs,
2357 const MDSMap::mds_info_t& ainfo)
2358 {
2359 // someone else already following?
2360 if (leader_fs.has_standby_replay(ainfo.global_id)) {
2361 dout(20) << " mds." << ainfo.rank << " already has a follower" << dendl;
2362 return false;
2363 } else {
2364 // Assign the new role to the standby
2365 dout(10) << " setting to follow mds rank " << ainfo.rank << dendl;
2366 get_pending_fsmap_writeable().assign_standby_replay(finfo.global_id, leader_fs.fscid, ainfo.rank);
2367 return true;
2368 }
2369 }
2370
2371 MDSMonitor::MDSMonitor(Monitor *mn, Paxos *p, string service_name)
2372 : PaxosService(mn, p, service_name)
2373 {
2374 handlers = FileSystemCommandHandler::load(p);
2375 }
2376
2377 void MDSMonitor::on_restart()
2378 {
2379 // Clear out the leader-specific state.
2380 last_tick = utime_t();
2381 last_beacon.clear();
2382 }
2383