]> git.proxmox.com Git - ceph.git/blob - ceph/src/mon/MDSMonitor.cc
update sources to v12.1.3
[ceph.git] / ceph / src / mon / MDSMonitor.cc
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3 /*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 *
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
12 *
13 */
14
15 #include <sstream>
16 #include <boost/utility.hpp>
17 #include <boost/regex.hpp>
18
19 #include "MDSMonitor.h"
20 #include "FSCommands.h"
21 #include "Monitor.h"
22 #include "MonitorDBStore.h"
23 #include "OSDMonitor.h"
24 #include "PGMonitor.h"
25
26 #include "common/strtol.h"
27 #include "common/perf_counters.h"
28 #include "common/config.h"
29 #include "common/cmdparse.h"
30 #include "messages/MMDSMap.h"
31 #include "messages/MFSMap.h"
32 #include "messages/MFSMapUser.h"
33 #include "messages/MMDSLoadTargets.h"
34 #include "messages/MMonCommand.h"
35 #include "messages/MGenericMessage.h"
36
37 #include "include/assert.h"
38 #include "include/str_list.h"
39 #include "include/stringify.h"
40 #include "mds/mdstypes.h"
41 #include "Session.h"
42
43 #define dout_subsys ceph_subsys_mon
44 #undef dout_prefix
45 #define dout_prefix _prefix(_dout, mon, fsmap)
46 static ostream& _prefix(std::ostream *_dout, Monitor *mon, FSMap const& fsmap) {
47 return *_dout << "mon." << mon->name << "@" << mon->rank
48 << "(" << mon->get_state_name()
49 << ").mds e" << fsmap.get_epoch() << " ";
50 }
51
52 /*
53 * Specialized implementation of cmd_getval to allow us to parse
54 * out strongly-typedef'd types
55 */
56 template<> bool cmd_getval(CephContext *cct, const cmdmap_t& cmdmap,
57 const std::string& k, mds_gid_t &val)
58 {
59 return cmd_getval(cct, cmdmap, k, (int64_t&)val);
60 }
61
62 template<> bool cmd_getval(CephContext *cct, const cmdmap_t& cmdmap,
63 const std::string& k, mds_rank_t &val)
64 {
65 return cmd_getval(cct, cmdmap, k, (int64_t&)val);
66 }
67
68 template<> bool cmd_getval(CephContext *cct, const cmdmap_t& cmdmap,
69 const std::string& k, MDSMap::DaemonState &val)
70 {
71 return cmd_getval(cct, cmdmap, k, (int64_t&)val);
72 }
73
74 static const string MDS_METADATA_PREFIX("mds_metadata");
75
76
77 // my methods
78
79 void MDSMonitor::print_map(FSMap &m, int dbl)
80 {
81 dout(dbl) << "print_map\n";
82 m.print(*_dout);
83 *_dout << dendl;
84 }
85
86 // service methods
87 void MDSMonitor::create_initial()
88 {
89 dout(10) << "create_initial" << dendl;
90 }
91
92
93 void MDSMonitor::update_from_paxos(bool *need_bootstrap)
94 {
95 version_t version = get_last_committed();
96 if (version == fsmap.epoch)
97 return;
98
99 dout(10) << __func__ << " version " << version
100 << ", my e " << fsmap.epoch << dendl;
101 assert(version > fsmap.epoch);
102
103 load_health();
104
105 // read and decode
106 bufferlist fsmap_bl;
107 fsmap_bl.clear();
108 int err = get_version(version, fsmap_bl);
109 assert(err == 0);
110
111 assert(fsmap_bl.length() > 0);
112 dout(10) << __func__ << " got " << version << dendl;
113 fsmap.decode(fsmap_bl);
114
115 // new map
116 dout(4) << "new map" << dendl;
117 print_map(fsmap, 0);
118 if (!g_conf->mon_mds_skip_sanity) {
119 fsmap.sanity();
120 }
121
122 check_subs();
123 update_logger();
124 }
125
126 void MDSMonitor::init()
127 {
128 (void)load_metadata(pending_metadata);
129 }
130
131 void MDSMonitor::create_pending()
132 {
133 pending_fsmap = fsmap;
134 pending_fsmap.epoch++;
135
136 dout(10) << "create_pending e" << pending_fsmap.epoch << dendl;
137 }
138
139 void MDSMonitor::encode_pending(MonitorDBStore::TransactionRef t)
140 {
141 dout(10) << "encode_pending e" << pending_fsmap.epoch << dendl;
142
143
144 // print map iff 'debug mon = 30' or higher
145 print_map(pending_fsmap, 30);
146 if (!g_conf->mon_mds_skip_sanity) {
147 pending_fsmap.sanity();
148 }
149
150 // Set 'modified' on maps modified this epoch
151 for (auto &i : fsmap.filesystems) {
152 if (i.second->mds_map.epoch == fsmap.epoch) {
153 i.second->mds_map.modified = ceph_clock_now();
154 }
155 }
156
157 // apply to paxos
158 assert(get_last_committed() + 1 == pending_fsmap.epoch);
159 bufferlist fsmap_bl;
160 pending_fsmap.encode(fsmap_bl, mon->get_quorum_con_features());
161
162 /* put everything in the transaction */
163 put_version(t, pending_fsmap.epoch, fsmap_bl);
164 put_last_committed(t, pending_fsmap.epoch);
165
166 // Encode MDSHealth data
167 for (std::map<uint64_t, MDSHealth>::iterator i = pending_daemon_health.begin();
168 i != pending_daemon_health.end(); ++i) {
169 bufferlist bl;
170 i->second.encode(bl);
171 t->put(MDS_HEALTH_PREFIX, stringify(i->first), bl);
172 }
173
174 for (std::set<uint64_t>::iterator i = pending_daemon_health_rm.begin();
175 i != pending_daemon_health_rm.end(); ++i) {
176 t->erase(MDS_HEALTH_PREFIX, stringify(*i));
177 }
178 pending_daemon_health_rm.clear();
179 remove_from_metadata(t);
180
181 // health
182 health_check_map_t new_checks;
183 const auto info_map = pending_fsmap.get_mds_info();
184 for (const auto &i : info_map) {
185 const auto &gid = i.first;
186 const auto &info = i.second;
187 if (pending_daemon_health_rm.count(gid)) {
188 continue;
189 }
190 MDSHealth health;
191 auto p = pending_daemon_health.find(gid);
192 if (p != pending_daemon_health.end()) {
193 health = p->second;
194 } else {
195 bufferlist bl;
196 mon->store->get(MDS_HEALTH_PREFIX, stringify(gid), bl);
197 if (!bl.length()) {
198 derr << "Missing health data for MDS " << gid << dendl;
199 continue;
200 }
201 bufferlist::iterator bl_i = bl.begin();
202 health.decode(bl_i);
203 }
204 for (const auto &metric : health.metrics) {
205 const int rank = info.rank;
206 health_check_t *check = &new_checks.get_or_add(
207 mds_metric_name(metric.type),
208 metric.sev,
209 mds_metric_summary(metric.type));
210 ostringstream ss;
211 ss << "mds" << info.name << "(mds." << rank << "): " << metric.message;
212 for (auto p = metric.metadata.begin();
213 p != metric.metadata.end();
214 ++p) {
215 if (p != metric.metadata.begin()) {
216 ss << ", ";
217 }
218 ss << p->first << ": " << p->second;
219 }
220 check->detail.push_back(ss.str());
221 }
222 }
223 pending_fsmap.get_health_checks(&new_checks);
224 for (auto& p : new_checks.checks) {
225 p.second.summary = boost::regex_replace(
226 p.second.summary,
227 boost::regex("%num%"),
228 stringify(p.second.detail.size()));
229 p.second.summary = boost::regex_replace(
230 p.second.summary,
231 boost::regex("%plurals%"),
232 p.second.detail.size() > 1 ? "s" : "");
233 p.second.summary = boost::regex_replace(
234 p.second.summary,
235 boost::regex("%isorare%"),
236 p.second.detail.size() > 1 ? "are" : "is");
237 }
238 encode_health(new_checks, t);
239 }
240
241 version_t MDSMonitor::get_trim_to()
242 {
243 version_t floor = 0;
244 if (g_conf->mon_mds_force_trim_to > 0 &&
245 g_conf->mon_mds_force_trim_to < (int)get_last_committed()) {
246 floor = g_conf->mon_mds_force_trim_to;
247 dout(10) << __func__ << " explicit mon_mds_force_trim_to = "
248 << floor << dendl;
249 }
250
251 unsigned max = g_conf->mon_max_mdsmap_epochs;
252 version_t last = get_last_committed();
253
254 if (last - get_first_committed() > max && floor < last - max)
255 return last - max;
256 return floor;
257 }
258
259 void MDSMonitor::update_logger()
260 {
261 dout(10) << "update_logger" << dendl;
262
263 uint64_t up = 0;
264 uint64_t in = 0;
265 uint64_t failed = 0;
266 for (const auto &i : fsmap.filesystems) {
267 const MDSMap &mds_map = i.second->mds_map;
268
269 up += mds_map.get_num_up_mds();
270 in += mds_map.get_num_in_mds();
271 failed += mds_map.get_num_failed_mds();
272 }
273 mon->cluster_logger->set(l_cluster_num_mds_up, up);
274 mon->cluster_logger->set(l_cluster_num_mds_in, in);
275 mon->cluster_logger->set(l_cluster_num_mds_failed, failed);
276 mon->cluster_logger->set(l_cluster_mds_epoch, fsmap.get_epoch());
277 }
278
279 bool MDSMonitor::preprocess_query(MonOpRequestRef op)
280 {
281 op->mark_mdsmon_event(__func__);
282 PaxosServiceMessage *m = static_cast<PaxosServiceMessage*>(op->get_req());
283 dout(10) << "preprocess_query " << *m << " from " << m->get_orig_source_inst() << dendl;
284
285 switch (m->get_type()) {
286
287 case MSG_MDS_BEACON:
288 return preprocess_beacon(op);
289
290 case MSG_MON_COMMAND:
291 return preprocess_command(op);
292
293 case MSG_MDS_OFFLOAD_TARGETS:
294 return preprocess_offload_targets(op);
295
296 default:
297 ceph_abort();
298 return true;
299 }
300 }
301
302 void MDSMonitor::_note_beacon(MMDSBeacon *m)
303 {
304 mds_gid_t gid = mds_gid_t(m->get_global_id());
305 version_t seq = m->get_seq();
306
307 dout(15) << "_note_beacon " << *m << " noting time" << dendl;
308 last_beacon[gid].stamp = ceph_clock_now();
309 last_beacon[gid].seq = seq;
310 }
311
312 bool MDSMonitor::preprocess_beacon(MonOpRequestRef op)
313 {
314 op->mark_mdsmon_event(__func__);
315 MMDSBeacon *m = static_cast<MMDSBeacon*>(op->get_req());
316 MDSMap::DaemonState state = m->get_state();
317 mds_gid_t gid = m->get_global_id();
318 version_t seq = m->get_seq();
319 MDSMap::mds_info_t info;
320 epoch_t effective_epoch = 0;
321
322 // check privileges, ignore if fails
323 MonSession *session = m->get_session();
324 assert(session);
325 if (!session->is_capable("mds", MON_CAP_X)) {
326 dout(0) << "preprocess_beacon got MMDSBeacon from entity with insufficient privileges "
327 << session->caps << dendl;
328 goto ignore;
329 }
330
331 if (m->get_fsid() != mon->monmap->fsid) {
332 dout(0) << "preprocess_beacon on fsid " << m->get_fsid() << " != " << mon->monmap->fsid << dendl;
333 goto ignore;
334 }
335
336 dout(12) << "preprocess_beacon " << *m
337 << " from " << m->get_orig_source_inst()
338 << " " << m->get_compat()
339 << dendl;
340
341 // make sure the address has a port
342 if (m->get_orig_source_addr().get_port() == 0) {
343 dout(1) << " ignoring boot message without a port" << dendl;
344 goto ignore;
345 }
346
347 // check compat
348 if (!m->get_compat().writeable(fsmap.compat)) {
349 dout(1) << " mds " << m->get_source_inst() << " can't write to fsmap " << fsmap.compat << dendl;
350 goto ignore;
351 }
352
353 // fw to leader?
354 if (!mon->is_leader())
355 return false;
356
357 // booted, but not in map?
358 if (!pending_fsmap.gid_exists(gid)) {
359 if (state != MDSMap::STATE_BOOT) {
360 dout(7) << "mds_beacon " << *m << " is not in fsmap (state "
361 << ceph_mds_state_name(state) << ")" << dendl;
362
363 MDSMap null_map;
364 null_map.epoch = fsmap.epoch;
365 null_map.compat = fsmap.compat;
366 mon->send_reply(op, new MMDSMap(mon->monmap->fsid, &null_map));
367 return true;
368 } else {
369 return false; // not booted yet.
370 }
371 }
372 dout(10) << __func__ << ": GID exists in map: " << gid << dendl;
373 info = pending_fsmap.get_info_gid(gid);
374
375 // old seq?
376 if (info.state_seq > seq) {
377 dout(7) << "mds_beacon " << *m << " has old seq, ignoring" << dendl;
378 goto ignore;
379 }
380
381 // Work out the latest epoch that this daemon should have seen
382 {
383 fs_cluster_id_t fscid = pending_fsmap.mds_roles.at(gid);
384 if (fscid == FS_CLUSTER_ID_NONE) {
385 effective_epoch = pending_fsmap.standby_epochs.at(gid);
386 } else {
387 effective_epoch = pending_fsmap.get_filesystem(fscid)->mds_map.epoch;
388 }
389 if (effective_epoch != m->get_last_epoch_seen()) {
390 dout(10) << "mds_beacon " << *m
391 << " ignoring requested state, because mds hasn't seen latest map" << dendl;
392 goto reply;
393 }
394 }
395
396 if (info.laggy()) {
397 _note_beacon(m);
398 return false; // no longer laggy, need to update map.
399 }
400 if (state == MDSMap::STATE_BOOT) {
401 // ignore, already booted.
402 goto ignore;
403 }
404 // is there a state change here?
405 if (info.state != state) {
406 // legal state change?
407 if ((info.state == MDSMap::STATE_STANDBY ||
408 info.state == MDSMap::STATE_STANDBY_REPLAY) && state > 0) {
409 dout(10) << "mds_beacon mds can't activate itself (" << ceph_mds_state_name(info.state)
410 << " -> " << ceph_mds_state_name(state) << ")" << dendl;
411 goto reply;
412 }
413
414 if ((state == MDSMap::STATE_STANDBY || state == MDSMap::STATE_STANDBY_REPLAY)
415 && info.rank != MDS_RANK_NONE)
416 {
417 dout(4) << "mds_beacon MDS can't go back into standby after taking rank: "
418 "held rank " << info.rank << " while requesting state "
419 << ceph_mds_state_name(state) << dendl;
420 goto reply;
421 }
422
423 _note_beacon(m);
424 return false;
425 }
426
427 // Comparing known daemon health with m->get_health()
428 // and return false (i.e. require proposal) if they
429 // do not match, to update our stored
430 if (!(pending_daemon_health[gid] == m->get_health())) {
431 dout(20) << __func__ << " health metrics for gid " << gid << " were updated" << dendl;
432 _note_beacon(m);
433 return false;
434 }
435
436 reply:
437 // note time and reply
438 assert(effective_epoch > 0);
439 _note_beacon(m);
440 mon->send_reply(op,
441 new MMDSBeacon(mon->monmap->fsid, m->get_global_id(), m->get_name(),
442 effective_epoch, state, seq,
443 CEPH_FEATURES_SUPPORTED_DEFAULT));
444 return true;
445
446 ignore:
447 // I won't reply this beacon, drop it.
448 mon->no_reply(op);
449 return true;
450 }
451
452 bool MDSMonitor::preprocess_offload_targets(MonOpRequestRef op)
453 {
454 op->mark_mdsmon_event(__func__);
455 MMDSLoadTargets *m = static_cast<MMDSLoadTargets*>(op->get_req());
456 dout(10) << "preprocess_offload_targets " << *m << " from " << m->get_orig_source() << dendl;
457
458 // check privileges, ignore message if fails
459 MonSession *session = m->get_session();
460 if (!session)
461 goto done;
462 if (!session->is_capable("mds", MON_CAP_X)) {
463 dout(0) << "preprocess_offload_targets got MMDSLoadTargets from entity with insufficient caps "
464 << session->caps << dendl;
465 goto done;
466 }
467
468 if (fsmap.gid_exists(m->global_id) &&
469 m->targets == fsmap.get_info_gid(m->global_id).export_targets)
470 goto done;
471
472 return false;
473
474 done:
475 return true;
476 }
477
478
479 bool MDSMonitor::prepare_update(MonOpRequestRef op)
480 {
481 op->mark_mdsmon_event(__func__);
482 PaxosServiceMessage *m = static_cast<PaxosServiceMessage*>(op->get_req());
483 dout(7) << "prepare_update " << *m << dendl;
484
485 switch (m->get_type()) {
486
487 case MSG_MDS_BEACON:
488 return prepare_beacon(op);
489
490 case MSG_MON_COMMAND:
491 return prepare_command(op);
492
493 case MSG_MDS_OFFLOAD_TARGETS:
494 return prepare_offload_targets(op);
495
496 default:
497 ceph_abort();
498 }
499
500 return true;
501 }
502
503 bool MDSMonitor::prepare_beacon(MonOpRequestRef op)
504 {
505 op->mark_mdsmon_event(__func__);
506 MMDSBeacon *m = static_cast<MMDSBeacon*>(op->get_req());
507 // -- this is an update --
508 dout(12) << "prepare_beacon " << *m << " from " << m->get_orig_source_inst() << dendl;
509 entity_addr_t addr = m->get_orig_source_inst().addr;
510 mds_gid_t gid = m->get_global_id();
511 MDSMap::DaemonState state = m->get_state();
512 version_t seq = m->get_seq();
513
514 dout(20) << __func__ << " got health from gid " << gid << " with " << m->get_health().metrics.size() << " metrics." << dendl;
515
516 // Calculate deltas of health metrics created and removed
517 // Do this by type rather than MDSHealthMetric equality, because messages can
518 // change a lot when they include e.g. a number of items.
519 const auto &old_health = pending_daemon_health[gid].metrics;
520 const auto &new_health = m->get_health().metrics;
521
522 std::set<mds_metric_t> old_types;
523 for (const auto &i : old_health) {
524 old_types.insert(i.type);
525 }
526
527 std::set<mds_metric_t> new_types;
528 for (const auto &i : new_health) {
529 new_types.insert(i.type);
530 }
531
532 for (const auto &new_metric: new_health) {
533 if (old_types.count(new_metric.type) == 0) {
534 std::stringstream msg;
535 msg << "MDS health message (" << m->get_orig_source_inst().name << "): "
536 << new_metric.message;
537 if (new_metric.sev == HEALTH_ERR) {
538 mon->clog->error() << msg.str();
539 } else if (new_metric.sev == HEALTH_WARN) {
540 mon->clog->warn() << msg.str();
541 } else {
542 mon->clog->info() << msg.str();
543 }
544 }
545 }
546
547 // Log the disappearance of health messages at INFO
548 for (const auto &old_metric : old_health) {
549 if (new_types.count(old_metric.type) == 0) {
550 mon->clog->info() << "MDS health message cleared ("
551 << m->get_orig_source_inst().name << "): " << old_metric.message;
552 }
553 }
554
555 // Store health
556 pending_daemon_health[gid] = m->get_health();
557
558 // boot?
559 if (state == MDSMap::STATE_BOOT) {
560 // zap previous instance of this name?
561 if (g_conf->mds_enforce_unique_name) {
562 bool failed_mds = false;
563 while (mds_gid_t existing = pending_fsmap.find_mds_gid_by_name(m->get_name())) {
564 if (!mon->osdmon()->is_writeable()) {
565 mon->osdmon()->wait_for_writeable(op, new C_RetryMessage(this, op));
566 return false;
567 }
568 const MDSMap::mds_info_t &existing_info =
569 pending_fsmap.get_info_gid(existing);
570 mon->clog->info() << existing_info.human_name() << " restarted";
571 fail_mds_gid(existing);
572 failed_mds = true;
573 }
574 if (failed_mds) {
575 assert(mon->osdmon()->is_writeable());
576 request_proposal(mon->osdmon());
577 }
578 }
579
580 // Add this daemon to the map
581 if (pending_fsmap.mds_roles.count(gid) == 0) {
582 MDSMap::mds_info_t new_info;
583 new_info.global_id = gid;
584 new_info.name = m->get_name();
585 new_info.addr = addr;
586 new_info.mds_features = m->get_mds_features();
587 new_info.state = MDSMap::STATE_STANDBY;
588 new_info.state_seq = seq;
589 new_info.standby_for_rank = m->get_standby_for_rank();
590 new_info.standby_for_name = m->get_standby_for_name();
591 new_info.standby_for_fscid = m->get_standby_for_fscid();
592 new_info.standby_replay = m->get_standby_replay();
593 pending_fsmap.insert(new_info);
594 }
595
596 // Resolve standby_for_name to a rank
597 const MDSMap::mds_info_t &info = pending_fsmap.get_info_gid(gid);
598 if (!info.standby_for_name.empty()) {
599 const MDSMap::mds_info_t *leaderinfo = fsmap.find_by_name(
600 info.standby_for_name);
601 if (leaderinfo && (leaderinfo->rank >= 0)) {
602 auto fscid = pending_fsmap.mds_roles.at(leaderinfo->global_id);
603 auto fs = pending_fsmap.get_filesystem(fscid);
604
605 pending_fsmap.modify_daemon(gid, [fscid, leaderinfo](
606 MDSMap::mds_info_t *info) {
607 info->standby_for_rank = leaderinfo->rank;
608 info->standby_for_fscid = fscid;
609 });
610 }
611 }
612
613 // initialize the beacon timer
614 last_beacon[gid].stamp = ceph_clock_now();
615 last_beacon[gid].seq = seq;
616
617 // new incompat?
618 if (!pending_fsmap.compat.writeable(m->get_compat())) {
619 dout(10) << " fsmap " << pending_fsmap.compat
620 << " can't write to new mds' " << m->get_compat()
621 << ", updating fsmap and killing old mds's"
622 << dendl;
623 pending_fsmap.update_compat(m->get_compat());
624 }
625
626 update_metadata(m->get_global_id(), m->get_sys_info());
627 } else {
628 // state update
629 const MDSMap::mds_info_t &info = pending_fsmap.get_info_gid(gid);
630 // Old MDS daemons don't mention that they're standby replay until
631 // after they've sent their boot beacon, so update this field.
632 if (info.standby_replay != m->get_standby_replay()) {
633 pending_fsmap.modify_daemon(info.global_id, [&m](
634 MDSMap::mds_info_t *i)
635 {
636 i->standby_replay = m->get_standby_replay();
637 });
638 }
639
640 if (info.state == MDSMap::STATE_STOPPING && state != MDSMap::STATE_STOPPED ) {
641 // we can't transition to any other states from STOPPING
642 dout(0) << "got beacon for MDS in STATE_STOPPING, ignoring requested state change"
643 << dendl;
644 _note_beacon(m);
645 return true;
646 }
647
648 if (info.laggy()) {
649 dout(10) << "prepare_beacon clearing laggy flag on " << addr << dendl;
650 pending_fsmap.modify_daemon(info.global_id, [](MDSMap::mds_info_t *info)
651 {
652 info->clear_laggy();
653 }
654 );
655 }
656
657 dout(10) << "prepare_beacon mds." << info.rank
658 << " " << ceph_mds_state_name(info.state)
659 << " -> " << ceph_mds_state_name(state)
660 << " standby_for_rank=" << m->get_standby_for_rank()
661 << dendl;
662 if (state == MDSMap::STATE_STOPPED) {
663 const auto fscid = pending_fsmap.mds_roles.at(gid);
664 auto fs = pending_fsmap.get_filesystem(fscid);
665 mon->clog->info() << info.human_name() << " finished "
666 << "deactivating rank " << info.rank << " in filesystem "
667 << fs->mds_map.fs_name << " (now has "
668 << fs->mds_map.get_num_in_mds() << " ranks)";
669
670 auto erased = pending_fsmap.stop(gid);
671 erased.push_back(gid);
672
673 for (const auto &erased_gid : erased) {
674 last_beacon.erase(erased_gid);
675 if (pending_daemon_health.count(erased_gid)) {
676 pending_daemon_health.erase(erased_gid);
677 pending_daemon_health_rm.insert(erased_gid);
678 }
679 }
680
681
682 } else if (state == MDSMap::STATE_DAMAGED) {
683 if (!mon->osdmon()->is_writeable()) {
684 dout(4) << __func__ << ": DAMAGED from rank " << info.rank
685 << " waiting for osdmon writeable to blacklist it" << dendl;
686 mon->osdmon()->wait_for_writeable(op, new C_RetryMessage(this, op));
687 return false;
688 }
689
690 // Record this MDS rank as damaged, so that other daemons
691 // won't try to run it.
692 dout(4) << __func__ << ": marking rank "
693 << info.rank << " damaged" << dendl;
694
695 utime_t until = ceph_clock_now();
696 until += g_conf->mds_blacklist_interval;
697 const auto blacklist_epoch = mon->osdmon()->blacklist(info.addr, until);
698 request_proposal(mon->osdmon());
699 pending_fsmap.damaged(gid, blacklist_epoch);
700 last_beacon.erase(gid);
701
702 // Respond to MDS, so that it knows it can continue to shut down
703 mon->send_reply(op,
704 new MMDSBeacon(
705 mon->monmap->fsid, m->get_global_id(),
706 m->get_name(), fsmap.get_epoch(), state, seq,
707 CEPH_FEATURES_SUPPORTED_DEFAULT));
708 } else if (state == MDSMap::STATE_DNE) {
709 if (!mon->osdmon()->is_writeable()) {
710 dout(4) << __func__ << ": DNE from rank " << info.rank
711 << " waiting for osdmon writeable to blacklist it" << dendl;
712 mon->osdmon()->wait_for_writeable(op, new C_RetryMessage(this, op));
713 return false;
714 }
715
716 fail_mds_gid(gid);
717 assert(mon->osdmon()->is_writeable());
718 request_proposal(mon->osdmon());
719
720 // Respond to MDS, so that it knows it can continue to shut down
721 mon->send_reply(op,
722 new MMDSBeacon(
723 mon->monmap->fsid, m->get_global_id(),
724 m->get_name(), fsmap.get_epoch(), state, seq,
725 CEPH_FEATURES_SUPPORTED_DEFAULT));
726 } else if (info.state == MDSMap::STATE_STANDBY && state != info.state) {
727 // Standby daemons should never modify their own
728 // state. Reject any attempts to do so.
729 derr << "standby " << gid << " attempted to change state to "
730 << ceph_mds_state_name(state) << ", rejecting" << dendl;
731 return true;
732 } else if (info.state != MDSMap::STATE_STANDBY && state != info.state &&
733 !MDSMap::state_transition_valid(info.state, state)) {
734 // Validate state transitions for daemons that hold a rank
735 derr << "daemon " << gid << " (rank " << info.rank << ") "
736 << "reported invalid state transition "
737 << ceph_mds_state_name(info.state) << " -> "
738 << ceph_mds_state_name(state) << dendl;
739 return true;
740 } else {
741 // Made it through special cases and validations, record the
742 // daemon's reported state to the FSMap.
743 pending_fsmap.modify_daemon(gid, [state, seq](MDSMap::mds_info_t *info) {
744 info->state = state;
745 info->state_seq = seq;
746 });
747
748 if (state == MDSMap::STATE_ACTIVE) {
749 auto fscid = pending_fsmap.mds_roles.at(gid);
750 auto fs = pending_fsmap.get_filesystem(fscid);
751 mon->clog->info() << info.human_name() << " is now active in "
752 << "filesystem " << fs->mds_map.fs_name << " as rank "
753 << info.rank;
754 }
755 }
756 }
757
758 dout(7) << "prepare_beacon pending map now:" << dendl;
759 print_map(pending_fsmap);
760
761 wait_for_finished_proposal(op, new FunctionContext([op, this](int r){
762 if (r >= 0)
763 _updated(op); // success
764 else if (r == -ECANCELED) {
765 mon->no_reply(op);
766 } else {
767 dispatch(op); // try again
768 }
769 }));
770
771 return true;
772 }
773
774 bool MDSMonitor::prepare_offload_targets(MonOpRequestRef op)
775 {
776 op->mark_mdsmon_event(__func__);
777 MMDSLoadTargets *m = static_cast<MMDSLoadTargets*>(op->get_req());
778 mds_gid_t gid = m->global_id;
779 if (pending_fsmap.gid_has_rank(gid)) {
780 dout(10) << "prepare_offload_targets " << gid << " " << m->targets << dendl;
781 pending_fsmap.update_export_targets(gid, m->targets);
782 } else {
783 dout(10) << "prepare_offload_targets " << gid << " not in map" << dendl;
784 }
785 return true;
786 }
787
788 bool MDSMonitor::should_propose(double& delay)
789 {
790 // delegate to PaxosService to assess whether we should propose
791 return PaxosService::should_propose(delay);
792 }
793
794 void MDSMonitor::_updated(MonOpRequestRef op)
795 {
796 op->mark_mdsmon_event(__func__);
797 MMDSBeacon *m = static_cast<MMDSBeacon*>(op->get_req());
798 dout(10) << "_updated " << m->get_orig_source() << " " << *m << dendl;
799 mon->clog->debug() << m->get_orig_source_inst() << " "
800 << ceph_mds_state_name(m->get_state());
801
802 if (m->get_state() == MDSMap::STATE_STOPPED) {
803 // send the map manually (they're out of the map, so they won't get it automatic)
804 MDSMap null_map;
805 null_map.epoch = fsmap.epoch;
806 null_map.compat = fsmap.compat;
807 mon->send_reply(op, new MMDSMap(mon->monmap->fsid, &null_map));
808 } else {
809 mon->send_reply(op, new MMDSBeacon(mon->monmap->fsid,
810 m->get_global_id(),
811 m->get_name(),
812 fsmap.get_epoch(),
813 m->get_state(),
814 m->get_seq(),
815 CEPH_FEATURES_SUPPORTED_DEFAULT));
816 }
817 }
818
819 void MDSMonitor::on_active()
820 {
821 tick();
822 update_logger();
823
824 if (mon->is_leader()) {
825 mon->clog->debug() << "fsmap " << fsmap;
826 }
827 }
828
829 void MDSMonitor::get_health(list<pair<health_status_t, string> >& summary,
830 list<pair<health_status_t, string> > *detail,
831 CephContext* cct) const
832 {
833 fsmap.get_health(summary, detail);
834
835 // For each MDS GID...
836 const auto info_map = fsmap.get_mds_info();
837 for (const auto &i : info_map) {
838 const auto &gid = i.first;
839 const auto &info = i.second;
840
841 // Decode MDSHealth
842 bufferlist bl;
843 mon->store->get(MDS_HEALTH_PREFIX, stringify(gid), bl);
844 if (!bl.length()) {
845 derr << "Missing health data for MDS " << gid << dendl;
846 continue;
847 }
848 MDSHealth health;
849 bufferlist::iterator bl_i = bl.begin();
850 health.decode(bl_i);
851
852 for (const auto &metric : health.metrics) {
853 const int rank = info.rank;
854 std::ostringstream message;
855 message << "mds" << rank << ": " << metric.message;
856 summary.push_back(std::make_pair(metric.sev, message.str()));
857
858 if (detail) {
859 // There is no way for us to clealy associate detail entries with summary entries (#7192), so
860 // we duplicate the summary message in the detail string and tag the metadata on.
861 std::ostringstream detail_message;
862 detail_message << message.str();
863 if (metric.metadata.size()) {
864 detail_message << "(";
865 auto k = metric.metadata.begin();
866 while (k != metric.metadata.end()) {
867 detail_message << k->first << ": " << k->second;
868 if (boost::next(k) != metric.metadata.end()) {
869 detail_message << ", ";
870 }
871 ++k;
872 }
873 detail_message << ")";
874 }
875 detail->push_back(std::make_pair(metric.sev, detail_message.str()));
876 }
877 }
878 }
879 }
880
881 void MDSMonitor::dump_info(Formatter *f)
882 {
883 f->open_object_section("fsmap");
884 fsmap.dump(f);
885 f->close_section();
886
887 f->dump_unsigned("mdsmap_first_committed", get_first_committed());
888 f->dump_unsigned("mdsmap_last_committed", get_last_committed());
889 }
890
891 bool MDSMonitor::preprocess_command(MonOpRequestRef op)
892 {
893 op->mark_mdsmon_event(__func__);
894 MMonCommand *m = static_cast<MMonCommand*>(op->get_req());
895 int r = -1;
896 bufferlist rdata;
897 stringstream ss, ds;
898
899 map<string, cmd_vartype> cmdmap;
900 if (!cmdmap_from_json(m->cmd, &cmdmap, ss)) {
901 // ss has reason for failure
902 string rs = ss.str();
903 mon->reply_command(op, -EINVAL, rs, rdata, get_last_committed());
904 return true;
905 }
906
907 string prefix;
908 cmd_getval(g_ceph_context, cmdmap, "prefix", prefix);
909 string format;
910 cmd_getval(g_ceph_context, cmdmap, "format", format, string("plain"));
911 boost::scoped_ptr<Formatter> f(Formatter::create(format));
912
913 MonSession *session = m->get_session();
914 if (!session) {
915 mon->reply_command(op, -EACCES, "access denied", rdata, get_last_committed());
916 return true;
917 }
918
919 if (prefix == "mds stat") {
920 if (f) {
921 f->open_object_section("mds_stat");
922 dump_info(f.get());
923 f->close_section();
924 f->flush(ds);
925 } else {
926 ds << fsmap;
927 }
928 r = 0;
929 } else if (prefix == "mds dump") {
930 int64_t epocharg;
931 epoch_t epoch;
932
933 FSMap *p = &fsmap;
934 if (cmd_getval(g_ceph_context, cmdmap, "epoch", epocharg)) {
935 epoch = epocharg;
936 bufferlist b;
937 int err = get_version(epoch, b);
938 if (err == -ENOENT) {
939 p = 0;
940 r = -ENOENT;
941 } else {
942 assert(err == 0);
943 assert(b.length());
944 p = new FSMap;
945 p->decode(b);
946 }
947 }
948 if (p) {
949 stringstream ds;
950 const MDSMap *mdsmap = nullptr;
951 MDSMap blank;
952 blank.epoch = fsmap.epoch;
953 if (fsmap.legacy_client_fscid != FS_CLUSTER_ID_NONE) {
954 mdsmap = &(fsmap.filesystems[fsmap.legacy_client_fscid]->mds_map);
955 } else {
956 mdsmap = &blank;
957 }
958 if (f != NULL) {
959 f->open_object_section("mdsmap");
960 mdsmap->dump(f.get());
961 f->close_section();
962 f->flush(ds);
963 r = 0;
964 } else {
965 mdsmap->print(ds);
966 r = 0;
967 }
968
969 rdata.append(ds);
970 ss << "dumped fsmap epoch " << p->get_epoch();
971
972 if (p != &fsmap) {
973 delete p;
974 }
975 }
976 } else if (prefix == "fs dump") {
977 int64_t epocharg;
978 epoch_t epoch;
979
980 FSMap *p = &fsmap;
981 if (cmd_getval(g_ceph_context, cmdmap, "epoch", epocharg)) {
982 epoch = epocharg;
983 bufferlist b;
984 int err = get_version(epoch, b);
985 if (err == -ENOENT) {
986 p = 0;
987 r = -ENOENT;
988 } else {
989 assert(err == 0);
990 assert(b.length());
991 p = new FSMap;
992 p->decode(b);
993 }
994 }
995 if (p) {
996 stringstream ds;
997 if (f != NULL) {
998 f->open_object_section("fsmap");
999 p->dump(f.get());
1000 f->close_section();
1001 f->flush(ds);
1002 r = 0;
1003 } else {
1004 p->print(ds);
1005 r = 0;
1006 }
1007
1008 rdata.append(ds);
1009 ss << "dumped fsmap epoch " << p->get_epoch();
1010
1011 if (p != &fsmap)
1012 delete p;
1013 }
1014 } else if (prefix == "mds metadata") {
1015 if (!f)
1016 f.reset(Formatter::create("json-pretty"));
1017
1018 string who;
1019 bool all = !cmd_getval(g_ceph_context, cmdmap, "who", who);
1020 dout(1) << "all = " << all << dendl;
1021 if (all) {
1022 r = 0;
1023 // Dump all MDSs' metadata
1024 const auto all_info = fsmap.get_mds_info();
1025
1026 f->open_array_section("mds_metadata");
1027 for(const auto &i : all_info) {
1028 const auto &info = i.second;
1029
1030 f->open_object_section("mds");
1031 f->dump_string("name", info.name);
1032 std::ostringstream get_err;
1033 r = dump_metadata(info.name, f.get(), get_err);
1034 if (r == -EINVAL || r == -ENOENT) {
1035 // Drop error, list what metadata we do have
1036 dout(1) << get_err.str() << dendl;
1037 r = 0;
1038 } else if (r != 0) {
1039 derr << "Unexpected error reading metadata: " << cpp_strerror(r)
1040 << dendl;
1041 ss << get_err.str();
1042 f->close_section();
1043 break;
1044 }
1045 f->close_section();
1046 }
1047 f->close_section();
1048 } else {
1049 // Dump a single daemon's metadata
1050 f->open_object_section("mds_metadata");
1051 r = dump_metadata(who, f.get(), ss);
1052 f->close_section();
1053 }
1054 f->flush(ds);
1055 } else if (prefix == "mds versions") {
1056 if (!f)
1057 f.reset(Formatter::create("json-pretty"));
1058 count_metadata("ceph_version", f.get());
1059 f->flush(ds);
1060 r = 0;
1061 } else if (prefix == "mds count-metadata") {
1062 if (!f)
1063 f.reset(Formatter::create("json-pretty"));
1064 string field;
1065 cmd_getval(g_ceph_context, cmdmap, "property", field);
1066 count_metadata(field, f.get());
1067 f->flush(ds);
1068 r = 0;
1069 } else if (prefix == "mds getmap") {
1070 epoch_t e;
1071 int64_t epocharg;
1072 bufferlist b;
1073 if (cmd_getval(g_ceph_context, cmdmap, "epoch", epocharg)) {
1074 e = epocharg;
1075 int err = get_version(e, b);
1076 if (err == -ENOENT) {
1077 r = -ENOENT;
1078 } else {
1079 assert(err == 0);
1080 assert(b.length());
1081 FSMap mm;
1082 mm.decode(b);
1083 mm.encode(rdata, m->get_connection()->get_features());
1084 ss << "got fsmap epoch " << mm.get_epoch();
1085 r = 0;
1086 }
1087 } else {
1088 fsmap.encode(rdata, m->get_connection()->get_features());
1089 ss << "got fsmap epoch " << fsmap.get_epoch();
1090 r = 0;
1091 }
1092 } else if (prefix == "mds compat show") {
1093 if (f) {
1094 f->open_object_section("mds_compat");
1095 fsmap.compat.dump(f.get());
1096 f->close_section();
1097 f->flush(ds);
1098 } else {
1099 ds << fsmap.compat;
1100 }
1101 r = 0;
1102 } else if (prefix == "fs get") {
1103 string fs_name;
1104 cmd_getval(g_ceph_context, cmdmap, "fs_name", fs_name);
1105 auto fs = fsmap.get_filesystem(fs_name);
1106 if (fs == nullptr) {
1107 ss << "filesystem '" << fs_name << "' not found";
1108 r = -ENOENT;
1109 } else {
1110 if (f != nullptr) {
1111 f->open_object_section("filesystem");
1112 fs->dump(f.get());
1113 f->close_section();
1114 f->flush(ds);
1115 r = 0;
1116 } else {
1117 fs->print(ds);
1118 r = 0;
1119 }
1120 }
1121 } else if (prefix == "fs ls") {
1122 if (f) {
1123 f->open_array_section("filesystems");
1124 {
1125 for (const auto i : fsmap.filesystems) {
1126 const auto fs = i.second;
1127 f->open_object_section("filesystem");
1128 {
1129 const MDSMap &mds_map = fs->mds_map;
1130 f->dump_string("name", mds_map.fs_name);
1131 /* Output both the names and IDs of pools, for use by
1132 * humans and machines respectively */
1133 f->dump_string("metadata_pool", mon->osdmon()->osdmap.get_pool_name(
1134 mds_map.metadata_pool));
1135 f->dump_int("metadata_pool_id", mds_map.metadata_pool);
1136 f->open_array_section("data_pool_ids");
1137 {
1138 for (auto dpi = mds_map.data_pools.begin();
1139 dpi != mds_map.data_pools.end(); ++dpi) {
1140 f->dump_int("data_pool_id", *dpi);
1141 }
1142 }
1143 f->close_section();
1144
1145 f->open_array_section("data_pools");
1146 {
1147 for (auto dpi = mds_map.data_pools.begin();
1148 dpi != mds_map.data_pools.end(); ++dpi) {
1149 const auto &name = mon->osdmon()->osdmap.get_pool_name(
1150 *dpi);
1151 f->dump_string("data_pool", name);
1152 }
1153 }
1154
1155 f->close_section();
1156 }
1157 f->close_section();
1158 }
1159 }
1160 f->close_section();
1161 f->flush(ds);
1162 } else {
1163 for (const auto i : fsmap.filesystems) {
1164 const auto fs = i.second;
1165 const MDSMap &mds_map = fs->mds_map;
1166 const string &md_pool_name = mon->osdmon()->osdmap.get_pool_name(
1167 mds_map.metadata_pool);
1168
1169 ds << "name: " << mds_map.fs_name << ", metadata pool: "
1170 << md_pool_name << ", data pools: [";
1171 for (auto dpi : mds_map.data_pools) {
1172 const string &pool_name = mon->osdmon()->osdmap.get_pool_name(dpi);
1173 ds << pool_name << " ";
1174 }
1175 ds << "]" << std::endl;
1176 }
1177
1178 if (fsmap.filesystems.empty()) {
1179 ds << "No filesystems enabled" << std::endl;
1180 }
1181 }
1182 r = 0;
1183 }
1184
1185 if (r != -1) {
1186 rdata.append(ds);
1187 string rs;
1188 getline(ss, rs);
1189 mon->reply_command(op, r, rs, rdata, get_last_committed());
1190 return true;
1191 } else
1192 return false;
1193 }
1194
1195 bool MDSMonitor::fail_mds_gid(mds_gid_t gid)
1196 {
1197 const MDSMap::mds_info_t info = pending_fsmap.get_info_gid(gid);
1198 dout(10) << "fail_mds_gid " << gid << " mds." << info.name << " role " << info.rank << dendl;
1199
1200 epoch_t blacklist_epoch = 0;
1201 if (info.rank >= 0 && info.state != MDSMap::STATE_STANDBY_REPLAY) {
1202 utime_t until = ceph_clock_now();
1203 until += g_conf->mds_blacklist_interval;
1204 blacklist_epoch = mon->osdmon()->blacklist(info.addr, until);
1205 }
1206
1207 pending_fsmap.erase(gid, blacklist_epoch);
1208 last_beacon.erase(gid);
1209 if (pending_daemon_health.count(gid)) {
1210 pending_daemon_health.erase(gid);
1211 pending_daemon_health_rm.insert(gid);
1212 }
1213
1214 return blacklist_epoch != 0;
1215 }
1216
1217 mds_gid_t MDSMonitor::gid_from_arg(const std::string& arg, std::ostream &ss)
1218 {
1219 const FSMap *relevant_fsmap = mon->is_leader() ? &pending_fsmap : &fsmap;
1220
1221 // Try parsing as a role
1222 mds_role_t role;
1223 std::ostringstream ignore_err; // Don't spam 'ss' with parse_role errors
1224 int r = parse_role(arg, &role, ignore_err);
1225 if (r == 0) {
1226 // See if a GID is assigned to this role
1227 auto fs = relevant_fsmap->get_filesystem(role.fscid);
1228 assert(fs != nullptr); // parse_role ensures it exists
1229 if (fs->mds_map.is_up(role.rank)) {
1230 dout(10) << __func__ << ": validated rank/GID " << role
1231 << " as a rank" << dendl;
1232 return fs->mds_map.get_mds_info(role.rank).global_id;
1233 }
1234 }
1235
1236 // Try parsing as a gid
1237 std::string err;
1238 unsigned long long maybe_gid = strict_strtoll(arg.c_str(), 10, &err);
1239 if (!err.empty()) {
1240 // Not a role or a GID, try as a daemon name
1241 const MDSMap::mds_info_t *mds_info = relevant_fsmap->find_by_name(arg);
1242 if (!mds_info) {
1243 ss << "MDS named '" << arg
1244 << "' does not exist, or is not up";
1245 return MDS_GID_NONE;
1246 }
1247 dout(10) << __func__ << ": resolved MDS name '" << arg
1248 << "' to GID " << mds_info->global_id << dendl;
1249 return mds_info->global_id;
1250 } else {
1251 // Not a role, but parses as a an integer, might be a GID
1252 dout(10) << __func__ << ": treating MDS reference '" << arg
1253 << "' as an integer " << maybe_gid << dendl;
1254
1255 if (relevant_fsmap->gid_exists(mds_gid_t(maybe_gid))) {
1256 return mds_gid_t(maybe_gid);
1257 }
1258 }
1259
1260 dout(1) << __func__ << ": rank/GID " << arg
1261 << " not a existent rank or GID" << dendl;
1262 return MDS_GID_NONE;
1263 }
1264
1265 int MDSMonitor::fail_mds(std::ostream &ss, const std::string &arg,
1266 MDSMap::mds_info_t *failed_info)
1267 {
1268 assert(failed_info != nullptr);
1269
1270 mds_gid_t gid = gid_from_arg(arg, ss);
1271 if (gid == MDS_GID_NONE) {
1272 return 0;
1273 }
1274 if (!mon->osdmon()->is_writeable()) {
1275 return -EAGAIN;
1276 }
1277
1278 // Take a copy of the info before removing the MDS from the map,
1279 // so that the caller knows which mds (if any) they ended up removing.
1280 *failed_info = pending_fsmap.get_info_gid(gid);
1281
1282 fail_mds_gid(gid);
1283 ss << "failed mds gid " << gid;
1284 assert(mon->osdmon()->is_writeable());
1285 request_proposal(mon->osdmon());
1286 return 0;
1287 }
1288
1289 bool MDSMonitor::prepare_command(MonOpRequestRef op)
1290 {
1291 op->mark_mdsmon_event(__func__);
1292 MMonCommand *m = static_cast<MMonCommand*>(op->get_req());
1293 int r = -EINVAL;
1294 stringstream ss;
1295 bufferlist rdata;
1296
1297 map<string, cmd_vartype> cmdmap;
1298 if (!cmdmap_from_json(m->cmd, &cmdmap, ss)) {
1299 string rs = ss.str();
1300 mon->reply_command(op, -EINVAL, rs, rdata, get_last_committed());
1301 return true;
1302 }
1303
1304 string prefix;
1305 cmd_getval(g_ceph_context, cmdmap, "prefix", prefix);
1306
1307 /* Refuse access if message not associated with a valid session */
1308 MonSession *session = m->get_session();
1309 if (!session) {
1310 mon->reply_command(op, -EACCES, "access denied", rdata, get_last_committed());
1311 return true;
1312 }
1313
1314 bool batched_propose = false;
1315 for (auto h : handlers) {
1316 if (h->can_handle(prefix)) {
1317 batched_propose = h->batched_propose();
1318 if (batched_propose) {
1319 paxos->plug();
1320 }
1321 r = h->handle(mon, pending_fsmap, op, cmdmap, ss);
1322 if (batched_propose) {
1323 paxos->unplug();
1324 }
1325
1326 if (r == -EAGAIN) {
1327 // message has been enqueued for retry; return.
1328 dout(4) << __func__ << " enqueue for retry by prepare_command" << dendl;
1329 return false;
1330 } else {
1331 if (r == 0) {
1332 // On successful updates, print the updated map
1333 print_map(pending_fsmap);
1334 }
1335 // Successful or not, we're done: respond.
1336 goto out;
1337 }
1338 }
1339 }
1340
1341 r = filesystem_command(op, prefix, cmdmap, ss);
1342 if (r >= 0) {
1343 goto out;
1344 } else if (r == -EAGAIN) {
1345 // Do not reply, the message has been enqueued for retry
1346 dout(4) << __func__ << " enqueue for retry by filesystem_command" << dendl;
1347 return false;
1348 } else if (r != -ENOSYS) {
1349 goto out;
1350 }
1351
1352 // Only handle legacy commands if there is a filesystem configured
1353 if (pending_fsmap.legacy_client_fscid == FS_CLUSTER_ID_NONE) {
1354 if (pending_fsmap.filesystems.size() == 0) {
1355 ss << "No filesystem configured: use `ceph fs new` to create a filesystem";
1356 } else {
1357 ss << "No filesystem set for use with legacy commands";
1358 }
1359 r = -EINVAL;
1360 goto out;
1361 }
1362
1363 r = legacy_filesystem_command(op, prefix, cmdmap, ss);
1364
1365 if (r == -ENOSYS && ss.str().empty()) {
1366 ss << "unrecognized command";
1367 }
1368
1369 out:
1370 dout(4) << __func__ << " done, r=" << r << dendl;
1371 /* Compose response */
1372 string rs;
1373 getline(ss, rs);
1374
1375 if (r >= 0) {
1376 // success.. delay reply
1377 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, r, rs,
1378 get_last_committed() + 1));
1379 if (batched_propose) {
1380 force_immediate_propose();
1381 }
1382 return true;
1383 } else {
1384 // reply immediately
1385 mon->reply_command(op, r, rs, rdata, get_last_committed());
1386 return false;
1387 }
1388 }
1389
1390
1391 /**
1392 * Given one of the following forms:
1393 * <fs name>:<rank>
1394 * <fs id>:<rank>
1395 * <rank>
1396 *
1397 * Parse into a mds_role_t. The rank-only form is only valid
1398 * if legacy_client_ns is set.
1399 */
1400 int MDSMonitor::parse_role(
1401 const std::string &role_str,
1402 mds_role_t *role,
1403 std::ostream &ss)
1404 {
1405 const FSMap *relevant_fsmap = &fsmap;
1406 if (mon->is_leader()) {
1407 relevant_fsmap = &pending_fsmap;
1408 }
1409 return relevant_fsmap->parse_role(role_str, role, ss);
1410 }
1411
1412 int MDSMonitor::filesystem_command(
1413 MonOpRequestRef op,
1414 std::string const &prefix,
1415 map<string, cmd_vartype> &cmdmap,
1416 std::stringstream &ss)
1417 {
1418 dout(4) << __func__ << " prefix='" << prefix << "'" << dendl;
1419 op->mark_mdsmon_event(__func__);
1420 int r = 0;
1421 string whostr;
1422 cmd_getval(g_ceph_context, cmdmap, "who", whostr);
1423
1424 if (prefix == "mds stop" ||
1425 prefix == "mds deactivate") {
1426
1427 mds_role_t role;
1428 r = parse_role(whostr, &role, ss);
1429 if (r < 0 ) {
1430 return r;
1431 }
1432 auto fs = pending_fsmap.get_filesystem(role.fscid);
1433
1434 if (!fs->mds_map.is_active(role.rank)) {
1435 r = -EEXIST;
1436 ss << "mds." << role << " not active ("
1437 << ceph_mds_state_name(fs->mds_map.get_state(role.rank)) << ")";
1438 } else if (fs->mds_map.get_root() == role.rank ||
1439 fs->mds_map.get_tableserver() == role.rank) {
1440 r = -EINVAL;
1441 ss << "can't tell the root (" << fs->mds_map.get_root()
1442 << ") or tableserver (" << fs->mds_map.get_tableserver()
1443 << ") to deactivate";
1444 } else if (role.rank != fs->mds_map.get_last_in_mds()) {
1445 r = -EINVAL;
1446 ss << "mds." << role << " doesn't have the max rank ("
1447 << fs->mds_map.get_last_in_mds() << ")";
1448 } else if (fs->mds_map.get_num_in_mds() <= size_t(fs->mds_map.get_max_mds())) {
1449 r = -EBUSY;
1450 ss << "must decrease max_mds or else MDS will immediately reactivate";
1451 } else {
1452 r = 0;
1453 mds_gid_t gid = fs->mds_map.up.at(role.rank);
1454 ss << "telling mds." << role << " "
1455 << pending_fsmap.get_info_gid(gid).addr << " to deactivate";
1456
1457 pending_fsmap.modify_daemon(gid, [](MDSMap::mds_info_t *info) {
1458 info->state = MDSMap::STATE_STOPPING;
1459 });
1460 }
1461 } else if (prefix == "mds set_state") {
1462 mds_gid_t gid;
1463 if (!cmd_getval(g_ceph_context, cmdmap, "gid", gid)) {
1464 ss << "error parsing 'gid' value '"
1465 << cmd_vartype_stringify(cmdmap["gid"]) << "'";
1466 return -EINVAL;
1467 }
1468 MDSMap::DaemonState state;
1469 if (!cmd_getval(g_ceph_context, cmdmap, "state", state)) {
1470 ss << "error parsing 'state' string value '"
1471 << cmd_vartype_stringify(cmdmap["state"]) << "'";
1472 return -EINVAL;
1473 }
1474 if (pending_fsmap.gid_exists(gid)) {
1475 pending_fsmap.modify_daemon(gid, [state](MDSMap::mds_info_t *info) {
1476 info->state = state;
1477 });
1478 ss << "set mds gid " << gid << " to state " << state << " "
1479 << ceph_mds_state_name(state);
1480 return 0;
1481 }
1482 } else if (prefix == "mds fail") {
1483 string who;
1484 cmd_getval(g_ceph_context, cmdmap, "who", who);
1485
1486 MDSMap::mds_info_t failed_info;
1487 r = fail_mds(ss, who, &failed_info);
1488 if (r < 0 && r == -EAGAIN) {
1489 mon->osdmon()->wait_for_writeable(op, new C_RetryMessage(this, op));
1490 return -EAGAIN; // don't propose yet; wait for message to be retried
1491 } else if (r == 0) {
1492 // Only log if we really did something (not when was already gone)
1493 if (failed_info.global_id != MDS_GID_NONE) {
1494 mon->clog->info() << failed_info.human_name() << " marked failed by "
1495 << op->get_session()->entity_name;
1496 }
1497 }
1498 } else if (prefix == "mds rm") {
1499 mds_gid_t gid;
1500 if (!cmd_getval(g_ceph_context, cmdmap, "gid", gid)) {
1501 ss << "error parsing 'gid' value '"
1502 << cmd_vartype_stringify(cmdmap["gid"]) << "'";
1503 return -EINVAL;
1504 }
1505 if (!pending_fsmap.gid_exists(gid)) {
1506 ss << "mds gid " << gid << " dne";
1507 r = 0;
1508 } else {
1509 MDSMap::DaemonState state = pending_fsmap.get_info_gid(gid).state;
1510 if (state > 0) {
1511 ss << "cannot remove active mds." << pending_fsmap.get_info_gid(gid).name
1512 << " rank " << pending_fsmap.get_info_gid(gid).rank;
1513 return -EBUSY;
1514 } else {
1515 pending_fsmap.erase(gid, {});
1516 ss << "removed mds gid " << gid;
1517 return 0;
1518 }
1519 }
1520 } else if (prefix == "mds rmfailed") {
1521 string confirm;
1522 if (!cmd_getval(g_ceph_context, cmdmap, "confirm", confirm) ||
1523 confirm != "--yes-i-really-mean-it") {
1524 ss << "WARNING: this can make your filesystem inaccessible! "
1525 "Add --yes-i-really-mean-it if you are sure you wish to continue.";
1526 return -EPERM;
1527 }
1528
1529 std::string role_str;
1530 cmd_getval(g_ceph_context, cmdmap, "who", role_str);
1531 mds_role_t role;
1532 int r = parse_role(role_str, &role, ss);
1533 if (r < 0) {
1534 ss << "invalid role '" << role_str << "'";
1535 return -EINVAL;
1536 }
1537
1538 pending_fsmap.modify_filesystem(
1539 role.fscid,
1540 [role](std::shared_ptr<Filesystem> fs)
1541 {
1542 fs->mds_map.failed.erase(role.rank);
1543 });
1544
1545 ss << "removed failed mds." << role;
1546 return 0;
1547 } else if (prefix == "mds compat rm_compat") {
1548 int64_t f;
1549 if (!cmd_getval(g_ceph_context, cmdmap, "feature", f)) {
1550 ss << "error parsing feature value '"
1551 << cmd_vartype_stringify(cmdmap["feature"]) << "'";
1552 return -EINVAL;
1553 }
1554 if (pending_fsmap.compat.compat.contains(f)) {
1555 ss << "removing compat feature " << f;
1556 CompatSet modified = pending_fsmap.compat;
1557 modified.compat.remove(f);
1558 pending_fsmap.update_compat(modified);
1559 } else {
1560 ss << "compat feature " << f << " not present in " << pending_fsmap.compat;
1561 }
1562 r = 0;
1563 } else if (prefix == "mds compat rm_incompat") {
1564 int64_t f;
1565 if (!cmd_getval(g_ceph_context, cmdmap, "feature", f)) {
1566 ss << "error parsing feature value '"
1567 << cmd_vartype_stringify(cmdmap["feature"]) << "'";
1568 return -EINVAL;
1569 }
1570 if (pending_fsmap.compat.incompat.contains(f)) {
1571 ss << "removing incompat feature " << f;
1572 CompatSet modified = pending_fsmap.compat;
1573 modified.incompat.remove(f);
1574 pending_fsmap.update_compat(modified);
1575 } else {
1576 ss << "incompat feature " << f << " not present in " << pending_fsmap.compat;
1577 }
1578 r = 0;
1579 } else if (prefix == "mds repaired") {
1580 std::string role_str;
1581 cmd_getval(g_ceph_context, cmdmap, "rank", role_str);
1582 mds_role_t role;
1583 r = parse_role(role_str, &role, ss);
1584 if (r < 0) {
1585 return r;
1586 }
1587
1588 bool modified = pending_fsmap.undamaged(role.fscid, role.rank);
1589 if (modified) {
1590 dout(4) << "repaired: restoring rank " << role << dendl;
1591 } else {
1592 dout(4) << "repaired: no-op on rank " << role << dendl;
1593 }
1594
1595 r = 0;
1596 } else {
1597 return -ENOSYS;
1598 }
1599
1600 return r;
1601 }
1602
1603 /**
1604 * Helper to legacy_filesystem_command
1605 */
1606 void MDSMonitor::modify_legacy_filesystem(
1607 std::function<void(std::shared_ptr<Filesystem> )> fn)
1608 {
1609 pending_fsmap.modify_filesystem(
1610 pending_fsmap.legacy_client_fscid,
1611 fn
1612 );
1613 }
1614
1615
1616
1617 /**
1618 * Handle a command that affects the filesystem (i.e. a filesystem
1619 * must exist for the command to act upon).
1620 *
1621 * @retval 0 Command was successfully handled and has side effects
1622 * @retval -EAGAIN Messages has been requeued for retry
1623 * @retval -ENOSYS Unknown command
1624 * @retval < 0 An error has occurred; **ss** may have been set.
1625 */
1626 int MDSMonitor::legacy_filesystem_command(
1627 MonOpRequestRef op,
1628 std::string const &prefix,
1629 map<string, cmd_vartype> &cmdmap,
1630 std::stringstream &ss)
1631 {
1632 dout(4) << __func__ << " prefix='" << prefix << "'" << dendl;
1633 op->mark_mdsmon_event(__func__);
1634 int r = 0;
1635 string whostr;
1636 cmd_getval(g_ceph_context, cmdmap, "who", whostr);
1637
1638 assert (pending_fsmap.legacy_client_fscid != FS_CLUSTER_ID_NONE);
1639
1640 if (prefix == "mds set_max_mds") {
1641 // NOTE: deprecated by "fs set max_mds"
1642 int64_t maxmds;
1643 if (!cmd_getval(g_ceph_context, cmdmap, "maxmds", maxmds) || maxmds <= 0) {
1644 return -EINVAL;
1645 }
1646
1647 const MDSMap& mdsmap =
1648 pending_fsmap.filesystems.at(pending_fsmap.legacy_client_fscid)->mds_map;
1649
1650 if (!mdsmap.allows_multimds() &&
1651 maxmds > mdsmap.get_max_mds() &&
1652 maxmds > 1) {
1653 ss << "multi-MDS clusters are not enabled; set 'allow_multimds' to enable";
1654 return -EINVAL;
1655 }
1656
1657 if (maxmds > MAX_MDS) {
1658 ss << "may not have more than " << MAX_MDS << " MDS ranks";
1659 return -EINVAL;
1660 }
1661
1662 modify_legacy_filesystem(
1663 [maxmds](std::shared_ptr<Filesystem> fs)
1664 {
1665 fs->mds_map.set_max_mds(maxmds);
1666 });
1667
1668 r = 0;
1669 ss << "max_mds = " << maxmds;
1670 } else if (prefix == "mds cluster_down") {
1671 // NOTE: deprecated by "fs set cluster_down"
1672 modify_legacy_filesystem(
1673 [](std::shared_ptr<Filesystem> fs)
1674 {
1675 fs->mds_map.set_flag(CEPH_MDSMAP_DOWN);
1676 });
1677 ss << "marked fsmap DOWN";
1678 r = 0;
1679 } else if (prefix == "mds cluster_up") {
1680 // NOTE: deprecated by "fs set cluster_up"
1681 modify_legacy_filesystem(
1682 [](std::shared_ptr<Filesystem> fs)
1683 {
1684 fs->mds_map.clear_flag(CEPH_MDSMAP_DOWN);
1685 });
1686 ss << "unmarked fsmap DOWN";
1687 r = 0;
1688 } else {
1689 return -ENOSYS;
1690 }
1691
1692 return r;
1693 }
1694
1695
1696 void MDSMonitor::check_subs()
1697 {
1698 std::list<std::string> types;
1699
1700 // Subscriptions may be to "mdsmap" (MDS and legacy clients),
1701 // "mdsmap.<namespace>", or to "fsmap" for the full state of all
1702 // filesystems. Build a list of all the types we service
1703 // subscriptions for.
1704 types.push_back("fsmap");
1705 types.push_back("fsmap.user");
1706 types.push_back("mdsmap");
1707 for (const auto &i : fsmap.filesystems) {
1708 auto fscid = i.first;
1709 std::ostringstream oss;
1710 oss << "mdsmap." << fscid;
1711 types.push_back(oss.str());
1712 }
1713
1714 for (const auto &type : types) {
1715 if (mon->session_map.subs.count(type) == 0)
1716 continue;
1717 xlist<Subscription*>::iterator p = mon->session_map.subs[type]->begin();
1718 while (!p.end()) {
1719 Subscription *sub = *p;
1720 ++p;
1721 check_sub(sub);
1722 }
1723 }
1724 }
1725
1726
1727 void MDSMonitor::check_sub(Subscription *sub)
1728 {
1729 dout(20) << __func__ << ": " << sub->type << dendl;
1730
1731 if (sub->type == "fsmap") {
1732 if (sub->next <= fsmap.get_epoch()) {
1733 sub->session->con->send_message(new MFSMap(mon->monmap->fsid, fsmap));
1734 if (sub->onetime) {
1735 mon->session_map.remove_sub(sub);
1736 } else {
1737 sub->next = fsmap.get_epoch() + 1;
1738 }
1739 }
1740 } else if (sub->type == "fsmap.user") {
1741 if (sub->next <= fsmap.get_epoch()) {
1742 FSMapUser fsmap_u;
1743 fsmap_u.epoch = fsmap.get_epoch();
1744 fsmap_u.legacy_client_fscid = fsmap.legacy_client_fscid;
1745 for (auto p = fsmap.filesystems.begin();
1746 p != fsmap.filesystems.end();
1747 ++p) {
1748 FSMapUser::fs_info_t& fs_info = fsmap_u.filesystems[p->first];
1749 fs_info.cid = p->first;
1750 fs_info.name= p->second->mds_map.fs_name;
1751 }
1752 sub->session->con->send_message(new MFSMapUser(mon->monmap->fsid, fsmap_u));
1753 if (sub->onetime) {
1754 mon->session_map.remove_sub(sub);
1755 } else {
1756 sub->next = fsmap.get_epoch() + 1;
1757 }
1758 }
1759 } else if (sub->type.compare(0, 6, "mdsmap") == 0) {
1760 if (sub->next > fsmap.get_epoch()) {
1761 return;
1762 }
1763
1764 const bool is_mds = sub->session->inst.name.is_mds();
1765 mds_gid_t mds_gid = MDS_GID_NONE;
1766 fs_cluster_id_t fscid = FS_CLUSTER_ID_NONE;
1767 if (is_mds) {
1768 // What (if any) namespace are you assigned to?
1769 auto mds_info = fsmap.get_mds_info();
1770 for (const auto &i : mds_info) {
1771 if (i.second.addr == sub->session->inst.addr) {
1772 mds_gid = i.first;
1773 fscid = fsmap.mds_roles.at(mds_gid);
1774 }
1775 }
1776 } else {
1777 // You're a client. Did you request a particular
1778 // namespace?
1779 if (sub->type.find("mdsmap.") == 0) {
1780 auto namespace_id_str = sub->type.substr(std::string("mdsmap.").size());
1781 dout(10) << __func__ << ": namespace_id " << namespace_id_str << dendl;
1782 std::string err;
1783 fscid = strict_strtoll(namespace_id_str.c_str(), 10, &err);
1784 if (!err.empty()) {
1785 // Client asked for a non-existent namespace, send them nothing
1786 dout(1) << "Invalid client subscription '" << sub->type
1787 << "'" << dendl;
1788 return;
1789 }
1790 if (fsmap.filesystems.count(fscid) == 0) {
1791 // Client asked for a non-existent namespace, send them nothing
1792 // TODO: something more graceful for when a client has a filesystem
1793 // mounted, and the fileysstem is deleted. Add a "shut down you fool"
1794 // flag to MMDSMap?
1795 dout(1) << "Client subscribed to non-existent namespace '" <<
1796 fscid << "'" << dendl;
1797 return;
1798 }
1799 } else {
1800 // Unqualified request for "mdsmap": give it the one marked
1801 // for use by legacy clients.
1802 if (fsmap.legacy_client_fscid != FS_CLUSTER_ID_NONE) {
1803 fscid = fsmap.legacy_client_fscid;
1804 } else {
1805 dout(1) << "Client subscribed for legacy filesystem but "
1806 "none is configured" << dendl;
1807 return;
1808 }
1809 }
1810 }
1811 dout(10) << __func__ << ": is_mds=" << is_mds << ", fscid= " << fscid << dendl;
1812
1813 // Work out the effective latest epoch
1814 MDSMap *mds_map = nullptr;
1815 MDSMap null_map;
1816 null_map.compat = fsmap.compat;
1817 if (fscid == FS_CLUSTER_ID_NONE) {
1818 // For a client, we should have already dropped out
1819 assert(is_mds);
1820
1821 if (fsmap.standby_daemons.count(mds_gid)) {
1822 // For an MDS, we need to feed it an MDSMap with its own state in
1823 null_map.mds_info[mds_gid] = fsmap.standby_daemons[mds_gid];
1824 null_map.epoch = fsmap.standby_epochs[mds_gid];
1825 } else {
1826 null_map.epoch = fsmap.epoch;
1827 }
1828 mds_map = &null_map;
1829 } else {
1830 // Check the effective epoch
1831 mds_map = &(fsmap.filesystems.at(fscid)->mds_map);
1832 }
1833
1834 assert(mds_map != nullptr);
1835 dout(10) << __func__ << " selected MDS map epoch " <<
1836 mds_map->epoch << " for namespace " << fscid << " for subscriber "
1837 << sub->session->inst.name << " who wants epoch " << sub->next << dendl;
1838
1839 if (sub->next > mds_map->epoch) {
1840 return;
1841 }
1842 auto msg = new MMDSMap(mon->monmap->fsid, mds_map);
1843
1844 sub->session->con->send_message(msg);
1845 if (sub->onetime) {
1846 mon->session_map.remove_sub(sub);
1847 } else {
1848 sub->next = mds_map->get_epoch() + 1;
1849 }
1850 }
1851 }
1852
1853
1854 void MDSMonitor::update_metadata(mds_gid_t gid,
1855 const map<string, string>& metadata)
1856 {
1857 if (metadata.empty()) {
1858 return;
1859 }
1860 pending_metadata[gid] = metadata;
1861
1862 MonitorDBStore::TransactionRef t = paxos->get_pending_transaction();
1863 bufferlist bl;
1864 ::encode(pending_metadata, bl);
1865 t->put(MDS_METADATA_PREFIX, "last_metadata", bl);
1866 paxos->trigger_propose();
1867 }
1868
1869 void MDSMonitor::remove_from_metadata(MonitorDBStore::TransactionRef t)
1870 {
1871 bool update = false;
1872 for (map<mds_gid_t, Metadata>::iterator i = pending_metadata.begin();
1873 i != pending_metadata.end(); ) {
1874 if (!pending_fsmap.gid_exists(i->first)) {
1875 pending_metadata.erase(i++);
1876 update = true;
1877 } else {
1878 ++i;
1879 }
1880 }
1881 if (!update)
1882 return;
1883 bufferlist bl;
1884 ::encode(pending_metadata, bl);
1885 t->put(MDS_METADATA_PREFIX, "last_metadata", bl);
1886 }
1887
1888 int MDSMonitor::load_metadata(map<mds_gid_t, Metadata>& m)
1889 {
1890 bufferlist bl;
1891 int r = mon->store->get(MDS_METADATA_PREFIX, "last_metadata", bl);
1892 if (r) {
1893 dout(1) << "Unable to load 'last_metadata'" << dendl;
1894 return r;
1895 }
1896
1897 bufferlist::iterator it = bl.begin();
1898 ::decode(m, it);
1899 return 0;
1900 }
1901
1902 void MDSMonitor::count_metadata(const string& field, map<string,int> *out)
1903 {
1904 map<mds_gid_t,Metadata> meta;
1905 load_metadata(meta);
1906 for (auto& p : meta) {
1907 auto q = p.second.find(field);
1908 if (q == p.second.end()) {
1909 (*out)["unknown"]++;
1910 } else {
1911 (*out)[q->second]++;
1912 }
1913 }
1914 }
1915
1916 void MDSMonitor::count_metadata(const string& field, Formatter *f)
1917 {
1918 map<string,int> by_val;
1919 count_metadata(field, &by_val);
1920 f->open_object_section(field.c_str());
1921 for (auto& p : by_val) {
1922 f->dump_int(p.first.c_str(), p.second);
1923 }
1924 f->close_section();
1925 }
1926
1927 int MDSMonitor::dump_metadata(const std::string &who, Formatter *f, ostream& err)
1928 {
1929 assert(f);
1930
1931 mds_gid_t gid = gid_from_arg(who, err);
1932 if (gid == MDS_GID_NONE) {
1933 return -EINVAL;
1934 }
1935
1936 map<mds_gid_t, Metadata> metadata;
1937 if (int r = load_metadata(metadata)) {
1938 err << "Unable to load 'last_metadata'";
1939 return r;
1940 }
1941
1942 if (!metadata.count(gid)) {
1943 return -ENOENT;
1944 }
1945 const Metadata& m = metadata[gid];
1946 for (Metadata::const_iterator p = m.begin(); p != m.end(); ++p) {
1947 f->dump_string(p->first.c_str(), p->second);
1948 }
1949 return 0;
1950 }
1951
1952 int MDSMonitor::print_nodes(Formatter *f)
1953 {
1954 assert(f);
1955
1956 map<mds_gid_t, Metadata> metadata;
1957 if (int r = load_metadata(metadata)) {
1958 return r;
1959 }
1960
1961 map<string, list<int> > mdses; // hostname => rank
1962 for (map<mds_gid_t, Metadata>::iterator it = metadata.begin();
1963 it != metadata.end(); ++it) {
1964 const Metadata& m = it->second;
1965 Metadata::const_iterator hostname = m.find("hostname");
1966 if (hostname == m.end()) {
1967 // not likely though
1968 continue;
1969 }
1970 const mds_gid_t gid = it->first;
1971 if (!fsmap.gid_exists(gid)) {
1972 dout(5) << __func__ << ": GID " << gid << " not existent" << dendl;
1973 continue;
1974 }
1975 const MDSMap::mds_info_t& mds_info = fsmap.get_info_gid(gid);
1976 // FIXME: include filesystem name with rank here
1977 mdses[hostname->second].push_back(mds_info.rank);
1978 }
1979
1980 dump_services(f, mdses, "mds");
1981 return 0;
1982 }
1983
1984 /**
1985 * If a cluster is undersized (with respect to max_mds), then
1986 * attempt to find daemons to grow it.
1987 */
1988 bool MDSMonitor::maybe_expand_cluster(std::shared_ptr<Filesystem> fs)
1989 {
1990 bool do_propose = false;
1991
1992 if (fs->mds_map.test_flag(CEPH_MDSMAP_DOWN)) {
1993 return do_propose;
1994 }
1995
1996 while (fs->mds_map.get_num_in_mds() < size_t(fs->mds_map.get_max_mds()) &&
1997 !fs->mds_map.is_degraded()) {
1998 mds_rank_t mds = mds_rank_t(0);
1999 string name;
2000 while (fs->mds_map.is_in(mds)) {
2001 mds++;
2002 }
2003 mds_gid_t newgid = pending_fsmap.find_replacement_for({fs->fscid, mds},
2004 name, g_conf->mon_force_standby_active);
2005 if (newgid == MDS_GID_NONE) {
2006 break;
2007 }
2008
2009 const auto &new_info = pending_fsmap.get_info_gid(newgid);
2010 dout(1) << "assigned standby " << new_info.addr
2011 << " as mds." << mds << dendl;
2012
2013 mon->clog->info() << new_info.human_name() << " assigned to "
2014 "filesystem " << fs->mds_map.fs_name << " as rank "
2015 << mds << " (now has " << fs->mds_map.get_num_in_mds()
2016 << " ranks)";
2017 pending_fsmap.promote(newgid, fs, mds);
2018 do_propose = true;
2019 }
2020
2021 return do_propose;
2022 }
2023
2024
2025 /**
2026 * If a daemon is laggy, and a suitable replacement
2027 * is available, fail this daemon (remove from map) and pass its
2028 * role to another daemon.
2029 */
2030 void MDSMonitor::maybe_replace_gid(mds_gid_t gid, const MDSMap::mds_info_t& info,
2031 bool *mds_propose, bool *osd_propose)
2032 {
2033 assert(mds_propose != nullptr);
2034 assert(osd_propose != nullptr);
2035
2036 const auto fscid = pending_fsmap.mds_roles.at(gid);
2037
2038 // We will only take decisive action (replacing/removing a daemon)
2039 // if we have some indicating that some other daemon(s) are successfully
2040 // getting beacons through recently.
2041 utime_t latest_beacon;
2042 for (const auto & i : last_beacon) {
2043 latest_beacon = MAX(i.second.stamp, latest_beacon);
2044 }
2045 const bool may_replace = latest_beacon >
2046 (ceph_clock_now() -
2047 MAX(g_conf->mds_beacon_interval, g_conf->mds_beacon_grace * 0.5));
2048
2049 // are we in?
2050 // and is there a non-laggy standby that can take over for us?
2051 mds_gid_t sgid;
2052 if (info.rank >= 0 &&
2053 info.state != MDSMap::STATE_STANDBY &&
2054 info.state != MDSMap::STATE_STANDBY_REPLAY &&
2055 may_replace &&
2056 !pending_fsmap.get_filesystem(fscid)->mds_map.test_flag(CEPH_MDSMAP_DOWN) &&
2057 (sgid = pending_fsmap.find_replacement_for({fscid, info.rank}, info.name,
2058 g_conf->mon_force_standby_active)) != MDS_GID_NONE)
2059 {
2060
2061 MDSMap::mds_info_t si = pending_fsmap.get_info_gid(sgid);
2062 dout(10) << " replacing " << gid << " " << info.addr << " mds."
2063 << info.rank << "." << info.inc
2064 << " " << ceph_mds_state_name(info.state)
2065 << " with " << sgid << "/" << si.name << " " << si.addr << dendl;
2066
2067 mon->clog->warn() << info.human_name()
2068 << " is not responding, replacing it "
2069 << "as rank " << info.rank
2070 << " with standby " << si.human_name();
2071
2072 // Remember what NS the old one was in
2073 const fs_cluster_id_t fscid = pending_fsmap.mds_roles.at(gid);
2074
2075 // Remove the old one
2076 *osd_propose |= fail_mds_gid(gid);
2077
2078 // Promote the replacement
2079 auto fs = pending_fsmap.filesystems.at(fscid);
2080 pending_fsmap.promote(sgid, fs, info.rank);
2081
2082 *mds_propose = true;
2083 } else if ((info.state == MDSMap::STATE_STANDBY_REPLAY ||
2084 info.state == MDSMap::STATE_STANDBY) && may_replace) {
2085 dout(10) << " failing and removing " << gid << " " << info.addr << " mds." << info.rank
2086 << "." << info.inc << " " << ceph_mds_state_name(info.state)
2087 << dendl;
2088 mon->clog->info() << "Standby " << info.human_name() << " is not "
2089 "responding, dropping it";
2090 fail_mds_gid(gid);
2091 *mds_propose = true;
2092 } else if (!info.laggy()) {
2093 dout(10) << " marking " << gid << " " << info.addr << " mds." << info.rank << "." << info.inc
2094 << " " << ceph_mds_state_name(info.state)
2095 << " laggy" << dendl;
2096 pending_fsmap.modify_daemon(info.global_id, [](MDSMap::mds_info_t *info) {
2097 info->laggy_since = ceph_clock_now();
2098 });
2099 *mds_propose = true;
2100 }
2101 }
2102
2103 bool MDSMonitor::maybe_promote_standby(std::shared_ptr<Filesystem> fs)
2104 {
2105 assert(!fs->mds_map.test_flag(CEPH_MDSMAP_DOWN));
2106
2107 bool do_propose = false;
2108
2109 // have a standby take over?
2110 set<mds_rank_t> failed;
2111 fs->mds_map.get_failed_mds_set(failed);
2112 if (!failed.empty()) {
2113 set<mds_rank_t>::iterator p = failed.begin();
2114 while (p != failed.end()) {
2115 mds_rank_t f = *p++;
2116 mds_gid_t sgid = pending_fsmap.find_replacement_for({fs->fscid, f}, {},
2117 g_conf->mon_force_standby_active);
2118 if (sgid) {
2119 const MDSMap::mds_info_t si = pending_fsmap.get_info_gid(sgid);
2120 dout(0) << " taking over failed mds." << f << " with " << sgid
2121 << "/" << si.name << " " << si.addr << dendl;
2122 mon->clog->info() << "Standby " << si.human_name()
2123 << " assigned to filesystem " << fs->mds_map.fs_name
2124 << " as rank " << f;
2125
2126 pending_fsmap.promote(sgid, fs, f);
2127 do_propose = true;
2128 }
2129 }
2130 } else {
2131 // There were no failures to replace, so try using any available standbys
2132 // as standby-replay daemons.
2133
2134 // Take a copy of the standby GIDs so that we can iterate over
2135 // them while perhaps-modifying standby_daemons during the loop
2136 // (if we promote anyone they are removed from standby_daemons)
2137 std::vector<mds_gid_t> standby_gids;
2138 for (const auto &j : pending_fsmap.standby_daemons) {
2139 standby_gids.push_back(j.first);
2140 }
2141
2142 for (const auto &gid : standby_gids) {
2143 const auto &info = pending_fsmap.standby_daemons.at(gid);
2144 assert(info.state == MDSMap::STATE_STANDBY);
2145
2146 if (!info.standby_replay) {
2147 continue;
2148 }
2149
2150 /*
2151 * This mds is standby but has no rank assigned.
2152 * See if we can find it somebody to shadow
2153 */
2154 dout(20) << "gid " << gid << " is standby and following nobody" << dendl;
2155
2156 // standby for someone specific?
2157 if (info.standby_for_rank >= 0) {
2158 // The mds_info_t may or may not tell us exactly which filesystem
2159 // the standby_for_rank refers to: lookup via legacy_client_fscid
2160 mds_role_t target_role = {
2161 info.standby_for_fscid == FS_CLUSTER_ID_NONE ?
2162 pending_fsmap.legacy_client_fscid : info.standby_for_fscid,
2163 info.standby_for_rank};
2164
2165 // It is possible that the map contains a standby_for_fscid
2166 // that doesn't correspond to an existing filesystem, especially
2167 // if we loaded from a version with a bug (#17466)
2168 if (info.standby_for_fscid != FS_CLUSTER_ID_NONE
2169 && !pending_fsmap.filesystem_exists(info.standby_for_fscid)) {
2170 derr << "gid " << gid << " has invalid standby_for_fscid "
2171 << info.standby_for_fscid << dendl;
2172 continue;
2173 }
2174
2175 // If we managed to resolve a full target role
2176 if (target_role.fscid != FS_CLUSTER_ID_NONE) {
2177 auto fs = pending_fsmap.get_filesystem(target_role.fscid);
2178 if (fs->mds_map.is_followable(target_role.rank)) {
2179 do_propose |= try_standby_replay(
2180 info,
2181 *fs,
2182 fs->mds_map.get_info(target_role.rank));
2183 }
2184 }
2185
2186 continue;
2187 }
2188
2189 // check everyone
2190 for (auto fs_i : pending_fsmap.filesystems) {
2191 const MDSMap &mds_map = fs_i.second->mds_map;
2192 for (auto mds_i : mds_map.mds_info) {
2193 MDSMap::mds_info_t &cand_info = mds_i.second;
2194 if (cand_info.rank >= 0 && mds_map.is_followable(cand_info.rank)) {
2195 if ((info.standby_for_name.length() && info.standby_for_name != cand_info.name) ||
2196 info.standby_for_rank != MDS_RANK_NONE) {
2197 continue; // we're supposed to follow someone else
2198 }
2199
2200 if (try_standby_replay(info, *(fs_i.second), cand_info)) {
2201 do_propose = true;
2202 break;
2203 }
2204 continue;
2205 }
2206 }
2207 }
2208 }
2209 }
2210
2211 return do_propose;
2212 }
2213
2214 void MDSMonitor::tick()
2215 {
2216 // make sure mds's are still alive
2217 // ...if i am an active leader
2218 if (!is_active()) return;
2219
2220 dout(10) << fsmap << dendl;
2221
2222 bool do_propose = false;
2223
2224 if (!mon->is_leader()) return;
2225
2226 do_propose |= pending_fsmap.check_health();
2227
2228 // expand mds cluster (add new nodes to @in)?
2229 for (auto i : pending_fsmap.filesystems) {
2230 do_propose |= maybe_expand_cluster(i.second);
2231 }
2232
2233 const auto now = ceph_clock_now();
2234 if (last_tick.is_zero()) {
2235 last_tick = now;
2236 }
2237
2238 if (now - last_tick > (g_conf->mds_beacon_grace - g_conf->mds_beacon_interval)) {
2239 // This case handles either local slowness (calls being delayed
2240 // for whatever reason) or cluster election slowness (a long gap
2241 // between calls while an election happened)
2242 dout(4) << __func__ << ": resetting beacon timeouts due to mon delay "
2243 "(slow election?) of " << now - last_tick << " seconds" << dendl;
2244 for (auto &i : last_beacon) {
2245 i.second.stamp = now;
2246 }
2247 }
2248
2249 last_tick = now;
2250
2251 // check beacon timestamps
2252 utime_t cutoff = now;
2253 cutoff -= g_conf->mds_beacon_grace;
2254
2255 // make sure last_beacon is fully populated
2256 for (const auto &p : pending_fsmap.mds_roles) {
2257 auto &gid = p.first;
2258 if (last_beacon.count(gid) == 0) {
2259 last_beacon[gid].stamp = now;
2260 last_beacon[gid].seq = 0;
2261 }
2262 }
2263
2264 bool propose_osdmap = false;
2265 bool osdmap_writeable = mon->osdmon()->is_writeable();
2266 auto p = last_beacon.begin();
2267 while (p != last_beacon.end()) {
2268 mds_gid_t gid = p->first;
2269 auto beacon_info = p->second;
2270 ++p;
2271
2272 if (!pending_fsmap.gid_exists(gid)) {
2273 // clean it out
2274 last_beacon.erase(gid);
2275 continue;
2276 }
2277
2278 if (beacon_info.stamp < cutoff) {
2279 auto &info = pending_fsmap.get_info_gid(gid);
2280 dout(1) << "no beacon from mds." << info.rank << "." << info.inc
2281 << " (gid: " << gid << " addr: " << info.addr
2282 << " state: " << ceph_mds_state_name(info.state) << ")"
2283 << " since " << beacon_info.stamp << dendl;
2284 // If the OSDMap is writeable, we can blacklist things, so we can
2285 // try failing any laggy MDS daemons. Consider each one for failure.
2286 if (osdmap_writeable) {
2287 maybe_replace_gid(gid, info, &do_propose, &propose_osdmap);
2288 }
2289 }
2290 }
2291 if (propose_osdmap) {
2292 request_proposal(mon->osdmon());
2293 }
2294
2295 for (auto i : pending_fsmap.filesystems) {
2296 auto fs = i.second;
2297 if (!fs->mds_map.test_flag(CEPH_MDSMAP_DOWN)) {
2298 do_propose |= maybe_promote_standby(fs);
2299 }
2300 }
2301
2302 if (do_propose) {
2303 propose_pending();
2304 }
2305 }
2306
2307 /**
2308 * finfo: the would-be follower
2309 * leader_fs: the Filesystem containing the would-be leader
2310 * ainfo: the would-be leader
2311 */
2312 bool MDSMonitor::try_standby_replay(
2313 const MDSMap::mds_info_t& finfo,
2314 const Filesystem &leader_fs,
2315 const MDSMap::mds_info_t& ainfo)
2316 {
2317 // someone else already following?
2318 if (leader_fs.has_standby_replay(ainfo.global_id)) {
2319 dout(20) << " mds." << ainfo.rank << " already has a follower" << dendl;
2320 return false;
2321 } else {
2322 // Assign the new role to the standby
2323 dout(10) << " setting to follow mds rank " << ainfo.rank << dendl;
2324 pending_fsmap.assign_standby_replay(finfo.global_id, leader_fs.fscid, ainfo.rank);
2325 return true;
2326 }
2327 }
2328
2329 MDSMonitor::MDSMonitor(Monitor *mn, Paxos *p, string service_name)
2330 : PaxosService(mn, p, service_name)
2331 {
2332 handlers = FileSystemCommandHandler::load(p);
2333 }
2334
2335 void MDSMonitor::on_restart()
2336 {
2337 // Clear out the leader-specific state.
2338 last_tick = utime_t();
2339 last_beacon.clear();
2340 }
2341