]> git.proxmox.com Git - ceph.git/blame - ceph/src/mon/MDSMonitor.cc
add subtree-ish sources for 12.0.3
[ceph.git] / ceph / src / mon / MDSMonitor.cc
CommitLineData
7c673cae
FG
1// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2// vim: ts=8 sw=2 smarttab
3/*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 *
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
12 *
13 */
14
15#include <sstream>
16#include <boost/utility.hpp>
17
18#include "MDSMonitor.h"
19#include "FSCommands.h"
20#include "Monitor.h"
21#include "MonitorDBStore.h"
22#include "OSDMonitor.h"
23#include "PGMonitor.h"
24
25#include "common/strtol.h"
26#include "common/perf_counters.h"
27#include "common/config.h"
28#include "common/cmdparse.h"
29#include "messages/MMDSMap.h"
30#include "messages/MFSMap.h"
31#include "messages/MFSMapUser.h"
32#include "messages/MMDSLoadTargets.h"
33#include "messages/MMonCommand.h"
34#include "messages/MGenericMessage.h"
35
36#include "include/assert.h"
37#include "include/str_list.h"
38#include "include/stringify.h"
39#include "mds/mdstypes.h"
40#include "Session.h"
41
42#define dout_subsys ceph_subsys_mon
43#undef dout_prefix
44#define dout_prefix _prefix(_dout, mon, fsmap)
45static ostream& _prefix(std::ostream *_dout, Monitor *mon, FSMap const& fsmap) {
46 return *_dout << "mon." << mon->name << "@" << mon->rank
47 << "(" << mon->get_state_name()
48 << ").mds e" << fsmap.get_epoch() << " ";
49}
50
51/*
52 * Specialized implementation of cmd_getval to allow us to parse
53 * out strongly-typedef'd types
54 */
55template<> bool cmd_getval(CephContext *cct, const cmdmap_t& cmdmap,
56 std::string k, mds_gid_t &val)
57{
58 return cmd_getval(cct, cmdmap, k, (int64_t&)val);
59}
60
61template<> bool cmd_getval(CephContext *cct, const cmdmap_t& cmdmap,
62 std::string k, mds_rank_t &val)
63{
64 return cmd_getval(cct, cmdmap, k, (int64_t&)val);
65}
66
67template<> bool cmd_getval(CephContext *cct, const cmdmap_t& cmdmap,
68 std::string k, MDSMap::DaemonState &val)
69{
70 return cmd_getval(cct, cmdmap, k, (int64_t&)val);
71}
72
73static const string MDS_METADATA_PREFIX("mds_metadata");
74
75
76// my methods
77
78void MDSMonitor::print_map(FSMap &m, int dbl)
79{
80 dout(dbl) << "print_map\n";
81 m.print(*_dout);
82 *_dout << dendl;
83}
84
85// service methods
86void MDSMonitor::create_initial()
87{
88 dout(10) << "create_initial" << dendl;
89}
90
91
92void MDSMonitor::update_from_paxos(bool *need_bootstrap)
93{
94 version_t version = get_last_committed();
95 if (version == fsmap.epoch)
96 return;
97
98 dout(10) << __func__ << " version " << version
99 << ", my e " << fsmap.epoch << dendl;
100 assert(version > fsmap.epoch);
101
102 // read and decode
103 bufferlist fsmap_bl;
104 fsmap_bl.clear();
105 int err = get_version(version, fsmap_bl);
106 assert(err == 0);
107
108 assert(fsmap_bl.length() > 0);
109 dout(10) << __func__ << " got " << version << dendl;
110 fsmap.decode(fsmap_bl);
111
112 // new map
113 dout(4) << "new map" << dendl;
114 print_map(fsmap, 0);
115 if (!g_conf->mon_mds_skip_sanity) {
116 fsmap.sanity();
117 }
118
119 check_subs();
120 update_logger();
121}
122
123void MDSMonitor::init()
124{
125 (void)load_metadata(pending_metadata);
126}
127
128void MDSMonitor::create_pending()
129{
130 pending_fsmap = fsmap;
131 pending_fsmap.epoch++;
132
133 dout(10) << "create_pending e" << pending_fsmap.epoch << dendl;
134}
135
136void MDSMonitor::encode_pending(MonitorDBStore::TransactionRef t)
137{
138 dout(10) << "encode_pending e" << pending_fsmap.epoch << dendl;
139
140
141 // print map iff 'debug mon = 30' or higher
142 print_map(pending_fsmap, 30);
143 if (!g_conf->mon_mds_skip_sanity) {
144 pending_fsmap.sanity();
145 }
146
147 // Set 'modified' on maps modified this epoch
148 for (auto &i : fsmap.filesystems) {
149 if (i.second->mds_map.epoch == fsmap.epoch) {
150 i.second->mds_map.modified = ceph_clock_now();
151 }
152 }
153
154 // apply to paxos
155 assert(get_last_committed() + 1 == pending_fsmap.epoch);
156 bufferlist fsmap_bl;
157 pending_fsmap.encode(fsmap_bl, mon->get_quorum_con_features());
158
159 /* put everything in the transaction */
160 put_version(t, pending_fsmap.epoch, fsmap_bl);
161 put_last_committed(t, pending_fsmap.epoch);
162
163 // Encode MDSHealth data
164 for (std::map<uint64_t, MDSHealth>::iterator i = pending_daemon_health.begin();
165 i != pending_daemon_health.end(); ++i) {
166 bufferlist bl;
167 i->second.encode(bl);
168 t->put(MDS_HEALTH_PREFIX, stringify(i->first), bl);
169 }
170
171 for (std::set<uint64_t>::iterator i = pending_daemon_health_rm.begin();
172 i != pending_daemon_health_rm.end(); ++i) {
173 t->erase(MDS_HEALTH_PREFIX, stringify(*i));
174 }
175 pending_daemon_health_rm.clear();
176 remove_from_metadata(t);
177}
178
179version_t MDSMonitor::get_trim_to()
180{
181 version_t floor = 0;
182 if (g_conf->mon_mds_force_trim_to > 0 &&
183 g_conf->mon_mds_force_trim_to < (int)get_last_committed()) {
184 floor = g_conf->mon_mds_force_trim_to;
185 dout(10) << __func__ << " explicit mon_mds_force_trim_to = "
186 << floor << dendl;
187 }
188
189 unsigned max = g_conf->mon_max_mdsmap_epochs;
190 version_t last = get_last_committed();
191
192 if (last - get_first_committed() > max && floor < last - max)
193 return last - max;
194 return floor;
195}
196
197void MDSMonitor::update_logger()
198{
199 dout(10) << "update_logger" << dendl;
200
201 uint64_t up = 0;
202 uint64_t in = 0;
203 uint64_t failed = 0;
204 for (const auto &i : fsmap.filesystems) {
205 const MDSMap &mds_map = i.second->mds_map;
206
207 up += mds_map.get_num_up_mds();
208 in += mds_map.get_num_in_mds();
209 failed += mds_map.get_num_failed_mds();
210 }
211 mon->cluster_logger->set(l_cluster_num_mds_up, up);
212 mon->cluster_logger->set(l_cluster_num_mds_in, in);
213 mon->cluster_logger->set(l_cluster_num_mds_failed, failed);
214 mon->cluster_logger->set(l_cluster_mds_epoch, fsmap.get_epoch());
215}
216
217bool MDSMonitor::preprocess_query(MonOpRequestRef op)
218{
219 op->mark_mdsmon_event(__func__);
220 PaxosServiceMessage *m = static_cast<PaxosServiceMessage*>(op->get_req());
221 dout(10) << "preprocess_query " << *m << " from " << m->get_orig_source_inst() << dendl;
222
223 switch (m->get_type()) {
224
225 case MSG_MDS_BEACON:
226 return preprocess_beacon(op);
227
228 case MSG_MON_COMMAND:
229 return preprocess_command(op);
230
231 case MSG_MDS_OFFLOAD_TARGETS:
232 return preprocess_offload_targets(op);
233
234 default:
235 ceph_abort();
236 return true;
237 }
238}
239
240void MDSMonitor::_note_beacon(MMDSBeacon *m)
241{
242 mds_gid_t gid = mds_gid_t(m->get_global_id());
243 version_t seq = m->get_seq();
244
245 dout(15) << "_note_beacon " << *m << " noting time" << dendl;
246 last_beacon[gid].stamp = ceph_clock_now();
247 last_beacon[gid].seq = seq;
248}
249
250bool MDSMonitor::preprocess_beacon(MonOpRequestRef op)
251{
252 op->mark_mdsmon_event(__func__);
253 MMDSBeacon *m = static_cast<MMDSBeacon*>(op->get_req());
254 MDSMap::DaemonState state = m->get_state();
255 mds_gid_t gid = m->get_global_id();
256 version_t seq = m->get_seq();
257 MDSMap::mds_info_t info;
258 epoch_t effective_epoch = 0;
259
260 // check privileges, ignore if fails
261 MonSession *session = m->get_session();
262 assert(session);
263 if (!session->is_capable("mds", MON_CAP_X)) {
264 dout(0) << "preprocess_beacon got MMDSBeacon from entity with insufficient privileges "
265 << session->caps << dendl;
266 goto ignore;
267 }
268
269 if (m->get_fsid() != mon->monmap->fsid) {
270 dout(0) << "preprocess_beacon on fsid " << m->get_fsid() << " != " << mon->monmap->fsid << dendl;
271 goto ignore;
272 }
273
274 dout(12) << "preprocess_beacon " << *m
275 << " from " << m->get_orig_source_inst()
276 << " " << m->get_compat()
277 << dendl;
278
279 // make sure the address has a port
280 if (m->get_orig_source_addr().get_port() == 0) {
281 dout(1) << " ignoring boot message without a port" << dendl;
282 goto ignore;
283 }
284
285 // check compat
286 if (!m->get_compat().writeable(fsmap.compat)) {
287 dout(1) << " mds " << m->get_source_inst() << " can't write to fsmap " << fsmap.compat << dendl;
288 goto ignore;
289 }
290
291 // fw to leader?
292 if (!mon->is_leader())
293 return false;
294
295 // booted, but not in map?
296 if (!pending_fsmap.gid_exists(gid)) {
297 if (state != MDSMap::STATE_BOOT) {
298 dout(7) << "mds_beacon " << *m << " is not in fsmap (state "
299 << ceph_mds_state_name(state) << ")" << dendl;
300
301 MDSMap null_map;
302 null_map.epoch = fsmap.epoch;
303 null_map.compat = fsmap.compat;
304 mon->send_reply(op, new MMDSMap(mon->monmap->fsid, &null_map));
305 return true;
306 } else {
307 return false; // not booted yet.
308 }
309 }
310 dout(10) << __func__ << ": GID exists in map: " << gid << dendl;
311 info = pending_fsmap.get_info_gid(gid);
312
313 // old seq?
314 if (info.state_seq > seq) {
315 dout(7) << "mds_beacon " << *m << " has old seq, ignoring" << dendl;
316 goto ignore;
317 }
318
319 // Work out the latest epoch that this daemon should have seen
320 {
321 fs_cluster_id_t fscid = pending_fsmap.mds_roles.at(gid);
322 if (fscid == FS_CLUSTER_ID_NONE) {
323 effective_epoch = pending_fsmap.standby_epochs.at(gid);
324 } else {
325 effective_epoch = pending_fsmap.get_filesystem(fscid)->mds_map.epoch;
326 }
327 if (effective_epoch != m->get_last_epoch_seen()) {
328 dout(10) << "mds_beacon " << *m
329 << " ignoring requested state, because mds hasn't seen latest map" << dendl;
330 goto reply;
331 }
332 }
333
334 if (info.laggy()) {
335 _note_beacon(m);
336 return false; // no longer laggy, need to update map.
337 }
338 if (state == MDSMap::STATE_BOOT) {
339 // ignore, already booted.
340 goto ignore;
341 }
342 // is there a state change here?
343 if (info.state != state) {
344 // legal state change?
345 if ((info.state == MDSMap::STATE_STANDBY ||
346 info.state == MDSMap::STATE_STANDBY_REPLAY) && state > 0) {
347 dout(10) << "mds_beacon mds can't activate itself (" << ceph_mds_state_name(info.state)
348 << " -> " << ceph_mds_state_name(state) << ")" << dendl;
349 goto reply;
350 }
351
352 if ((state == MDSMap::STATE_STANDBY || state == MDSMap::STATE_STANDBY_REPLAY)
353 && info.rank != MDS_RANK_NONE)
354 {
355 dout(4) << "mds_beacon MDS can't go back into standby after taking rank: "
356 "held rank " << info.rank << " while requesting state "
357 << ceph_mds_state_name(state) << dendl;
358 goto reply;
359 }
360
361 _note_beacon(m);
362 return false;
363 }
364
365 // Comparing known daemon health with m->get_health()
366 // and return false (i.e. require proposal) if they
367 // do not match, to update our stored
368 if (!(pending_daemon_health[gid] == m->get_health())) {
369 dout(20) << __func__ << " health metrics for gid " << gid << " were updated" << dendl;
370 _note_beacon(m);
371 return false;
372 }
373
374 reply:
375 // note time and reply
376 assert(effective_epoch > 0);
377 _note_beacon(m);
378 mon->send_reply(op,
379 new MMDSBeacon(mon->monmap->fsid, m->get_global_id(), m->get_name(),
380 effective_epoch, state, seq,
381 CEPH_FEATURES_SUPPORTED_DEFAULT));
382 return true;
383
384 ignore:
385 // I won't reply this beacon, drop it.
386 mon->no_reply(op);
387 return true;
388}
389
390bool MDSMonitor::preprocess_offload_targets(MonOpRequestRef op)
391{
392 op->mark_mdsmon_event(__func__);
393 MMDSLoadTargets *m = static_cast<MMDSLoadTargets*>(op->get_req());
394 dout(10) << "preprocess_offload_targets " << *m << " from " << m->get_orig_source() << dendl;
395
396 // check privileges, ignore message if fails
397 MonSession *session = m->get_session();
398 if (!session)
399 goto done;
400 if (!session->is_capable("mds", MON_CAP_X)) {
401 dout(0) << "preprocess_offload_targets got MMDSLoadTargets from entity with insufficient caps "
402 << session->caps << dendl;
403 goto done;
404 }
405
406 if (fsmap.gid_exists(m->global_id) &&
407 m->targets == fsmap.get_info_gid(m->global_id).export_targets)
408 goto done;
409
410 return false;
411
412 done:
413 return true;
414}
415
416
417bool MDSMonitor::prepare_update(MonOpRequestRef op)
418{
419 op->mark_mdsmon_event(__func__);
420 PaxosServiceMessage *m = static_cast<PaxosServiceMessage*>(op->get_req());
421 dout(7) << "prepare_update " << *m << dendl;
422
423 switch (m->get_type()) {
424
425 case MSG_MDS_BEACON:
426 return prepare_beacon(op);
427
428 case MSG_MON_COMMAND:
429 return prepare_command(op);
430
431 case MSG_MDS_OFFLOAD_TARGETS:
432 return prepare_offload_targets(op);
433
434 default:
435 ceph_abort();
436 }
437
438 return true;
439}
440
441bool MDSMonitor::prepare_beacon(MonOpRequestRef op)
442{
443 op->mark_mdsmon_event(__func__);
444 MMDSBeacon *m = static_cast<MMDSBeacon*>(op->get_req());
445 // -- this is an update --
446 dout(12) << "prepare_beacon " << *m << " from " << m->get_orig_source_inst() << dendl;
447 entity_addr_t addr = m->get_orig_source_inst().addr;
448 mds_gid_t gid = m->get_global_id();
449 MDSMap::DaemonState state = m->get_state();
450 version_t seq = m->get_seq();
451
452 dout(20) << __func__ << " got health from gid " << gid << " with " << m->get_health().metrics.size() << " metrics." << dendl;
453
454 // Calculate deltas of health metrics created and removed
455 // Do this by type rather than MDSHealthMetric equality, because messages can
456 // change a lot when they include e.g. a number of items.
457 const auto &old_health = pending_daemon_health[gid].metrics;
458 const auto &new_health = m->get_health().metrics;
459
460 std::set<mds_metric_t> old_types;
461 for (const auto &i : old_health) {
462 old_types.insert(i.type);
463 }
464
465 std::set<mds_metric_t> new_types;
466 for (const auto &i : new_health) {
467 new_types.insert(i.type);
468 }
469
470 for (const auto &new_metric: new_health) {
471 if (old_types.count(new_metric.type) == 0) {
472 std::stringstream msg;
473 msg << "MDS health message (" << m->get_orig_source_inst().name << "): "
474 << new_metric.message;
475 if (new_metric.sev == HEALTH_ERR) {
476 mon->clog->error() << msg.str();
477 } else if (new_metric.sev == HEALTH_WARN) {
478 mon->clog->warn() << msg.str();
479 } else {
480 mon->clog->info() << msg.str();
481 }
482 }
483 }
484
485 // Log the disappearance of health messages at INFO
486 for (const auto &old_metric : old_health) {
487 if (new_types.count(old_metric.type) == 0) {
488 mon->clog->info() << "MDS health message cleared ("
489 << m->get_orig_source_inst().name << "): " << old_metric.message;
490 }
491 }
492
493 // Store health
494 pending_daemon_health[gid] = m->get_health();
495
496 // boot?
497 if (state == MDSMap::STATE_BOOT) {
498 // zap previous instance of this name?
499 if (g_conf->mds_enforce_unique_name) {
500 bool failed_mds = false;
501 while (mds_gid_t existing = pending_fsmap.find_mds_gid_by_name(m->get_name())) {
502 if (!mon->osdmon()->is_writeable()) {
503 mon->osdmon()->wait_for_writeable(op, new C_RetryMessage(this, op));
504 return false;
505 }
506 fail_mds_gid(existing);
507 failed_mds = true;
508 }
509 if (failed_mds) {
510 assert(mon->osdmon()->is_writeable());
511 request_proposal(mon->osdmon());
512 }
513 }
514
515 // Add this daemon to the map
516 if (pending_fsmap.mds_roles.count(gid) == 0) {
517 MDSMap::mds_info_t new_info;
518 new_info.global_id = gid;
519 new_info.name = m->get_name();
520 new_info.addr = addr;
521 new_info.mds_features = m->get_mds_features();
522 new_info.state = MDSMap::STATE_STANDBY;
523 new_info.state_seq = seq;
524 new_info.standby_for_rank = m->get_standby_for_rank();
525 new_info.standby_for_name = m->get_standby_for_name();
526 new_info.standby_for_fscid = m->get_standby_for_fscid();
527 new_info.standby_replay = m->get_standby_replay();
528 pending_fsmap.insert(new_info);
529 }
530
531 // Resolve standby_for_name to a rank
532 const MDSMap::mds_info_t &info = pending_fsmap.get_info_gid(gid);
533 if (!info.standby_for_name.empty()) {
534 const MDSMap::mds_info_t *leaderinfo = fsmap.find_by_name(
535 info.standby_for_name);
536 if (leaderinfo && (leaderinfo->rank >= 0)) {
537 auto fscid = pending_fsmap.mds_roles.at(leaderinfo->global_id);
538 auto fs = pending_fsmap.get_filesystem(fscid);
539 bool followable = fs->mds_map.is_followable(leaderinfo->rank);
540
541 pending_fsmap.modify_daemon(gid, [fscid, leaderinfo, followable](
542 MDSMap::mds_info_t *info) {
543 info->standby_for_rank = leaderinfo->rank;
544 info->standby_for_fscid = fscid;
545 });
546 }
547 }
548
549 // initialize the beacon timer
550 last_beacon[gid].stamp = ceph_clock_now();
551 last_beacon[gid].seq = seq;
552
553 // new incompat?
554 if (!pending_fsmap.compat.writeable(m->get_compat())) {
555 dout(10) << " fsmap " << pending_fsmap.compat
556 << " can't write to new mds' " << m->get_compat()
557 << ", updating fsmap and killing old mds's"
558 << dendl;
559 pending_fsmap.update_compat(m->get_compat());
560 }
561
562 update_metadata(m->get_global_id(), m->get_sys_info());
563 } else {
564 // state update
565 const MDSMap::mds_info_t &info = pending_fsmap.get_info_gid(gid);
566 // Old MDS daemons don't mention that they're standby replay until
567 // after they've sent their boot beacon, so update this field.
568 if (info.standby_replay != m->get_standby_replay()) {
569 pending_fsmap.modify_daemon(info.global_id, [&m](
570 MDSMap::mds_info_t *i)
571 {
572 i->standby_replay = m->get_standby_replay();
573 });
574 }
575
576 if (info.state == MDSMap::STATE_STOPPING && state != MDSMap::STATE_STOPPED ) {
577 // we can't transition to any other states from STOPPING
578 dout(0) << "got beacon for MDS in STATE_STOPPING, ignoring requested state change"
579 << dendl;
580 _note_beacon(m);
581 return true;
582 }
583
584 if (info.laggy()) {
585 dout(10) << "prepare_beacon clearing laggy flag on " << addr << dendl;
586 pending_fsmap.modify_daemon(info.global_id, [](MDSMap::mds_info_t *info)
587 {
588 info->clear_laggy();
589 }
590 );
591 }
592
593 dout(10) << "prepare_beacon mds." << info.rank
594 << " " << ceph_mds_state_name(info.state)
595 << " -> " << ceph_mds_state_name(state)
596 << " standby_for_rank=" << m->get_standby_for_rank()
597 << dendl;
598 if (state == MDSMap::STATE_STOPPED) {
599 auto erased = pending_fsmap.stop(gid);
600 erased.push_back(gid);
601
602 for (const auto &erased_gid : erased) {
603 last_beacon.erase(erased_gid);
604 if (pending_daemon_health.count(erased_gid)) {
605 pending_daemon_health.erase(erased_gid);
606 pending_daemon_health_rm.insert(erased_gid);
607 }
608 }
609 } else if (state == MDSMap::STATE_DAMAGED) {
610 if (!mon->osdmon()->is_writeable()) {
611 dout(4) << __func__ << ": DAMAGED from rank " << info.rank
612 << " waiting for osdmon writeable to blacklist it" << dendl;
613 mon->osdmon()->wait_for_writeable(op, new C_RetryMessage(this, op));
614 return false;
615 }
616
617 // Record this MDS rank as damaged, so that other daemons
618 // won't try to run it.
619 dout(4) << __func__ << ": marking rank "
620 << info.rank << " damaged" << dendl;
621
622 utime_t until = ceph_clock_now();
623 until += g_conf->mds_blacklist_interval;
624 const auto blacklist_epoch = mon->osdmon()->blacklist(info.addr, until);
625 request_proposal(mon->osdmon());
626 pending_fsmap.damaged(gid, blacklist_epoch);
627 last_beacon.erase(gid);
628
629 // Respond to MDS, so that it knows it can continue to shut down
630 mon->send_reply(op,
631 new MMDSBeacon(
632 mon->monmap->fsid, m->get_global_id(),
633 m->get_name(), fsmap.get_epoch(), state, seq,
634 CEPH_FEATURES_SUPPORTED_DEFAULT));
635 } else if (state == MDSMap::STATE_DNE) {
636 if (!mon->osdmon()->is_writeable()) {
637 dout(4) << __func__ << ": DNE from rank " << info.rank
638 << " waiting for osdmon writeable to blacklist it" << dendl;
639 mon->osdmon()->wait_for_writeable(op, new C_RetryMessage(this, op));
640 return false;
641 }
642
643 fail_mds_gid(gid);
644 assert(mon->osdmon()->is_writeable());
645 request_proposal(mon->osdmon());
646
647 // Respond to MDS, so that it knows it can continue to shut down
648 mon->send_reply(op,
649 new MMDSBeacon(
650 mon->monmap->fsid, m->get_global_id(),
651 m->get_name(), fsmap.get_epoch(), state, seq,
652 CEPH_FEATURES_SUPPORTED_DEFAULT));
653 } else if (info.state == MDSMap::STATE_STANDBY && state != info.state) {
654 // Standby daemons should never modify their own
655 // state. Reject any attempts to do so.
656 derr << "standby " << gid << " attempted to change state to "
657 << ceph_mds_state_name(state) << ", rejecting" << dendl;
658 return true;
659 } else if (info.state != MDSMap::STATE_STANDBY && state != info.state &&
660 !MDSMap::state_transition_valid(info.state, state)) {
661 // Validate state transitions for daemons that hold a rank
662 derr << "daemon " << gid << " (rank " << info.rank << ") "
663 << "reported invalid state transition "
664 << ceph_mds_state_name(info.state) << " -> "
665 << ceph_mds_state_name(state) << dendl;
666 return true;
667 } else {
668 // Made it through special cases and validations, record the
669 // daemon's reported state to the FSMap.
670 pending_fsmap.modify_daemon(gid, [state, seq](MDSMap::mds_info_t *info) {
671 info->state = state;
672 info->state_seq = seq;
673 });
674 }
675 }
676
677 dout(7) << "prepare_beacon pending map now:" << dendl;
678 print_map(pending_fsmap);
679
680 wait_for_finished_proposal(op, new FunctionContext([op, this](int r){
681 if (r >= 0)
682 _updated(op); // success
683 else if (r == -ECANCELED) {
684 mon->no_reply(op);
685 } else {
686 dispatch(op); // try again
687 }
688 }));
689
690 return true;
691}
692
693bool MDSMonitor::prepare_offload_targets(MonOpRequestRef op)
694{
695 op->mark_mdsmon_event(__func__);
696 MMDSLoadTargets *m = static_cast<MMDSLoadTargets*>(op->get_req());
697 mds_gid_t gid = m->global_id;
698 if (pending_fsmap.gid_has_rank(gid)) {
699 dout(10) << "prepare_offload_targets " << gid << " " << m->targets << dendl;
700 pending_fsmap.update_export_targets(gid, m->targets);
701 } else {
702 dout(10) << "prepare_offload_targets " << gid << " not in map" << dendl;
703 }
704 return true;
705}
706
707bool MDSMonitor::should_propose(double& delay)
708{
709 // delegate to PaxosService to assess whether we should propose
710 return PaxosService::should_propose(delay);
711}
712
713void MDSMonitor::_updated(MonOpRequestRef op)
714{
715 op->mark_mdsmon_event(__func__);
716 MMDSBeacon *m = static_cast<MMDSBeacon*>(op->get_req());
717 dout(10) << "_updated " << m->get_orig_source() << " " << *m << dendl;
718 mon->clog->info() << m->get_orig_source_inst() << " "
719 << ceph_mds_state_name(m->get_state());
720
721 if (m->get_state() == MDSMap::STATE_STOPPED) {
722 // send the map manually (they're out of the map, so they won't get it automatic)
723 MDSMap null_map;
724 null_map.epoch = fsmap.epoch;
725 null_map.compat = fsmap.compat;
726 mon->send_reply(op, new MMDSMap(mon->monmap->fsid, &null_map));
727 } else {
728 mon->send_reply(op, new MMDSBeacon(mon->monmap->fsid,
729 m->get_global_id(),
730 m->get_name(),
731 fsmap.get_epoch(),
732 m->get_state(),
733 m->get_seq(),
734 CEPH_FEATURES_SUPPORTED_DEFAULT));
735 }
736}
737
738void MDSMonitor::on_active()
739{
740 tick();
741 update_logger();
742
743 if (mon->is_leader())
744 mon->clog->info() << "fsmap " << fsmap;
745}
746
747void MDSMonitor::get_health(list<pair<health_status_t, string> >& summary,
748 list<pair<health_status_t, string> > *detail,
749 CephContext* cct) const
750{
751 fsmap.get_health(summary, detail);
752
753 // For each MDS GID...
754 const auto info_map = fsmap.get_mds_info();
755 for (const auto &i : info_map) {
756 const auto &gid = i.first;
757 const auto &info = i.second;
758
759 // Decode MDSHealth
760 bufferlist bl;
761 mon->store->get(MDS_HEALTH_PREFIX, stringify(gid), bl);
762 if (!bl.length()) {
763 derr << "Missing health data for MDS " << gid << dendl;
764 continue;
765 }
766 MDSHealth health;
767 bufferlist::iterator bl_i = bl.begin();
768 health.decode(bl_i);
769
770 for (const auto &metric : health.metrics) {
771 int const rank = info.rank;
772 std::ostringstream message;
773 message << "mds" << rank << ": " << metric.message;
774 summary.push_back(std::make_pair(metric.sev, message.str()));
775
776 if (detail) {
777 // There is no way for us to clealy associate detail entries with summary entries (#7192), so
778 // we duplicate the summary message in the detail string and tag the metadata on.
779 std::ostringstream detail_message;
780 detail_message << message.str();
781 if (metric.metadata.size()) {
782 detail_message << "(";
783 auto k = metric.metadata.begin();
784 while (k != metric.metadata.end()) {
785 detail_message << k->first << ": " << k->second;
786 if (boost::next(k) != metric.metadata.end()) {
787 detail_message << ", ";
788 }
789 ++k;
790 }
791 detail_message << ")";
792 }
793 detail->push_back(std::make_pair(metric.sev, detail_message.str()));
794 }
795 }
796 }
797}
798
799void MDSMonitor::dump_info(Formatter *f)
800{
801 f->open_object_section("fsmap");
802 fsmap.dump(f);
803 f->close_section();
804
805 f->dump_unsigned("mdsmap_first_committed", get_first_committed());
806 f->dump_unsigned("mdsmap_last_committed", get_last_committed());
807}
808
809bool MDSMonitor::preprocess_command(MonOpRequestRef op)
810{
811 op->mark_mdsmon_event(__func__);
812 MMonCommand *m = static_cast<MMonCommand*>(op->get_req());
813 int r = -1;
814 bufferlist rdata;
815 stringstream ss, ds;
816
817 map<string, cmd_vartype> cmdmap;
818 if (!cmdmap_from_json(m->cmd, &cmdmap, ss)) {
819 // ss has reason for failure
820 string rs = ss.str();
821 mon->reply_command(op, -EINVAL, rs, rdata, get_last_committed());
822 return true;
823 }
824
825 string prefix;
826 cmd_getval(g_ceph_context, cmdmap, "prefix", prefix);
827 string format;
828 cmd_getval(g_ceph_context, cmdmap, "format", format, string("plain"));
829 boost::scoped_ptr<Formatter> f(Formatter::create(format));
830
831 MonSession *session = m->get_session();
832 if (!session) {
833 mon->reply_command(op, -EACCES, "access denied", rdata, get_last_committed());
834 return true;
835 }
836
837 if (prefix == "mds stat") {
838 if (f) {
839 f->open_object_section("mds_stat");
840 dump_info(f.get());
841 f->close_section();
842 f->flush(ds);
843 } else {
844 ds << fsmap;
845 }
846 r = 0;
847 } else if (prefix == "mds dump") {
848 int64_t epocharg;
849 epoch_t epoch;
850
851 FSMap *p = &fsmap;
852 if (cmd_getval(g_ceph_context, cmdmap, "epoch", epocharg)) {
853 epoch = epocharg;
854 bufferlist b;
855 int err = get_version(epoch, b);
856 if (err == -ENOENT) {
857 p = 0;
858 r = -ENOENT;
859 } else {
860 assert(err == 0);
861 assert(b.length());
862 p = new FSMap;
863 p->decode(b);
864 }
865 }
866 if (p) {
867 stringstream ds;
868 const MDSMap *mdsmap = nullptr;
869 MDSMap blank;
870 blank.epoch = fsmap.epoch;
871 if (fsmap.legacy_client_fscid != FS_CLUSTER_ID_NONE) {
872 mdsmap = &(fsmap.filesystems[fsmap.legacy_client_fscid]->mds_map);
873 } else {
874 mdsmap = &blank;
875 }
876 if (f != NULL) {
877 f->open_object_section("mdsmap");
878 mdsmap->dump(f.get());
879 f->close_section();
880 f->flush(ds);
881 r = 0;
882 } else {
883 mdsmap->print(ds);
884 r = 0;
885 }
886 if (r == 0) {
887 rdata.append(ds);
888 ss << "dumped fsmap epoch " << p->get_epoch();
889 }
890 if (p != &fsmap) {
891 delete p;
892 }
893 }
894 } else if (prefix == "fs dump") {
895 int64_t epocharg;
896 epoch_t epoch;
897
898 FSMap *p = &fsmap;
899 if (cmd_getval(g_ceph_context, cmdmap, "epoch", epocharg)) {
900 epoch = epocharg;
901 bufferlist b;
902 int err = get_version(epoch, b);
903 if (err == -ENOENT) {
904 p = 0;
905 r = -ENOENT;
906 } else {
907 assert(err == 0);
908 assert(b.length());
909 p = new FSMap;
910 p->decode(b);
911 }
912 }
913 if (p) {
914 stringstream ds;
915 if (f != NULL) {
916 f->open_object_section("fsmap");
917 p->dump(f.get());
918 f->close_section();
919 f->flush(ds);
920 r = 0;
921 } else {
922 p->print(ds);
923 r = 0;
924 }
925 if (r == 0) {
926 rdata.append(ds);
927 ss << "dumped fsmap epoch " << p->get_epoch();
928 }
929 if (p != &fsmap)
930 delete p;
931 }
932 } else if (prefix == "mds metadata") {
933 if (!f)
934 f.reset(Formatter::create("json-pretty"));
935
936 string who;
937 bool all = !cmd_getval(g_ceph_context, cmdmap, "who", who);
938 dout(1) << "all = " << all << dendl;
939 if (all) {
940 r = 0;
941 // Dump all MDSs' metadata
942 const auto all_info = fsmap.get_mds_info();
943
944 f->open_array_section("mds_metadata");
945 for(const auto &i : all_info) {
946 const auto &info = i.second;
947
948 f->open_object_section("mds");
949 f->dump_string("name", info.name);
950 std::ostringstream get_err;
951 r = dump_metadata(info.name, f.get(), get_err);
952 if (r == -EINVAL || r == -ENOENT) {
953 // Drop error, list what metadata we do have
954 dout(1) << get_err.str() << dendl;
955 r = 0;
956 } else if (r != 0) {
957 derr << "Unexpected error reading metadata: " << cpp_strerror(r)
958 << dendl;
959 ss << get_err.str();
960 break;
961 }
962 f->close_section();
963 }
964 f->close_section();
965 } else {
966 // Dump a single daemon's metadata
967 f->open_object_section("mds_metadata");
968 r = dump_metadata(who, f.get(), ss);
969 f->close_section();
970 }
971 f->flush(ds);
972 } else if (prefix == "mds getmap") {
973 epoch_t e;
974 int64_t epocharg;
975 bufferlist b;
976 if (cmd_getval(g_ceph_context, cmdmap, "epoch", epocharg)) {
977 e = epocharg;
978 int err = get_version(e, b);
979 if (err == -ENOENT) {
980 r = -ENOENT;
981 } else {
982 assert(err == 0);
983 assert(b.length());
984 FSMap mm;
985 mm.decode(b);
986 mm.encode(rdata, m->get_connection()->get_features());
987 ss << "got fsmap epoch " << mm.get_epoch();
988 r = 0;
989 }
990 } else {
991 fsmap.encode(rdata, m->get_connection()->get_features());
992 ss << "got fsmap epoch " << fsmap.get_epoch();
993 r = 0;
994 }
995 } else if (prefix == "mds compat show") {
996 if (f) {
997 f->open_object_section("mds_compat");
998 fsmap.compat.dump(f.get());
999 f->close_section();
1000 f->flush(ds);
1001 } else {
1002 ds << fsmap.compat;
1003 }
1004 r = 0;
1005 } else if (prefix == "fs get") {
1006 string fs_name;
1007 cmd_getval(g_ceph_context, cmdmap, "fs_name", fs_name);
1008 auto fs = fsmap.get_filesystem(fs_name);
1009 if (fs == nullptr) {
1010 ss << "filesystem '" << fs_name << "' not found";
1011 r = -ENOENT;
1012 } else {
1013 if (f != nullptr) {
1014 f->open_object_section("filesystem");
1015 fs->dump(f.get());
1016 f->close_section();
1017 f->flush(ds);
1018 r = 0;
1019 } else {
1020 fs->print(ds);
1021 r = 0;
1022 }
1023 }
1024 } else if (prefix == "fs ls") {
1025 if (f) {
1026 f->open_array_section("filesystems");
1027 {
1028 for (const auto i : fsmap.filesystems) {
1029 const auto fs = i.second;
1030 f->open_object_section("filesystem");
1031 {
1032 const MDSMap &mds_map = fs->mds_map;
1033 f->dump_string("name", mds_map.fs_name);
1034 /* Output both the names and IDs of pools, for use by
1035 * humans and machines respectively */
1036 f->dump_string("metadata_pool", mon->osdmon()->osdmap.get_pool_name(
1037 mds_map.metadata_pool));
1038 f->dump_int("metadata_pool_id", mds_map.metadata_pool);
1039 f->open_array_section("data_pool_ids");
1040 {
1041 for (auto dpi = mds_map.data_pools.begin();
1042 dpi != mds_map.data_pools.end(); ++dpi) {
1043 f->dump_int("data_pool_id", *dpi);
1044 }
1045 }
1046 f->close_section();
1047
1048 f->open_array_section("data_pools");
1049 {
1050 for (auto dpi = mds_map.data_pools.begin();
1051 dpi != mds_map.data_pools.end(); ++dpi) {
1052 const auto &name = mon->osdmon()->osdmap.get_pool_name(
1053 *dpi);
1054 f->dump_string("data_pool", name);
1055 }
1056 }
1057
1058 f->close_section();
1059 }
1060 f->close_section();
1061 }
1062 }
1063 f->close_section();
1064 f->flush(ds);
1065 } else {
1066 for (const auto i : fsmap.filesystems) {
1067 const auto fs = i.second;
1068 const MDSMap &mds_map = fs->mds_map;
1069 const string &md_pool_name = mon->osdmon()->osdmap.get_pool_name(
1070 mds_map.metadata_pool);
1071
1072 ds << "name: " << mds_map.fs_name << ", metadata pool: "
1073 << md_pool_name << ", data pools: [";
1074 for (std::set<int64_t>::iterator dpi = mds_map.data_pools.begin();
1075 dpi != mds_map.data_pools.end(); ++dpi) {
1076 const string &pool_name = mon->osdmon()->osdmap.get_pool_name(*dpi);
1077 ds << pool_name << " ";
1078 }
1079 ds << "]" << std::endl;
1080 }
1081
1082 if (fsmap.filesystems.empty()) {
1083 ds << "No filesystems enabled" << std::endl;
1084 }
1085 }
1086 r = 0;
1087 }
1088
1089 if (r != -1) {
1090 rdata.append(ds);
1091 string rs;
1092 getline(ss, rs);
1093 mon->reply_command(op, r, rs, rdata, get_last_committed());
1094 return true;
1095 } else
1096 return false;
1097}
1098
1099bool MDSMonitor::fail_mds_gid(mds_gid_t gid)
1100{
1101 const MDSMap::mds_info_t info = pending_fsmap.get_info_gid(gid);
1102 dout(10) << "fail_mds_gid " << gid << " mds." << info.name << " role " << info.rank << dendl;
1103
1104 epoch_t blacklist_epoch = 0;
1105 if (info.rank >= 0 && info.state != MDSMap::STATE_STANDBY_REPLAY) {
1106 utime_t until = ceph_clock_now();
1107 until += g_conf->mds_blacklist_interval;
1108 blacklist_epoch = mon->osdmon()->blacklist(info.addr, until);
1109 }
1110
1111 pending_fsmap.erase(gid, blacklist_epoch);
1112 last_beacon.erase(gid);
1113 if (pending_daemon_health.count(gid)) {
1114 pending_daemon_health.erase(gid);
1115 pending_daemon_health_rm.insert(gid);
1116 }
1117
1118 return blacklist_epoch != 0;
1119}
1120
1121mds_gid_t MDSMonitor::gid_from_arg(const std::string& arg, std::ostream &ss)
1122{
1123 // Try parsing as a role
1124 mds_role_t role;
1125 std::ostringstream ignore_err; // Don't spam 'ss' with parse_role errors
1126 int r = parse_role(arg, &role, ignore_err);
1127 if (r == 0) {
1128 // See if a GID is assigned to this role
1129 auto fs = pending_fsmap.get_filesystem(role.fscid);
1130 assert(fs != nullptr); // parse_role ensures it exists
1131 if (fs->mds_map.is_up(role.rank)) {
1132 dout(10) << __func__ << ": validated rank/GID " << role
1133 << " as a rank" << dendl;
1134 return fs->mds_map.get_mds_info(role.rank).global_id;
1135 }
1136 }
1137
1138 // Try parsing as a gid
1139 std::string err;
1140 unsigned long long maybe_gid = strict_strtoll(arg.c_str(), 10, &err);
1141 if (!err.empty()) {
1142 // Not a role or a GID, try as a daemon name
1143 const MDSMap::mds_info_t *mds_info = fsmap.find_by_name(arg);
1144 if (!mds_info) {
1145 ss << "MDS named '" << arg
1146 << "' does not exist, or is not up";
1147 return MDS_GID_NONE;
1148 }
1149 dout(10) << __func__ << ": resolved MDS name '" << arg
1150 << "' to GID " << mds_info->global_id << dendl;
1151 return mds_info->global_id;
1152 } else {
1153 // Not a role, but parses as a an integer, might be a GID
1154 dout(10) << __func__ << ": treating MDS reference '" << arg
1155 << "' as an integer " << maybe_gid << dendl;
1156 if (mon->is_leader()) {
1157 if (pending_fsmap.gid_exists(mds_gid_t(maybe_gid))) {
1158 return mds_gid_t(maybe_gid);
1159 }
1160 } else {
1161 if (fsmap.gid_exists(mds_gid_t(maybe_gid))) {
1162 return mds_gid_t(maybe_gid);
1163 }
1164 }
1165 }
1166
1167 dout(1) << __func__ << ": rank/GID " << arg
1168 << " not a existent rank or GID" << dendl;
1169 return MDS_GID_NONE;
1170}
1171
1172int MDSMonitor::fail_mds(std::ostream &ss, const std::string &arg)
1173{
1174 mds_gid_t gid = gid_from_arg(arg, ss);
1175 if (gid == MDS_GID_NONE) {
1176 return 0;
1177 }
1178 if (!mon->osdmon()->is_writeable()) {
1179 return -EAGAIN;
1180 }
1181 fail_mds_gid(gid);
1182 ss << "failed mds gid " << gid;
1183 assert(mon->osdmon()->is_writeable());
1184 request_proposal(mon->osdmon());
1185 return 0;
1186}
1187
1188bool MDSMonitor::prepare_command(MonOpRequestRef op)
1189{
1190 op->mark_mdsmon_event(__func__);
1191 MMonCommand *m = static_cast<MMonCommand*>(op->get_req());
1192 int r = -EINVAL;
1193 stringstream ss;
1194 bufferlist rdata;
1195
1196 map<string, cmd_vartype> cmdmap;
1197 if (!cmdmap_from_json(m->cmd, &cmdmap, ss)) {
1198 string rs = ss.str();
1199 mon->reply_command(op, -EINVAL, rs, rdata, get_last_committed());
1200 return true;
1201 }
1202
1203 string prefix;
1204 cmd_getval(g_ceph_context, cmdmap, "prefix", prefix);
1205
1206 /* Refuse access if message not associated with a valid session */
1207 MonSession *session = m->get_session();
1208 if (!session) {
1209 mon->reply_command(op, -EACCES, "access denied", rdata, get_last_committed());
1210 return true;
1211 }
1212
1213 for (auto h : handlers) {
1214 if (h->can_handle(prefix)) {
1215 r = h->handle(mon, pending_fsmap, op, cmdmap, ss);
1216 if (r == -EAGAIN) {
1217 // message has been enqueued for retry; return.
1218 dout(4) << __func__ << " enqueue for retry by prepare_command" << dendl;
1219 return false;
1220 } else {
1221 if (r == 0) {
1222 // On successful updates, print the updated map
1223 print_map(pending_fsmap);
1224 }
1225 // Successful or not, we're done: respond.
1226 goto out;
1227 }
1228 }
1229 }
1230
1231 r = filesystem_command(op, prefix, cmdmap, ss);
1232 if (r >= 0) {
1233 goto out;
1234 } else if (r == -EAGAIN) {
1235 // Do not reply, the message has been enqueued for retry
1236 dout(4) << __func__ << " enqueue for retry by filesystem_command" << dendl;
1237 return false;
1238 } else if (r != -ENOSYS) {
1239 goto out;
1240 }
1241
1242 // Only handle legacy commands if there is a filesystem configured
1243 if (pending_fsmap.legacy_client_fscid == FS_CLUSTER_ID_NONE) {
1244 if (pending_fsmap.filesystems.size() == 0) {
1245 ss << "No filesystem configured: use `ceph fs new` to create a filesystem";
1246 } else {
1247 ss << "No filesystem set for use with legacy commands";
1248 }
1249 r = -EINVAL;
1250 goto out;
1251 }
1252
1253 r = legacy_filesystem_command(op, prefix, cmdmap, ss);
1254
1255 if (r == -ENOSYS && ss.str().empty()) {
1256 ss << "unrecognized command";
1257 }
1258
1259out:
1260 dout(4) << __func__ << " done, r=" << r << dendl;
1261 /* Compose response */
1262 string rs;
1263 getline(ss, rs);
1264
1265 if (r >= 0) {
1266 // success.. delay reply
1267 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, r, rs,
1268 get_last_committed() + 1));
1269 return true;
1270 } else {
1271 // reply immediately
1272 mon->reply_command(op, r, rs, rdata, get_last_committed());
1273 return false;
1274 }
1275}
1276
1277
1278/**
1279 * Given one of the following forms:
1280 * <fs name>:<rank>
1281 * <fs id>:<rank>
1282 * <rank>
1283 *
1284 * Parse into a mds_role_t. The rank-only form is only valid
1285 * if legacy_client_ns is set.
1286 */
1287int MDSMonitor::parse_role(
1288 const std::string &role_str,
1289 mds_role_t *role,
1290 std::ostream &ss)
1291{
1292 const FSMap *relevant_fsmap = &fsmap;
1293 if (mon->is_leader()) {
1294 relevant_fsmap = &pending_fsmap;
1295 }
1296 return relevant_fsmap->parse_role(role_str, role, ss);
1297}
1298
1299int MDSMonitor::filesystem_command(
1300 MonOpRequestRef op,
1301 std::string const &prefix,
1302 map<string, cmd_vartype> &cmdmap,
1303 std::stringstream &ss)
1304{
1305 dout(4) << __func__ << " prefix='" << prefix << "'" << dendl;
1306 op->mark_mdsmon_event(__func__);
1307 int r = 0;
1308 string whostr;
1309 cmd_getval(g_ceph_context, cmdmap, "who", whostr);
1310
1311 if (prefix == "mds stop" ||
1312 prefix == "mds deactivate") {
1313
1314 mds_role_t role;
1315 r = parse_role(whostr, &role, ss);
1316 if (r < 0 ) {
1317 return r;
1318 }
1319 auto fs = pending_fsmap.get_filesystem(role.fscid);
1320
1321 if (!fs->mds_map.is_active(role.rank)) {
1322 r = -EEXIST;
1323 ss << "mds." << role << " not active ("
1324 << ceph_mds_state_name(fs->mds_map.get_state(role.rank)) << ")";
1325 } else if (fs->mds_map.get_root() == role.rank ||
1326 fs->mds_map.get_tableserver() == role.rank) {
1327 r = -EINVAL;
1328 ss << "can't tell the root (" << fs->mds_map.get_root()
1329 << ") or tableserver (" << fs->mds_map.get_tableserver()
1330 << ") to deactivate";
1331 } else if (fs->mds_map.get_num_in_mds() <= size_t(fs->mds_map.get_max_mds())) {
1332 r = -EBUSY;
1333 ss << "must decrease max_mds or else MDS will immediately reactivate";
1334 } else {
1335 r = 0;
1336 mds_gid_t gid = fs->mds_map.up.at(role.rank);
1337 ss << "telling mds." << role << " "
1338 << pending_fsmap.get_info_gid(gid).addr << " to deactivate";
1339
1340 pending_fsmap.modify_daemon(gid, [](MDSMap::mds_info_t *info) {
1341 info->state = MDSMap::STATE_STOPPING;
1342 });
1343 }
1344 } else if (prefix == "mds set_state") {
1345 mds_gid_t gid;
1346 if (!cmd_getval(g_ceph_context, cmdmap, "gid", gid)) {
1347 ss << "error parsing 'gid' value '"
1348 << cmd_vartype_stringify(cmdmap["gid"]) << "'";
1349 return -EINVAL;
1350 }
1351 MDSMap::DaemonState state;
1352 if (!cmd_getval(g_ceph_context, cmdmap, "state", state)) {
1353 ss << "error parsing 'state' string value '"
1354 << cmd_vartype_stringify(cmdmap["state"]) << "'";
1355 return -EINVAL;
1356 }
1357 if (pending_fsmap.gid_exists(gid)) {
1358 pending_fsmap.modify_daemon(gid, [state](MDSMap::mds_info_t *info) {
1359 info->state = state;
1360 });
1361 ss << "set mds gid " << gid << " to state " << state << " "
1362 << ceph_mds_state_name(state);
1363 return 0;
1364 }
1365 } else if (prefix == "mds fail") {
1366 string who;
1367 cmd_getval(g_ceph_context, cmdmap, "who", who);
1368 r = fail_mds(ss, who);
1369 if (r < 0 && r == -EAGAIN) {
1370 mon->osdmon()->wait_for_writeable(op, new C_RetryMessage(this, op));
1371 return -EAGAIN; // don't propose yet; wait for message to be retried
1372 }
1373 } else if (prefix == "mds rm") {
1374 mds_gid_t gid;
1375 if (!cmd_getval(g_ceph_context, cmdmap, "gid", gid)) {
1376 ss << "error parsing 'gid' value '"
1377 << cmd_vartype_stringify(cmdmap["gid"]) << "'";
1378 return -EINVAL;
1379 }
1380 if (!pending_fsmap.gid_exists(gid)) {
1381 ss << "mds gid " << gid << " dne";
1382 r = 0;
1383 } else {
1384 MDSMap::DaemonState state = pending_fsmap.get_info_gid(gid).state;
1385 if (state > 0) {
1386 ss << "cannot remove active mds." << pending_fsmap.get_info_gid(gid).name
1387 << " rank " << pending_fsmap.get_info_gid(gid).rank;
1388 return -EBUSY;
1389 } else {
1390 pending_fsmap.erase(gid, {});
1391 ss << "removed mds gid " << gid;
1392 return 0;
1393 }
1394 }
1395 } else if (prefix == "mds rmfailed") {
1396 string confirm;
1397 if (!cmd_getval(g_ceph_context, cmdmap, "confirm", confirm) ||
1398 confirm != "--yes-i-really-mean-it") {
1399 ss << "WARNING: this can make your filesystem inaccessible! "
1400 "Add --yes-i-really-mean-it if you are sure you wish to continue.";
1401 return -EPERM;
1402 }
1403
1404 std::string role_str;
1405 cmd_getval(g_ceph_context, cmdmap, "who", role_str);
1406 mds_role_t role;
1407 int r = parse_role(role_str, &role, ss);
1408 if (r < 0) {
1409 ss << "invalid role '" << role_str << "'";
1410 return -EINVAL;
1411 }
1412
1413 pending_fsmap.modify_filesystem(
1414 role.fscid,
1415 [role](std::shared_ptr<Filesystem> fs)
1416 {
1417 fs->mds_map.failed.erase(role.rank);
1418 });
1419
1420 ss << "removed failed mds." << role;
1421 return 0;
1422 } else if (prefix == "mds compat rm_compat") {
1423 int64_t f;
1424 if (!cmd_getval(g_ceph_context, cmdmap, "feature", f)) {
1425 ss << "error parsing feature value '"
1426 << cmd_vartype_stringify(cmdmap["feature"]) << "'";
1427 return -EINVAL;
1428 }
1429 if (pending_fsmap.compat.compat.contains(f)) {
1430 ss << "removing compat feature " << f;
1431 CompatSet modified = pending_fsmap.compat;
1432 modified.compat.remove(f);
1433 pending_fsmap.update_compat(modified);
1434 } else {
1435 ss << "compat feature " << f << " not present in " << pending_fsmap.compat;
1436 }
1437 r = 0;
1438 } else if (prefix == "mds compat rm_incompat") {
1439 int64_t f;
1440 if (!cmd_getval(g_ceph_context, cmdmap, "feature", f)) {
1441 ss << "error parsing feature value '"
1442 << cmd_vartype_stringify(cmdmap["feature"]) << "'";
1443 return -EINVAL;
1444 }
1445 if (pending_fsmap.compat.incompat.contains(f)) {
1446 ss << "removing incompat feature " << f;
1447 CompatSet modified = pending_fsmap.compat;
1448 modified.incompat.remove(f);
1449 pending_fsmap.update_compat(modified);
1450 } else {
1451 ss << "incompat feature " << f << " not present in " << pending_fsmap.compat;
1452 }
1453 r = 0;
1454 } else if (prefix == "mds repaired") {
1455 std::string role_str;
1456 cmd_getval(g_ceph_context, cmdmap, "rank", role_str);
1457 mds_role_t role;
1458 r = parse_role(role_str, &role, ss);
1459 if (r < 0) {
1460 return r;
1461 }
1462
1463 bool modified = pending_fsmap.undamaged(role.fscid, role.rank);
1464 if (modified) {
1465 dout(4) << "repaired: restoring rank " << role << dendl;
1466 } else {
1467 dout(4) << "repaired: no-op on rank " << role << dendl;
1468 }
1469
1470 r = 0;
1471 } else {
1472 return -ENOSYS;
1473 }
1474
1475 return r;
1476}
1477
1478/**
1479 * Helper to legacy_filesystem_command
1480 */
1481void MDSMonitor::modify_legacy_filesystem(
1482 std::function<void(std::shared_ptr<Filesystem> )> fn)
1483{
1484 pending_fsmap.modify_filesystem(
1485 pending_fsmap.legacy_client_fscid,
1486 fn
1487 );
1488}
1489
1490
1491
1492/**
1493 * Handle a command that affects the filesystem (i.e. a filesystem
1494 * must exist for the command to act upon).
1495 *
1496 * @retval 0 Command was successfully handled and has side effects
1497 * @retval -EAGAIN Messages has been requeued for retry
1498 * @retval -ENOSYS Unknown command
1499 * @retval < 0 An error has occurred; **ss** may have been set.
1500 */
1501int MDSMonitor::legacy_filesystem_command(
1502 MonOpRequestRef op,
1503 std::string const &prefix,
1504 map<string, cmd_vartype> &cmdmap,
1505 std::stringstream &ss)
1506{
1507 dout(4) << __func__ << " prefix='" << prefix << "'" << dendl;
1508 op->mark_mdsmon_event(__func__);
1509 int r = 0;
1510 string whostr;
1511 cmd_getval(g_ceph_context, cmdmap, "who", whostr);
1512
1513 assert (pending_fsmap.legacy_client_fscid != FS_CLUSTER_ID_NONE);
1514
1515 if (prefix == "mds set_max_mds") {
1516 // NOTE: deprecated by "fs set max_mds"
1517 int64_t maxmds;
1518 if (!cmd_getval(g_ceph_context, cmdmap, "maxmds", maxmds) || maxmds <= 0) {
1519 return -EINVAL;
1520 }
1521
1522 const MDSMap& mdsmap =
1523 pending_fsmap.filesystems.at(pending_fsmap.legacy_client_fscid)->mds_map;
1524
1525 if (!mdsmap.allows_multimds() &&
1526 maxmds > mdsmap.get_max_mds() &&
1527 maxmds > 1) {
1528 ss << "multi-MDS clusters are not enabled; set 'allow_multimds' to enable";
1529 return -EINVAL;
1530 }
1531
1532 if (maxmds > MAX_MDS) {
1533 ss << "may not have more than " << MAX_MDS << " MDS ranks";
1534 return -EINVAL;
1535 }
1536
1537 modify_legacy_filesystem(
1538 [maxmds](std::shared_ptr<Filesystem> fs)
1539 {
1540 fs->mds_map.set_max_mds(maxmds);
1541 });
1542
1543 r = 0;
1544 ss << "max_mds = " << maxmds;
1545 } else if (prefix == "mds cluster_down") {
1546 // NOTE: deprecated by "fs set cluster_down"
1547 modify_legacy_filesystem(
1548 [](std::shared_ptr<Filesystem> fs)
1549 {
1550 fs->mds_map.set_flag(CEPH_MDSMAP_DOWN);
1551 });
1552 ss << "marked fsmap DOWN";
1553 r = 0;
1554 } else if (prefix == "mds cluster_up") {
1555 // NOTE: deprecated by "fs set cluster_up"
1556 modify_legacy_filesystem(
1557 [](std::shared_ptr<Filesystem> fs)
1558 {
1559 fs->mds_map.clear_flag(CEPH_MDSMAP_DOWN);
1560 });
1561 ss << "unmarked fsmap DOWN";
1562 r = 0;
1563 } else {
1564 return -ENOSYS;
1565 }
1566
1567 return r;
1568}
1569
1570
1571void MDSMonitor::check_subs()
1572{
1573 std::list<std::string> types;
1574
1575 // Subscriptions may be to "mdsmap" (MDS and legacy clients),
1576 // "mdsmap.<namespace>", or to "fsmap" for the full state of all
1577 // filesystems. Build a list of all the types we service
1578 // subscriptions for.
1579 types.push_back("fsmap");
1580 types.push_back("fsmap.user");
1581 types.push_back("mdsmap");
1582 for (const auto &i : fsmap.filesystems) {
1583 auto fscid = i.first;
1584 std::ostringstream oss;
1585 oss << "mdsmap." << fscid;
1586 types.push_back(oss.str());
1587 }
1588
1589 for (const auto &type : types) {
1590 if (mon->session_map.subs.count(type) == 0)
1591 continue;
1592 xlist<Subscription*>::iterator p = mon->session_map.subs[type]->begin();
1593 while (!p.end()) {
1594 Subscription *sub = *p;
1595 ++p;
1596 check_sub(sub);
1597 }
1598 }
1599}
1600
1601
1602void MDSMonitor::check_sub(Subscription *sub)
1603{
1604 dout(20) << __func__ << ": " << sub->type << dendl;
1605
1606 if (sub->type == "fsmap") {
1607 if (sub->next <= fsmap.get_epoch()) {
1608 sub->session->con->send_message(new MFSMap(mon->monmap->fsid, fsmap));
1609 if (sub->onetime) {
1610 mon->session_map.remove_sub(sub);
1611 } else {
1612 sub->next = fsmap.get_epoch() + 1;
1613 }
1614 }
1615 } else if (sub->type == "fsmap.user") {
1616 if (sub->next <= fsmap.get_epoch()) {
1617 FSMapUser fsmap_u;
1618 fsmap_u.epoch = fsmap.get_epoch();
1619 fsmap_u.legacy_client_fscid = fsmap.legacy_client_fscid;
1620 for (auto p = fsmap.filesystems.begin();
1621 p != fsmap.filesystems.end();
1622 ++p) {
1623 FSMapUser::fs_info_t& fs_info = fsmap_u.filesystems[p->first];
1624 fs_info.cid = p->first;
1625 fs_info.name= p->second->mds_map.fs_name;
1626 }
1627 sub->session->con->send_message(new MFSMapUser(mon->monmap->fsid, fsmap_u));
1628 if (sub->onetime) {
1629 mon->session_map.remove_sub(sub);
1630 } else {
1631 sub->next = fsmap.get_epoch() + 1;
1632 }
1633 }
1634 } else if (sub->type.compare(0, 6, "mdsmap") == 0) {
1635 if (sub->next > fsmap.get_epoch()) {
1636 return;
1637 }
1638
1639 const bool is_mds = sub->session->inst.name.is_mds();
1640 mds_gid_t mds_gid = MDS_GID_NONE;
1641 fs_cluster_id_t fscid = FS_CLUSTER_ID_NONE;
1642 if (is_mds) {
1643 // What (if any) namespace are you assigned to?
1644 auto mds_info = fsmap.get_mds_info();
1645 for (const auto &i : mds_info) {
1646 if (i.second.addr == sub->session->inst.addr) {
1647 mds_gid = i.first;
1648 fscid = fsmap.mds_roles.at(mds_gid);
1649 }
1650 }
1651 } else {
1652 // You're a client. Did you request a particular
1653 // namespace?
1654 if (sub->type.find("mdsmap.") == 0) {
1655 auto namespace_id_str = sub->type.substr(std::string("mdsmap.").size());
1656 dout(10) << __func__ << ": namespace_id " << namespace_id_str << dendl;
1657 std::string err;
1658 fscid = strict_strtoll(namespace_id_str.c_str(), 10, &err);
1659 if (!err.empty()) {
1660 // Client asked for a non-existent namespace, send them nothing
1661 dout(1) << "Invalid client subscription '" << sub->type
1662 << "'" << dendl;
1663 return;
1664 }
1665 if (fsmap.filesystems.count(fscid) == 0) {
1666 // Client asked for a non-existent namespace, send them nothing
1667 // TODO: something more graceful for when a client has a filesystem
1668 // mounted, and the fileysstem is deleted. Add a "shut down you fool"
1669 // flag to MMDSMap?
1670 dout(1) << "Client subscribed to non-existent namespace '" <<
1671 fscid << "'" << dendl;
1672 return;
1673 }
1674 } else {
1675 // Unqualified request for "mdsmap": give it the one marked
1676 // for use by legacy clients.
1677 if (fsmap.legacy_client_fscid != FS_CLUSTER_ID_NONE) {
1678 fscid = fsmap.legacy_client_fscid;
1679 } else {
1680 dout(1) << "Client subscribed for legacy filesystem but "
1681 "none is configured" << dendl;
1682 return;
1683 }
1684 }
1685 }
1686 dout(10) << __func__ << ": is_mds=" << is_mds << ", fscid= " << fscid << dendl;
1687
1688 // Work out the effective latest epoch
1689 MDSMap *mds_map = nullptr;
1690 MDSMap null_map;
1691 null_map.compat = fsmap.compat;
1692 if (fscid == FS_CLUSTER_ID_NONE) {
1693 // For a client, we should have already dropped out
1694 assert(is_mds);
1695
1696 if (fsmap.standby_daemons.count(mds_gid)) {
1697 // For an MDS, we need to feed it an MDSMap with its own state in
1698 null_map.mds_info[mds_gid] = fsmap.standby_daemons[mds_gid];
1699 null_map.epoch = fsmap.standby_epochs[mds_gid];
1700 } else {
1701 null_map.epoch = fsmap.epoch;
1702 }
1703 mds_map = &null_map;
1704 } else {
1705 // Check the effective epoch
1706 mds_map = &(fsmap.filesystems.at(fscid)->mds_map);
1707 }
1708
1709 assert(mds_map != nullptr);
1710 dout(10) << __func__ << " selected MDS map epoch " <<
1711 mds_map->epoch << " for namespace " << fscid << " for subscriber "
1712 << sub->session->inst.name << " who wants epoch " << sub->next << dendl;
1713
1714 if (sub->next > mds_map->epoch) {
1715 return;
1716 }
1717 auto msg = new MMDSMap(mon->monmap->fsid, mds_map);
1718
1719 sub->session->con->send_message(msg);
1720 if (sub->onetime) {
1721 mon->session_map.remove_sub(sub);
1722 } else {
1723 sub->next = mds_map->get_epoch() + 1;
1724 }
1725 }
1726}
1727
1728
1729void MDSMonitor::update_metadata(mds_gid_t gid,
1730 const map<string, string>& metadata)
1731{
1732 if (metadata.empty()) {
1733 return;
1734 }
1735 pending_metadata[gid] = metadata;
1736
1737 MonitorDBStore::TransactionRef t = paxos->get_pending_transaction();
1738 bufferlist bl;
1739 ::encode(pending_metadata, bl);
1740 t->put(MDS_METADATA_PREFIX, "last_metadata", bl);
1741 paxos->trigger_propose();
1742}
1743
1744void MDSMonitor::remove_from_metadata(MonitorDBStore::TransactionRef t)
1745{
1746 bool update = false;
1747 for (map<mds_gid_t, Metadata>::iterator i = pending_metadata.begin();
1748 i != pending_metadata.end(); ) {
1749 if (!pending_fsmap.gid_exists(i->first)) {
1750 pending_metadata.erase(i++);
1751 update = true;
1752 } else {
1753 ++i;
1754 }
1755 }
1756 if (!update)
1757 return;
1758 bufferlist bl;
1759 ::encode(pending_metadata, bl);
1760 t->put(MDS_METADATA_PREFIX, "last_metadata", bl);
1761}
1762
1763int MDSMonitor::load_metadata(map<mds_gid_t, Metadata>& m)
1764{
1765 bufferlist bl;
1766 int r = mon->store->get(MDS_METADATA_PREFIX, "last_metadata", bl);
1767 if (r) {
1768 dout(1) << "Unable to load 'last_metadata'" << dendl;
1769 return r;
1770 }
1771
1772 bufferlist::iterator it = bl.begin();
1773 ::decode(m, it);
1774 return 0;
1775}
1776
1777int MDSMonitor::dump_metadata(const std::string &who, Formatter *f, ostream& err)
1778{
1779 assert(f);
1780
1781 mds_gid_t gid = gid_from_arg(who, err);
1782 if (gid == MDS_GID_NONE) {
1783 return -EINVAL;
1784 }
1785
1786 map<mds_gid_t, Metadata> metadata;
1787 if (int r = load_metadata(metadata)) {
1788 err << "Unable to load 'last_metadata'";
1789 return r;
1790 }
1791
1792 if (!metadata.count(gid)) {
1793 return -ENOENT;
1794 }
1795 const Metadata& m = metadata[gid];
1796 for (Metadata::const_iterator p = m.begin(); p != m.end(); ++p) {
1797 f->dump_string(p->first.c_str(), p->second);
1798 }
1799 return 0;
1800}
1801
1802int MDSMonitor::print_nodes(Formatter *f)
1803{
1804 assert(f);
1805
1806 map<mds_gid_t, Metadata> metadata;
1807 if (int r = load_metadata(metadata)) {
1808 return r;
1809 }
1810
1811 map<string, list<int> > mdses; // hostname => rank
1812 for (map<mds_gid_t, Metadata>::iterator it = metadata.begin();
1813 it != metadata.end(); ++it) {
1814 const Metadata& m = it->second;
1815 Metadata::const_iterator hostname = m.find("hostname");
1816 if (hostname == m.end()) {
1817 // not likely though
1818 continue;
1819 }
1820 const mds_gid_t gid = it->first;
1821 if (!fsmap.gid_exists(gid)) {
1822 dout(5) << __func__ << ": GID " << gid << " not existent" << dendl;
1823 continue;
1824 }
1825 const MDSMap::mds_info_t& mds_info = fsmap.get_info_gid(gid);
1826 // FIXME: include filesystem name with rank here
1827 mdses[hostname->second].push_back(mds_info.rank);
1828 }
1829
1830 dump_services(f, mdses, "mds");
1831 return 0;
1832}
1833
1834/**
1835 * If a cluster is undersized (with respect to max_mds), then
1836 * attempt to find daemons to grow it.
1837 */
1838bool MDSMonitor::maybe_expand_cluster(std::shared_ptr<Filesystem> fs)
1839{
1840 bool do_propose = false;
1841
1842 if (fs->mds_map.test_flag(CEPH_MDSMAP_DOWN)) {
1843 return do_propose;
1844 }
1845
1846 while (fs->mds_map.get_num_in_mds() < size_t(fs->mds_map.get_max_mds()) &&
1847 !fs->mds_map.is_degraded()) {
1848 mds_rank_t mds = mds_rank_t(0);
1849 string name;
1850 while (fs->mds_map.is_in(mds)) {
1851 mds++;
1852 }
1853 mds_gid_t newgid = pending_fsmap.find_replacement_for({fs->fscid, mds},
1854 name, g_conf->mon_force_standby_active);
1855 if (newgid == MDS_GID_NONE) {
1856 break;
1857 }
1858
1859 dout(1) << "adding standby " << pending_fsmap.get_info_gid(newgid).addr
1860 << " as mds." << mds << dendl;
1861 pending_fsmap.promote(newgid, fs, mds);
1862 do_propose = true;
1863 }
1864
1865 return do_propose;
1866}
1867
1868
1869/**
1870 * If a daemon is laggy, and a suitable replacement
1871 * is available, fail this daemon (remove from map) and pass its
1872 * role to another daemon.
1873 */
1874void MDSMonitor::maybe_replace_gid(mds_gid_t gid,
1875 const beacon_info_t &beacon,
1876 bool *mds_propose, bool *osd_propose)
1877{
1878 assert(mds_propose != nullptr);
1879 assert(osd_propose != nullptr);
1880
1881 const MDSMap::mds_info_t info = pending_fsmap.get_info_gid(gid);
1882 const auto fscid = pending_fsmap.mds_roles.at(gid);
1883
1884 dout(10) << "no beacon from " << gid << " " << info.addr << " mds."
1885 << info.rank << "." << info.inc
1886 << " " << ceph_mds_state_name(info.state)
1887 << " since " << beacon.stamp << dendl;
1888
1889 // are we in?
1890 // and is there a non-laggy standby that can take over for us?
1891 mds_gid_t sgid;
1892 if (info.rank >= 0 &&
1893 info.state != MDSMap::STATE_STANDBY &&
1894 info.state != MDSMap::STATE_STANDBY_REPLAY &&
1895 !pending_fsmap.get_filesystem(fscid)->mds_map.test_flag(CEPH_MDSMAP_DOWN) &&
1896 (sgid = pending_fsmap.find_replacement_for({fscid, info.rank}, info.name,
1897 g_conf->mon_force_standby_active)) != MDS_GID_NONE)
1898 {
1899
1900 MDSMap::mds_info_t si = pending_fsmap.get_info_gid(sgid);
1901 dout(10) << " replacing " << gid << " " << info.addr << " mds."
1902 << info.rank << "." << info.inc
1903 << " " << ceph_mds_state_name(info.state)
1904 << " with " << sgid << "/" << si.name << " " << si.addr << dendl;
1905
1906 // Remember what NS the old one was in
1907 const fs_cluster_id_t fscid = pending_fsmap.mds_roles.at(gid);
1908
1909 // Remove the old one
1910 *osd_propose |= fail_mds_gid(gid);
1911
1912 // Promote the replacement
1913 auto fs = pending_fsmap.filesystems.at(fscid);
1914 pending_fsmap.promote(sgid, fs, info.rank);
1915
1916 *mds_propose = true;
1917 } else if (info.state == MDSMap::STATE_STANDBY_REPLAY ||
1918 info.state == MDSMap::STATE_STANDBY) {
1919 dout(10) << " failing and removing " << gid << " " << info.addr << " mds." << info.rank
1920 << "." << info.inc << " " << ceph_mds_state_name(info.state)
1921 << dendl;
1922 fail_mds_gid(gid);
1923 *mds_propose = true;
1924 } else if (!info.laggy()) {
1925 dout(10) << " marking " << gid << " " << info.addr << " mds." << info.rank << "." << info.inc
1926 << " " << ceph_mds_state_name(info.state)
1927 << " laggy" << dendl;
1928 pending_fsmap.modify_daemon(info.global_id, [](MDSMap::mds_info_t *info) {
1929 info->laggy_since = ceph_clock_now();
1930 });
1931 *mds_propose = true;
1932 }
1933}
1934
1935bool MDSMonitor::maybe_promote_standby(std::shared_ptr<Filesystem> fs)
1936{
1937 assert(!fs->mds_map.test_flag(CEPH_MDSMAP_DOWN));
1938
1939 bool do_propose = false;
1940
1941 // have a standby take over?
1942 set<mds_rank_t> failed;
1943 fs->mds_map.get_failed_mds_set(failed);
1944 if (!failed.empty()) {
1945 set<mds_rank_t>::iterator p = failed.begin();
1946 while (p != failed.end()) {
1947 mds_rank_t f = *p++;
1948 mds_gid_t sgid = pending_fsmap.find_replacement_for({fs->fscid, f}, {},
1949 g_conf->mon_force_standby_active);
1950 if (sgid) {
1951 const MDSMap::mds_info_t si = pending_fsmap.get_info_gid(sgid);
1952 dout(0) << " taking over failed mds." << f << " with " << sgid
1953 << "/" << si.name << " " << si.addr << dendl;
1954 pending_fsmap.promote(sgid, fs, f);
1955 do_propose = true;
1956 }
1957 }
1958 } else {
1959 // There were no failures to replace, so try using any available standbys
1960 // as standby-replay daemons.
1961
1962 // Take a copy of the standby GIDs so that we can iterate over
1963 // them while perhaps-modifying standby_daemons during the loop
1964 // (if we promote anyone they are removed from standby_daemons)
1965 std::vector<mds_gid_t> standby_gids;
1966 for (const auto &j : pending_fsmap.standby_daemons) {
1967 standby_gids.push_back(j.first);
1968 }
1969
1970 for (const auto &gid : standby_gids) {
1971 const auto &info = pending_fsmap.standby_daemons.at(gid);
1972 assert(info.state == MDSMap::STATE_STANDBY);
1973
1974 if (!info.standby_replay) {
1975 continue;
1976 }
1977
1978 /*
1979 * This mds is standby but has no rank assigned.
1980 * See if we can find it somebody to shadow
1981 */
1982 dout(20) << "gid " << gid << " is standby and following nobody" << dendl;
1983
1984 // standby for someone specific?
1985 if (info.standby_for_rank >= 0) {
1986 // The mds_info_t may or may not tell us exactly which filesystem
1987 // the standby_for_rank refers to: lookup via legacy_client_fscid
1988 mds_role_t target_role = {
1989 info.standby_for_fscid == FS_CLUSTER_ID_NONE ?
1990 pending_fsmap.legacy_client_fscid : info.standby_for_fscid,
1991 info.standby_for_rank};
1992
1993 // It is possible that the map contains a standby_for_fscid
1994 // that doesn't correspond to an existing filesystem, especially
1995 // if we loaded from a version with a bug (#17466)
1996 if (info.standby_for_fscid != FS_CLUSTER_ID_NONE
1997 && !pending_fsmap.filesystem_exists(info.standby_for_fscid)) {
1998 derr << "gid " << gid << " has invalid standby_for_fscid "
1999 << info.standby_for_fscid << dendl;
2000 continue;
2001 }
2002
2003 // If we managed to resolve a full target role
2004 if (target_role.fscid != FS_CLUSTER_ID_NONE) {
2005 auto fs = pending_fsmap.get_filesystem(target_role.fscid);
2006 if (fs->mds_map.is_followable(target_role.rank)) {
2007 do_propose |= try_standby_replay(
2008 info,
2009 *fs,
2010 fs->mds_map.get_info(target_role.rank));
2011 }
2012 }
2013
2014 continue;
2015 }
2016
2017 // check everyone
2018 for (auto fs_i : pending_fsmap.filesystems) {
2019 const MDSMap &mds_map = fs_i.second->mds_map;
2020 for (auto mds_i : mds_map.mds_info) {
2021 MDSMap::mds_info_t &cand_info = mds_i.second;
2022 if (cand_info.rank >= 0 && mds_map.is_followable(cand_info.rank)) {
2023 if ((info.standby_for_name.length() && info.standby_for_name != cand_info.name) ||
2024 info.standby_for_rank != MDS_RANK_NONE) {
2025 continue; // we're supposed to follow someone else
2026 }
2027
2028 if (try_standby_replay(info, *(fs_i.second), cand_info)) {
2029 do_propose = true;
2030 break;
2031 }
2032 continue;
2033 }
2034 }
2035 }
2036 }
2037 }
2038
2039 return do_propose;
2040}
2041
2042void MDSMonitor::tick()
2043{
2044 // make sure mds's are still alive
2045 // ...if i am an active leader
2046 if (!is_active()) return;
2047
2048 dout(10) << fsmap << dendl;
2049
2050 bool do_propose = false;
2051
2052 if (!mon->is_leader()) return;
2053
2054 do_propose |= pending_fsmap.check_health();
2055
2056 // expand mds cluster (add new nodes to @in)?
2057 for (auto i : pending_fsmap.filesystems) {
2058 do_propose |= maybe_expand_cluster(i.second);
2059 }
2060
2061 const auto now = ceph_clock_now();
2062 if (last_tick.is_zero()) {
2063 last_tick = now;
2064 }
2065
2066 if (now - last_tick > (g_conf->mds_beacon_grace - g_conf->mds_beacon_interval)) {
2067 // This case handles either local slowness (calls being delayed
2068 // for whatever reason) or cluster election slowness (a long gap
2069 // between calls while an election happened)
2070 dout(4) << __func__ << ": resetting beacon timeouts due to mon delay "
2071 "(slow election?) of " << now - last_tick << " seconds" << dendl;
2072 for (auto &i : last_beacon) {
2073 i.second.stamp = now;
2074 }
2075 }
2076
2077 last_tick = now;
2078
2079 // check beacon timestamps
2080 utime_t cutoff = now;
2081 cutoff -= g_conf->mds_beacon_grace;
2082
2083 // make sure last_beacon is fully populated
2084 for (const auto &p : pending_fsmap.mds_roles) {
2085 auto &gid = p.first;
2086 if (last_beacon.count(gid) == 0) {
2087 last_beacon[gid].stamp = now;
2088 last_beacon[gid].seq = 0;
2089 }
2090 }
2091
2092 // If the OSDMap is writeable, we can blacklist things, so we can
2093 // try failing any laggy MDS daemons. Consider each one for failure.
2094 if (mon->osdmon()->is_writeable()) {
2095 bool propose_osdmap = false;
2096
2097 map<mds_gid_t, beacon_info_t>::iterator p = last_beacon.begin();
2098 while (p != last_beacon.end()) {
2099 mds_gid_t gid = p->first;
2100 auto beacon_info = p->second;
2101 ++p;
2102
2103 if (!pending_fsmap.gid_exists(gid)) {
2104 // clean it out
2105 last_beacon.erase(gid);
2106 continue;
2107 }
2108
2109 if (beacon_info.stamp < cutoff) {
2110 maybe_replace_gid(gid, beacon_info, &do_propose, &propose_osdmap);
2111 }
2112 }
2113
2114 if (propose_osdmap) {
2115 request_proposal(mon->osdmon());
2116 }
2117 }
2118
2119 for (auto i : pending_fsmap.filesystems) {
2120 auto fs = i.second;
2121 if (!fs->mds_map.test_flag(CEPH_MDSMAP_DOWN)) {
2122 do_propose |= maybe_promote_standby(fs);
2123 }
2124 }
2125
2126 if (do_propose) {
2127 propose_pending();
2128 }
2129}
2130
2131/**
2132 * finfo: the would-be follower
2133 * leader_fs: the Filesystem containing the would-be leader
2134 * ainfo: the would-be leader
2135 */
2136bool MDSMonitor::try_standby_replay(
2137 const MDSMap::mds_info_t& finfo,
2138 const Filesystem &leader_fs,
2139 const MDSMap::mds_info_t& ainfo)
2140{
2141 // someone else already following?
2142 if (leader_fs.has_standby_replay(ainfo.global_id)) {
2143 dout(20) << " mds." << ainfo.rank << " already has a follower" << dendl;
2144 return false;
2145 } else {
2146 // Assign the new role to the standby
2147 dout(10) << " setting to follow mds rank " << ainfo.rank << dendl;
2148 pending_fsmap.assign_standby_replay(finfo.global_id, leader_fs.fscid, ainfo.rank);
2149 return true;
2150 }
2151}
2152
2153MDSMonitor::MDSMonitor(Monitor *mn, Paxos *p, string service_name)
2154 : PaxosService(mn, p, service_name)
2155{
2156 handlers = FileSystemCommandHandler::load();
2157}
2158
2159void MDSMonitor::on_restart()
2160{
2161 // Clear out the leader-specific state.
2162 last_tick = utime_t();
2163 last_beacon.clear();
2164}
2165