]> git.proxmox.com Git - ceph.git/blob - ceph/src/mon/MDSMonitor.cc
608e1aeedc3e8d43c19e5c29af4264d1f955b6b8
[ceph.git] / ceph / src / mon / MDSMonitor.cc
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3 /*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 *
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
12 *
13 */
14
15 #include <sstream>
16 #include <boost/utility.hpp>
17
18 #include "MDSMonitor.h"
19 #include "FSCommands.h"
20 #include "Monitor.h"
21 #include "MonitorDBStore.h"
22 #include "OSDMonitor.h"
23 #include "PGMonitor.h"
24
25 #include "common/strtol.h"
26 #include "common/perf_counters.h"
27 #include "common/config.h"
28 #include "common/cmdparse.h"
29 #include "messages/MMDSMap.h"
30 #include "messages/MFSMap.h"
31 #include "messages/MFSMapUser.h"
32 #include "messages/MMDSLoadTargets.h"
33 #include "messages/MMonCommand.h"
34 #include "messages/MGenericMessage.h"
35
36 #include "include/assert.h"
37 #include "include/str_list.h"
38 #include "include/stringify.h"
39 #include "mds/mdstypes.h"
40 #include "Session.h"
41
42 #define dout_subsys ceph_subsys_mon
43 #undef dout_prefix
44 #define dout_prefix _prefix(_dout, mon, fsmap)
45 static ostream& _prefix(std::ostream *_dout, Monitor *mon, FSMap const& fsmap) {
46 return *_dout << "mon." << mon->name << "@" << mon->rank
47 << "(" << mon->get_state_name()
48 << ").mds e" << fsmap.get_epoch() << " ";
49 }
50
51 /*
52 * Specialized implementation of cmd_getval to allow us to parse
53 * out strongly-typedef'd types
54 */
55 template<> bool cmd_getval(CephContext *cct, const cmdmap_t& cmdmap,
56 const std::string& k, mds_gid_t &val)
57 {
58 return cmd_getval(cct, cmdmap, k, (int64_t&)val);
59 }
60
61 template<> bool cmd_getval(CephContext *cct, const cmdmap_t& cmdmap,
62 const std::string& k, mds_rank_t &val)
63 {
64 return cmd_getval(cct, cmdmap, k, (int64_t&)val);
65 }
66
67 template<> bool cmd_getval(CephContext *cct, const cmdmap_t& cmdmap,
68 const std::string& k, MDSMap::DaemonState &val)
69 {
70 return cmd_getval(cct, cmdmap, k, (int64_t&)val);
71 }
72
73 static const string MDS_METADATA_PREFIX("mds_metadata");
74
75
76 // my methods
77
78 void MDSMonitor::print_map(FSMap &m, int dbl)
79 {
80 dout(dbl) << "print_map\n";
81 m.print(*_dout);
82 *_dout << dendl;
83 }
84
85 // service methods
86 void MDSMonitor::create_initial()
87 {
88 dout(10) << "create_initial" << dendl;
89 }
90
91
92 void MDSMonitor::update_from_paxos(bool *need_bootstrap)
93 {
94 version_t version = get_last_committed();
95 if (version == fsmap.epoch)
96 return;
97
98 dout(10) << __func__ << " version " << version
99 << ", my e " << fsmap.epoch << dendl;
100 assert(version > fsmap.epoch);
101
102 // read and decode
103 bufferlist fsmap_bl;
104 fsmap_bl.clear();
105 int err = get_version(version, fsmap_bl);
106 assert(err == 0);
107
108 assert(fsmap_bl.length() > 0);
109 dout(10) << __func__ << " got " << version << dendl;
110 fsmap.decode(fsmap_bl);
111
112 // new map
113 dout(4) << "new map" << dendl;
114 print_map(fsmap, 0);
115 if (!g_conf->mon_mds_skip_sanity) {
116 fsmap.sanity();
117 }
118
119 check_subs();
120 update_logger();
121 }
122
123 void MDSMonitor::init()
124 {
125 (void)load_metadata(pending_metadata);
126 }
127
128 void MDSMonitor::create_pending()
129 {
130 pending_fsmap = fsmap;
131 pending_fsmap.epoch++;
132
133 dout(10) << "create_pending e" << pending_fsmap.epoch << dendl;
134 }
135
136 void MDSMonitor::encode_pending(MonitorDBStore::TransactionRef t)
137 {
138 dout(10) << "encode_pending e" << pending_fsmap.epoch << dendl;
139
140
141 // print map iff 'debug mon = 30' or higher
142 print_map(pending_fsmap, 30);
143 if (!g_conf->mon_mds_skip_sanity) {
144 pending_fsmap.sanity();
145 }
146
147 // Set 'modified' on maps modified this epoch
148 for (auto &i : fsmap.filesystems) {
149 if (i.second->mds_map.epoch == fsmap.epoch) {
150 i.second->mds_map.modified = ceph_clock_now();
151 }
152 }
153
154 // apply to paxos
155 assert(get_last_committed() + 1 == pending_fsmap.epoch);
156 bufferlist fsmap_bl;
157 pending_fsmap.encode(fsmap_bl, mon->get_quorum_con_features());
158
159 /* put everything in the transaction */
160 put_version(t, pending_fsmap.epoch, fsmap_bl);
161 put_last_committed(t, pending_fsmap.epoch);
162
163 // Encode MDSHealth data
164 for (std::map<uint64_t, MDSHealth>::iterator i = pending_daemon_health.begin();
165 i != pending_daemon_health.end(); ++i) {
166 bufferlist bl;
167 i->second.encode(bl);
168 t->put(MDS_HEALTH_PREFIX, stringify(i->first), bl);
169 }
170
171 for (std::set<uint64_t>::iterator i = pending_daemon_health_rm.begin();
172 i != pending_daemon_health_rm.end(); ++i) {
173 t->erase(MDS_HEALTH_PREFIX, stringify(*i));
174 }
175 pending_daemon_health_rm.clear();
176 remove_from_metadata(t);
177 }
178
179 version_t MDSMonitor::get_trim_to()
180 {
181 version_t floor = 0;
182 if (g_conf->mon_mds_force_trim_to > 0 &&
183 g_conf->mon_mds_force_trim_to < (int)get_last_committed()) {
184 floor = g_conf->mon_mds_force_trim_to;
185 dout(10) << __func__ << " explicit mon_mds_force_trim_to = "
186 << floor << dendl;
187 }
188
189 unsigned max = g_conf->mon_max_mdsmap_epochs;
190 version_t last = get_last_committed();
191
192 if (last - get_first_committed() > max && floor < last - max)
193 return last - max;
194 return floor;
195 }
196
197 void MDSMonitor::update_logger()
198 {
199 dout(10) << "update_logger" << dendl;
200
201 uint64_t up = 0;
202 uint64_t in = 0;
203 uint64_t failed = 0;
204 for (const auto &i : fsmap.filesystems) {
205 const MDSMap &mds_map = i.second->mds_map;
206
207 up += mds_map.get_num_up_mds();
208 in += mds_map.get_num_in_mds();
209 failed += mds_map.get_num_failed_mds();
210 }
211 mon->cluster_logger->set(l_cluster_num_mds_up, up);
212 mon->cluster_logger->set(l_cluster_num_mds_in, in);
213 mon->cluster_logger->set(l_cluster_num_mds_failed, failed);
214 mon->cluster_logger->set(l_cluster_mds_epoch, fsmap.get_epoch());
215 }
216
217 bool MDSMonitor::preprocess_query(MonOpRequestRef op)
218 {
219 op->mark_mdsmon_event(__func__);
220 PaxosServiceMessage *m = static_cast<PaxosServiceMessage*>(op->get_req());
221 dout(10) << "preprocess_query " << *m << " from " << m->get_orig_source_inst() << dendl;
222
223 switch (m->get_type()) {
224
225 case MSG_MDS_BEACON:
226 return preprocess_beacon(op);
227
228 case MSG_MON_COMMAND:
229 return preprocess_command(op);
230
231 case MSG_MDS_OFFLOAD_TARGETS:
232 return preprocess_offload_targets(op);
233
234 default:
235 ceph_abort();
236 return true;
237 }
238 }
239
240 void MDSMonitor::_note_beacon(MMDSBeacon *m)
241 {
242 mds_gid_t gid = mds_gid_t(m->get_global_id());
243 version_t seq = m->get_seq();
244
245 dout(15) << "_note_beacon " << *m << " noting time" << dendl;
246 last_beacon[gid].stamp = ceph_clock_now();
247 last_beacon[gid].seq = seq;
248 }
249
250 bool MDSMonitor::preprocess_beacon(MonOpRequestRef op)
251 {
252 op->mark_mdsmon_event(__func__);
253 MMDSBeacon *m = static_cast<MMDSBeacon*>(op->get_req());
254 MDSMap::DaemonState state = m->get_state();
255 mds_gid_t gid = m->get_global_id();
256 version_t seq = m->get_seq();
257 MDSMap::mds_info_t info;
258 epoch_t effective_epoch = 0;
259
260 // check privileges, ignore if fails
261 MonSession *session = m->get_session();
262 assert(session);
263 if (!session->is_capable("mds", MON_CAP_X)) {
264 dout(0) << "preprocess_beacon got MMDSBeacon from entity with insufficient privileges "
265 << session->caps << dendl;
266 goto ignore;
267 }
268
269 if (m->get_fsid() != mon->monmap->fsid) {
270 dout(0) << "preprocess_beacon on fsid " << m->get_fsid() << " != " << mon->monmap->fsid << dendl;
271 goto ignore;
272 }
273
274 dout(12) << "preprocess_beacon " << *m
275 << " from " << m->get_orig_source_inst()
276 << " " << m->get_compat()
277 << dendl;
278
279 // make sure the address has a port
280 if (m->get_orig_source_addr().get_port() == 0) {
281 dout(1) << " ignoring boot message without a port" << dendl;
282 goto ignore;
283 }
284
285 // check compat
286 if (!m->get_compat().writeable(fsmap.compat)) {
287 dout(1) << " mds " << m->get_source_inst() << " can't write to fsmap " << fsmap.compat << dendl;
288 goto ignore;
289 }
290
291 // fw to leader?
292 if (!mon->is_leader())
293 return false;
294
295 // booted, but not in map?
296 if (!pending_fsmap.gid_exists(gid)) {
297 if (state != MDSMap::STATE_BOOT) {
298 dout(7) << "mds_beacon " << *m << " is not in fsmap (state "
299 << ceph_mds_state_name(state) << ")" << dendl;
300
301 MDSMap null_map;
302 null_map.epoch = fsmap.epoch;
303 null_map.compat = fsmap.compat;
304 mon->send_reply(op, new MMDSMap(mon->monmap->fsid, &null_map));
305 return true;
306 } else {
307 return false; // not booted yet.
308 }
309 }
310 dout(10) << __func__ << ": GID exists in map: " << gid << dendl;
311 info = pending_fsmap.get_info_gid(gid);
312
313 // old seq?
314 if (info.state_seq > seq) {
315 dout(7) << "mds_beacon " << *m << " has old seq, ignoring" << dendl;
316 goto ignore;
317 }
318
319 // Work out the latest epoch that this daemon should have seen
320 {
321 fs_cluster_id_t fscid = pending_fsmap.mds_roles.at(gid);
322 if (fscid == FS_CLUSTER_ID_NONE) {
323 effective_epoch = pending_fsmap.standby_epochs.at(gid);
324 } else {
325 effective_epoch = pending_fsmap.get_filesystem(fscid)->mds_map.epoch;
326 }
327 if (effective_epoch != m->get_last_epoch_seen()) {
328 dout(10) << "mds_beacon " << *m
329 << " ignoring requested state, because mds hasn't seen latest map" << dendl;
330 goto reply;
331 }
332 }
333
334 if (info.laggy()) {
335 _note_beacon(m);
336 return false; // no longer laggy, need to update map.
337 }
338 if (state == MDSMap::STATE_BOOT) {
339 // ignore, already booted.
340 goto ignore;
341 }
342 // is there a state change here?
343 if (info.state != state) {
344 // legal state change?
345 if ((info.state == MDSMap::STATE_STANDBY ||
346 info.state == MDSMap::STATE_STANDBY_REPLAY) && state > 0) {
347 dout(10) << "mds_beacon mds can't activate itself (" << ceph_mds_state_name(info.state)
348 << " -> " << ceph_mds_state_name(state) << ")" << dendl;
349 goto reply;
350 }
351
352 if ((state == MDSMap::STATE_STANDBY || state == MDSMap::STATE_STANDBY_REPLAY)
353 && info.rank != MDS_RANK_NONE)
354 {
355 dout(4) << "mds_beacon MDS can't go back into standby after taking rank: "
356 "held rank " << info.rank << " while requesting state "
357 << ceph_mds_state_name(state) << dendl;
358 goto reply;
359 }
360
361 _note_beacon(m);
362 return false;
363 }
364
365 // Comparing known daemon health with m->get_health()
366 // and return false (i.e. require proposal) if they
367 // do not match, to update our stored
368 if (!(pending_daemon_health[gid] == m->get_health())) {
369 dout(20) << __func__ << " health metrics for gid " << gid << " were updated" << dendl;
370 _note_beacon(m);
371 return false;
372 }
373
374 reply:
375 // note time and reply
376 assert(effective_epoch > 0);
377 _note_beacon(m);
378 mon->send_reply(op,
379 new MMDSBeacon(mon->monmap->fsid, m->get_global_id(), m->get_name(),
380 effective_epoch, state, seq,
381 CEPH_FEATURES_SUPPORTED_DEFAULT));
382 return true;
383
384 ignore:
385 // I won't reply this beacon, drop it.
386 mon->no_reply(op);
387 return true;
388 }
389
390 bool MDSMonitor::preprocess_offload_targets(MonOpRequestRef op)
391 {
392 op->mark_mdsmon_event(__func__);
393 MMDSLoadTargets *m = static_cast<MMDSLoadTargets*>(op->get_req());
394 dout(10) << "preprocess_offload_targets " << *m << " from " << m->get_orig_source() << dendl;
395
396 // check privileges, ignore message if fails
397 MonSession *session = m->get_session();
398 if (!session)
399 goto done;
400 if (!session->is_capable("mds", MON_CAP_X)) {
401 dout(0) << "preprocess_offload_targets got MMDSLoadTargets from entity with insufficient caps "
402 << session->caps << dendl;
403 goto done;
404 }
405
406 if (fsmap.gid_exists(m->global_id) &&
407 m->targets == fsmap.get_info_gid(m->global_id).export_targets)
408 goto done;
409
410 return false;
411
412 done:
413 return true;
414 }
415
416
417 bool MDSMonitor::prepare_update(MonOpRequestRef op)
418 {
419 op->mark_mdsmon_event(__func__);
420 PaxosServiceMessage *m = static_cast<PaxosServiceMessage*>(op->get_req());
421 dout(7) << "prepare_update " << *m << dendl;
422
423 switch (m->get_type()) {
424
425 case MSG_MDS_BEACON:
426 return prepare_beacon(op);
427
428 case MSG_MON_COMMAND:
429 return prepare_command(op);
430
431 case MSG_MDS_OFFLOAD_TARGETS:
432 return prepare_offload_targets(op);
433
434 default:
435 ceph_abort();
436 }
437
438 return true;
439 }
440
441 bool MDSMonitor::prepare_beacon(MonOpRequestRef op)
442 {
443 op->mark_mdsmon_event(__func__);
444 MMDSBeacon *m = static_cast<MMDSBeacon*>(op->get_req());
445 // -- this is an update --
446 dout(12) << "prepare_beacon " << *m << " from " << m->get_orig_source_inst() << dendl;
447 entity_addr_t addr = m->get_orig_source_inst().addr;
448 mds_gid_t gid = m->get_global_id();
449 MDSMap::DaemonState state = m->get_state();
450 version_t seq = m->get_seq();
451
452 dout(20) << __func__ << " got health from gid " << gid << " with " << m->get_health().metrics.size() << " metrics." << dendl;
453
454 // Calculate deltas of health metrics created and removed
455 // Do this by type rather than MDSHealthMetric equality, because messages can
456 // change a lot when they include e.g. a number of items.
457 const auto &old_health = pending_daemon_health[gid].metrics;
458 const auto &new_health = m->get_health().metrics;
459
460 std::set<mds_metric_t> old_types;
461 for (const auto &i : old_health) {
462 old_types.insert(i.type);
463 }
464
465 std::set<mds_metric_t> new_types;
466 for (const auto &i : new_health) {
467 new_types.insert(i.type);
468 }
469
470 for (const auto &new_metric: new_health) {
471 if (old_types.count(new_metric.type) == 0) {
472 std::stringstream msg;
473 msg << "MDS health message (" << m->get_orig_source_inst().name << "): "
474 << new_metric.message;
475 if (new_metric.sev == HEALTH_ERR) {
476 mon->clog->error() << msg.str();
477 } else if (new_metric.sev == HEALTH_WARN) {
478 mon->clog->warn() << msg.str();
479 } else {
480 mon->clog->info() << msg.str();
481 }
482 }
483 }
484
485 // Log the disappearance of health messages at INFO
486 for (const auto &old_metric : old_health) {
487 if (new_types.count(old_metric.type) == 0) {
488 mon->clog->info() << "MDS health message cleared ("
489 << m->get_orig_source_inst().name << "): " << old_metric.message;
490 }
491 }
492
493 // Store health
494 pending_daemon_health[gid] = m->get_health();
495
496 // boot?
497 if (state == MDSMap::STATE_BOOT) {
498 // zap previous instance of this name?
499 if (g_conf->mds_enforce_unique_name) {
500 bool failed_mds = false;
501 while (mds_gid_t existing = pending_fsmap.find_mds_gid_by_name(m->get_name())) {
502 if (!mon->osdmon()->is_writeable()) {
503 mon->osdmon()->wait_for_writeable(op, new C_RetryMessage(this, op));
504 return false;
505 }
506 mon->clog->info() << "MDS daemon '" << m->get_name() << "' restarted";
507 fail_mds_gid(existing);
508 failed_mds = true;
509 }
510 if (failed_mds) {
511 assert(mon->osdmon()->is_writeable());
512 request_proposal(mon->osdmon());
513 }
514 }
515
516 // Add this daemon to the map
517 if (pending_fsmap.mds_roles.count(gid) == 0) {
518 MDSMap::mds_info_t new_info;
519 new_info.global_id = gid;
520 new_info.name = m->get_name();
521 new_info.addr = addr;
522 new_info.mds_features = m->get_mds_features();
523 new_info.state = MDSMap::STATE_STANDBY;
524 new_info.state_seq = seq;
525 new_info.standby_for_rank = m->get_standby_for_rank();
526 new_info.standby_for_name = m->get_standby_for_name();
527 new_info.standby_for_fscid = m->get_standby_for_fscid();
528 new_info.standby_replay = m->get_standby_replay();
529 pending_fsmap.insert(new_info);
530 }
531
532 // Resolve standby_for_name to a rank
533 const MDSMap::mds_info_t &info = pending_fsmap.get_info_gid(gid);
534 if (!info.standby_for_name.empty()) {
535 const MDSMap::mds_info_t *leaderinfo = fsmap.find_by_name(
536 info.standby_for_name);
537 if (leaderinfo && (leaderinfo->rank >= 0)) {
538 auto fscid = pending_fsmap.mds_roles.at(leaderinfo->global_id);
539 auto fs = pending_fsmap.get_filesystem(fscid);
540 bool followable = fs->mds_map.is_followable(leaderinfo->rank);
541
542 pending_fsmap.modify_daemon(gid, [fscid, leaderinfo, followable](
543 MDSMap::mds_info_t *info) {
544 info->standby_for_rank = leaderinfo->rank;
545 info->standby_for_fscid = fscid;
546 });
547 }
548 }
549
550 // initialize the beacon timer
551 last_beacon[gid].stamp = ceph_clock_now();
552 last_beacon[gid].seq = seq;
553
554 // new incompat?
555 if (!pending_fsmap.compat.writeable(m->get_compat())) {
556 dout(10) << " fsmap " << pending_fsmap.compat
557 << " can't write to new mds' " << m->get_compat()
558 << ", updating fsmap and killing old mds's"
559 << dendl;
560 pending_fsmap.update_compat(m->get_compat());
561 }
562
563 update_metadata(m->get_global_id(), m->get_sys_info());
564 } else {
565 // state update
566 const MDSMap::mds_info_t &info = pending_fsmap.get_info_gid(gid);
567 // Old MDS daemons don't mention that they're standby replay until
568 // after they've sent their boot beacon, so update this field.
569 if (info.standby_replay != m->get_standby_replay()) {
570 pending_fsmap.modify_daemon(info.global_id, [&m](
571 MDSMap::mds_info_t *i)
572 {
573 i->standby_replay = m->get_standby_replay();
574 });
575 }
576
577 if (info.state == MDSMap::STATE_STOPPING && state != MDSMap::STATE_STOPPED ) {
578 // we can't transition to any other states from STOPPING
579 dout(0) << "got beacon for MDS in STATE_STOPPING, ignoring requested state change"
580 << dendl;
581 _note_beacon(m);
582 return true;
583 }
584
585 if (info.laggy()) {
586 dout(10) << "prepare_beacon clearing laggy flag on " << addr << dendl;
587 pending_fsmap.modify_daemon(info.global_id, [](MDSMap::mds_info_t *info)
588 {
589 info->clear_laggy();
590 }
591 );
592 }
593
594 dout(10) << "prepare_beacon mds." << info.rank
595 << " " << ceph_mds_state_name(info.state)
596 << " -> " << ceph_mds_state_name(state)
597 << " standby_for_rank=" << m->get_standby_for_rank()
598 << dendl;
599 if (state == MDSMap::STATE_STOPPED) {
600 auto erased = pending_fsmap.stop(gid);
601 erased.push_back(gid);
602
603 for (const auto &erased_gid : erased) {
604 last_beacon.erase(erased_gid);
605 if (pending_daemon_health.count(erased_gid)) {
606 pending_daemon_health.erase(erased_gid);
607 pending_daemon_health_rm.insert(erased_gid);
608 }
609 }
610 } else if (state == MDSMap::STATE_DAMAGED) {
611 if (!mon->osdmon()->is_writeable()) {
612 dout(4) << __func__ << ": DAMAGED from rank " << info.rank
613 << " waiting for osdmon writeable to blacklist it" << dendl;
614 mon->osdmon()->wait_for_writeable(op, new C_RetryMessage(this, op));
615 return false;
616 }
617
618 // Record this MDS rank as damaged, so that other daemons
619 // won't try to run it.
620 dout(4) << __func__ << ": marking rank "
621 << info.rank << " damaged" << dendl;
622
623 utime_t until = ceph_clock_now();
624 until += g_conf->mds_blacklist_interval;
625 const auto blacklist_epoch = mon->osdmon()->blacklist(info.addr, until);
626 request_proposal(mon->osdmon());
627 pending_fsmap.damaged(gid, blacklist_epoch);
628 last_beacon.erase(gid);
629
630 // Respond to MDS, so that it knows it can continue to shut down
631 mon->send_reply(op,
632 new MMDSBeacon(
633 mon->monmap->fsid, m->get_global_id(),
634 m->get_name(), fsmap.get_epoch(), state, seq,
635 CEPH_FEATURES_SUPPORTED_DEFAULT));
636 } else if (state == MDSMap::STATE_DNE) {
637 if (!mon->osdmon()->is_writeable()) {
638 dout(4) << __func__ << ": DNE from rank " << info.rank
639 << " waiting for osdmon writeable to blacklist it" << dendl;
640 mon->osdmon()->wait_for_writeable(op, new C_RetryMessage(this, op));
641 return false;
642 }
643
644 fail_mds_gid(gid);
645 assert(mon->osdmon()->is_writeable());
646 request_proposal(mon->osdmon());
647
648 // Respond to MDS, so that it knows it can continue to shut down
649 mon->send_reply(op,
650 new MMDSBeacon(
651 mon->monmap->fsid, m->get_global_id(),
652 m->get_name(), fsmap.get_epoch(), state, seq,
653 CEPH_FEATURES_SUPPORTED_DEFAULT));
654 } else if (info.state == MDSMap::STATE_STANDBY && state != info.state) {
655 // Standby daemons should never modify their own
656 // state. Reject any attempts to do so.
657 derr << "standby " << gid << " attempted to change state to "
658 << ceph_mds_state_name(state) << ", rejecting" << dendl;
659 return true;
660 } else if (info.state != MDSMap::STATE_STANDBY && state != info.state &&
661 !MDSMap::state_transition_valid(info.state, state)) {
662 // Validate state transitions for daemons that hold a rank
663 derr << "daemon " << gid << " (rank " << info.rank << ") "
664 << "reported invalid state transition "
665 << ceph_mds_state_name(info.state) << " -> "
666 << ceph_mds_state_name(state) << dendl;
667 return true;
668 } else {
669 // Made it through special cases and validations, record the
670 // daemon's reported state to the FSMap.
671 pending_fsmap.modify_daemon(gid, [state, seq](MDSMap::mds_info_t *info) {
672 info->state = state;
673 info->state_seq = seq;
674 });
675 }
676 }
677
678 dout(7) << "prepare_beacon pending map now:" << dendl;
679 print_map(pending_fsmap);
680
681 wait_for_finished_proposal(op, new FunctionContext([op, this](int r){
682 if (r >= 0)
683 _updated(op); // success
684 else if (r == -ECANCELED) {
685 mon->no_reply(op);
686 } else {
687 dispatch(op); // try again
688 }
689 }));
690
691 return true;
692 }
693
694 bool MDSMonitor::prepare_offload_targets(MonOpRequestRef op)
695 {
696 op->mark_mdsmon_event(__func__);
697 MMDSLoadTargets *m = static_cast<MMDSLoadTargets*>(op->get_req());
698 mds_gid_t gid = m->global_id;
699 if (pending_fsmap.gid_has_rank(gid)) {
700 dout(10) << "prepare_offload_targets " << gid << " " << m->targets << dendl;
701 pending_fsmap.update_export_targets(gid, m->targets);
702 } else {
703 dout(10) << "prepare_offload_targets " << gid << " not in map" << dendl;
704 }
705 return true;
706 }
707
708 bool MDSMonitor::should_propose(double& delay)
709 {
710 // delegate to PaxosService to assess whether we should propose
711 return PaxosService::should_propose(delay);
712 }
713
714 void MDSMonitor::_updated(MonOpRequestRef op)
715 {
716 op->mark_mdsmon_event(__func__);
717 MMDSBeacon *m = static_cast<MMDSBeacon*>(op->get_req());
718 dout(10) << "_updated " << m->get_orig_source() << " " << *m << dendl;
719 mon->clog->info() << m->get_orig_source_inst() << " "
720 << ceph_mds_state_name(m->get_state());
721
722 if (m->get_state() == MDSMap::STATE_STOPPED) {
723 // send the map manually (they're out of the map, so they won't get it automatic)
724 MDSMap null_map;
725 null_map.epoch = fsmap.epoch;
726 null_map.compat = fsmap.compat;
727 mon->send_reply(op, new MMDSMap(mon->monmap->fsid, &null_map));
728 } else {
729 mon->send_reply(op, new MMDSBeacon(mon->monmap->fsid,
730 m->get_global_id(),
731 m->get_name(),
732 fsmap.get_epoch(),
733 m->get_state(),
734 m->get_seq(),
735 CEPH_FEATURES_SUPPORTED_DEFAULT));
736 }
737 }
738
739 void MDSMonitor::on_active()
740 {
741 tick();
742 update_logger();
743
744 if (mon->is_leader())
745 mon->clog->info() << "fsmap " << fsmap;
746 }
747
748 void MDSMonitor::get_health(list<pair<health_status_t, string> >& summary,
749 list<pair<health_status_t, string> > *detail,
750 CephContext* cct) const
751 {
752 fsmap.get_health(summary, detail);
753
754 // For each MDS GID...
755 const auto info_map = fsmap.get_mds_info();
756 for (const auto &i : info_map) {
757 const auto &gid = i.first;
758 const auto &info = i.second;
759
760 // Decode MDSHealth
761 bufferlist bl;
762 mon->store->get(MDS_HEALTH_PREFIX, stringify(gid), bl);
763 if (!bl.length()) {
764 derr << "Missing health data for MDS " << gid << dendl;
765 continue;
766 }
767 MDSHealth health;
768 bufferlist::iterator bl_i = bl.begin();
769 health.decode(bl_i);
770
771 for (const auto &metric : health.metrics) {
772 int const rank = info.rank;
773 std::ostringstream message;
774 message << "mds" << rank << ": " << metric.message;
775 summary.push_back(std::make_pair(metric.sev, message.str()));
776
777 if (detail) {
778 // There is no way for us to clealy associate detail entries with summary entries (#7192), so
779 // we duplicate the summary message in the detail string and tag the metadata on.
780 std::ostringstream detail_message;
781 detail_message << message.str();
782 if (metric.metadata.size()) {
783 detail_message << "(";
784 auto k = metric.metadata.begin();
785 while (k != metric.metadata.end()) {
786 detail_message << k->first << ": " << k->second;
787 if (boost::next(k) != metric.metadata.end()) {
788 detail_message << ", ";
789 }
790 ++k;
791 }
792 detail_message << ")";
793 }
794 detail->push_back(std::make_pair(metric.sev, detail_message.str()));
795 }
796 }
797 }
798 }
799
800 void MDSMonitor::dump_info(Formatter *f)
801 {
802 f->open_object_section("fsmap");
803 fsmap.dump(f);
804 f->close_section();
805
806 f->dump_unsigned("mdsmap_first_committed", get_first_committed());
807 f->dump_unsigned("mdsmap_last_committed", get_last_committed());
808 }
809
810 bool MDSMonitor::preprocess_command(MonOpRequestRef op)
811 {
812 op->mark_mdsmon_event(__func__);
813 MMonCommand *m = static_cast<MMonCommand*>(op->get_req());
814 int r = -1;
815 bufferlist rdata;
816 stringstream ss, ds;
817
818 map<string, cmd_vartype> cmdmap;
819 if (!cmdmap_from_json(m->cmd, &cmdmap, ss)) {
820 // ss has reason for failure
821 string rs = ss.str();
822 mon->reply_command(op, -EINVAL, rs, rdata, get_last_committed());
823 return true;
824 }
825
826 string prefix;
827 cmd_getval(g_ceph_context, cmdmap, "prefix", prefix);
828 string format;
829 cmd_getval(g_ceph_context, cmdmap, "format", format, string("plain"));
830 boost::scoped_ptr<Formatter> f(Formatter::create(format));
831
832 MonSession *session = m->get_session();
833 if (!session) {
834 mon->reply_command(op, -EACCES, "access denied", rdata, get_last_committed());
835 return true;
836 }
837
838 if (prefix == "mds stat") {
839 if (f) {
840 f->open_object_section("mds_stat");
841 dump_info(f.get());
842 f->close_section();
843 f->flush(ds);
844 } else {
845 ds << fsmap;
846 }
847 r = 0;
848 } else if (prefix == "mds dump") {
849 int64_t epocharg;
850 epoch_t epoch;
851
852 FSMap *p = &fsmap;
853 if (cmd_getval(g_ceph_context, cmdmap, "epoch", epocharg)) {
854 epoch = epocharg;
855 bufferlist b;
856 int err = get_version(epoch, b);
857 if (err == -ENOENT) {
858 p = 0;
859 r = -ENOENT;
860 } else {
861 assert(err == 0);
862 assert(b.length());
863 p = new FSMap;
864 p->decode(b);
865 }
866 }
867 if (p) {
868 stringstream ds;
869 const MDSMap *mdsmap = nullptr;
870 MDSMap blank;
871 blank.epoch = fsmap.epoch;
872 if (fsmap.legacy_client_fscid != FS_CLUSTER_ID_NONE) {
873 mdsmap = &(fsmap.filesystems[fsmap.legacy_client_fscid]->mds_map);
874 } else {
875 mdsmap = &blank;
876 }
877 if (f != NULL) {
878 f->open_object_section("mdsmap");
879 mdsmap->dump(f.get());
880 f->close_section();
881 f->flush(ds);
882 r = 0;
883 } else {
884 mdsmap->print(ds);
885 r = 0;
886 }
887 if (r == 0) {
888 rdata.append(ds);
889 ss << "dumped fsmap epoch " << p->get_epoch();
890 }
891 if (p != &fsmap) {
892 delete p;
893 }
894 }
895 } else if (prefix == "fs dump") {
896 int64_t epocharg;
897 epoch_t epoch;
898
899 FSMap *p = &fsmap;
900 if (cmd_getval(g_ceph_context, cmdmap, "epoch", epocharg)) {
901 epoch = epocharg;
902 bufferlist b;
903 int err = get_version(epoch, b);
904 if (err == -ENOENT) {
905 p = 0;
906 r = -ENOENT;
907 } else {
908 assert(err == 0);
909 assert(b.length());
910 p = new FSMap;
911 p->decode(b);
912 }
913 }
914 if (p) {
915 stringstream ds;
916 if (f != NULL) {
917 f->open_object_section("fsmap");
918 p->dump(f.get());
919 f->close_section();
920 f->flush(ds);
921 r = 0;
922 } else {
923 p->print(ds);
924 r = 0;
925 }
926 if (r == 0) {
927 rdata.append(ds);
928 ss << "dumped fsmap epoch " << p->get_epoch();
929 }
930 if (p != &fsmap)
931 delete p;
932 }
933 } else if (prefix == "mds metadata") {
934 if (!f)
935 f.reset(Formatter::create("json-pretty"));
936
937 string who;
938 bool all = !cmd_getval(g_ceph_context, cmdmap, "who", who);
939 dout(1) << "all = " << all << dendl;
940 if (all) {
941 r = 0;
942 // Dump all MDSs' metadata
943 const auto all_info = fsmap.get_mds_info();
944
945 f->open_array_section("mds_metadata");
946 for(const auto &i : all_info) {
947 const auto &info = i.second;
948
949 f->open_object_section("mds");
950 f->dump_string("name", info.name);
951 std::ostringstream get_err;
952 r = dump_metadata(info.name, f.get(), get_err);
953 if (r == -EINVAL || r == -ENOENT) {
954 // Drop error, list what metadata we do have
955 dout(1) << get_err.str() << dendl;
956 r = 0;
957 } else if (r != 0) {
958 derr << "Unexpected error reading metadata: " << cpp_strerror(r)
959 << dendl;
960 ss << get_err.str();
961 break;
962 }
963 f->close_section();
964 }
965 f->close_section();
966 } else {
967 // Dump a single daemon's metadata
968 f->open_object_section("mds_metadata");
969 r = dump_metadata(who, f.get(), ss);
970 f->close_section();
971 }
972 f->flush(ds);
973 } else if (prefix == "mds versions") {
974 if (!f)
975 f.reset(Formatter::create("json-pretty"));
976 count_metadata("ceph_version", f.get());
977 f->flush(ds);
978 r = 0;
979 } else if (prefix == "mds count-metadata") {
980 if (!f)
981 f.reset(Formatter::create("json-pretty"));
982 string field;
983 cmd_getval(g_ceph_context, cmdmap, "property", field);
984 count_metadata(field, f.get());
985 f->flush(ds);
986 r = 0;
987 } else if (prefix == "mds getmap") {
988 epoch_t e;
989 int64_t epocharg;
990 bufferlist b;
991 if (cmd_getval(g_ceph_context, cmdmap, "epoch", epocharg)) {
992 e = epocharg;
993 int err = get_version(e, b);
994 if (err == -ENOENT) {
995 r = -ENOENT;
996 } else {
997 assert(err == 0);
998 assert(b.length());
999 FSMap mm;
1000 mm.decode(b);
1001 mm.encode(rdata, m->get_connection()->get_features());
1002 ss << "got fsmap epoch " << mm.get_epoch();
1003 r = 0;
1004 }
1005 } else {
1006 fsmap.encode(rdata, m->get_connection()->get_features());
1007 ss << "got fsmap epoch " << fsmap.get_epoch();
1008 r = 0;
1009 }
1010 } else if (prefix == "mds compat show") {
1011 if (f) {
1012 f->open_object_section("mds_compat");
1013 fsmap.compat.dump(f.get());
1014 f->close_section();
1015 f->flush(ds);
1016 } else {
1017 ds << fsmap.compat;
1018 }
1019 r = 0;
1020 } else if (prefix == "fs get") {
1021 string fs_name;
1022 cmd_getval(g_ceph_context, cmdmap, "fs_name", fs_name);
1023 auto fs = fsmap.get_filesystem(fs_name);
1024 if (fs == nullptr) {
1025 ss << "filesystem '" << fs_name << "' not found";
1026 r = -ENOENT;
1027 } else {
1028 if (f != nullptr) {
1029 f->open_object_section("filesystem");
1030 fs->dump(f.get());
1031 f->close_section();
1032 f->flush(ds);
1033 r = 0;
1034 } else {
1035 fs->print(ds);
1036 r = 0;
1037 }
1038 }
1039 } else if (prefix == "fs ls") {
1040 if (f) {
1041 f->open_array_section("filesystems");
1042 {
1043 for (const auto i : fsmap.filesystems) {
1044 const auto fs = i.second;
1045 f->open_object_section("filesystem");
1046 {
1047 const MDSMap &mds_map = fs->mds_map;
1048 f->dump_string("name", mds_map.fs_name);
1049 /* Output both the names and IDs of pools, for use by
1050 * humans and machines respectively */
1051 f->dump_string("metadata_pool", mon->osdmon()->osdmap.get_pool_name(
1052 mds_map.metadata_pool));
1053 f->dump_int("metadata_pool_id", mds_map.metadata_pool);
1054 f->open_array_section("data_pool_ids");
1055 {
1056 for (auto dpi = mds_map.data_pools.begin();
1057 dpi != mds_map.data_pools.end(); ++dpi) {
1058 f->dump_int("data_pool_id", *dpi);
1059 }
1060 }
1061 f->close_section();
1062
1063 f->open_array_section("data_pools");
1064 {
1065 for (auto dpi = mds_map.data_pools.begin();
1066 dpi != mds_map.data_pools.end(); ++dpi) {
1067 const auto &name = mon->osdmon()->osdmap.get_pool_name(
1068 *dpi);
1069 f->dump_string("data_pool", name);
1070 }
1071 }
1072
1073 f->close_section();
1074 }
1075 f->close_section();
1076 }
1077 }
1078 f->close_section();
1079 f->flush(ds);
1080 } else {
1081 for (const auto i : fsmap.filesystems) {
1082 const auto fs = i.second;
1083 const MDSMap &mds_map = fs->mds_map;
1084 const string &md_pool_name = mon->osdmon()->osdmap.get_pool_name(
1085 mds_map.metadata_pool);
1086
1087 ds << "name: " << mds_map.fs_name << ", metadata pool: "
1088 << md_pool_name << ", data pools: [";
1089 for (auto dpi : mds_map.data_pools) {
1090 const string &pool_name = mon->osdmon()->osdmap.get_pool_name(dpi);
1091 ds << pool_name << " ";
1092 }
1093 ds << "]" << std::endl;
1094 }
1095
1096 if (fsmap.filesystems.empty()) {
1097 ds << "No filesystems enabled" << std::endl;
1098 }
1099 }
1100 r = 0;
1101 }
1102
1103 if (r != -1) {
1104 rdata.append(ds);
1105 string rs;
1106 getline(ss, rs);
1107 mon->reply_command(op, r, rs, rdata, get_last_committed());
1108 return true;
1109 } else
1110 return false;
1111 }
1112
1113 bool MDSMonitor::fail_mds_gid(mds_gid_t gid)
1114 {
1115 const MDSMap::mds_info_t info = pending_fsmap.get_info_gid(gid);
1116 dout(10) << "fail_mds_gid " << gid << " mds." << info.name << " role " << info.rank << dendl;
1117
1118 epoch_t blacklist_epoch = 0;
1119 if (info.rank >= 0 && info.state != MDSMap::STATE_STANDBY_REPLAY) {
1120 utime_t until = ceph_clock_now();
1121 until += g_conf->mds_blacklist_interval;
1122 blacklist_epoch = mon->osdmon()->blacklist(info.addr, until);
1123 }
1124
1125 pending_fsmap.erase(gid, blacklist_epoch);
1126 last_beacon.erase(gid);
1127 if (pending_daemon_health.count(gid)) {
1128 pending_daemon_health.erase(gid);
1129 pending_daemon_health_rm.insert(gid);
1130 }
1131
1132 return blacklist_epoch != 0;
1133 }
1134
1135 mds_gid_t MDSMonitor::gid_from_arg(const std::string& arg, std::ostream &ss)
1136 {
1137 const FSMap *relevant_fsmap = mon->is_leader() ? &pending_fsmap : &fsmap;
1138
1139 // Try parsing as a role
1140 mds_role_t role;
1141 std::ostringstream ignore_err; // Don't spam 'ss' with parse_role errors
1142 int r = parse_role(arg, &role, ignore_err);
1143 if (r == 0) {
1144 // See if a GID is assigned to this role
1145 auto fs = relevant_fsmap->get_filesystem(role.fscid);
1146 assert(fs != nullptr); // parse_role ensures it exists
1147 if (fs->mds_map.is_up(role.rank)) {
1148 dout(10) << __func__ << ": validated rank/GID " << role
1149 << " as a rank" << dendl;
1150 return fs->mds_map.get_mds_info(role.rank).global_id;
1151 }
1152 }
1153
1154 // Try parsing as a gid
1155 std::string err;
1156 unsigned long long maybe_gid = strict_strtoll(arg.c_str(), 10, &err);
1157 if (!err.empty()) {
1158 // Not a role or a GID, try as a daemon name
1159 const MDSMap::mds_info_t *mds_info = relevant_fsmap->find_by_name(arg);
1160 if (!mds_info) {
1161 ss << "MDS named '" << arg
1162 << "' does not exist, or is not up";
1163 return MDS_GID_NONE;
1164 }
1165 dout(10) << __func__ << ": resolved MDS name '" << arg
1166 << "' to GID " << mds_info->global_id << dendl;
1167 return mds_info->global_id;
1168 } else {
1169 // Not a role, but parses as a an integer, might be a GID
1170 dout(10) << __func__ << ": treating MDS reference '" << arg
1171 << "' as an integer " << maybe_gid << dendl;
1172
1173 if (relevant_fsmap->gid_exists(mds_gid_t(maybe_gid))) {
1174 return mds_gid_t(maybe_gid);
1175 }
1176 }
1177
1178 dout(1) << __func__ << ": rank/GID " << arg
1179 << " not a existent rank or GID" << dendl;
1180 return MDS_GID_NONE;
1181 }
1182
1183 int MDSMonitor::fail_mds(std::ostream &ss, const std::string &arg)
1184 {
1185 mds_gid_t gid = gid_from_arg(arg, ss);
1186 if (gid == MDS_GID_NONE) {
1187 return 0;
1188 }
1189 if (!mon->osdmon()->is_writeable()) {
1190 return -EAGAIN;
1191 }
1192 fail_mds_gid(gid);
1193 ss << "failed mds gid " << gid;
1194 assert(mon->osdmon()->is_writeable());
1195 request_proposal(mon->osdmon());
1196 return 0;
1197 }
1198
1199 bool MDSMonitor::prepare_command(MonOpRequestRef op)
1200 {
1201 op->mark_mdsmon_event(__func__);
1202 MMonCommand *m = static_cast<MMonCommand*>(op->get_req());
1203 int r = -EINVAL;
1204 stringstream ss;
1205 bufferlist rdata;
1206
1207 map<string, cmd_vartype> cmdmap;
1208 if (!cmdmap_from_json(m->cmd, &cmdmap, ss)) {
1209 string rs = ss.str();
1210 mon->reply_command(op, -EINVAL, rs, rdata, get_last_committed());
1211 return true;
1212 }
1213
1214 string prefix;
1215 cmd_getval(g_ceph_context, cmdmap, "prefix", prefix);
1216
1217 /* Refuse access if message not associated with a valid session */
1218 MonSession *session = m->get_session();
1219 if (!session) {
1220 mon->reply_command(op, -EACCES, "access denied", rdata, get_last_committed());
1221 return true;
1222 }
1223
1224 for (auto h : handlers) {
1225 if (h->can_handle(prefix)) {
1226 r = h->handle(mon, pending_fsmap, op, cmdmap, ss);
1227 if (r == -EAGAIN) {
1228 // message has been enqueued for retry; return.
1229 dout(4) << __func__ << " enqueue for retry by prepare_command" << dendl;
1230 return false;
1231 } else {
1232 if (r == 0) {
1233 // On successful updates, print the updated map
1234 print_map(pending_fsmap);
1235 }
1236 // Successful or not, we're done: respond.
1237 goto out;
1238 }
1239 }
1240 }
1241
1242 r = filesystem_command(op, prefix, cmdmap, ss);
1243 if (r >= 0) {
1244 goto out;
1245 } else if (r == -EAGAIN) {
1246 // Do not reply, the message has been enqueued for retry
1247 dout(4) << __func__ << " enqueue for retry by filesystem_command" << dendl;
1248 return false;
1249 } else if (r != -ENOSYS) {
1250 goto out;
1251 }
1252
1253 // Only handle legacy commands if there is a filesystem configured
1254 if (pending_fsmap.legacy_client_fscid == FS_CLUSTER_ID_NONE) {
1255 if (pending_fsmap.filesystems.size() == 0) {
1256 ss << "No filesystem configured: use `ceph fs new` to create a filesystem";
1257 } else {
1258 ss << "No filesystem set for use with legacy commands";
1259 }
1260 r = -EINVAL;
1261 goto out;
1262 }
1263
1264 r = legacy_filesystem_command(op, prefix, cmdmap, ss);
1265
1266 if (r == -ENOSYS && ss.str().empty()) {
1267 ss << "unrecognized command";
1268 }
1269
1270 out:
1271 dout(4) << __func__ << " done, r=" << r << dendl;
1272 /* Compose response */
1273 string rs;
1274 getline(ss, rs);
1275
1276 if (r >= 0) {
1277 // success.. delay reply
1278 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, r, rs,
1279 get_last_committed() + 1));
1280 return true;
1281 } else {
1282 // reply immediately
1283 mon->reply_command(op, r, rs, rdata, get_last_committed());
1284 return false;
1285 }
1286 }
1287
1288
1289 /**
1290 * Given one of the following forms:
1291 * <fs name>:<rank>
1292 * <fs id>:<rank>
1293 * <rank>
1294 *
1295 * Parse into a mds_role_t. The rank-only form is only valid
1296 * if legacy_client_ns is set.
1297 */
1298 int MDSMonitor::parse_role(
1299 const std::string &role_str,
1300 mds_role_t *role,
1301 std::ostream &ss)
1302 {
1303 const FSMap *relevant_fsmap = &fsmap;
1304 if (mon->is_leader()) {
1305 relevant_fsmap = &pending_fsmap;
1306 }
1307 return relevant_fsmap->parse_role(role_str, role, ss);
1308 }
1309
1310 int MDSMonitor::filesystem_command(
1311 MonOpRequestRef op,
1312 std::string const &prefix,
1313 map<string, cmd_vartype> &cmdmap,
1314 std::stringstream &ss)
1315 {
1316 dout(4) << __func__ << " prefix='" << prefix << "'" << dendl;
1317 op->mark_mdsmon_event(__func__);
1318 int r = 0;
1319 string whostr;
1320 cmd_getval(g_ceph_context, cmdmap, "who", whostr);
1321
1322 if (prefix == "mds stop" ||
1323 prefix == "mds deactivate") {
1324
1325 mds_role_t role;
1326 r = parse_role(whostr, &role, ss);
1327 if (r < 0 ) {
1328 return r;
1329 }
1330 auto fs = pending_fsmap.get_filesystem(role.fscid);
1331
1332 if (!fs->mds_map.is_active(role.rank)) {
1333 r = -EEXIST;
1334 ss << "mds." << role << " not active ("
1335 << ceph_mds_state_name(fs->mds_map.get_state(role.rank)) << ")";
1336 } else if (fs->mds_map.get_root() == role.rank ||
1337 fs->mds_map.get_tableserver() == role.rank) {
1338 r = -EINVAL;
1339 ss << "can't tell the root (" << fs->mds_map.get_root()
1340 << ") or tableserver (" << fs->mds_map.get_tableserver()
1341 << ") to deactivate";
1342 } else if (role.rank != fs->mds_map.get_last_in_mds()) {
1343 r = -EINVAL;
1344 ss << "mds." << role << " doesn't have the max rank ("
1345 << fs->mds_map.get_last_in_mds() << ")";
1346 } else if (fs->mds_map.get_num_in_mds() <= size_t(fs->mds_map.get_max_mds())) {
1347 r = -EBUSY;
1348 ss << "must decrease max_mds or else MDS will immediately reactivate";
1349 } else {
1350 r = 0;
1351 mds_gid_t gid = fs->mds_map.up.at(role.rank);
1352 ss << "telling mds." << role << " "
1353 << pending_fsmap.get_info_gid(gid).addr << " to deactivate";
1354
1355 pending_fsmap.modify_daemon(gid, [](MDSMap::mds_info_t *info) {
1356 info->state = MDSMap::STATE_STOPPING;
1357 });
1358 }
1359 } else if (prefix == "mds set_state") {
1360 mds_gid_t gid;
1361 if (!cmd_getval(g_ceph_context, cmdmap, "gid", gid)) {
1362 ss << "error parsing 'gid' value '"
1363 << cmd_vartype_stringify(cmdmap["gid"]) << "'";
1364 return -EINVAL;
1365 }
1366 MDSMap::DaemonState state;
1367 if (!cmd_getval(g_ceph_context, cmdmap, "state", state)) {
1368 ss << "error parsing 'state' string value '"
1369 << cmd_vartype_stringify(cmdmap["state"]) << "'";
1370 return -EINVAL;
1371 }
1372 if (pending_fsmap.gid_exists(gid)) {
1373 pending_fsmap.modify_daemon(gid, [state](MDSMap::mds_info_t *info) {
1374 info->state = state;
1375 });
1376 ss << "set mds gid " << gid << " to state " << state << " "
1377 << ceph_mds_state_name(state);
1378 return 0;
1379 }
1380 } else if (prefix == "mds fail") {
1381 string who;
1382 cmd_getval(g_ceph_context, cmdmap, "who", who);
1383 r = fail_mds(ss, who);
1384 if (r < 0 && r == -EAGAIN) {
1385 mon->osdmon()->wait_for_writeable(op, new C_RetryMessage(this, op));
1386 return -EAGAIN; // don't propose yet; wait for message to be retried
1387 }
1388 } else if (prefix == "mds rm") {
1389 mds_gid_t gid;
1390 if (!cmd_getval(g_ceph_context, cmdmap, "gid", gid)) {
1391 ss << "error parsing 'gid' value '"
1392 << cmd_vartype_stringify(cmdmap["gid"]) << "'";
1393 return -EINVAL;
1394 }
1395 if (!pending_fsmap.gid_exists(gid)) {
1396 ss << "mds gid " << gid << " dne";
1397 r = 0;
1398 } else {
1399 MDSMap::DaemonState state = pending_fsmap.get_info_gid(gid).state;
1400 if (state > 0) {
1401 ss << "cannot remove active mds." << pending_fsmap.get_info_gid(gid).name
1402 << " rank " << pending_fsmap.get_info_gid(gid).rank;
1403 return -EBUSY;
1404 } else {
1405 pending_fsmap.erase(gid, {});
1406 ss << "removed mds gid " << gid;
1407 return 0;
1408 }
1409 }
1410 } else if (prefix == "mds rmfailed") {
1411 string confirm;
1412 if (!cmd_getval(g_ceph_context, cmdmap, "confirm", confirm) ||
1413 confirm != "--yes-i-really-mean-it") {
1414 ss << "WARNING: this can make your filesystem inaccessible! "
1415 "Add --yes-i-really-mean-it if you are sure you wish to continue.";
1416 return -EPERM;
1417 }
1418
1419 std::string role_str;
1420 cmd_getval(g_ceph_context, cmdmap, "who", role_str);
1421 mds_role_t role;
1422 int r = parse_role(role_str, &role, ss);
1423 if (r < 0) {
1424 ss << "invalid role '" << role_str << "'";
1425 return -EINVAL;
1426 }
1427
1428 pending_fsmap.modify_filesystem(
1429 role.fscid,
1430 [role](std::shared_ptr<Filesystem> fs)
1431 {
1432 fs->mds_map.failed.erase(role.rank);
1433 });
1434
1435 ss << "removed failed mds." << role;
1436 return 0;
1437 } else if (prefix == "mds compat rm_compat") {
1438 int64_t f;
1439 if (!cmd_getval(g_ceph_context, cmdmap, "feature", f)) {
1440 ss << "error parsing feature value '"
1441 << cmd_vartype_stringify(cmdmap["feature"]) << "'";
1442 return -EINVAL;
1443 }
1444 if (pending_fsmap.compat.compat.contains(f)) {
1445 ss << "removing compat feature " << f;
1446 CompatSet modified = pending_fsmap.compat;
1447 modified.compat.remove(f);
1448 pending_fsmap.update_compat(modified);
1449 } else {
1450 ss << "compat feature " << f << " not present in " << pending_fsmap.compat;
1451 }
1452 r = 0;
1453 } else if (prefix == "mds compat rm_incompat") {
1454 int64_t f;
1455 if (!cmd_getval(g_ceph_context, cmdmap, "feature", f)) {
1456 ss << "error parsing feature value '"
1457 << cmd_vartype_stringify(cmdmap["feature"]) << "'";
1458 return -EINVAL;
1459 }
1460 if (pending_fsmap.compat.incompat.contains(f)) {
1461 ss << "removing incompat feature " << f;
1462 CompatSet modified = pending_fsmap.compat;
1463 modified.incompat.remove(f);
1464 pending_fsmap.update_compat(modified);
1465 } else {
1466 ss << "incompat feature " << f << " not present in " << pending_fsmap.compat;
1467 }
1468 r = 0;
1469 } else if (prefix == "mds repaired") {
1470 std::string role_str;
1471 cmd_getval(g_ceph_context, cmdmap, "rank", role_str);
1472 mds_role_t role;
1473 r = parse_role(role_str, &role, ss);
1474 if (r < 0) {
1475 return r;
1476 }
1477
1478 bool modified = pending_fsmap.undamaged(role.fscid, role.rank);
1479 if (modified) {
1480 dout(4) << "repaired: restoring rank " << role << dendl;
1481 } else {
1482 dout(4) << "repaired: no-op on rank " << role << dendl;
1483 }
1484
1485 r = 0;
1486 } else {
1487 return -ENOSYS;
1488 }
1489
1490 return r;
1491 }
1492
1493 /**
1494 * Helper to legacy_filesystem_command
1495 */
1496 void MDSMonitor::modify_legacy_filesystem(
1497 std::function<void(std::shared_ptr<Filesystem> )> fn)
1498 {
1499 pending_fsmap.modify_filesystem(
1500 pending_fsmap.legacy_client_fscid,
1501 fn
1502 );
1503 }
1504
1505
1506
1507 /**
1508 * Handle a command that affects the filesystem (i.e. a filesystem
1509 * must exist for the command to act upon).
1510 *
1511 * @retval 0 Command was successfully handled and has side effects
1512 * @retval -EAGAIN Messages has been requeued for retry
1513 * @retval -ENOSYS Unknown command
1514 * @retval < 0 An error has occurred; **ss** may have been set.
1515 */
1516 int MDSMonitor::legacy_filesystem_command(
1517 MonOpRequestRef op,
1518 std::string const &prefix,
1519 map<string, cmd_vartype> &cmdmap,
1520 std::stringstream &ss)
1521 {
1522 dout(4) << __func__ << " prefix='" << prefix << "'" << dendl;
1523 op->mark_mdsmon_event(__func__);
1524 int r = 0;
1525 string whostr;
1526 cmd_getval(g_ceph_context, cmdmap, "who", whostr);
1527
1528 assert (pending_fsmap.legacy_client_fscid != FS_CLUSTER_ID_NONE);
1529
1530 if (prefix == "mds set_max_mds") {
1531 // NOTE: deprecated by "fs set max_mds"
1532 int64_t maxmds;
1533 if (!cmd_getval(g_ceph_context, cmdmap, "maxmds", maxmds) || maxmds <= 0) {
1534 return -EINVAL;
1535 }
1536
1537 const MDSMap& mdsmap =
1538 pending_fsmap.filesystems.at(pending_fsmap.legacy_client_fscid)->mds_map;
1539
1540 if (!mdsmap.allows_multimds() &&
1541 maxmds > mdsmap.get_max_mds() &&
1542 maxmds > 1) {
1543 ss << "multi-MDS clusters are not enabled; set 'allow_multimds' to enable";
1544 return -EINVAL;
1545 }
1546
1547 if (maxmds > MAX_MDS) {
1548 ss << "may not have more than " << MAX_MDS << " MDS ranks";
1549 return -EINVAL;
1550 }
1551
1552 modify_legacy_filesystem(
1553 [maxmds](std::shared_ptr<Filesystem> fs)
1554 {
1555 fs->mds_map.set_max_mds(maxmds);
1556 });
1557
1558 r = 0;
1559 ss << "max_mds = " << maxmds;
1560 } else if (prefix == "mds cluster_down") {
1561 // NOTE: deprecated by "fs set cluster_down"
1562 modify_legacy_filesystem(
1563 [](std::shared_ptr<Filesystem> fs)
1564 {
1565 fs->mds_map.set_flag(CEPH_MDSMAP_DOWN);
1566 });
1567 ss << "marked fsmap DOWN";
1568 r = 0;
1569 } else if (prefix == "mds cluster_up") {
1570 // NOTE: deprecated by "fs set cluster_up"
1571 modify_legacy_filesystem(
1572 [](std::shared_ptr<Filesystem> fs)
1573 {
1574 fs->mds_map.clear_flag(CEPH_MDSMAP_DOWN);
1575 });
1576 ss << "unmarked fsmap DOWN";
1577 r = 0;
1578 } else {
1579 return -ENOSYS;
1580 }
1581
1582 return r;
1583 }
1584
1585
1586 void MDSMonitor::check_subs()
1587 {
1588 std::list<std::string> types;
1589
1590 // Subscriptions may be to "mdsmap" (MDS and legacy clients),
1591 // "mdsmap.<namespace>", or to "fsmap" for the full state of all
1592 // filesystems. Build a list of all the types we service
1593 // subscriptions for.
1594 types.push_back("fsmap");
1595 types.push_back("fsmap.user");
1596 types.push_back("mdsmap");
1597 for (const auto &i : fsmap.filesystems) {
1598 auto fscid = i.first;
1599 std::ostringstream oss;
1600 oss << "mdsmap." << fscid;
1601 types.push_back(oss.str());
1602 }
1603
1604 for (const auto &type : types) {
1605 if (mon->session_map.subs.count(type) == 0)
1606 continue;
1607 xlist<Subscription*>::iterator p = mon->session_map.subs[type]->begin();
1608 while (!p.end()) {
1609 Subscription *sub = *p;
1610 ++p;
1611 check_sub(sub);
1612 }
1613 }
1614 }
1615
1616
1617 void MDSMonitor::check_sub(Subscription *sub)
1618 {
1619 dout(20) << __func__ << ": " << sub->type << dendl;
1620
1621 if (sub->type == "fsmap") {
1622 if (sub->next <= fsmap.get_epoch()) {
1623 sub->session->con->send_message(new MFSMap(mon->monmap->fsid, fsmap));
1624 if (sub->onetime) {
1625 mon->session_map.remove_sub(sub);
1626 } else {
1627 sub->next = fsmap.get_epoch() + 1;
1628 }
1629 }
1630 } else if (sub->type == "fsmap.user") {
1631 if (sub->next <= fsmap.get_epoch()) {
1632 FSMapUser fsmap_u;
1633 fsmap_u.epoch = fsmap.get_epoch();
1634 fsmap_u.legacy_client_fscid = fsmap.legacy_client_fscid;
1635 for (auto p = fsmap.filesystems.begin();
1636 p != fsmap.filesystems.end();
1637 ++p) {
1638 FSMapUser::fs_info_t& fs_info = fsmap_u.filesystems[p->first];
1639 fs_info.cid = p->first;
1640 fs_info.name= p->second->mds_map.fs_name;
1641 }
1642 sub->session->con->send_message(new MFSMapUser(mon->monmap->fsid, fsmap_u));
1643 if (sub->onetime) {
1644 mon->session_map.remove_sub(sub);
1645 } else {
1646 sub->next = fsmap.get_epoch() + 1;
1647 }
1648 }
1649 } else if (sub->type.compare(0, 6, "mdsmap") == 0) {
1650 if (sub->next > fsmap.get_epoch()) {
1651 return;
1652 }
1653
1654 const bool is_mds = sub->session->inst.name.is_mds();
1655 mds_gid_t mds_gid = MDS_GID_NONE;
1656 fs_cluster_id_t fscid = FS_CLUSTER_ID_NONE;
1657 if (is_mds) {
1658 // What (if any) namespace are you assigned to?
1659 auto mds_info = fsmap.get_mds_info();
1660 for (const auto &i : mds_info) {
1661 if (i.second.addr == sub->session->inst.addr) {
1662 mds_gid = i.first;
1663 fscid = fsmap.mds_roles.at(mds_gid);
1664 }
1665 }
1666 } else {
1667 // You're a client. Did you request a particular
1668 // namespace?
1669 if (sub->type.find("mdsmap.") == 0) {
1670 auto namespace_id_str = sub->type.substr(std::string("mdsmap.").size());
1671 dout(10) << __func__ << ": namespace_id " << namespace_id_str << dendl;
1672 std::string err;
1673 fscid = strict_strtoll(namespace_id_str.c_str(), 10, &err);
1674 if (!err.empty()) {
1675 // Client asked for a non-existent namespace, send them nothing
1676 dout(1) << "Invalid client subscription '" << sub->type
1677 << "'" << dendl;
1678 return;
1679 }
1680 if (fsmap.filesystems.count(fscid) == 0) {
1681 // Client asked for a non-existent namespace, send them nothing
1682 // TODO: something more graceful for when a client has a filesystem
1683 // mounted, and the fileysstem is deleted. Add a "shut down you fool"
1684 // flag to MMDSMap?
1685 dout(1) << "Client subscribed to non-existent namespace '" <<
1686 fscid << "'" << dendl;
1687 return;
1688 }
1689 } else {
1690 // Unqualified request for "mdsmap": give it the one marked
1691 // for use by legacy clients.
1692 if (fsmap.legacy_client_fscid != FS_CLUSTER_ID_NONE) {
1693 fscid = fsmap.legacy_client_fscid;
1694 } else {
1695 dout(1) << "Client subscribed for legacy filesystem but "
1696 "none is configured" << dendl;
1697 return;
1698 }
1699 }
1700 }
1701 dout(10) << __func__ << ": is_mds=" << is_mds << ", fscid= " << fscid << dendl;
1702
1703 // Work out the effective latest epoch
1704 MDSMap *mds_map = nullptr;
1705 MDSMap null_map;
1706 null_map.compat = fsmap.compat;
1707 if (fscid == FS_CLUSTER_ID_NONE) {
1708 // For a client, we should have already dropped out
1709 assert(is_mds);
1710
1711 if (fsmap.standby_daemons.count(mds_gid)) {
1712 // For an MDS, we need to feed it an MDSMap with its own state in
1713 null_map.mds_info[mds_gid] = fsmap.standby_daemons[mds_gid];
1714 null_map.epoch = fsmap.standby_epochs[mds_gid];
1715 } else {
1716 null_map.epoch = fsmap.epoch;
1717 }
1718 mds_map = &null_map;
1719 } else {
1720 // Check the effective epoch
1721 mds_map = &(fsmap.filesystems.at(fscid)->mds_map);
1722 }
1723
1724 assert(mds_map != nullptr);
1725 dout(10) << __func__ << " selected MDS map epoch " <<
1726 mds_map->epoch << " for namespace " << fscid << " for subscriber "
1727 << sub->session->inst.name << " who wants epoch " << sub->next << dendl;
1728
1729 if (sub->next > mds_map->epoch) {
1730 return;
1731 }
1732 auto msg = new MMDSMap(mon->monmap->fsid, mds_map);
1733
1734 sub->session->con->send_message(msg);
1735 if (sub->onetime) {
1736 mon->session_map.remove_sub(sub);
1737 } else {
1738 sub->next = mds_map->get_epoch() + 1;
1739 }
1740 }
1741 }
1742
1743
1744 void MDSMonitor::update_metadata(mds_gid_t gid,
1745 const map<string, string>& metadata)
1746 {
1747 if (metadata.empty()) {
1748 return;
1749 }
1750 pending_metadata[gid] = metadata;
1751
1752 MonitorDBStore::TransactionRef t = paxos->get_pending_transaction();
1753 bufferlist bl;
1754 ::encode(pending_metadata, bl);
1755 t->put(MDS_METADATA_PREFIX, "last_metadata", bl);
1756 paxos->trigger_propose();
1757 }
1758
1759 void MDSMonitor::remove_from_metadata(MonitorDBStore::TransactionRef t)
1760 {
1761 bool update = false;
1762 for (map<mds_gid_t, Metadata>::iterator i = pending_metadata.begin();
1763 i != pending_metadata.end(); ) {
1764 if (!pending_fsmap.gid_exists(i->first)) {
1765 pending_metadata.erase(i++);
1766 update = true;
1767 } else {
1768 ++i;
1769 }
1770 }
1771 if (!update)
1772 return;
1773 bufferlist bl;
1774 ::encode(pending_metadata, bl);
1775 t->put(MDS_METADATA_PREFIX, "last_metadata", bl);
1776 }
1777
1778 int MDSMonitor::load_metadata(map<mds_gid_t, Metadata>& m)
1779 {
1780 bufferlist bl;
1781 int r = mon->store->get(MDS_METADATA_PREFIX, "last_metadata", bl);
1782 if (r) {
1783 dout(1) << "Unable to load 'last_metadata'" << dendl;
1784 return r;
1785 }
1786
1787 bufferlist::iterator it = bl.begin();
1788 ::decode(m, it);
1789 return 0;
1790 }
1791
1792 void MDSMonitor::count_metadata(const string& field, Formatter *f)
1793 {
1794 map<string,int> by_val;
1795 map<mds_gid_t,Metadata> meta;
1796 load_metadata(meta);
1797 for (auto& p : meta) {
1798 auto q = p.second.find(field);
1799 if (q == p.second.end()) {
1800 by_val["unknown"]++;
1801 } else {
1802 by_val[q->second]++;
1803 }
1804 }
1805 f->open_object_section(field.c_str());
1806 for (auto& p : by_val) {
1807 f->dump_int(p.first.c_str(), p.second);
1808 }
1809 f->close_section();
1810 }
1811
1812 int MDSMonitor::dump_metadata(const std::string &who, Formatter *f, ostream& err)
1813 {
1814 assert(f);
1815
1816 mds_gid_t gid = gid_from_arg(who, err);
1817 if (gid == MDS_GID_NONE) {
1818 return -EINVAL;
1819 }
1820
1821 map<mds_gid_t, Metadata> metadata;
1822 if (int r = load_metadata(metadata)) {
1823 err << "Unable to load 'last_metadata'";
1824 return r;
1825 }
1826
1827 if (!metadata.count(gid)) {
1828 return -ENOENT;
1829 }
1830 const Metadata& m = metadata[gid];
1831 for (Metadata::const_iterator p = m.begin(); p != m.end(); ++p) {
1832 f->dump_string(p->first.c_str(), p->second);
1833 }
1834 return 0;
1835 }
1836
1837 int MDSMonitor::print_nodes(Formatter *f)
1838 {
1839 assert(f);
1840
1841 map<mds_gid_t, Metadata> metadata;
1842 if (int r = load_metadata(metadata)) {
1843 return r;
1844 }
1845
1846 map<string, list<int> > mdses; // hostname => rank
1847 for (map<mds_gid_t, Metadata>::iterator it = metadata.begin();
1848 it != metadata.end(); ++it) {
1849 const Metadata& m = it->second;
1850 Metadata::const_iterator hostname = m.find("hostname");
1851 if (hostname == m.end()) {
1852 // not likely though
1853 continue;
1854 }
1855 const mds_gid_t gid = it->first;
1856 if (!fsmap.gid_exists(gid)) {
1857 dout(5) << __func__ << ": GID " << gid << " not existent" << dendl;
1858 continue;
1859 }
1860 const MDSMap::mds_info_t& mds_info = fsmap.get_info_gid(gid);
1861 // FIXME: include filesystem name with rank here
1862 mdses[hostname->second].push_back(mds_info.rank);
1863 }
1864
1865 dump_services(f, mdses, "mds");
1866 return 0;
1867 }
1868
1869 /**
1870 * If a cluster is undersized (with respect to max_mds), then
1871 * attempt to find daemons to grow it.
1872 */
1873 bool MDSMonitor::maybe_expand_cluster(std::shared_ptr<Filesystem> fs)
1874 {
1875 bool do_propose = false;
1876
1877 if (fs->mds_map.test_flag(CEPH_MDSMAP_DOWN)) {
1878 return do_propose;
1879 }
1880
1881 while (fs->mds_map.get_num_in_mds() < size_t(fs->mds_map.get_max_mds()) &&
1882 !fs->mds_map.is_degraded()) {
1883 mds_rank_t mds = mds_rank_t(0);
1884 string name;
1885 while (fs->mds_map.is_in(mds)) {
1886 mds++;
1887 }
1888 mds_gid_t newgid = pending_fsmap.find_replacement_for({fs->fscid, mds},
1889 name, g_conf->mon_force_standby_active);
1890 if (newgid == MDS_GID_NONE) {
1891 break;
1892 }
1893
1894 dout(1) << "adding standby " << pending_fsmap.get_info_gid(newgid).addr
1895 << " as mds." << mds << dendl;
1896 pending_fsmap.promote(newgid, fs, mds);
1897 do_propose = true;
1898 }
1899
1900 return do_propose;
1901 }
1902
1903
1904 /**
1905 * If a daemon is laggy, and a suitable replacement
1906 * is available, fail this daemon (remove from map) and pass its
1907 * role to another daemon.
1908 */
1909 void MDSMonitor::maybe_replace_gid(mds_gid_t gid,
1910 const beacon_info_t &beacon,
1911 bool *mds_propose, bool *osd_propose)
1912 {
1913 assert(mds_propose != nullptr);
1914 assert(osd_propose != nullptr);
1915
1916 const MDSMap::mds_info_t info = pending_fsmap.get_info_gid(gid);
1917 const auto fscid = pending_fsmap.mds_roles.at(gid);
1918
1919 dout(10) << "no beacon from " << gid << " " << info.addr << " mds."
1920 << info.rank << "." << info.inc
1921 << " " << ceph_mds_state_name(info.state)
1922 << " since " << beacon.stamp << dendl;
1923
1924 // We will only take decisive action (replacing/removing a daemon)
1925 // if we have some indicating that some other daemon(s) are successfully
1926 // getting beacons through recently.
1927 utime_t latest_beacon;
1928 for (const auto & i : last_beacon) {
1929 latest_beacon = MAX(i.second.stamp, latest_beacon);
1930 }
1931 const bool may_replace = latest_beacon >
1932 (ceph_clock_now() -
1933 MAX(g_conf->mds_beacon_interval, g_conf->mds_beacon_grace * 0.5));
1934
1935 // are we in?
1936 // and is there a non-laggy standby that can take over for us?
1937 mds_gid_t sgid;
1938 if (info.rank >= 0 &&
1939 info.state != MDSMap::STATE_STANDBY &&
1940 info.state != MDSMap::STATE_STANDBY_REPLAY &&
1941 may_replace &&
1942 !pending_fsmap.get_filesystem(fscid)->mds_map.test_flag(CEPH_MDSMAP_DOWN) &&
1943 (sgid = pending_fsmap.find_replacement_for({fscid, info.rank}, info.name,
1944 g_conf->mon_force_standby_active)) != MDS_GID_NONE)
1945 {
1946
1947 MDSMap::mds_info_t si = pending_fsmap.get_info_gid(sgid);
1948 dout(10) << " replacing " << gid << " " << info.addr << " mds."
1949 << info.rank << "." << info.inc
1950 << " " << ceph_mds_state_name(info.state)
1951 << " with " << sgid << "/" << si.name << " " << si.addr << dendl;
1952
1953 mon->clog->warn() << "MDS daemon '" << info.name << "'"
1954 << " is not responding, replacing it "
1955 << "as rank " << info.rank
1956 << " with standby '" << si.name << "'";
1957
1958 // Remember what NS the old one was in
1959 const fs_cluster_id_t fscid = pending_fsmap.mds_roles.at(gid);
1960
1961 // Remove the old one
1962 *osd_propose |= fail_mds_gid(gid);
1963
1964 // Promote the replacement
1965 auto fs = pending_fsmap.filesystems.at(fscid);
1966 pending_fsmap.promote(sgid, fs, info.rank);
1967
1968 *mds_propose = true;
1969 } else if ((info.state == MDSMap::STATE_STANDBY_REPLAY ||
1970 info.state == MDSMap::STATE_STANDBY) && may_replace) {
1971 dout(10) << " failing and removing " << gid << " " << info.addr << " mds." << info.rank
1972 << "." << info.inc << " " << ceph_mds_state_name(info.state)
1973 << dendl;
1974 mon->clog->info() << "MDS standby '" << info.name
1975 << "' is not responding, removing it from the set of "
1976 << "standbys";
1977 fail_mds_gid(gid);
1978 *mds_propose = true;
1979 } else if (!info.laggy()) {
1980 dout(10) << " marking " << gid << " " << info.addr << " mds." << info.rank << "." << info.inc
1981 << " " << ceph_mds_state_name(info.state)
1982 << " laggy" << dendl;
1983 pending_fsmap.modify_daemon(info.global_id, [](MDSMap::mds_info_t *info) {
1984 info->laggy_since = ceph_clock_now();
1985 });
1986 *mds_propose = true;
1987 }
1988 }
1989
1990 bool MDSMonitor::maybe_promote_standby(std::shared_ptr<Filesystem> fs)
1991 {
1992 assert(!fs->mds_map.test_flag(CEPH_MDSMAP_DOWN));
1993
1994 bool do_propose = false;
1995
1996 // have a standby take over?
1997 set<mds_rank_t> failed;
1998 fs->mds_map.get_failed_mds_set(failed);
1999 if (!failed.empty()) {
2000 set<mds_rank_t>::iterator p = failed.begin();
2001 while (p != failed.end()) {
2002 mds_rank_t f = *p++;
2003 mds_gid_t sgid = pending_fsmap.find_replacement_for({fs->fscid, f}, {},
2004 g_conf->mon_force_standby_active);
2005 if (sgid) {
2006 const MDSMap::mds_info_t si = pending_fsmap.get_info_gid(sgid);
2007 dout(0) << " taking over failed mds." << f << " with " << sgid
2008 << "/" << si.name << " " << si.addr << dendl;
2009 pending_fsmap.promote(sgid, fs, f);
2010 do_propose = true;
2011 }
2012 }
2013 } else {
2014 // There were no failures to replace, so try using any available standbys
2015 // as standby-replay daemons.
2016
2017 // Take a copy of the standby GIDs so that we can iterate over
2018 // them while perhaps-modifying standby_daemons during the loop
2019 // (if we promote anyone they are removed from standby_daemons)
2020 std::vector<mds_gid_t> standby_gids;
2021 for (const auto &j : pending_fsmap.standby_daemons) {
2022 standby_gids.push_back(j.first);
2023 }
2024
2025 for (const auto &gid : standby_gids) {
2026 const auto &info = pending_fsmap.standby_daemons.at(gid);
2027 assert(info.state == MDSMap::STATE_STANDBY);
2028
2029 if (!info.standby_replay) {
2030 continue;
2031 }
2032
2033 /*
2034 * This mds is standby but has no rank assigned.
2035 * See if we can find it somebody to shadow
2036 */
2037 dout(20) << "gid " << gid << " is standby and following nobody" << dendl;
2038
2039 // standby for someone specific?
2040 if (info.standby_for_rank >= 0) {
2041 // The mds_info_t may or may not tell us exactly which filesystem
2042 // the standby_for_rank refers to: lookup via legacy_client_fscid
2043 mds_role_t target_role = {
2044 info.standby_for_fscid == FS_CLUSTER_ID_NONE ?
2045 pending_fsmap.legacy_client_fscid : info.standby_for_fscid,
2046 info.standby_for_rank};
2047
2048 // It is possible that the map contains a standby_for_fscid
2049 // that doesn't correspond to an existing filesystem, especially
2050 // if we loaded from a version with a bug (#17466)
2051 if (info.standby_for_fscid != FS_CLUSTER_ID_NONE
2052 && !pending_fsmap.filesystem_exists(info.standby_for_fscid)) {
2053 derr << "gid " << gid << " has invalid standby_for_fscid "
2054 << info.standby_for_fscid << dendl;
2055 continue;
2056 }
2057
2058 // If we managed to resolve a full target role
2059 if (target_role.fscid != FS_CLUSTER_ID_NONE) {
2060 auto fs = pending_fsmap.get_filesystem(target_role.fscid);
2061 if (fs->mds_map.is_followable(target_role.rank)) {
2062 do_propose |= try_standby_replay(
2063 info,
2064 *fs,
2065 fs->mds_map.get_info(target_role.rank));
2066 }
2067 }
2068
2069 continue;
2070 }
2071
2072 // check everyone
2073 for (auto fs_i : pending_fsmap.filesystems) {
2074 const MDSMap &mds_map = fs_i.second->mds_map;
2075 for (auto mds_i : mds_map.mds_info) {
2076 MDSMap::mds_info_t &cand_info = mds_i.second;
2077 if (cand_info.rank >= 0 && mds_map.is_followable(cand_info.rank)) {
2078 if ((info.standby_for_name.length() && info.standby_for_name != cand_info.name) ||
2079 info.standby_for_rank != MDS_RANK_NONE) {
2080 continue; // we're supposed to follow someone else
2081 }
2082
2083 if (try_standby_replay(info, *(fs_i.second), cand_info)) {
2084 do_propose = true;
2085 break;
2086 }
2087 continue;
2088 }
2089 }
2090 }
2091 }
2092 }
2093
2094 return do_propose;
2095 }
2096
2097 void MDSMonitor::tick()
2098 {
2099 // make sure mds's are still alive
2100 // ...if i am an active leader
2101 if (!is_active()) return;
2102
2103 dout(10) << fsmap << dendl;
2104
2105 bool do_propose = false;
2106
2107 if (!mon->is_leader()) return;
2108
2109 do_propose |= pending_fsmap.check_health();
2110
2111 // expand mds cluster (add new nodes to @in)?
2112 for (auto i : pending_fsmap.filesystems) {
2113 do_propose |= maybe_expand_cluster(i.second);
2114 }
2115
2116 const auto now = ceph_clock_now();
2117 if (last_tick.is_zero()) {
2118 last_tick = now;
2119 }
2120
2121 if (now - last_tick > (g_conf->mds_beacon_grace - g_conf->mds_beacon_interval)) {
2122 // This case handles either local slowness (calls being delayed
2123 // for whatever reason) or cluster election slowness (a long gap
2124 // between calls while an election happened)
2125 dout(4) << __func__ << ": resetting beacon timeouts due to mon delay "
2126 "(slow election?) of " << now - last_tick << " seconds" << dendl;
2127 for (auto &i : last_beacon) {
2128 i.second.stamp = now;
2129 }
2130 }
2131
2132 last_tick = now;
2133
2134 // check beacon timestamps
2135 utime_t cutoff = now;
2136 cutoff -= g_conf->mds_beacon_grace;
2137
2138 // make sure last_beacon is fully populated
2139 for (const auto &p : pending_fsmap.mds_roles) {
2140 auto &gid = p.first;
2141 if (last_beacon.count(gid) == 0) {
2142 last_beacon[gid].stamp = now;
2143 last_beacon[gid].seq = 0;
2144 }
2145 }
2146
2147 // If the OSDMap is writeable, we can blacklist things, so we can
2148 // try failing any laggy MDS daemons. Consider each one for failure.
2149 if (mon->osdmon()->is_writeable()) {
2150 bool propose_osdmap = false;
2151
2152 map<mds_gid_t, beacon_info_t>::iterator p = last_beacon.begin();
2153 while (p != last_beacon.end()) {
2154 mds_gid_t gid = p->first;
2155 auto beacon_info = p->second;
2156 ++p;
2157
2158 if (!pending_fsmap.gid_exists(gid)) {
2159 // clean it out
2160 last_beacon.erase(gid);
2161 continue;
2162 }
2163
2164 if (beacon_info.stamp < cutoff) {
2165 maybe_replace_gid(gid, beacon_info, &do_propose, &propose_osdmap);
2166 }
2167 }
2168
2169 if (propose_osdmap) {
2170 request_proposal(mon->osdmon());
2171 }
2172 }
2173
2174 for (auto i : pending_fsmap.filesystems) {
2175 auto fs = i.second;
2176 if (!fs->mds_map.test_flag(CEPH_MDSMAP_DOWN)) {
2177 do_propose |= maybe_promote_standby(fs);
2178 }
2179 }
2180
2181 if (do_propose) {
2182 propose_pending();
2183 }
2184 }
2185
2186 /**
2187 * finfo: the would-be follower
2188 * leader_fs: the Filesystem containing the would-be leader
2189 * ainfo: the would-be leader
2190 */
2191 bool MDSMonitor::try_standby_replay(
2192 const MDSMap::mds_info_t& finfo,
2193 const Filesystem &leader_fs,
2194 const MDSMap::mds_info_t& ainfo)
2195 {
2196 // someone else already following?
2197 if (leader_fs.has_standby_replay(ainfo.global_id)) {
2198 dout(20) << " mds." << ainfo.rank << " already has a follower" << dendl;
2199 return false;
2200 } else {
2201 // Assign the new role to the standby
2202 dout(10) << " setting to follow mds rank " << ainfo.rank << dendl;
2203 pending_fsmap.assign_standby_replay(finfo.global_id, leader_fs.fscid, ainfo.rank);
2204 return true;
2205 }
2206 }
2207
2208 MDSMonitor::MDSMonitor(Monitor *mn, Paxos *p, string service_name)
2209 : PaxosService(mn, p, service_name)
2210 {
2211 handlers = FileSystemCommandHandler::load();
2212 }
2213
2214 void MDSMonitor::on_restart()
2215 {
2216 // Clear out the leader-specific state.
2217 last_tick = utime_t();
2218 last_beacon.clear();
2219 }
2220