]> git.proxmox.com Git - ceph.git/blob - ceph/src/mon/MgrMonitor.cc
import 15.2.0 Octopus source
[ceph.git] / ceph / src / mon / MgrMonitor.cc
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3 /*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2016 John Spray <john.spray@redhat.com>
7 *
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
12 */
13
14 #include <boost/tokenizer.hpp>
15
16 #include "messages/MMgrBeacon.h"
17 #include "messages/MMgrMap.h"
18 #include "messages/MMgrDigest.h"
19
20 #include "include/stringify.h"
21 #include "mgr/MgrContext.h"
22 #include "mgr/mgr_commands.h"
23 #include "OSDMonitor.h"
24 #include "ConfigMonitor.h"
25 #include "HealthMonitor.h"
26
27 #include "MgrMonitor.h"
28
29 #define MGR_METADATA_PREFIX "mgr_metadata"
30
31 #define dout_subsys ceph_subsys_mon
32 #undef dout_prefix
33 #define dout_prefix _prefix(_dout, mon, map)
34 using namespace TOPNSPC::common;
35 static ostream& _prefix(std::ostream *_dout, Monitor *mon,
36 const MgrMap& mgrmap) {
37 return *_dout << "mon." << mon->name << "@" << mon->rank
38 << "(" << mon->get_state_name()
39 << ").mgr e" << mgrmap.get_epoch() << " ";
40 }
41
42 // the system treats always_on_modules as if they provide built-in functionality
43 // by ensuring that they are always enabled.
44 const static std::map<uint32_t, std::set<std::string>> always_on_modules = {
45 {
46 CEPH_RELEASE_NAUTILUS, {
47 "crash",
48 "status",
49 "progress",
50 "balancer",
51 "devicehealth",
52 "orchestrator_cli",
53 "rbd_support",
54 "volumes",
55 }
56 },
57 {
58 CEPH_RELEASE_OCTOPUS, {
59 "crash",
60 "status",
61 "progress",
62 "balancer",
63 "devicehealth",
64 "orchestrator",
65 "rbd_support",
66 "osd_support",
67 "volumes",
68 "pg_autoscaler",
69 "telemetry",
70 }
71 }
72 };
73
74 // Prefix for mon store of active mgr's command descriptions
75 const static std::string command_descs_prefix = "mgr_command_descs";
76
77 const Option *MgrMonitor::find_module_option(const string& name)
78 {
79 // we have two forms of names: "mgr/$module/$option" and
80 // localized "mgr/$module/$instance/$option". normalize to the
81 // former by stripping out $instance.
82 string real_name;
83 if (name.substr(0, 4) != "mgr/") {
84 return nullptr;
85 }
86 auto second_slash = name.find('/', 5);
87 if (second_slash == std::string::npos) {
88 return nullptr;
89 }
90 auto third_slash = name.find('/', second_slash + 1);
91 if (third_slash != std::string::npos) {
92 // drop the $instance part between the second and third slash
93 real_name = name.substr(0, second_slash) + name.substr(third_slash);
94 } else {
95 real_name = name;
96 }
97 auto p = mgr_module_options.find(real_name);
98 if (p != mgr_module_options.end()) {
99 return &p->second;
100 }
101 return nullptr;
102 }
103
104 version_t MgrMonitor::get_trim_to() const
105 {
106 int64_t max = g_conf().get_val<int64_t>("mon_max_mgrmap_epochs");
107 if (map.epoch > max) {
108 return map.epoch - max;
109 }
110 return 0;
111 }
112
113 void MgrMonitor::create_initial()
114 {
115 // Take a local copy of initial_modules for tokenizer to iterate over.
116 auto initial_modules = g_conf().get_val<std::string>("mgr_initial_modules");
117 boost::tokenizer<> tok(initial_modules);
118 for (auto& m : tok) {
119 pending_map.modules.insert(m);
120 }
121 pending_map.always_on_modules = always_on_modules;
122 pending_command_descs = mgr_commands;
123 dout(10) << __func__ << " initial modules " << pending_map.modules
124 << ", always on modules " << pending_map.get_always_on_modules()
125 << ", " << pending_command_descs.size() << " commands"
126 << dendl;
127 }
128
129 void MgrMonitor::get_store_prefixes(std::set<string>& s) const
130 {
131 s.insert(service_name);
132 s.insert(command_descs_prefix);
133 s.insert(MGR_METADATA_PREFIX);
134 }
135
136 void MgrMonitor::update_from_paxos(bool *need_bootstrap)
137 {
138 version_t version = get_last_committed();
139 if (version != map.epoch) {
140 dout(4) << "loading version " << version << dendl;
141
142 bufferlist bl;
143 int err = get_version(version, bl);
144 ceph_assert(err == 0);
145
146 bool old_available = map.get_available();
147 uint64_t old_gid = map.get_active_gid();
148
149 auto p = bl.cbegin();
150 map.decode(p);
151
152 dout(4) << "active server: " << map.active_addrs
153 << "(" << map.active_gid << ")" << dendl;
154
155 ever_had_active_mgr = get_value("ever_had_active_mgr");
156
157 load_health();
158
159 if (map.available) {
160 first_seen_inactive = utime_t();
161 } else {
162 first_seen_inactive = ceph_clock_now();
163 }
164
165 check_subs();
166
167 if (version == 1
168 || command_descs.empty()
169 || (map.get_available()
170 && (!old_available || old_gid != map.get_active_gid()))) {
171 dout(4) << "mkfs or daemon transitioned to available, loading commands"
172 << dendl;
173 bufferlist loaded_commands;
174 int r = mon->store->get(command_descs_prefix, "", loaded_commands);
175 if (r < 0) {
176 derr << "Failed to load mgr commands: " << cpp_strerror(r) << dendl;
177 } else {
178 auto p = loaded_commands.cbegin();
179 decode(command_descs, p);
180 }
181 }
182 }
183
184 // populate module options
185 mgr_module_options.clear();
186 misc_option_strings.clear();
187 for (auto& i : map.available_modules) {
188 for (auto& j : i.module_options) {
189 string name = string("mgr/") + i.name + "/" + j.second.name;
190 auto p = mgr_module_options.emplace(
191 name,
192 Option(name, static_cast<Option::type_t>(j.second.type),
193 static_cast<Option::level_t>(j.second.level)));
194 Option& opt = p.first->second;
195 opt.set_flags(static_cast<Option::flag_t>(j.second.flags));
196 opt.set_flag(Option::FLAG_MGR);
197 opt.set_description(j.second.desc.c_str());
198 opt.set_long_description(j.second.long_desc.c_str());
199 for (auto& k : j.second.tags) {
200 opt.add_tag(k.c_str());
201 }
202 for (auto& k : j.second.see_also) {
203 if (i.module_options.count(k)) {
204 // it's another module option
205 misc_option_strings.push_back(string("mgr/") + i.name + "/" + k);
206 opt.add_see_also(misc_option_strings.back().c_str());
207 } else {
208 // it's a native option
209 opt.add_see_also(k.c_str());
210 }
211 }
212 Option::value_t v, v2;
213 std::string err;
214 if (j.second.default_value.size() &&
215 !opt.parse_value(j.second.default_value, &v, &err)) {
216 opt.set_default(v);
217 }
218 if (j.second.min.size() &&
219 j.second.max.size() &&
220 !opt.parse_value(j.second.min, &v, &err) &&
221 !opt.parse_value(j.second.max, &v2, &err)) {
222 opt.set_min_max(v, v2);
223 }
224 std::vector<const char *> enum_allowed;
225 for (auto& k : j.second.enum_allowed) {
226 enum_allowed.push_back(k.c_str());
227 }
228 opt.set_enum_allowed(enum_allowed);
229 }
230 }
231 // force ConfigMonitor to refresh, since it uses const Option *
232 // pointers into our mgr_module_options (which we just rebuilt).
233 mon->configmon()->load_config();
234
235 if (!mon->is_init()) {
236 // feed our pet MgrClient, unless we are in Monitor::[pre]init()
237 prime_mgr_client();
238 }
239 }
240
241 void MgrMonitor::prime_mgr_client()
242 {
243 dout(10) << __func__ << dendl;
244 mon->mgr_client.ms_dispatch2(make_message<MMgrMap>(map));
245 }
246
247 void MgrMonitor::create_pending()
248 {
249 pending_map = map;
250 pending_map.epoch++;
251 }
252
253 health_status_t MgrMonitor::should_warn_about_mgr_down()
254 {
255 utime_t now = ceph_clock_now();
256 // we warn if we have osds AND we've exceeded the grace period
257 // which means a new mon cluster and be HEALTH_OK indefinitely as long as
258 // no OSDs are ever created.
259 if (mon->osdmon()->osdmap.get_num_osds() > 0 &&
260 now > mon->monmap->created + g_conf().get_val<int64_t>("mon_mgr_mkfs_grace")) {
261 health_status_t level = HEALTH_WARN;
262 if (first_seen_inactive != utime_t() &&
263 now - first_seen_inactive > g_conf().get_val<int64_t>("mon_mgr_inactive_grace")) {
264 level = HEALTH_ERR;
265 }
266 return level;
267 }
268 return HEALTH_OK;
269 }
270
271 void MgrMonitor::post_paxos_update()
272 {
273 // are we handling digest subscribers?
274 if (digest_event) {
275 bool send = false;
276 if (prev_health_checks.empty()) {
277 prev_health_checks.resize(mon->paxos_service.size());
278 send = true;
279 }
280 ceph_assert(prev_health_checks.size() == mon->paxos_service.size());
281 for (auto i = 0u; i < prev_health_checks.size(); i++) {
282 const auto& curr = mon->paxos_service[i]->get_health_checks();
283 if (!send && curr != prev_health_checks[i]) {
284 send = true;
285 }
286 prev_health_checks[i] = curr;
287 }
288 if (send) {
289 if (is_active()) {
290 send_digests();
291 } else {
292 cancel_timer();
293 wait_for_active_ctx(new C_MonContext{mon, [this](int) {
294 send_digests();
295 }});
296 }
297 }
298 }
299 }
300
301 void MgrMonitor::encode_pending(MonitorDBStore::TransactionRef t)
302 {
303 dout(10) << __func__ << " " << pending_map << dendl;
304 bufferlist bl;
305 pending_map.encode(bl, mon->get_quorum_con_features());
306 put_version(t, pending_map.epoch, bl);
307 put_last_committed(t, pending_map.epoch);
308
309 for (auto& p : pending_metadata) {
310 dout(10) << __func__ << " set metadata for " << p.first << dendl;
311 t->put(MGR_METADATA_PREFIX, p.first, p.second);
312 }
313 for (auto& name : pending_metadata_rm) {
314 dout(10) << __func__ << " rm metadata for " << name << dendl;
315 t->erase(MGR_METADATA_PREFIX, name);
316 }
317 pending_metadata.clear();
318 pending_metadata_rm.clear();
319
320 health_check_map_t next;
321 if (pending_map.active_gid == 0) {
322 auto level = should_warn_about_mgr_down();
323 if (level != HEALTH_OK) {
324 next.add("MGR_DOWN", level, "no active mgr", 0);
325 } else {
326 dout(10) << __func__ << " no health warning (never active and new cluster)"
327 << dendl;
328 }
329 } else {
330 put_value(t, "ever_had_active_mgr", 1);
331 }
332 encode_health(next, t);
333
334 if (pending_command_descs.size()) {
335 dout(4) << __func__ << " encoding " << pending_command_descs.size()
336 << " command_descs" << dendl;
337 for (auto& p : pending_command_descs) {
338 p.set_flag(MonCommand::FLAG_MGR);
339 }
340 bufferlist bl;
341 encode(pending_command_descs, bl);
342 t->put(command_descs_prefix, "", bl);
343 pending_command_descs.clear();
344 }
345 }
346
347 bool MgrMonitor::check_caps(MonOpRequestRef op, const uuid_d& fsid)
348 {
349 // check permissions
350 MonSession *session = op->get_session();
351 if (!session)
352 return false;
353 if (!session->is_capable("mgr", MON_CAP_X)) {
354 dout(1) << __func__ << " insufficient caps " << session->caps << dendl;
355 return false;
356 }
357 if (fsid != mon->monmap->fsid) {
358 dout(1) << __func__ << " op fsid " << fsid
359 << " != " << mon->monmap->fsid << dendl;
360 return false;
361 }
362 return true;
363 }
364
365 bool MgrMonitor::preprocess_query(MonOpRequestRef op)
366 {
367 auto m = op->get_req<PaxosServiceMessage>();
368 switch (m->get_type()) {
369 case MSG_MGR_BEACON:
370 return preprocess_beacon(op);
371 case MSG_MON_COMMAND:
372 try {
373 return preprocess_command(op);
374 } catch (const bad_cmd_get& e) {
375 bufferlist bl;
376 mon->reply_command(op, -EINVAL, e.what(), bl, get_last_committed());
377 return true;
378 }
379
380 default:
381 mon->no_reply(op);
382 derr << "Unhandled message type " << m->get_type() << dendl;
383 return true;
384 }
385 }
386
387 bool MgrMonitor::prepare_update(MonOpRequestRef op)
388 {
389 auto m = op->get_req<PaxosServiceMessage>();
390 switch (m->get_type()) {
391 case MSG_MGR_BEACON:
392 return prepare_beacon(op);
393
394 case MSG_MON_COMMAND:
395 try {
396 return prepare_command(op);
397 } catch (const bad_cmd_get& e) {
398 bufferlist bl;
399 mon->reply_command(op, -EINVAL, e.what(), bl, get_last_committed());
400 return true;
401 }
402
403 default:
404 mon->no_reply(op);
405 derr << "Unhandled message type " << m->get_type() << dendl;
406 return true;
407 }
408 }
409
410
411
412 class C_Updated : public Context {
413 MgrMonitor *mm;
414 MonOpRequestRef op;
415 public:
416 C_Updated(MgrMonitor *a, MonOpRequestRef c) :
417 mm(a), op(c) {}
418 void finish(int r) override {
419 if (r >= 0) {
420 // Success
421 } else if (r == -ECANCELED) {
422 mm->mon->no_reply(op);
423 } else {
424 mm->dispatch(op); // try again
425 }
426 }
427 };
428
429 bool MgrMonitor::preprocess_beacon(MonOpRequestRef op)
430 {
431 auto m = op->get_req<MMgrBeacon>();
432 mon->no_reply(op); // we never reply to beacons
433 dout(4) << "beacon from " << m->get_gid() << dendl;
434
435 if (!check_caps(op, m->get_fsid())) {
436 // drop it on the floor
437 return true;
438 }
439
440 // always send this to the leader's prepare_beacon()
441 return false;
442 }
443
444 bool MgrMonitor::prepare_beacon(MonOpRequestRef op)
445 {
446 auto m = op->get_req<MMgrBeacon>();
447 dout(4) << "beacon from " << m->get_gid() << dendl;
448
449 // See if we are seeing same name, new GID for the active daemon
450 if (m->get_name() == pending_map.active_name
451 && m->get_gid() != pending_map.active_gid)
452 {
453 dout(4) << "Active daemon restart (mgr." << m->get_name() << ")" << dendl;
454 mon->clog->info() << "Active manager daemon " << m->get_name()
455 << " restarted";
456 if (!mon->osdmon()->is_writeable()) {
457 dout(1) << __func__ << ": waiting for osdmon writeable to"
458 " blacklist old instance." << dendl;
459 mon->osdmon()->wait_for_writeable(op, new C_RetryMessage(this, op));
460 return false;
461 }
462 drop_active();
463 }
464
465 // See if we are seeing same name, new GID for any standbys
466 for (const auto &i : pending_map.standbys) {
467 const MgrMap::StandbyInfo &s = i.second;
468 if (s.name == m->get_name() && s.gid != m->get_gid()) {
469 dout(4) << "Standby daemon restart (mgr." << m->get_name() << ")" << dendl;
470 mon->clog->debug() << "Standby manager daemon " << m->get_name()
471 << " restarted";
472 drop_standby(i.first);
473 break;
474 }
475 }
476
477 last_beacon[m->get_gid()] = ceph::coarse_mono_clock::now();
478
479 // Track whether we modified pending_map
480 bool updated = false;
481
482 if (pending_map.active_gid == m->get_gid()) {
483 if (pending_map.services != m->get_services()) {
484 dout(4) << "updated services from mgr." << m->get_name()
485 << ": " << m->get_services() << dendl;
486 pending_map.services = m->get_services();
487 updated = true;
488 }
489
490 // A beacon from the currently active daemon
491 if (pending_map.active_addrs != m->get_server_addrs()) {
492 dout(4) << "learned address " << m->get_server_addrs()
493 << " (was " << pending_map.active_addrs << ")" << dendl;
494 pending_map.active_addrs = m->get_server_addrs();
495 updated = true;
496 }
497
498 if (pending_map.get_available() != m->get_available()) {
499 dout(4) << "available " << m->get_gid() << dendl;
500 mon->clog->info() << "Manager daemon " << pending_map.active_name
501 << " is now available";
502
503 // This beacon should include command descriptions
504 pending_command_descs = m->get_command_descs();
505 if (pending_command_descs.empty()) {
506 // This should not happen, but it also isn't fatal: we just
507 // won't successfully update our list of commands.
508 dout(4) << "First available beacon from " << pending_map.active_name
509 << "(" << m->get_gid() << ") does not include command descs"
510 << dendl;
511 } else {
512 dout(4) << "First available beacon from " << pending_map.active_name
513 << "(" << m->get_gid() << ") includes "
514 << pending_command_descs.size() << " command descs" << dendl;
515 }
516
517 pending_map.available = m->get_available();
518 updated = true;
519 }
520 if (pending_map.available_modules != m->get_available_modules()) {
521 dout(4) << "available_modules " << m->get_available_modules()
522 << " (was " << pending_map.available_modules << ")" << dendl;
523 pending_map.available_modules = m->get_available_modules();
524 updated = true;
525 }
526 const auto& clients = m->get_clients();
527 if (pending_map.clients != clients) {
528 dout(4) << "active's RADOS clients " << clients
529 << " (was " << pending_map.clients << ")" << dendl;
530 pending_map.clients = clients;
531 updated = true;
532 }
533 } else if (pending_map.active_gid == 0) {
534 // There is no currently active daemon, select this one.
535 if (pending_map.standbys.count(m->get_gid())) {
536 drop_standby(m->get_gid(), false);
537 }
538 dout(4) << "selecting new active " << m->get_gid()
539 << " " << m->get_name()
540 << " (was " << pending_map.active_gid << " "
541 << pending_map.active_name << ")" << dendl;
542 pending_map.active_gid = m->get_gid();
543 pending_map.active_name = m->get_name();
544 pending_map.active_change = ceph_clock_now();
545 pending_map.active_mgr_features = m->get_mgr_features();
546 pending_map.available_modules = m->get_available_modules();
547 encode(m->get_metadata(), pending_metadata[m->get_name()]);
548 pending_metadata_rm.erase(m->get_name());
549
550 mon->clog->info() << "Activating manager daemon "
551 << pending_map.active_name;
552
553 updated = true;
554 } else {
555 if (pending_map.standbys.count(m->get_gid()) > 0) {
556 dout(10) << "from existing standby " << m->get_gid() << dendl;
557 if (pending_map.standbys[m->get_gid()].available_modules !=
558 m->get_available_modules()) {
559 dout(10) << "existing standby " << m->get_gid() << " available_modules "
560 << m->get_available_modules() << " (was "
561 << pending_map.standbys[m->get_gid()].available_modules << ")"
562 << dendl;
563 pending_map.standbys[m->get_gid()].available_modules =
564 m->get_available_modules();
565 updated = true;
566 }
567 } else {
568 dout(10) << "new standby " << m->get_gid() << dendl;
569 mon->clog->debug() << "Standby manager daemon " << m->get_name()
570 << " started";
571 pending_map.standbys[m->get_gid()] = {m->get_gid(), m->get_name(),
572 m->get_available_modules(),
573 m->get_mgr_features()};
574 encode(m->get_metadata(), pending_metadata[m->get_name()]);
575 pending_metadata_rm.erase(m->get_name());
576 updated = true;
577 }
578 }
579
580 if (updated) {
581 dout(4) << "updating map" << dendl;
582 wait_for_finished_proposal(op, new C_Updated(this, op));
583 } else {
584 dout(10) << "no change" << dendl;
585 }
586
587 return updated;
588 }
589
590 void MgrMonitor::check_subs()
591 {
592 const std::string type = "mgrmap";
593 if (mon->session_map.subs.count(type) == 0)
594 return;
595 for (auto sub : *(mon->session_map.subs[type])) {
596 check_sub(sub);
597 }
598 }
599
600 void MgrMonitor::check_sub(Subscription *sub)
601 {
602 if (sub->type == "mgrmap") {
603 if (sub->next <= map.get_epoch()) {
604 dout(20) << "Sending map to subscriber " << sub->session->con
605 << " " << sub->session->con->get_peer_addr() << dendl;
606 sub->session->con->send_message2(make_message<MMgrMap>(map));
607 if (sub->onetime) {
608 mon->session_map.remove_sub(sub);
609 } else {
610 sub->next = map.get_epoch() + 1;
611 }
612 }
613 } else {
614 ceph_assert(sub->type == "mgrdigest");
615 if (sub->next == 0) {
616 // new registration; cancel previous timer
617 cancel_timer();
618 }
619 if (digest_event == nullptr) {
620 send_digests();
621 }
622 }
623 }
624
625 /**
626 * Handle digest subscriptions separately (outside of check_sub) because
627 * they are going to be periodic rather than version-driven.
628 */
629 void MgrMonitor::send_digests()
630 {
631 cancel_timer();
632
633 const std::string type = "mgrdigest";
634 if (mon->session_map.subs.count(type) == 0) {
635 prev_health_checks.clear();
636 return;
637 }
638
639 if (!is_active()) {
640 // if paxos is currently not active, don't send a digest but reenable timer
641 goto timer;
642 }
643 dout(10) << __func__ << dendl;
644
645 for (auto sub : *(mon->session_map.subs[type])) {
646 dout(10) << __func__ << " sending digest to subscriber " << sub->session->con
647 << " " << sub->session->con->get_peer_addr() << dendl;
648 auto mdigest = make_message<MMgrDigest>();
649
650 JSONFormatter f;
651 mon->healthmon()->get_health_status(true, &f, nullptr, nullptr, nullptr);
652 f.flush(mdigest->health_json);
653 f.reset();
654
655 mon->get_mon_status(&f);
656 f.flush(mdigest->mon_status_json);
657 f.reset();
658
659 sub->session->con->send_message2(mdigest);
660 }
661
662 timer:
663 digest_event = mon->timer.add_event_after(
664 g_conf().get_val<int64_t>("mon_mgr_digest_period"),
665 new C_MonContext{mon, [this](int) {
666 send_digests();
667 }});
668 }
669
670 void MgrMonitor::cancel_timer()
671 {
672 if (digest_event) {
673 mon->timer.cancel_event(digest_event);
674 digest_event = nullptr;
675 }
676 }
677
678 void MgrMonitor::on_active()
679 {
680 if (!mon->is_leader()) {
681 return;
682 }
683 mon->clog->debug() << "mgrmap e" << map.epoch << ": " << map;
684 if (!HAVE_FEATURE(mon->get_quorum_con_features(), SERVER_NAUTILUS)) {
685 return;
686 }
687 if (pending_map.always_on_modules == always_on_modules) {
688 return;
689 }
690 dout(4) << "always on modules changed, pending "
691 << pending_map.always_on_modules << " != wanted "
692 << always_on_modules << dendl;
693 pending_map.always_on_modules = always_on_modules;
694 propose_pending();
695 }
696
697 void MgrMonitor::tick()
698 {
699 if (!is_active() || !mon->is_leader())
700 return;
701
702 const auto now = ceph::coarse_mono_clock::now();
703
704 const auto mgr_beacon_grace =
705 g_conf().get_val<std::chrono::seconds>("mon_mgr_beacon_grace");
706
707 // Note that this is the mgr daemon's tick period, not ours (the
708 // beacon is sent with this period).
709 const auto mgr_tick_period =
710 g_conf().get_val<std::chrono::seconds>("mgr_tick_period");
711
712 if (last_tick != ceph::coarse_mono_clock::time_point::min()
713 && (now - last_tick > (mgr_beacon_grace - mgr_tick_period))) {
714 // This case handles either local slowness (calls being delayed
715 // for whatever reason) or cluster election slowness (a long gap
716 // between calls while an election happened)
717 dout(4) << __func__ << ": resetting beacon timeouts due to mon delay "
718 "(slow election?) of " << now - last_tick << " seconds" << dendl;
719 for (auto &i : last_beacon) {
720 i.second = now;
721 }
722 }
723
724 last_tick = now;
725
726 // Populate any missing beacons (i.e. no beacon since MgrMonitor
727 // instantiation) with the current time, so that they will
728 // eventually look laggy if they fail to give us a beacon.
729 if (pending_map.active_gid != 0
730 && last_beacon.count(pending_map.active_gid) == 0) {
731 last_beacon[pending_map.active_gid] = now;
732 }
733 for (auto s : pending_map.standbys) {
734 if (last_beacon.count(s.first) == 0) {
735 last_beacon[s.first] = now;
736 }
737 }
738
739 // Cull standbys first so that any remaining standbys
740 // will be eligible to take over from the active if we cull him.
741 std::list<uint64_t> dead_standbys;
742 const auto cutoff = now - mgr_beacon_grace;
743 for (const auto &i : pending_map.standbys) {
744 auto last_beacon_time = last_beacon.at(i.first);
745 if (last_beacon_time < cutoff) {
746 dead_standbys.push_back(i.first);
747 }
748 }
749
750 bool propose = false;
751
752 for (auto i : dead_standbys) {
753 dout(4) << "Dropping laggy standby " << i << dendl;
754 drop_standby(i);
755 propose = true;
756 }
757
758 if (pending_map.active_gid != 0
759 && last_beacon.at(pending_map.active_gid) < cutoff
760 && mon->osdmon()->is_writeable()) {
761 const std::string old_active_name = pending_map.active_name;
762 drop_active();
763 propose = true;
764 dout(4) << "Dropping active" << pending_map.active_gid << dendl;
765 if (promote_standby()) {
766 dout(4) << "Promoted standby " << pending_map.active_gid << dendl;
767 mon->clog->info() << "Manager daemon " << old_active_name
768 << " is unresponsive, replacing it with standby"
769 << " daemon " << pending_map.active_name;
770 } else {
771 dout(4) << "Active is laggy but have no standbys to replace it" << dendl;
772 mon->clog->info() << "Manager daemon " << old_active_name
773 << " is unresponsive. No standby daemons available.";
774 }
775 } else if (pending_map.active_gid == 0) {
776 if (promote_standby()) {
777 dout(4) << "Promoted standby " << pending_map.active_gid << dendl;
778 mon->clog->info() << "Activating manager daemon "
779 << pending_map.active_name;
780 propose = true;
781 }
782 }
783
784 if (!pending_map.available &&
785 !ever_had_active_mgr &&
786 should_warn_about_mgr_down() != HEALTH_OK) {
787 dout(10) << " exceeded mon_mgr_mkfs_grace "
788 << g_conf().get_val<int64_t>("mon_mgr_mkfs_grace")
789 << " seconds" << dendl;
790 propose = true;
791 }
792
793 if (propose) {
794 propose_pending();
795 }
796 }
797
798 void MgrMonitor::on_restart()
799 {
800 // Clear out the leader-specific state.
801 last_beacon.clear();
802 last_tick = ceph::coarse_mono_clock::now();
803 }
804
805
806 bool MgrMonitor::promote_standby()
807 {
808 ceph_assert(pending_map.active_gid == 0);
809 if (pending_map.standbys.size()) {
810 // Promote a replacement (arbitrary choice of standby)
811 auto replacement_gid = pending_map.standbys.begin()->first;
812 pending_map.active_gid = replacement_gid;
813 pending_map.active_name = pending_map.standbys.at(replacement_gid).name;
814 pending_map.active_mgr_features =
815 pending_map.standbys.at(replacement_gid).mgr_features;
816 pending_map.available = false;
817 pending_map.active_addrs = entity_addrvec_t();
818 pending_map.active_change = ceph_clock_now();
819
820 drop_standby(replacement_gid, false);
821
822 return true;
823 } else {
824 return false;
825 }
826 }
827
828 void MgrMonitor::drop_active()
829 {
830 ceph_assert(mon->osdmon()->is_writeable());
831
832 if (last_beacon.count(pending_map.active_gid) > 0) {
833 last_beacon.erase(pending_map.active_gid);
834 }
835
836 ceph_assert(pending_map.active_gid > 0);
837 auto until = ceph_clock_now();
838 until += g_conf().get_val<double>("mon_mgr_blacklist_interval");
839 dout(5) << "blacklisting previous mgr." << pending_map.active_name << "."
840 << pending_map.active_gid << " ("
841 << pending_map.active_addrs << ")" << dendl;
842 auto blacklist_epoch = mon->osdmon()->blacklist(pending_map.active_addrs, until);
843
844 /* blacklist RADOS clients in use by the mgr */
845 for (const auto& a : pending_map.clients) {
846 mon->osdmon()->blacklist(a, until);
847 }
848 request_proposal(mon->osdmon());
849
850 pending_metadata_rm.insert(pending_map.active_name);
851 pending_metadata.erase(pending_map.active_name);
852 pending_map.active_name = "";
853 pending_map.active_gid = 0;
854 pending_map.active_change = ceph_clock_now();
855 pending_map.active_mgr_features = 0;
856 pending_map.available = false;
857 pending_map.active_addrs = entity_addrvec_t();
858 pending_map.services.clear();
859 pending_map.clients.clear();
860 pending_map.last_failure_osd_epoch = blacklist_epoch;
861
862 // So that when new active mgr subscribes to mgrdigest, it will
863 // get an immediate response instead of waiting for next timer
864 cancel_timer();
865 }
866
867 void MgrMonitor::drop_standby(uint64_t gid, bool drop_meta)
868 {
869 if (drop_meta) {
870 pending_metadata_rm.insert(pending_map.standbys[gid].name);
871 pending_metadata.erase(pending_map.standbys[gid].name);
872 }
873 pending_map.standbys.erase(gid);
874 if (last_beacon.count(gid) > 0) {
875 last_beacon.erase(gid);
876 }
877 }
878
879 bool MgrMonitor::preprocess_command(MonOpRequestRef op)
880 {
881 auto m = op->get_req<MMonCommand>();
882 std::stringstream ss;
883 bufferlist rdata;
884
885 cmdmap_t cmdmap;
886 if (!cmdmap_from_json(m->cmd, &cmdmap, ss)) {
887 string rs = ss.str();
888 mon->reply_command(op, -EINVAL, rs, rdata, get_last_committed());
889 return true;
890 }
891
892 MonSession *session = op->get_session();
893 if (!session) {
894 mon->reply_command(op, -EACCES, "access denied", rdata,
895 get_last_committed());
896 return true;
897 }
898
899 string format;
900 cmd_getval(cmdmap, "format", format);
901 boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty",
902 "json-pretty"));
903
904 string prefix;
905 cmd_getval(cmdmap, "prefix", prefix);
906 int r = 0;
907
908 if (prefix == "mgr dump") {
909 int64_t epoch = 0;
910 cmd_getval(cmdmap, "epoch", epoch, (int64_t)map.get_epoch());
911 if (epoch == (int64_t)map.get_epoch()) {
912 f->dump_object("mgrmap", map);
913 } else {
914 bufferlist bl;
915 int err = get_version(epoch, bl);
916 if (err == -ENOENT) {
917 r = -ENOENT;
918 ss << "there is no map for epoch " << epoch;
919 goto reply;
920 }
921 MgrMap m;
922 auto p = bl.cbegin();
923 m.decode(p);
924 f->dump_object("mgrmap", m);
925 }
926 f->flush(rdata);
927 } else if (prefix == "mgr module ls") {
928 f->open_object_section("modules");
929 {
930 f->open_array_section("always_on_modules");
931 for (auto& p : map.get_always_on_modules()) {
932 f->dump_string("module", p);
933 }
934 f->close_section();
935 f->open_array_section("enabled_modules");
936 for (auto& p : map.modules) {
937 if (map.get_always_on_modules().count(p) > 0)
938 continue;
939 // We only show the name for enabled modules. The any errors
940 // etc will show up as a health checks.
941 f->dump_string("module", p);
942 }
943 f->close_section();
944 f->open_array_section("disabled_modules");
945 for (auto& p : map.available_modules) {
946 if (map.modules.count(p.name) == 0 &&
947 map.get_always_on_modules().count(p.name) == 0) {
948 // For disabled modules, we show the full info, to
949 // give a hint about whether enabling it will work
950 p.dump(f.get());
951 }
952 }
953 f->close_section();
954 }
955 f->close_section();
956 f->flush(rdata);
957 } else if (prefix == "mgr services") {
958 f->open_object_section("services");
959 for (const auto &i : map.services) {
960 f->dump_string(i.first.c_str(), i.second);
961 }
962 f->close_section();
963 f->flush(rdata);
964 } else if (prefix == "mgr metadata") {
965 string name;
966 cmd_getval(cmdmap, "who", name);
967 if (name.size() > 0 && !map.have_name(name)) {
968 ss << "mgr." << name << " does not exist";
969 r = -ENOENT;
970 goto reply;
971 }
972 string format;
973 cmd_getval(cmdmap, "format", format);
974 boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty", "json-pretty"));
975 if (name.size()) {
976 f->open_object_section("mgr_metadata");
977 f->dump_string("name", name);
978 r = dump_metadata(name, f.get(), &ss);
979 if (r < 0)
980 goto reply;
981 f->close_section();
982 } else {
983 r = 0;
984 f->open_array_section("mgr_metadata");
985 for (auto& i : map.get_all_names()) {
986 f->open_object_section("mgr");
987 f->dump_string("name", i);
988 r = dump_metadata(i, f.get(), NULL);
989 if (r == -EINVAL || r == -ENOENT) {
990 // Drop error, continue to get other daemons' metadata
991 dout(4) << "No metadata for mgr." << i << dendl;
992 r = 0;
993 } else if (r < 0) {
994 // Unexpected error
995 goto reply;
996 }
997 f->close_section();
998 }
999 f->close_section();
1000 }
1001 f->flush(rdata);
1002 } else if (prefix == "mgr versions") {
1003 count_metadata("ceph_version", f.get());
1004 f->flush(rdata);
1005 r = 0;
1006 } else if (prefix == "mgr count-metadata") {
1007 string field;
1008 cmd_getval(cmdmap, "property", field);
1009 count_metadata(field, f.get());
1010 f->flush(rdata);
1011 r = 0;
1012 } else {
1013 return false;
1014 }
1015
1016 reply:
1017 string rs;
1018 getline(ss, rs);
1019 mon->reply_command(op, r, rs, rdata, get_last_committed());
1020 return true;
1021 }
1022
1023 bool MgrMonitor::prepare_command(MonOpRequestRef op)
1024 {
1025 auto m = op->get_req<MMonCommand>();
1026
1027 std::stringstream ss;
1028 bufferlist rdata;
1029
1030 cmdmap_t cmdmap;
1031 if (!cmdmap_from_json(m->cmd, &cmdmap, ss)) {
1032 string rs = ss.str();
1033 mon->reply_command(op, -EINVAL, rs, rdata, get_last_committed());
1034 return true;
1035 }
1036
1037 MonSession *session = op->get_session();
1038 if (!session) {
1039 mon->reply_command(op, -EACCES, "access denied", rdata, get_last_committed());
1040 return true;
1041 }
1042
1043 string format;
1044 cmd_getval(cmdmap, "format", format, string("plain"));
1045 boost::scoped_ptr<Formatter> f(Formatter::create(format));
1046
1047 string prefix;
1048 cmd_getval(cmdmap, "prefix", prefix);
1049
1050 int r = 0;
1051
1052 if (prefix == "mgr fail") {
1053 string who;
1054 if (!cmd_getval(cmdmap, "who", who)) {
1055 if (!map.active_gid) {
1056 ss << "Currently no active mgr";
1057 goto out;
1058 }
1059 who = map.active_name;
1060 }
1061
1062 std::string err;
1063 uint64_t gid = strict_strtol(who.c_str(), 10, &err);
1064 bool changed = false;
1065 if (!err.empty()) {
1066 // Does not parse as a gid, treat it as a name
1067 if (pending_map.active_name == who) {
1068 if (!mon->osdmon()->is_writeable()) {
1069 mon->osdmon()->wait_for_writeable(op, new C_RetryMessage(this, op));
1070 return false;
1071 }
1072 drop_active();
1073 changed = true;
1074 } else {
1075 gid = 0;
1076 for (const auto &i : pending_map.standbys) {
1077 if (i.second.name == who) {
1078 gid = i.first;
1079 break;
1080 }
1081 }
1082 if (gid != 0) {
1083 drop_standby(gid);
1084 changed = true;
1085 } else {
1086 ss << "Daemon not found '" << who << "', already failed?";
1087 }
1088 }
1089 } else {
1090 if (pending_map.active_gid == gid) {
1091 if (!mon->osdmon()->is_writeable()) {
1092 mon->osdmon()->wait_for_writeable(op, new C_RetryMessage(this, op));
1093 return false;
1094 }
1095 drop_active();
1096 changed = true;
1097 } else if (pending_map.standbys.count(gid) > 0) {
1098 drop_standby(gid);
1099 changed = true;
1100 } else {
1101 ss << "Daemon not found '" << gid << "', already failed?";
1102 }
1103 }
1104
1105 if (changed && pending_map.active_gid == 0) {
1106 promote_standby();
1107 }
1108 } else if (prefix == "mgr module enable") {
1109 string module;
1110 cmd_getval(cmdmap, "module", module);
1111 if (module.empty()) {
1112 r = -EINVAL;
1113 goto out;
1114 }
1115 if (pending_map.get_always_on_modules().count(module) > 0) {
1116 ss << "module '" << module << "' is already enabled (always-on)";
1117 goto out;
1118 }
1119 string force;
1120 cmd_getval(cmdmap, "force", force);
1121 if (!pending_map.all_support_module(module) &&
1122 force != "--force") {
1123 ss << "all mgr daemons do not support module '" << module << "', pass "
1124 << "--force to force enablement";
1125 r = -ENOENT;
1126 goto out;
1127 }
1128
1129 std::string can_run_error;
1130 if (force != "--force" && !pending_map.can_run_module(module, &can_run_error)) {
1131 ss << "module '" << module << "' reports that it cannot run on the active "
1132 "manager daemon: " << can_run_error << " (pass --force to force "
1133 "enablement)";
1134 r = -ENOENT;
1135 goto out;
1136 }
1137
1138 if (pending_map.module_enabled(module)) {
1139 ss << "module '" << module << "' is already enabled";
1140 r = 0;
1141 goto out;
1142 }
1143 pending_map.modules.insert(module);
1144 } else if (prefix == "mgr module disable") {
1145 string module;
1146 cmd_getval(cmdmap, "module", module);
1147 if (module.empty()) {
1148 r = -EINVAL;
1149 goto out;
1150 }
1151 if (pending_map.get_always_on_modules().count(module) > 0) {
1152 ss << "module '" << module << "' cannot be disabled (always-on)";
1153 r = -EINVAL;
1154 goto out;
1155 }
1156 if (!pending_map.module_enabled(module)) {
1157 ss << "module '" << module << "' is already disabled";
1158 r = 0;
1159 goto out;
1160 }
1161 if (!pending_map.any_supports_module(module)) {
1162 ss << "module '" << module << "' does not exist";
1163 }
1164 pending_map.modules.erase(module);
1165 } else {
1166 ss << "Command '" << prefix << "' not implemented!";
1167 r = -ENOSYS;
1168 }
1169
1170 out:
1171 dout(4) << __func__ << " done, r=" << r << dendl;
1172 /* Compose response */
1173 string rs;
1174 getline(ss, rs);
1175
1176 if (r >= 0) {
1177 // success.. delay reply
1178 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, r, rs,
1179 get_last_committed() + 1));
1180 return true;
1181 } else {
1182 // reply immediately
1183 mon->reply_command(op, r, rs, rdata, get_last_committed());
1184 return false;
1185 }
1186 }
1187
1188 void MgrMonitor::init()
1189 {
1190 if (digest_event == nullptr) {
1191 send_digests(); // To get it to schedule its own event
1192 }
1193 }
1194
1195 void MgrMonitor::on_shutdown()
1196 {
1197 cancel_timer();
1198 }
1199
1200 int MgrMonitor::load_metadata(const string& name, std::map<string, string>& m,
1201 ostream *err) const
1202 {
1203 bufferlist bl;
1204 int r = mon->store->get(MGR_METADATA_PREFIX, name, bl);
1205 if (r < 0)
1206 return r;
1207 try {
1208 auto p = bl.cbegin();
1209 decode(m, p);
1210 }
1211 catch (buffer::error& e) {
1212 if (err)
1213 *err << "mgr." << name << " metadata is corrupt";
1214 return -EIO;
1215 }
1216 return 0;
1217 }
1218
1219 void MgrMonitor::count_metadata(const string& field, std::map<string,int> *out)
1220 {
1221 std::set<string> ls = map.get_all_names();
1222 for (auto& name : ls) {
1223 std::map<string,string> meta;
1224 load_metadata(name, meta, nullptr);
1225 auto p = meta.find(field);
1226 if (p == meta.end()) {
1227 (*out)["unknown"]++;
1228 } else {
1229 (*out)[p->second]++;
1230 }
1231 }
1232 }
1233
1234 void MgrMonitor::count_metadata(const string& field, Formatter *f)
1235 {
1236 std::map<string,int> by_val;
1237 count_metadata(field, &by_val);
1238 f->open_object_section(field.c_str());
1239 for (auto& p : by_val) {
1240 f->dump_int(p.first.c_str(), p.second);
1241 }
1242 f->close_section();
1243 }
1244
1245 int MgrMonitor::dump_metadata(const string& name, Formatter *f, ostream *err)
1246 {
1247 std::map<string,string> m;
1248 if (int r = load_metadata(name, m, err))
1249 return r;
1250 for (auto& p : m) {
1251 f->dump_string(p.first.c_str(), p.second);
1252 }
1253 return 0;
1254 }
1255
1256 void MgrMonitor::print_nodes(Formatter *f) const
1257 {
1258 ceph_assert(f);
1259
1260 std::map<string, list<string> > mgrs; // hostname => mgr
1261 auto ls = map.get_all_names();
1262 for (auto& name : ls) {
1263 std::map<string,string> meta;
1264 if (load_metadata(name, meta, nullptr)) {
1265 continue;
1266 }
1267 auto hostname = meta.find("hostname");
1268 if (hostname == meta.end()) {
1269 // not likely though
1270 continue;
1271 }
1272 mgrs[hostname->second].push_back(name);
1273 }
1274
1275 dump_services(f, mgrs, "mgr");
1276 }
1277
1278 const std::vector<MonCommand> &MgrMonitor::get_command_descs() const
1279 {
1280 if (command_descs.empty()) {
1281 // must have just upgraded; fallback to static commands
1282 return mgr_commands;
1283 } else {
1284 return command_descs;
1285 }
1286 }