]> git.proxmox.com Git - ceph.git/blob - ceph/src/mon/MgrMonitor.cc
import quincy beta 17.1.0
[ceph.git] / ceph / src / mon / MgrMonitor.cc
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3 /*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2016 John Spray <john.spray@redhat.com>
7 *
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
12 */
13
14 #include <boost/tokenizer.hpp>
15
16 #include "messages/MMgrBeacon.h"
17 #include "messages/MMgrMap.h"
18 #include "messages/MMgrDigest.h"
19
20 #include "include/stringify.h"
21 #include "mgr/MgrContext.h"
22 #include "mgr/mgr_commands.h"
23 #include "OSDMonitor.h"
24 #include "ConfigMonitor.h"
25 #include "HealthMonitor.h"
26
27 #include "common/TextTable.h"
28 #include "include/stringify.h"
29
30 #include "MgrMonitor.h"
31
32 #define MGR_METADATA_PREFIX "mgr_metadata"
33
34 #define dout_subsys ceph_subsys_mon
35 #undef dout_prefix
36 #define dout_prefix _prefix(_dout, mon, map)
37 using namespace TOPNSPC::common;
38
39 using std::dec;
40 using std::hex;
41 using std::list;
42 using std::map;
43 using std::make_pair;
44 using std::ostream;
45 using std::ostringstream;
46 using std::pair;
47 using std::set;
48 using std::string;
49 using std::stringstream;
50 using std::to_string;
51 using std::vector;
52
53 using ceph::bufferlist;
54 using ceph::decode;
55 using ceph::encode;
56 using ceph::ErasureCodeInterfaceRef;
57 using ceph::ErasureCodeProfile;
58 using ceph::Formatter;
59 using ceph::JSONFormatter;
60 using ceph::make_message;
61 using ceph::mono_clock;
62 using ceph::mono_time;
63
64 static ostream& _prefix(std::ostream *_dout, Monitor &mon,
65 const MgrMap& mgrmap) {
66 return *_dout << "mon." << mon.name << "@" << mon.rank
67 << "(" << mon.get_state_name()
68 << ").mgr e" << mgrmap.get_epoch() << " ";
69 }
70
71 // the system treats always_on_modules as if they provide built-in functionality
72 // by ensuring that they are always enabled.
73 const static std::map<uint32_t, std::set<std::string>> always_on_modules = {
74 {
75 CEPH_RELEASE_OCTOPUS, {
76 "crash",
77 "status",
78 "progress",
79 "balancer",
80 "devicehealth",
81 "orchestrator",
82 "rbd_support",
83 "volumes",
84 "pg_autoscaler",
85 "telemetry",
86 }
87 },
88 {
89 CEPH_RELEASE_PACIFIC, {
90 "crash",
91 "status",
92 "progress",
93 "balancer",
94 "devicehealth",
95 "orchestrator",
96 "rbd_support",
97 "volumes",
98 "pg_autoscaler",
99 "telemetry",
100 }
101 },
102 {
103 CEPH_RELEASE_QUINCY, {
104 "crash",
105 "status",
106 "progress",
107 "balancer",
108 "devicehealth",
109 "orchestrator",
110 "rbd_support",
111 "volumes",
112 "pg_autoscaler",
113 "telemetry",
114 }
115 },
116 };
117
118 // Prefix for mon store of active mgr's command descriptions
119 const static std::string command_descs_prefix = "mgr_command_descs";
120
121 const Option *MgrMonitor::find_module_option(const string& name)
122 {
123 // we have two forms of names: "mgr/$module/$option" and
124 // localized "mgr/$module/$instance/$option". normalize to the
125 // former by stripping out $instance.
126 string real_name;
127 if (name.substr(0, 4) != "mgr/") {
128 return nullptr;
129 }
130 auto second_slash = name.find('/', 5);
131 if (second_slash == std::string::npos) {
132 return nullptr;
133 }
134 auto third_slash = name.find('/', second_slash + 1);
135 if (third_slash != std::string::npos) {
136 // drop the $instance part between the second and third slash
137 real_name = name.substr(0, second_slash) + name.substr(third_slash);
138 } else {
139 real_name = name;
140 }
141 auto p = mgr_module_options.find(real_name);
142 if (p != mgr_module_options.end()) {
143 return &p->second;
144 }
145 return nullptr;
146 }
147
148 version_t MgrMonitor::get_trim_to() const
149 {
150 int64_t max = g_conf().get_val<int64_t>("mon_max_mgrmap_epochs");
151 if (map.epoch > max) {
152 return map.epoch - max;
153 }
154 return 0;
155 }
156
157 void MgrMonitor::create_initial()
158 {
159 // Take a local copy of initial_modules for tokenizer to iterate over.
160 auto initial_modules = g_conf().get_val<std::string>("mgr_initial_modules");
161 boost::tokenizer<> tok(initial_modules);
162 for (auto& m : tok) {
163 pending_map.modules.insert(m);
164 }
165 pending_map.always_on_modules = always_on_modules;
166 pending_command_descs = mgr_commands;
167 dout(10) << __func__ << " initial modules " << pending_map.modules
168 << ", always on modules " << pending_map.get_always_on_modules()
169 << ", " << pending_command_descs.size() << " commands"
170 << dendl;
171 }
172
173 void MgrMonitor::get_store_prefixes(std::set<string>& s) const
174 {
175 s.insert(service_name);
176 s.insert(command_descs_prefix);
177 s.insert(MGR_METADATA_PREFIX);
178 }
179
180 void MgrMonitor::update_from_paxos(bool *need_bootstrap)
181 {
182 version_t version = get_last_committed();
183 if (version != map.epoch) {
184 dout(4) << "loading version " << version << dendl;
185
186 bufferlist bl;
187 int err = get_version(version, bl);
188 ceph_assert(err == 0);
189
190 bool old_available = map.get_available();
191 uint64_t old_gid = map.get_active_gid();
192
193 auto p = bl.cbegin();
194 map.decode(p);
195
196 dout(4) << "active server: " << map.active_addrs
197 << "(" << map.active_gid << ")" << dendl;
198
199 ever_had_active_mgr = get_value("ever_had_active_mgr");
200
201 load_health();
202
203 if (map.available) {
204 first_seen_inactive = utime_t();
205 } else {
206 first_seen_inactive = ceph_clock_now();
207 }
208
209 check_subs();
210
211 if (version == 1
212 || command_descs.empty()
213 || (map.get_available()
214 && (!old_available || old_gid != map.get_active_gid()))) {
215 dout(4) << "mkfs or daemon transitioned to available, loading commands"
216 << dendl;
217 bufferlist loaded_commands;
218 int r = mon.store->get(command_descs_prefix, "", loaded_commands);
219 if (r < 0) {
220 derr << "Failed to load mgr commands: " << cpp_strerror(r) << dendl;
221 } else {
222 auto p = loaded_commands.cbegin();
223 decode(command_descs, p);
224 }
225 }
226 }
227
228 // populate module options
229 mgr_module_options.clear();
230 misc_option_strings.clear();
231 for (auto& i : map.available_modules) {
232 for (auto& j : i.module_options) {
233 string name = string("mgr/") + i.name + "/" + j.second.name;
234 auto p = mgr_module_options.emplace(
235 name,
236 Option(name, static_cast<Option::type_t>(j.second.type),
237 static_cast<Option::level_t>(j.second.level)));
238 Option& opt = p.first->second;
239 opt.set_flags(static_cast<Option::flag_t>(j.second.flags));
240 opt.set_flag(Option::FLAG_MGR);
241 opt.set_description(j.second.desc.c_str());
242 opt.set_long_description(j.second.long_desc.c_str());
243 for (auto& k : j.second.tags) {
244 opt.add_tag(k.c_str());
245 }
246 for (auto& k : j.second.see_also) {
247 if (i.module_options.count(k)) {
248 // it's another module option
249 misc_option_strings.push_back(string("mgr/") + i.name + "/" + k);
250 opt.add_see_also(misc_option_strings.back().c_str());
251 } else {
252 // it's a native option
253 opt.add_see_also(k.c_str());
254 }
255 }
256 Option::value_t v, v2;
257 std::string err;
258 if (j.second.default_value.size() &&
259 !opt.parse_value(j.second.default_value, &v, &err)) {
260 opt.set_default(v);
261 }
262 if (j.second.min.size() &&
263 j.second.max.size() &&
264 !opt.parse_value(j.second.min, &v, &err) &&
265 !opt.parse_value(j.second.max, &v2, &err)) {
266 opt.set_min_max(v, v2);
267 }
268 std::vector<const char *> enum_allowed;
269 for (auto& k : j.second.enum_allowed) {
270 enum_allowed.push_back(k.c_str());
271 }
272 opt.set_enum_allowed(enum_allowed);
273 }
274 }
275 // force ConfigMonitor to refresh, since it uses const Option *
276 // pointers into our mgr_module_options (which we just rebuilt).
277 mon.configmon()->load_config();
278
279 if (!mon.is_init()) {
280 // feed our pet MgrClient, unless we are in Monitor::[pre]init()
281 prime_mgr_client();
282 }
283 }
284
285 void MgrMonitor::prime_mgr_client()
286 {
287 dout(10) << __func__ << dendl;
288 mon.mgr_client.ms_dispatch2(make_message<MMgrMap>(map));
289 }
290
291 void MgrMonitor::create_pending()
292 {
293 pending_map = map;
294 pending_map.epoch++;
295 }
296
297 health_status_t MgrMonitor::should_warn_about_mgr_down()
298 {
299 utime_t now = ceph_clock_now();
300 // we warn if we have osds AND we've exceeded the grace period
301 // which means a new mon cluster and be HEALTH_OK indefinitely as long as
302 // no OSDs are ever created.
303 if (mon.osdmon()->osdmap.get_num_osds() > 0 &&
304 now > mon.monmap->created + g_conf().get_val<int64_t>("mon_mgr_mkfs_grace")) {
305 health_status_t level = HEALTH_WARN;
306 if (first_seen_inactive != utime_t() &&
307 now - first_seen_inactive > g_conf().get_val<int64_t>("mon_mgr_inactive_grace")) {
308 level = HEALTH_ERR;
309 }
310 return level;
311 }
312 return HEALTH_OK;
313 }
314
315 void MgrMonitor::post_paxos_update()
316 {
317 // are we handling digest subscribers?
318 if (digest_event) {
319 bool send = false;
320 if (prev_health_checks.empty()) {
321 prev_health_checks.resize(mon.paxos_service.size());
322 send = true;
323 }
324 ceph_assert(prev_health_checks.size() == mon.paxos_service.size());
325 for (auto i = 0u; i < prev_health_checks.size(); i++) {
326 const auto& curr = mon.paxos_service[i]->get_health_checks();
327 if (!send && curr != prev_health_checks[i]) {
328 send = true;
329 }
330 prev_health_checks[i] = curr;
331 }
332 if (send) {
333 if (is_active()) {
334 send_digests();
335 } else {
336 cancel_timer();
337 wait_for_active_ctx(new C_MonContext{&mon, [this](int) {
338 send_digests();
339 }});
340 }
341 }
342 }
343 }
344
345 void MgrMonitor::encode_pending(MonitorDBStore::TransactionRef t)
346 {
347 dout(10) << __func__ << " " << pending_map << dendl;
348 bufferlist bl;
349 pending_map.encode(bl, mon.get_quorum_con_features());
350 put_version(t, pending_map.epoch, bl);
351 put_last_committed(t, pending_map.epoch);
352
353 for (auto& p : pending_metadata) {
354 dout(10) << __func__ << " set metadata for " << p.first << dendl;
355 t->put(MGR_METADATA_PREFIX, p.first, p.second);
356 }
357 for (auto& name : pending_metadata_rm) {
358 dout(10) << __func__ << " rm metadata for " << name << dendl;
359 t->erase(MGR_METADATA_PREFIX, name);
360 }
361 pending_metadata.clear();
362 pending_metadata_rm.clear();
363
364 health_check_map_t next;
365 if (pending_map.active_gid == 0) {
366 auto level = should_warn_about_mgr_down();
367 if (level != HEALTH_OK) {
368 next.add("MGR_DOWN", level, "no active mgr", 0);
369 } else {
370 dout(10) << __func__ << " no health warning (never active and new cluster)"
371 << dendl;
372 }
373 } else {
374 put_value(t, "ever_had_active_mgr", 1);
375 }
376 encode_health(next, t);
377
378 if (pending_command_descs.size()) {
379 dout(4) << __func__ << " encoding " << pending_command_descs.size()
380 << " command_descs" << dendl;
381 for (auto& p : pending_command_descs) {
382 p.set_flag(MonCommand::FLAG_MGR);
383 }
384 bufferlist bl;
385 encode(pending_command_descs, bl);
386 t->put(command_descs_prefix, "", bl);
387 pending_command_descs.clear();
388 }
389 }
390
391 bool MgrMonitor::check_caps(MonOpRequestRef op, const uuid_d& fsid)
392 {
393 // check permissions
394 MonSession *session = op->get_session();
395 if (!session)
396 return false;
397 if (!session->is_capable("mgr", MON_CAP_X)) {
398 dout(1) << __func__ << " insufficient caps " << session->caps << dendl;
399 return false;
400 }
401 if (fsid != mon.monmap->fsid) {
402 dout(1) << __func__ << " op fsid " << fsid
403 << " != " << mon.monmap->fsid << dendl;
404 return false;
405 }
406 return true;
407 }
408
409 bool MgrMonitor::preprocess_query(MonOpRequestRef op)
410 {
411 auto m = op->get_req<PaxosServiceMessage>();
412 switch (m->get_type()) {
413 case MSG_MGR_BEACON:
414 return preprocess_beacon(op);
415 case MSG_MON_COMMAND:
416 try {
417 return preprocess_command(op);
418 } catch (const bad_cmd_get& e) {
419 bufferlist bl;
420 mon.reply_command(op, -EINVAL, e.what(), bl, get_last_committed());
421 return true;
422 }
423
424 default:
425 mon.no_reply(op);
426 derr << "Unhandled message type " << m->get_type() << dendl;
427 return true;
428 }
429 }
430
431 bool MgrMonitor::prepare_update(MonOpRequestRef op)
432 {
433 auto m = op->get_req<PaxosServiceMessage>();
434 switch (m->get_type()) {
435 case MSG_MGR_BEACON:
436 return prepare_beacon(op);
437
438 case MSG_MON_COMMAND:
439 try {
440 return prepare_command(op);
441 } catch (const bad_cmd_get& e) {
442 bufferlist bl;
443 mon.reply_command(op, -EINVAL, e.what(), bl, get_last_committed());
444 return true;
445 }
446
447 default:
448 mon.no_reply(op);
449 derr << "Unhandled message type " << m->get_type() << dendl;
450 return true;
451 }
452 }
453
454
455
456 class C_Updated : public Context {
457 MgrMonitor *mm;
458 MonOpRequestRef op;
459 public:
460 C_Updated(MgrMonitor *a, MonOpRequestRef c) :
461 mm(a), op(c) {}
462 void finish(int r) override {
463 if (r >= 0) {
464 // Success
465 } else if (r == -ECANCELED) {
466 mm->mon.no_reply(op);
467 } else {
468 mm->dispatch(op); // try again
469 }
470 }
471 };
472
473 bool MgrMonitor::preprocess_beacon(MonOpRequestRef op)
474 {
475 auto m = op->get_req<MMgrBeacon>();
476 mon.no_reply(op); // we never reply to beacons
477 dout(4) << "beacon from " << m->get_gid() << dendl;
478
479 if (!check_caps(op, m->get_fsid())) {
480 // drop it on the floor
481 return true;
482 }
483
484 // always send this to the leader's prepare_beacon()
485 return false;
486 }
487
488 bool MgrMonitor::prepare_beacon(MonOpRequestRef op)
489 {
490 auto m = op->get_req<MMgrBeacon>();
491 dout(4) << "beacon from " << m->get_gid() << dendl;
492
493 // See if we are seeing same name, new GID for the active daemon
494 if (m->get_name() == pending_map.active_name
495 && m->get_gid() != pending_map.active_gid)
496 {
497 dout(4) << "Active daemon restart (mgr." << m->get_name() << ")" << dendl;
498 mon.clog->info() << "Active manager daemon " << m->get_name()
499 << " restarted";
500 if (!mon.osdmon()->is_writeable()) {
501 dout(1) << __func__ << ": waiting for osdmon writeable to"
502 " blocklist old instance." << dendl;
503 mon.osdmon()->wait_for_writeable(op, new C_RetryMessage(this, op));
504 return false;
505 }
506 drop_active();
507 }
508
509 // See if we are seeing same name, new GID for any standbys
510 for (const auto &i : pending_map.standbys) {
511 const MgrMap::StandbyInfo &s = i.second;
512 if (s.name == m->get_name() && s.gid != m->get_gid()) {
513 dout(4) << "Standby daemon restart (mgr." << m->get_name() << ")" << dendl;
514 mon.clog->debug() << "Standby manager daemon " << m->get_name()
515 << " restarted";
516 drop_standby(i.first);
517 break;
518 }
519 }
520
521 last_beacon[m->get_gid()] = ceph::coarse_mono_clock::now();
522
523 // Track whether we modified pending_map
524 bool updated = false;
525
526 if (pending_map.active_gid == m->get_gid()) {
527 if (pending_map.services != m->get_services()) {
528 dout(4) << "updated services from mgr." << m->get_name()
529 << ": " << m->get_services() << dendl;
530 pending_map.services = m->get_services();
531 updated = true;
532 }
533
534 // A beacon from the currently active daemon
535 if (pending_map.active_addrs != m->get_server_addrs()) {
536 dout(4) << "learned address " << m->get_server_addrs()
537 << " (was " << pending_map.active_addrs << ")" << dendl;
538 pending_map.active_addrs = m->get_server_addrs();
539 updated = true;
540 }
541
542 if (pending_map.get_available() != m->get_available()) {
543 dout(4) << "available " << m->get_gid() << dendl;
544 mon.clog->info() << "Manager daemon " << pending_map.active_name
545 << " is now available";
546
547 // This beacon should include command descriptions
548 pending_command_descs = m->get_command_descs();
549 if (pending_command_descs.empty()) {
550 // This should not happen, but it also isn't fatal: we just
551 // won't successfully update our list of commands.
552 dout(4) << "First available beacon from " << pending_map.active_name
553 << "(" << m->get_gid() << ") does not include command descs"
554 << dendl;
555 } else {
556 dout(4) << "First available beacon from " << pending_map.active_name
557 << "(" << m->get_gid() << ") includes "
558 << pending_command_descs.size() << " command descs" << dendl;
559 }
560
561 pending_map.available = m->get_available();
562 updated = true;
563 }
564 if (pending_map.available_modules != m->get_available_modules()) {
565 dout(4) << "available_modules " << m->get_available_modules()
566 << " (was " << pending_map.available_modules << ")" << dendl;
567 pending_map.available_modules = m->get_available_modules();
568 updated = true;
569 }
570 const auto& clients = m->get_clients();
571 if (pending_map.clients != clients) {
572 dout(4) << "active's RADOS clients " << clients
573 << " (was " << pending_map.clients << ")" << dendl;
574 pending_map.clients = clients;
575 updated = true;
576 }
577 } else if (pending_map.active_gid == 0) {
578 // There is no currently active daemon, select this one.
579 if (pending_map.standbys.count(m->get_gid())) {
580 drop_standby(m->get_gid(), false);
581 }
582 dout(4) << "selecting new active " << m->get_gid()
583 << " " << m->get_name()
584 << " (was " << pending_map.active_gid << " "
585 << pending_map.active_name << ")" << dendl;
586 pending_map.active_gid = m->get_gid();
587 pending_map.active_name = m->get_name();
588 pending_map.active_change = ceph_clock_now();
589 pending_map.active_mgr_features = m->get_mgr_features();
590 pending_map.available_modules = m->get_available_modules();
591 encode(m->get_metadata(), pending_metadata[m->get_name()]);
592 pending_metadata_rm.erase(m->get_name());
593
594 mon.clog->info() << "Activating manager daemon "
595 << pending_map.active_name;
596
597 updated = true;
598 } else {
599 if (pending_map.standbys.count(m->get_gid()) > 0) {
600 dout(10) << "from existing standby " << m->get_gid() << dendl;
601 if (pending_map.standbys[m->get_gid()].available_modules !=
602 m->get_available_modules()) {
603 dout(10) << "existing standby " << m->get_gid() << " available_modules "
604 << m->get_available_modules() << " (was "
605 << pending_map.standbys[m->get_gid()].available_modules << ")"
606 << dendl;
607 pending_map.standbys[m->get_gid()].available_modules =
608 m->get_available_modules();
609 updated = true;
610 }
611 } else {
612 dout(10) << "new standby " << m->get_gid() << dendl;
613 mon.clog->debug() << "Standby manager daemon " << m->get_name()
614 << " started";
615 pending_map.standbys[m->get_gid()] = {m->get_gid(), m->get_name(),
616 m->get_available_modules(),
617 m->get_mgr_features()};
618 encode(m->get_metadata(), pending_metadata[m->get_name()]);
619 pending_metadata_rm.erase(m->get_name());
620 updated = true;
621 }
622 }
623
624 if (updated) {
625 dout(4) << "updating map" << dendl;
626 wait_for_finished_proposal(op, new C_Updated(this, op));
627 } else {
628 dout(10) << "no change" << dendl;
629 }
630
631 return updated;
632 }
633
634 void MgrMonitor::check_subs()
635 {
636 const std::string type = "mgrmap";
637 if (mon.session_map.subs.count(type) == 0)
638 return;
639 for (auto sub : *(mon.session_map.subs[type])) {
640 check_sub(sub);
641 }
642 }
643
644 void MgrMonitor::check_sub(Subscription *sub)
645 {
646 if (sub->type == "mgrmap") {
647 if (sub->next <= map.get_epoch()) {
648 dout(20) << "Sending map to subscriber " << sub->session->con
649 << " " << sub->session->con->get_peer_addr() << dendl;
650 sub->session->con->send_message2(make_message<MMgrMap>(map));
651 if (sub->onetime) {
652 mon.session_map.remove_sub(sub);
653 } else {
654 sub->next = map.get_epoch() + 1;
655 }
656 }
657 } else {
658 ceph_assert(sub->type == "mgrdigest");
659 if (sub->next == 0) {
660 // new registration; cancel previous timer
661 cancel_timer();
662 }
663 if (digest_event == nullptr) {
664 send_digests();
665 }
666 }
667 }
668
669 /**
670 * Handle digest subscriptions separately (outside of check_sub) because
671 * they are going to be periodic rather than version-driven.
672 */
673 void MgrMonitor::send_digests()
674 {
675 cancel_timer();
676
677 const std::string type = "mgrdigest";
678 if (mon.session_map.subs.count(type) == 0) {
679 prev_health_checks.clear();
680 return;
681 }
682
683 if (!is_active()) {
684 // if paxos is currently not active, don't send a digest but reenable timer
685 goto timer;
686 }
687 dout(10) << __func__ << dendl;
688
689 for (auto sub : *(mon.session_map.subs[type])) {
690 dout(10) << __func__ << " sending digest to subscriber " << sub->session->con
691 << " " << sub->session->con->get_peer_addr() << dendl;
692 auto mdigest = make_message<MMgrDigest>();
693
694 JSONFormatter f;
695 mon.healthmon()->get_health_status(true, &f, nullptr, nullptr, nullptr);
696 f.flush(mdigest->health_json);
697 f.reset();
698
699 mon.get_mon_status(&f);
700 f.flush(mdigest->mon_status_json);
701 f.reset();
702
703 sub->session->con->send_message2(mdigest);
704 }
705
706 timer:
707 digest_event = mon.timer.add_event_after(
708 g_conf().get_val<int64_t>("mon_mgr_digest_period"),
709 new C_MonContext{&mon, [this](int) {
710 send_digests();
711 }});
712 }
713
714 void MgrMonitor::cancel_timer()
715 {
716 if (digest_event) {
717 mon.timer.cancel_event(digest_event);
718 digest_event = nullptr;
719 }
720 }
721
722 void MgrMonitor::on_active()
723 {
724 if (!mon.is_leader()) {
725 return;
726 }
727 mon.clog->debug() << "mgrmap e" << map.epoch << ": " << map;
728 assert(HAVE_FEATURE(mon.get_quorum_con_features(), SERVER_NAUTILUS));
729 if (pending_map.always_on_modules == always_on_modules) {
730 return;
731 }
732 dout(4) << "always on modules changed, pending "
733 << pending_map.always_on_modules << " != wanted "
734 << always_on_modules << dendl;
735 pending_map.always_on_modules = always_on_modules;
736 propose_pending();
737 }
738
739 void MgrMonitor::tick()
740 {
741 if (!is_active() || !mon.is_leader())
742 return;
743
744 const auto now = ceph::coarse_mono_clock::now();
745
746 const auto mgr_beacon_grace =
747 g_conf().get_val<std::chrono::seconds>("mon_mgr_beacon_grace");
748
749 // Note that this is the mgr daemon's tick period, not ours (the
750 // beacon is sent with this period).
751 const auto mgr_tick_period =
752 g_conf().get_val<std::chrono::seconds>("mgr_tick_period");
753
754 if (last_tick != ceph::coarse_mono_clock::time_point::min()
755 && (now - last_tick > (mgr_beacon_grace - mgr_tick_period))) {
756 // This case handles either local slowness (calls being delayed
757 // for whatever reason) or cluster election slowness (a long gap
758 // between calls while an election happened)
759 dout(4) << __func__ << ": resetting beacon timeouts due to mon delay "
760 "(slow election?) of " << now - last_tick << " seconds" << dendl;
761 for (auto &i : last_beacon) {
762 i.second = now;
763 }
764 }
765
766 last_tick = now;
767
768 // Populate any missing beacons (i.e. no beacon since MgrMonitor
769 // instantiation) with the current time, so that they will
770 // eventually look laggy if they fail to give us a beacon.
771 if (pending_map.active_gid != 0
772 && last_beacon.count(pending_map.active_gid) == 0) {
773 last_beacon[pending_map.active_gid] = now;
774 }
775 for (auto s : pending_map.standbys) {
776 if (last_beacon.count(s.first) == 0) {
777 last_beacon[s.first] = now;
778 }
779 }
780
781 // Cull standbys first so that any remaining standbys
782 // will be eligible to take over from the active if we cull him.
783 std::list<uint64_t> dead_standbys;
784 const auto cutoff = now - mgr_beacon_grace;
785 for (const auto &i : pending_map.standbys) {
786 auto last_beacon_time = last_beacon.at(i.first);
787 if (last_beacon_time < cutoff) {
788 dead_standbys.push_back(i.first);
789 }
790 }
791
792 bool propose = false;
793
794 for (auto i : dead_standbys) {
795 dout(4) << "Dropping laggy standby " << i << dendl;
796 drop_standby(i);
797 propose = true;
798 }
799
800 if (pending_map.active_gid != 0
801 && last_beacon.at(pending_map.active_gid) < cutoff
802 && mon.osdmon()->is_writeable()) {
803 const std::string old_active_name = pending_map.active_name;
804 drop_active();
805 propose = true;
806 dout(4) << "Dropping active" << pending_map.active_gid << dendl;
807 if (promote_standby()) {
808 dout(4) << "Promoted standby " << pending_map.active_gid << dendl;
809 mon.clog->info() << "Manager daemon " << old_active_name
810 << " is unresponsive, replacing it with standby"
811 << " daemon " << pending_map.active_name;
812 } else {
813 dout(4) << "Active is laggy but have no standbys to replace it" << dendl;
814 mon.clog->info() << "Manager daemon " << old_active_name
815 << " is unresponsive. No standby daemons available.";
816 }
817 } else if (pending_map.active_gid == 0) {
818 if (promote_standby()) {
819 dout(4) << "Promoted standby " << pending_map.active_gid << dendl;
820 mon.clog->info() << "Activating manager daemon "
821 << pending_map.active_name;
822 propose = true;
823 }
824 }
825
826 if (!pending_map.available &&
827 !ever_had_active_mgr &&
828 should_warn_about_mgr_down() != HEALTH_OK) {
829 dout(10) << " exceeded mon_mgr_mkfs_grace "
830 << g_conf().get_val<int64_t>("mon_mgr_mkfs_grace")
831 << " seconds" << dendl;
832 propose = true;
833 }
834
835 // obsolete modules?
836 if (mon.monmap->min_mon_release >= ceph_release_t::octopus &&
837 pending_map.module_enabled("orchestrator_cli")) {
838 dout(10) << " disabling obsolete/renamed 'orchestrator_cli'" << dendl;
839 // we don't need to enable 'orchestrator' because it's now always-on
840 pending_map.modules.erase("orchestrator_cli");
841 propose = true;
842 }
843
844 if (propose) {
845 propose_pending();
846 }
847 }
848
849 void MgrMonitor::on_restart()
850 {
851 // Clear out the leader-specific state.
852 last_beacon.clear();
853 last_tick = ceph::coarse_mono_clock::now();
854 }
855
856
857 bool MgrMonitor::promote_standby()
858 {
859 ceph_assert(pending_map.active_gid == 0);
860 if (pending_map.standbys.size()) {
861 // Promote a replacement (arbitrary choice of standby)
862 auto replacement_gid = pending_map.standbys.begin()->first;
863 pending_map.active_gid = replacement_gid;
864 pending_map.active_name = pending_map.standbys.at(replacement_gid).name;
865 pending_map.available_modules =
866 pending_map.standbys.at(replacement_gid).available_modules;
867 pending_map.active_mgr_features =
868 pending_map.standbys.at(replacement_gid).mgr_features;
869 pending_map.available = false;
870 pending_map.active_addrs = entity_addrvec_t();
871 pending_map.active_change = ceph_clock_now();
872
873 drop_standby(replacement_gid, false);
874
875 return true;
876 } else {
877 return false;
878 }
879 }
880
881 void MgrMonitor::drop_active()
882 {
883 ceph_assert(mon.osdmon()->is_writeable());
884
885 if (last_beacon.count(pending_map.active_gid) > 0) {
886 last_beacon.erase(pending_map.active_gid);
887 }
888
889 ceph_assert(pending_map.active_gid > 0);
890 auto until = ceph_clock_now();
891 until += g_conf().get_val<double>("mon_mgr_blocklist_interval");
892 dout(5) << "blocklisting previous mgr." << pending_map.active_name << "."
893 << pending_map.active_gid << " ("
894 << pending_map.active_addrs << ")" << dendl;
895 auto blocklist_epoch = mon.osdmon()->blocklist(pending_map.active_addrs, until);
896
897 /* blocklist RADOS clients in use by the mgr */
898 for (const auto& a : pending_map.clients) {
899 mon.osdmon()->blocklist(a, until);
900 }
901 request_proposal(mon.osdmon());
902
903 pending_metadata_rm.insert(pending_map.active_name);
904 pending_metadata.erase(pending_map.active_name);
905 pending_map.active_name = "";
906 pending_map.active_gid = 0;
907 pending_map.active_change = ceph_clock_now();
908 pending_map.active_mgr_features = 0;
909 pending_map.available = false;
910 pending_map.active_addrs = entity_addrvec_t();
911 pending_map.services.clear();
912 pending_map.clients.clear();
913 pending_map.last_failure_osd_epoch = blocklist_epoch;
914
915 // So that when new active mgr subscribes to mgrdigest, it will
916 // get an immediate response instead of waiting for next timer
917 cancel_timer();
918 }
919
920 void MgrMonitor::drop_standby(uint64_t gid, bool drop_meta)
921 {
922 if (drop_meta) {
923 pending_metadata_rm.insert(pending_map.standbys[gid].name);
924 pending_metadata.erase(pending_map.standbys[gid].name);
925 }
926 pending_map.standbys.erase(gid);
927 if (last_beacon.count(gid) > 0) {
928 last_beacon.erase(gid);
929 }
930 }
931
932 bool MgrMonitor::preprocess_command(MonOpRequestRef op)
933 {
934 auto m = op->get_req<MMonCommand>();
935 std::stringstream ss;
936 bufferlist rdata;
937
938 cmdmap_t cmdmap;
939 if (!cmdmap_from_json(m->cmd, &cmdmap, ss)) {
940 string rs = ss.str();
941 mon.reply_command(op, -EINVAL, rs, rdata, get_last_committed());
942 return true;
943 }
944
945 MonSession *session = op->get_session();
946 if (!session) {
947 mon.reply_command(op, -EACCES, "access denied", rdata,
948 get_last_committed());
949 return true;
950 }
951
952 string format = cmd_getval_or<string>(cmdmap, "format", "plain");
953 boost::scoped_ptr<Formatter> f(Formatter::create(format));
954
955 string prefix;
956 cmd_getval(cmdmap, "prefix", prefix);
957 int r = 0;
958
959 if (prefix == "mgr stat") {
960 if (!f) {
961 f.reset(Formatter::create(format, "json-pretty", "json-pretty"));
962 }
963 f->open_object_section("stat");
964 f->dump_unsigned("epoch", map.get_epoch());
965 f->dump_bool("available", map.get_available());
966 f->dump_string("active_name", map.get_active_name());
967 f->dump_unsigned("num_standby", map.get_num_standby());
968 f->close_section();
969 f->flush(rdata);
970 } else if (prefix == "mgr dump") {
971 if (!f) {
972 f.reset(Formatter::create(format, "json-pretty", "json-pretty"));
973 }
974 int64_t epoch = cmd_getval_or<int64_t>(cmdmap, "epoch", map.get_epoch());
975 if (epoch == (int64_t)map.get_epoch()) {
976 f->dump_object("mgrmap", map);
977 } else {
978 bufferlist bl;
979 int err = get_version(epoch, bl);
980 if (err == -ENOENT) {
981 r = -ENOENT;
982 ss << "there is no map for epoch " << epoch;
983 goto reply;
984 }
985 MgrMap m;
986 auto p = bl.cbegin();
987 m.decode(p);
988 f->dump_object("mgrmap", m);
989 }
990 f->flush(rdata);
991 } else if (prefix == "mgr module ls") {
992 if (f) {
993 f->open_object_section("modules");
994 {
995 f->open_array_section("always_on_modules");
996 for (auto& p : map.get_always_on_modules()) {
997 f->dump_string("module", p);
998 }
999 f->close_section();
1000 f->open_array_section("enabled_modules");
1001 for (auto& p : map.modules) {
1002 if (map.get_always_on_modules().count(p) > 0)
1003 continue;
1004 // We only show the name for enabled modules. The any errors
1005 // etc will show up as a health checks.
1006 f->dump_string("module", p);
1007 }
1008 f->close_section();
1009 f->open_array_section("disabled_modules");
1010 for (auto& p : map.available_modules) {
1011 if (map.modules.count(p.name) == 0 &&
1012 map.get_always_on_modules().count(p.name) == 0) {
1013 // For disabled modules, we show the full info if the detail
1014 // parameter is enabled, to give a hint about whether enabling it will work
1015 p.dump(f.get());
1016 }
1017 }
1018 f->close_section();
1019 }
1020 f->close_section();
1021 f->flush(rdata);
1022 } else {
1023 TextTable tbl;
1024 tbl.define_column("MODULE", TextTable::LEFT, TextTable::LEFT);
1025 tbl.define_column(" ", TextTable::LEFT, TextTable::LEFT);
1026
1027 for (auto& p : map.get_always_on_modules()) {
1028 tbl << p;
1029 tbl << "on (always on)";
1030 tbl << TextTable::endrow;
1031 }
1032 for (auto& p : map.modules) {
1033 if (map.get_always_on_modules().count(p) > 0)
1034 continue;
1035 tbl << p;
1036 tbl << "on";
1037 tbl << TextTable::endrow;
1038 }
1039 for (auto& p : map.available_modules) {
1040 if (map.modules.count(p.name) == 0 &&
1041 map.get_always_on_modules().count(p.name) == 0) {
1042 tbl << p.name;
1043 tbl << "-";
1044 tbl << TextTable::endrow;
1045 }
1046 }
1047 rdata.append(stringify(tbl));
1048 }
1049 } else if (prefix == "mgr services") {
1050 if (!f) {
1051 f.reset(Formatter::create(format, "json-pretty", "json-pretty"));
1052 }
1053 f->open_object_section("services");
1054 for (const auto &i : map.services) {
1055 f->dump_string(i.first.c_str(), i.second);
1056 }
1057 f->close_section();
1058 f->flush(rdata);
1059 } else if (prefix == "mgr metadata") {
1060 if (!f) {
1061 f.reset(Formatter::create(format, "json-pretty", "json-pretty"));
1062 }
1063 string name;
1064 cmd_getval(cmdmap, "who", name);
1065 if (name.size() > 0 && !map.have_name(name)) {
1066 ss << "mgr." << name << " does not exist";
1067 r = -ENOENT;
1068 goto reply;
1069 }
1070 if (name.size()) {
1071 f->open_object_section("mgr_metadata");
1072 f->dump_string("name", name);
1073 r = dump_metadata(name, f.get(), &ss);
1074 if (r < 0)
1075 goto reply;
1076 f->close_section();
1077 } else {
1078 r = 0;
1079 f->open_array_section("mgr_metadata");
1080 for (auto& i : map.get_all_names()) {
1081 f->open_object_section("mgr");
1082 f->dump_string("name", i);
1083 r = dump_metadata(i, f.get(), NULL);
1084 if (r == -EINVAL || r == -ENOENT) {
1085 // Drop error, continue to get other daemons' metadata
1086 dout(4) << "No metadata for mgr." << i << dendl;
1087 r = 0;
1088 } else if (r < 0) {
1089 // Unexpected error
1090 goto reply;
1091 }
1092 f->close_section();
1093 }
1094 f->close_section();
1095 }
1096 f->flush(rdata);
1097 } else if (prefix == "mgr versions") {
1098 if (!f) {
1099 f.reset(Formatter::create(format, "json-pretty", "json-pretty"));
1100 }
1101 count_metadata("ceph_version", f.get());
1102 f->flush(rdata);
1103 r = 0;
1104 } else if (prefix == "mgr count-metadata") {
1105 if (!f) {
1106 f.reset(Formatter::create(format, "json-pretty", "json-pretty"));
1107 }
1108 string field;
1109 cmd_getval(cmdmap, "property", field);
1110 count_metadata(field, f.get());
1111 f->flush(rdata);
1112 r = 0;
1113 } else {
1114 return false;
1115 }
1116
1117 reply:
1118 string rs;
1119 getline(ss, rs);
1120 mon.reply_command(op, r, rs, rdata, get_last_committed());
1121 return true;
1122 }
1123
1124 bool MgrMonitor::prepare_command(MonOpRequestRef op)
1125 {
1126 auto m = op->get_req<MMonCommand>();
1127
1128 std::stringstream ss;
1129 bufferlist rdata;
1130
1131 cmdmap_t cmdmap;
1132 if (!cmdmap_from_json(m->cmd, &cmdmap, ss)) {
1133 string rs = ss.str();
1134 mon.reply_command(op, -EINVAL, rs, rdata, get_last_committed());
1135 return true;
1136 }
1137
1138 MonSession *session = op->get_session();
1139 if (!session) {
1140 mon.reply_command(op, -EACCES, "access denied", rdata, get_last_committed());
1141 return true;
1142 }
1143
1144 string format = cmd_getval_or<string>(cmdmap, "format", "plain");
1145 boost::scoped_ptr<Formatter> f(Formatter::create(format));
1146
1147 string prefix;
1148 cmd_getval(cmdmap, "prefix", prefix);
1149
1150 int r = 0;
1151
1152 if (prefix == "mgr fail") {
1153 string who;
1154 if (!cmd_getval(cmdmap, "who", who)) {
1155 if (!map.active_gid) {
1156 ss << "Currently no active mgr";
1157 goto out;
1158 }
1159 who = map.active_name;
1160 }
1161
1162 std::string err;
1163 uint64_t gid = strict_strtol(who.c_str(), 10, &err);
1164 bool changed = false;
1165 if (!err.empty()) {
1166 // Does not parse as a gid, treat it as a name
1167 if (pending_map.active_name == who) {
1168 if (!mon.osdmon()->is_writeable()) {
1169 mon.osdmon()->wait_for_writeable(op, new C_RetryMessage(this, op));
1170 return false;
1171 }
1172 drop_active();
1173 changed = true;
1174 } else {
1175 gid = 0;
1176 for (const auto &i : pending_map.standbys) {
1177 if (i.second.name == who) {
1178 gid = i.first;
1179 break;
1180 }
1181 }
1182 if (gid != 0) {
1183 drop_standby(gid);
1184 changed = true;
1185 } else {
1186 ss << "Daemon not found '" << who << "', already failed?";
1187 }
1188 }
1189 } else {
1190 if (pending_map.active_gid == gid) {
1191 if (!mon.osdmon()->is_writeable()) {
1192 mon.osdmon()->wait_for_writeable(op, new C_RetryMessage(this, op));
1193 return false;
1194 }
1195 drop_active();
1196 changed = true;
1197 } else if (pending_map.standbys.count(gid) > 0) {
1198 drop_standby(gid);
1199 changed = true;
1200 } else {
1201 ss << "Daemon not found '" << gid << "', already failed?";
1202 }
1203 }
1204
1205 if (changed && pending_map.active_gid == 0) {
1206 promote_standby();
1207 }
1208 } else if (prefix == "mgr module enable") {
1209 string module;
1210 cmd_getval(cmdmap, "module", module);
1211 if (module.empty()) {
1212 r = -EINVAL;
1213 goto out;
1214 }
1215 if (pending_map.get_always_on_modules().count(module) > 0) {
1216 ss << "module '" << module << "' is already enabled (always-on)";
1217 goto out;
1218 }
1219 bool force = false;
1220 cmd_getval_compat_cephbool(cmdmap, "force", force);
1221 if (!pending_map.all_support_module(module) &&
1222 !force) {
1223 ss << "all mgr daemons do not support module '" << module << "', pass "
1224 << "--force to force enablement";
1225 r = -ENOENT;
1226 goto out;
1227 }
1228
1229 std::string can_run_error;
1230 if (!force && !pending_map.can_run_module(module, &can_run_error)) {
1231 ss << "module '" << module << "' reports that it cannot run on the active "
1232 "manager daemon: " << can_run_error << " (pass --force to force "
1233 "enablement)";
1234 r = -ENOENT;
1235 goto out;
1236 }
1237
1238 if (pending_map.module_enabled(module)) {
1239 ss << "module '" << module << "' is already enabled";
1240 r = 0;
1241 goto out;
1242 }
1243 pending_map.modules.insert(module);
1244 } else if (prefix == "mgr module disable") {
1245 string module;
1246 cmd_getval(cmdmap, "module", module);
1247 if (module.empty()) {
1248 r = -EINVAL;
1249 goto out;
1250 }
1251 if (pending_map.get_always_on_modules().count(module) > 0) {
1252 ss << "module '" << module << "' cannot be disabled (always-on)";
1253 r = -EINVAL;
1254 goto out;
1255 }
1256 if (!pending_map.module_enabled(module)) {
1257 ss << "module '" << module << "' is already disabled";
1258 r = 0;
1259 goto out;
1260 }
1261 if (!pending_map.modules.count(module)) {
1262 ss << "module '" << module << "' is not enabled";
1263 }
1264 pending_map.modules.erase(module);
1265 } else {
1266 ss << "Command '" << prefix << "' not implemented!";
1267 r = -ENOSYS;
1268 }
1269
1270 out:
1271 dout(4) << __func__ << " done, r=" << r << dendl;
1272 /* Compose response */
1273 string rs;
1274 getline(ss, rs);
1275
1276 if (r >= 0) {
1277 // success.. delay reply
1278 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, r, rs,
1279 get_last_committed() + 1));
1280 return true;
1281 } else {
1282 // reply immediately
1283 mon.reply_command(op, r, rs, rdata, get_last_committed());
1284 return false;
1285 }
1286 }
1287
1288 void MgrMonitor::init()
1289 {
1290 if (digest_event == nullptr) {
1291 send_digests(); // To get it to schedule its own event
1292 }
1293 }
1294
1295 void MgrMonitor::on_shutdown()
1296 {
1297 cancel_timer();
1298 }
1299
1300 int MgrMonitor::load_metadata(const string& name, std::map<string, string>& m,
1301 ostream *err) const
1302 {
1303 bufferlist bl;
1304 int r = mon.store->get(MGR_METADATA_PREFIX, name, bl);
1305 if (r < 0)
1306 return r;
1307 try {
1308 auto p = bl.cbegin();
1309 decode(m, p);
1310 }
1311 catch (ceph::buffer::error& e) {
1312 if (err)
1313 *err << "mgr." << name << " metadata is corrupt";
1314 return -EIO;
1315 }
1316 return 0;
1317 }
1318
1319 void MgrMonitor::count_metadata(const string& field, std::map<string,int> *out)
1320 {
1321 std::set<string> ls = map.get_all_names();
1322 for (auto& name : ls) {
1323 std::map<string,string> meta;
1324 load_metadata(name, meta, nullptr);
1325 auto p = meta.find(field);
1326 if (p == meta.end()) {
1327 (*out)["unknown"]++;
1328 } else {
1329 (*out)[p->second]++;
1330 }
1331 }
1332 }
1333
1334 void MgrMonitor::count_metadata(const string& field, Formatter *f)
1335 {
1336 std::map<string,int> by_val;
1337 count_metadata(field, &by_val);
1338 f->open_object_section(field.c_str());
1339 for (auto& p : by_val) {
1340 f->dump_int(p.first.c_str(), p.second);
1341 }
1342 f->close_section();
1343 }
1344
1345 void MgrMonitor::get_versions(std::map<string, list<string> > &versions)
1346 {
1347 std::set<string> ls = map.get_all_names();
1348 for (auto& name : ls) {
1349 std::map<string,string> meta;
1350 load_metadata(name, meta, nullptr);
1351 auto p = meta.find("ceph_version_short");
1352 if (p == meta.end()) continue;
1353 versions[p->second].push_back(string("mgr.") + name);
1354 }
1355 }
1356
1357 int MgrMonitor::dump_metadata(const string& name, Formatter *f, ostream *err)
1358 {
1359 std::map<string,string> m;
1360 if (int r = load_metadata(name, m, err))
1361 return r;
1362 for (auto& p : m) {
1363 f->dump_string(p.first.c_str(), p.second);
1364 }
1365 return 0;
1366 }
1367
1368 void MgrMonitor::print_nodes(Formatter *f) const
1369 {
1370 ceph_assert(f);
1371
1372 std::map<string, list<string> > mgrs; // hostname => mgr
1373 auto ls = map.get_all_names();
1374 for (auto& name : ls) {
1375 std::map<string,string> meta;
1376 if (load_metadata(name, meta, nullptr)) {
1377 continue;
1378 }
1379 auto hostname = meta.find("hostname");
1380 if (hostname == meta.end()) {
1381 // not likely though
1382 continue;
1383 }
1384 mgrs[hostname->second].push_back(name);
1385 }
1386
1387 dump_services(f, mgrs, "mgr");
1388 }
1389
1390 const std::vector<MonCommand> &MgrMonitor::get_command_descs() const
1391 {
1392 if (command_descs.empty()) {
1393 // must have just upgraded; fallback to static commands
1394 return mgr_commands;
1395 } else {
1396 return command_descs;
1397 }
1398 }