]> git.proxmox.com Git - ceph.git/blame - ceph/src/mon/MgrMonitor.cc
import new upstream nautilus stable release 14.2.8
[ceph.git] / ceph / src / mon / MgrMonitor.cc
CommitLineData
7c673cae
FG
1// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2// vim: ts=8 sw=2 smarttab
3/*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2016 John Spray <john.spray@redhat.com>
7 *
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
12 */
13
224ce89b
WB
14#include <boost/tokenizer.hpp>
15
7c673cae
FG
16#include "messages/MMgrBeacon.h"
17#include "messages/MMgrMap.h"
18#include "messages/MMgrDigest.h"
19
7c673cae
FG
20#include "include/stringify.h"
21#include "mgr/MgrContext.h"
c07f9fc5 22#include "mgr/mgr_commands.h"
7c673cae 23#include "OSDMonitor.h"
11fdf7f2 24#include "ConfigMonitor.h"
7c673cae
FG
25
26#include "MgrMonitor.h"
27
c07f9fc5
FG
28#define MGR_METADATA_PREFIX "mgr_metadata"
29
7c673cae
FG
30#define dout_subsys ceph_subsys_mon
31#undef dout_prefix
32#define dout_prefix _prefix(_dout, mon, map)
33static ostream& _prefix(std::ostream *_dout, Monitor *mon,
34 const MgrMap& mgrmap) {
35 return *_dout << "mon." << mon->name << "@" << mon->rank
36 << "(" << mon->get_state_name()
37 << ").mgr e" << mgrmap.get_epoch() << " ";
38}
39
11fdf7f2
TL
40// the system treats always_on_modules as if they provide built-in functionality
41// by ensuring that they are always enabled.
42const static std::map<uint32_t, std::set<std::string>> always_on_modules = {
43 {
44 CEPH_RELEASE_NAUTILUS, {
45 "crash",
46 "status",
47 "progress",
48 "balancer",
49 "devicehealth",
50 "orchestrator_cli",
494da23a 51 "rbd_support",
11fdf7f2
TL
52 "volumes",
53 }
54 }
55};
56
c07f9fc5
FG
57// Prefix for mon store of active mgr's command descriptions
58const static std::string command_descs_prefix = "mgr_command_descs";
59
11fdf7f2
TL
60const Option *MgrMonitor::find_module_option(const string& name)
61{
62 // we have two forms of names: "mgr/$module/$option" and
63 // localized "mgr/$module/$instance/$option". normalize to the
64 // former by stripping out $instance.
65 string real_name;
66 if (name.substr(0, 4) != "mgr/") {
67 return nullptr;
68 }
69 auto second_slash = name.find('/', 5);
70 if (second_slash == std::string::npos) {
71 return nullptr;
72 }
73 auto third_slash = name.find('/', second_slash + 1);
74 if (third_slash != std::string::npos) {
75 // drop the $instance part between the second and third slash
76 real_name = name.substr(0, second_slash) + name.substr(third_slash);
77 } else {
78 real_name = name;
79 }
80 auto p = mgr_module_options.find(real_name);
81 if (p != mgr_module_options.end()) {
82 return &p->second;
83 }
84 return nullptr;
85}
31f18b77 86
11fdf7f2 87version_t MgrMonitor::get_trim_to() const
b32b8144 88{
11fdf7f2 89 int64_t max = g_conf().get_val<int64_t>("mon_max_mgrmap_epochs");
b32b8144
FG
90 if (map.epoch > max) {
91 return map.epoch - max;
92 }
93 return 0;
94}
95
7c673cae
FG
96void MgrMonitor::create_initial()
97{
3efd9988 98 // Take a local copy of initial_modules for tokenizer to iterate over.
11fdf7f2 99 auto initial_modules = g_conf().get_val<std::string>("mgr_initial_modules");
3efd9988 100 boost::tokenizer<> tok(initial_modules);
224ce89b
WB
101 for (auto& m : tok) {
102 pending_map.modules.insert(m);
103 }
11fdf7f2 104 pending_map.always_on_modules = always_on_modules;
c07f9fc5
FG
105 pending_command_descs = mgr_commands;
106 dout(10) << __func__ << " initial modules " << pending_map.modules
11fdf7f2
TL
107 << ", always on modules " << pending_map.get_always_on_modules()
108 << ", " << pending_command_descs.size() << " commands"
c07f9fc5 109 << dendl;
7c673cae
FG
110}
111
11fdf7f2 112void MgrMonitor::get_store_prefixes(std::set<string>& s) const
3efd9988
FG
113{
114 s.insert(service_name);
115 s.insert(command_descs_prefix);
116 s.insert(MGR_METADATA_PREFIX);
117}
118
7c673cae
FG
119void MgrMonitor::update_from_paxos(bool *need_bootstrap)
120{
121 version_t version = get_last_committed();
122 if (version != map.epoch) {
123 dout(4) << "loading version " << version << dendl;
124
125 bufferlist bl;
126 int err = get_version(version, bl);
11fdf7f2 127 ceph_assert(err == 0);
7c673cae 128
c07f9fc5
FG
129 bool old_available = map.get_available();
130 uint64_t old_gid = map.get_active_gid();
131
11fdf7f2 132 auto p = bl.cbegin();
7c673cae
FG
133 map.decode(p);
134
11fdf7f2 135 dout(4) << "active server: " << map.active_addrs
7c673cae
FG
136 << "(" << map.active_gid << ")" << dendl;
137
224ce89b
WB
138 ever_had_active_mgr = get_value("ever_had_active_mgr");
139
140 load_health();
141
7c673cae
FG
142 if (map.available) {
143 first_seen_inactive = utime_t();
144 } else {
145 first_seen_inactive = ceph_clock_now();
146 }
147
148 check_subs();
c07f9fc5
FG
149
150 if (version == 1
3efd9988
FG
151 || command_descs.empty()
152 || (map.get_available()
153 && (!old_available || old_gid != map.get_active_gid()))) {
c07f9fc5
FG
154 dout(4) << "mkfs or daemon transitioned to available, loading commands"
155 << dendl;
156 bufferlist loaded_commands;
157 int r = mon->store->get(command_descs_prefix, "", loaded_commands);
158 if (r < 0) {
159 derr << "Failed to load mgr commands: " << cpp_strerror(r) << dendl;
160 } else {
11fdf7f2
TL
161 auto p = loaded_commands.cbegin();
162 decode(command_descs, p);
163 }
164 }
165 }
166
167 // populate module options
168 mgr_module_options.clear();
169 misc_option_strings.clear();
170 for (auto& i : map.available_modules) {
171 for (auto& j : i.module_options) {
172 string name = string("mgr/") + i.name + "/" + j.second.name;
173 auto p = mgr_module_options.emplace(
174 name,
175 Option(name, static_cast<Option::type_t>(j.second.type),
176 static_cast<Option::level_t>(j.second.level)));
177 Option& opt = p.first->second;
178 opt.set_flags(static_cast<Option::flag_t>(j.second.flags));
179 opt.set_flag(Option::FLAG_MGR);
180 opt.set_description(j.second.desc.c_str());
181 opt.set_long_description(j.second.long_desc.c_str());
182 for (auto& k : j.second.tags) {
183 opt.add_tag(k.c_str());
184 }
185 for (auto& k : j.second.see_also) {
186 if (i.module_options.count(k)) {
187 // it's another module option
188 misc_option_strings.push_back(string("mgr/") + i.name + "/" + k);
189 opt.add_see_also(misc_option_strings.back().c_str());
190 } else {
191 // it's a native option
192 opt.add_see_also(k.c_str());
193 }
194 }
195 Option::value_t v, v2;
196 std::string err;
197 if (j.second.default_value.size() &&
198 !opt.parse_value(j.second.default_value, &v, &err)) {
199 opt.set_default(v);
200 }
201 if (j.second.min.size() &&
202 j.second.max.size() &&
203 !opt.parse_value(j.second.min, &v, &err) &&
204 !opt.parse_value(j.second.max, &v2, &err)) {
205 opt.set_min_max(v, v2);
c07f9fc5 206 }
11fdf7f2
TL
207 std::vector<const char *> enum_allowed;
208 for (auto& k : j.second.enum_allowed) {
209 enum_allowed.push_back(k.c_str());
210 }
211 opt.set_enum_allowed(enum_allowed);
c07f9fc5 212 }
7c673cae 213 }
11fdf7f2
TL
214 // force ConfigMonitor to refresh, since it uses const Option *
215 // pointers into our mgr_module_options (which we just rebuilt).
216 mon->configmon()->load_config();
7c673cae 217
11fdf7f2
TL
218 if (!mon->is_init()) {
219 // feed our pet MgrClient, unless we are in Monitor::[pre]init()
220 prime_mgr_client();
221 }
222}
223
224void MgrMonitor::prime_mgr_client()
225{
226 dout(10) << __func__ << dendl;
7c673cae
FG
227 mon->mgr_client.ms_dispatch(new MMgrMap(map));
228}
229
230void MgrMonitor::create_pending()
231{
232 pending_map = map;
233 pending_map.epoch++;
234}
235
224ce89b
WB
236health_status_t MgrMonitor::should_warn_about_mgr_down()
237{
238 utime_t now = ceph_clock_now();
92f5a8d4 239 // we warn if we have osds AND we've exceeded the grace period
224ce89b
WB
240 // which means a new mon cluster and be HEALTH_OK indefinitely as long as
241 // no OSDs are ever created.
92f5a8d4
TL
242 if (mon->osdmon()->osdmap.get_num_osds() > 0 &&
243 now > mon->monmap->created + g_conf().get_val<int64_t>("mon_mgr_mkfs_grace")) {
224ce89b
WB
244 health_status_t level = HEALTH_WARN;
245 if (first_seen_inactive != utime_t() &&
11fdf7f2 246 now - first_seen_inactive > g_conf().get_val<int64_t>("mon_mgr_inactive_grace")) {
224ce89b
WB
247 level = HEALTH_ERR;
248 }
249 return level;
250 }
251 return HEALTH_OK;
252}
253
11fdf7f2
TL
254void MgrMonitor::post_paxos_update()
255{
256 // are we handling digest subscribers?
257 if (digest_event) {
258 bool send = false;
259 if (prev_health_checks.empty()) {
260 prev_health_checks.resize(mon->paxos_service.size());
261 send = true;
262 }
263 ceph_assert(prev_health_checks.size() == mon->paxos_service.size());
264 for (auto i = 0u; i < prev_health_checks.size(); i++) {
265 const auto& curr = mon->paxos_service[i]->get_health_checks();
266 if (!send && curr != prev_health_checks[i]) {
267 send = true;
268 }
269 prev_health_checks[i] = curr;
270 }
271 if (send) {
272 if (is_active()) {
273 send_digests();
274 } else {
275 cancel_timer();
276 wait_for_active_ctx(new C_MonContext(mon, [this](int) {
277 send_digests();
278 }));
279 }
280 }
281 }
282}
283
7c673cae
FG
284void MgrMonitor::encode_pending(MonitorDBStore::TransactionRef t)
285{
286 dout(10) << __func__ << " " << pending_map << dendl;
287 bufferlist bl;
288 pending_map.encode(bl, mon->get_quorum_con_features());
289 put_version(t, pending_map.epoch, bl);
290 put_last_committed(t, pending_map.epoch);
224ce89b 291
c07f9fc5
FG
292 for (auto& p : pending_metadata) {
293 dout(10) << __func__ << " set metadata for " << p.first << dendl;
294 t->put(MGR_METADATA_PREFIX, p.first, p.second);
295 }
296 for (auto& name : pending_metadata_rm) {
297 dout(10) << __func__ << " rm metadata for " << name << dendl;
298 t->erase(MGR_METADATA_PREFIX, name);
299 }
300 pending_metadata.clear();
301 pending_metadata_rm.clear();
302
224ce89b
WB
303 health_check_map_t next;
304 if (pending_map.active_gid == 0) {
305 auto level = should_warn_about_mgr_down();
306 if (level != HEALTH_OK) {
307 next.add("MGR_DOWN", level, "no active mgr");
308 } else {
309 dout(10) << __func__ << " no health warning (never active and new cluster)"
310 << dendl;
311 }
312 } else {
313 put_value(t, "ever_had_active_mgr", 1);
314 }
315 encode_health(next, t);
c07f9fc5
FG
316
317 if (pending_command_descs.size()) {
318 dout(4) << __func__ << " encoding " << pending_command_descs.size()
319 << " command_descs" << dendl;
320 for (auto& p : pending_command_descs) {
321 p.set_flag(MonCommand::FLAG_MGR);
322 }
323 bufferlist bl;
11fdf7f2 324 encode(pending_command_descs, bl);
c07f9fc5
FG
325 t->put(command_descs_prefix, "", bl);
326 pending_command_descs.clear();
327 }
7c673cae
FG
328}
329
330bool MgrMonitor::check_caps(MonOpRequestRef op, const uuid_d& fsid)
331{
332 // check permissions
333 MonSession *session = op->get_session();
334 if (!session)
335 return false;
336 if (!session->is_capable("mgr", MON_CAP_X)) {
337 dout(1) << __func__ << " insufficient caps " << session->caps << dendl;
338 return false;
339 }
340 if (fsid != mon->monmap->fsid) {
341 dout(1) << __func__ << " op fsid " << fsid
342 << " != " << mon->monmap->fsid << dendl;
343 return false;
344 }
345 return true;
346}
347
348bool MgrMonitor::preprocess_query(MonOpRequestRef op)
349{
350 PaxosServiceMessage *m = static_cast<PaxosServiceMessage*>(op->get_req());
351 switch (m->get_type()) {
352 case MSG_MGR_BEACON:
353 return preprocess_beacon(op);
354 case MSG_MON_COMMAND:
f64942e4
AA
355 try {
356 return preprocess_command(op);
11fdf7f2 357 } catch (const bad_cmd_get& e) {
f64942e4
AA
358 bufferlist bl;
359 mon->reply_command(op, -EINVAL, e.what(), bl, get_last_committed());
360 return true;
361 }
362
7c673cae
FG
363 default:
364 mon->no_reply(op);
365 derr << "Unhandled message type " << m->get_type() << dendl;
366 return true;
367 }
368}
369
370bool MgrMonitor::prepare_update(MonOpRequestRef op)
371{
372 PaxosServiceMessage *m = static_cast<PaxosServiceMessage*>(op->get_req());
373 switch (m->get_type()) {
374 case MSG_MGR_BEACON:
375 return prepare_beacon(op);
376
377 case MSG_MON_COMMAND:
f64942e4
AA
378 try {
379 return prepare_command(op);
11fdf7f2 380 } catch (const bad_cmd_get& e) {
f64942e4
AA
381 bufferlist bl;
382 mon->reply_command(op, -EINVAL, e.what(), bl, get_last_committed());
383 return true;
384 }
7c673cae
FG
385
386 default:
387 mon->no_reply(op);
388 derr << "Unhandled message type " << m->get_type() << dendl;
389 return true;
390 }
391}
392
393
394
395class C_Updated : public Context {
396 MgrMonitor *mm;
397 MonOpRequestRef op;
398public:
399 C_Updated(MgrMonitor *a, MonOpRequestRef c) :
400 mm(a), op(c) {}
401 void finish(int r) override {
402 if (r >= 0) {
403 // Success
404 } else if (r == -ECANCELED) {
405 mm->mon->no_reply(op);
406 } else {
407 mm->dispatch(op); // try again
408 }
409 }
410};
411
412bool MgrMonitor::preprocess_beacon(MonOpRequestRef op)
413{
414 MMgrBeacon *m = static_cast<MMgrBeacon*>(op->get_req());
94b18763 415 mon->no_reply(op); // we never reply to beacons
7c673cae
FG
416 dout(4) << "beacon from " << m->get_gid() << dendl;
417
418 if (!check_caps(op, m->get_fsid())) {
419 // drop it on the floor
420 return true;
421 }
422
423 // always send this to the leader's prepare_beacon()
424 return false;
425}
426
427bool MgrMonitor::prepare_beacon(MonOpRequestRef op)
428{
429 MMgrBeacon *m = static_cast<MMgrBeacon*>(op->get_req());
430 dout(4) << "beacon from " << m->get_gid() << dendl;
431
432 // See if we are seeing same name, new GID for the active daemon
433 if (m->get_name() == pending_map.active_name
434 && m->get_gid() != pending_map.active_gid)
435 {
436 dout(4) << "Active daemon restart (mgr." << m->get_name() << ")" << dendl;
224ce89b
WB
437 mon->clog->info() << "Active manager daemon " << m->get_name()
438 << " restarted";
7c673cae
FG
439 drop_active();
440 }
441
442 // See if we are seeing same name, new GID for any standbys
443 for (const auto &i : pending_map.standbys) {
11fdf7f2 444 const MgrMap::StandbyInfo &s = i.second;
7c673cae
FG
445 if (s.name == m->get_name() && s.gid != m->get_gid()) {
446 dout(4) << "Standby daemon restart (mgr." << m->get_name() << ")" << dendl;
224ce89b
WB
447 mon->clog->debug() << "Standby manager daemon " << m->get_name()
448 << " restarted";
7c673cae
FG
449 drop_standby(i.first);
450 break;
451 }
452 }
453
31f18b77 454 last_beacon[m->get_gid()] = ceph::coarse_mono_clock::now();
7c673cae
FG
455
456 // Track whether we modified pending_map
457 bool updated = false;
458
459 if (pending_map.active_gid == m->get_gid()) {
3efd9988
FG
460 if (pending_map.services != m->get_services()) {
461 dout(4) << "updated services from mgr." << m->get_name()
462 << ": " << m->get_services() << dendl;
463 pending_map.services = m->get_services();
464 updated = true;
465 }
466
7c673cae 467 // A beacon from the currently active daemon
11fdf7f2
TL
468 if (pending_map.active_addrs != m->get_server_addrs()) {
469 dout(4) << "learned address " << m->get_server_addrs()
470 << " (was " << pending_map.active_addrs << ")" << dendl;
471 pending_map.active_addrs = m->get_server_addrs();
7c673cae
FG
472 updated = true;
473 }
474
475 if (pending_map.get_available() != m->get_available()) {
476 dout(4) << "available " << m->get_gid() << dendl;
224ce89b
WB
477 mon->clog->info() << "Manager daemon " << pending_map.active_name
478 << " is now available";
c07f9fc5
FG
479
480 // This beacon should include command descriptions
481 pending_command_descs = m->get_command_descs();
482 if (pending_command_descs.empty()) {
483 // This should not happen, but it also isn't fatal: we just
484 // won't successfully update our list of commands.
485 dout(4) << "First available beacon from " << pending_map.active_name
486 << "(" << m->get_gid() << ") does not include command descs"
487 << dendl;
488 } else {
489 dout(4) << "First available beacon from " << pending_map.active_name
490 << "(" << m->get_gid() << ") includes "
491 << pending_command_descs.size() << " command descs" << dendl;
492 }
493
7c673cae
FG
494 pending_map.available = m->get_available();
495 updated = true;
496 }
224ce89b
WB
497 if (pending_map.available_modules != m->get_available_modules()) {
498 dout(4) << "available_modules " << m->get_available_modules()
499 << " (was " << pending_map.available_modules << ")" << dendl;
500 pending_map.available_modules = m->get_available_modules();
501 updated = true;
502 }
7c673cae
FG
503 } else if (pending_map.active_gid == 0) {
504 // There is no currently active daemon, select this one.
505 if (pending_map.standbys.count(m->get_gid())) {
181888fb 506 drop_standby(m->get_gid(), false);
7c673cae
FG
507 }
508 dout(4) << "selecting new active " << m->get_gid()
509 << " " << m->get_name()
510 << " (was " << pending_map.active_gid << " "
511 << pending_map.active_name << ")" << dendl;
512 pending_map.active_gid = m->get_gid();
513 pending_map.active_name = m->get_name();
11fdf7f2 514 pending_map.active_change = ceph_clock_now();
224ce89b 515 pending_map.available_modules = m->get_available_modules();
11fdf7f2 516 encode(m->get_metadata(), pending_metadata[m->get_name()]);
c07f9fc5 517 pending_metadata_rm.erase(m->get_name());
224ce89b
WB
518
519 mon->clog->info() << "Activating manager daemon "
520 << pending_map.active_name;
7c673cae
FG
521
522 updated = true;
523 } else {
524 if (pending_map.standbys.count(m->get_gid()) > 0) {
525 dout(10) << "from existing standby " << m->get_gid() << dendl;
224ce89b
WB
526 if (pending_map.standbys[m->get_gid()].available_modules !=
527 m->get_available_modules()) {
528 dout(10) << "existing standby " << m->get_gid() << " available_modules "
529 << m->get_available_modules() << " (was "
530 << pending_map.standbys[m->get_gid()].available_modules << ")"
531 << dendl;
532 pending_map.standbys[m->get_gid()].available_modules =
533 m->get_available_modules();
534 updated = true;
535 }
7c673cae
FG
536 } else {
537 dout(10) << "new standby " << m->get_gid() << dendl;
224ce89b
WB
538 mon->clog->debug() << "Standby manager daemon " << m->get_name()
539 << " started";
c07f9fc5
FG
540 pending_map.standbys[m->get_gid()] = {m->get_gid(), m->get_name(),
541 m->get_available_modules()};
11fdf7f2 542 encode(m->get_metadata(), pending_metadata[m->get_name()]);
c07f9fc5 543 pending_metadata_rm.erase(m->get_name());
7c673cae
FG
544 updated = true;
545 }
546 }
547
548 if (updated) {
549 dout(4) << "updating map" << dendl;
550 wait_for_finished_proposal(op, new C_Updated(this, op));
551 } else {
552 dout(10) << "no change" << dendl;
553 }
554
555 return updated;
556}
557
558void MgrMonitor::check_subs()
559{
560 const std::string type = "mgrmap";
561 if (mon->session_map.subs.count(type) == 0)
562 return;
563 for (auto sub : *(mon->session_map.subs[type])) {
564 check_sub(sub);
565 }
566}
567
568void MgrMonitor::check_sub(Subscription *sub)
569{
570 if (sub->type == "mgrmap") {
571 if (sub->next <= map.get_epoch()) {
224ce89b
WB
572 dout(20) << "Sending map to subscriber " << sub->session->con
573 << " " << sub->session->con->get_peer_addr() << dendl;
7c673cae
FG
574 sub->session->con->send_message(new MMgrMap(map));
575 if (sub->onetime) {
576 mon->session_map.remove_sub(sub);
577 } else {
578 sub->next = map.get_epoch() + 1;
579 }
580 }
581 } else {
11fdf7f2 582 ceph_assert(sub->type == "mgrdigest");
c07f9fc5
FG
583 if (sub->next == 0) {
584 // new registration; cancel previous timer
585 cancel_timer();
586 }
31f18b77 587 if (digest_event == nullptr) {
7c673cae
FG
588 send_digests();
589 }
590 }
591}
592
593/**
594 * Handle digest subscriptions separately (outside of check_sub) because
595 * they are going to be periodic rather than version-driven.
596 */
597void MgrMonitor::send_digests()
598{
31f18b77
FG
599 cancel_timer();
600
7c673cae 601 const std::string type = "mgrdigest";
11fdf7f2
TL
602 if (mon->session_map.subs.count(type) == 0) {
603 prev_health_checks.clear();
7c673cae 604 return;
11fdf7f2 605 }
7c673cae 606
b32b8144
FG
607 if (!is_active()) {
608 // if paxos is currently not active, don't send a digest but reenable timer
609 goto timer;
610 }
611 dout(10) << __func__ << dendl;
612
7c673cae 613 for (auto sub : *(mon->session_map.subs[type])) {
224ce89b
WB
614 dout(10) << __func__ << " sending digest to subscriber " << sub->session->con
615 << " " << sub->session->con->get_peer_addr() << dendl;
7c673cae
FG
616 MMgrDigest *mdigest = new MMgrDigest;
617
618 JSONFormatter f;
224ce89b 619 mon->get_health_status(true, &f, nullptr, nullptr, nullptr);
7c673cae
FG
620 f.flush(mdigest->health_json);
621 f.reset();
622
623 std::ostringstream ss;
624 mon->get_mon_status(&f, ss);
625 f.flush(mdigest->mon_status_json);
626 f.reset();
627
628 sub->session->con->send_message(mdigest);
629 }
630
b32b8144 631timer:
3efd9988 632 digest_event = mon->timer.add_event_after(
11fdf7f2 633 g_conf().get_val<int64_t>("mon_mgr_digest_period"),
3efd9988 634 new C_MonContext(mon, [this](int) {
7c673cae 635 send_digests();
3efd9988 636 }));
31f18b77
FG
637}
638
639void MgrMonitor::cancel_timer()
640{
641 if (digest_event) {
642 mon->timer.cancel_event(digest_event);
643 digest_event = nullptr;
644 }
7c673cae
FG
645}
646
647void MgrMonitor::on_active()
648{
224ce89b
WB
649 if (mon->is_leader()) {
650 mon->clog->debug() << "mgrmap e" << map.epoch << ": " << map;
7c673cae 651
11fdf7f2
TL
652 if (HAVE_FEATURE(mon->get_quorum_con_features(), SERVER_NAUTILUS) &&
653 pending_map.always_on_modules != always_on_modules) {
654 pending_map.always_on_modules = always_on_modules;
655 dout(4) << "always on modules changed, pending "
656 << pending_map.get_always_on_modules()
657 << " != wanted " << always_on_modules << dendl;
658 propose_pending();
7c673cae 659 }
7c673cae
FG
660 }
661}
662
663void MgrMonitor::tick()
664{
665 if (!is_active() || !mon->is_leader())
666 return;
667
31f18b77 668 const auto now = ceph::coarse_mono_clock::now();
3efd9988 669
11fdf7f2
TL
670 const auto mgr_beacon_grace =
671 g_conf().get_val<std::chrono::seconds>("mon_mgr_beacon_grace");
3efd9988
FG
672
673 // Note that this is the mgr daemon's tick period, not ours (the
674 // beacon is sent with this period).
11fdf7f2
TL
675 const auto mgr_tick_period =
676 g_conf().get_val<std::chrono::seconds>("mgr_tick_period");
3efd9988
FG
677
678 if (last_tick != ceph::coarse_mono_clock::time_point::min()
679 && (now - last_tick > (mgr_beacon_grace - mgr_tick_period))) {
680 // This case handles either local slowness (calls being delayed
681 // for whatever reason) or cluster election slowness (a long gap
682 // between calls while an election happened)
683 dout(4) << __func__ << ": resetting beacon timeouts due to mon delay "
684 "(slow election?) of " << now - last_tick << " seconds" << dendl;
685 for (auto &i : last_beacon) {
686 i.second = now;
687 }
688 }
689
690 last_tick = now;
7c673cae
FG
691
692 // Populate any missing beacons (i.e. no beacon since MgrMonitor
693 // instantiation) with the current time, so that they will
694 // eventually look laggy if they fail to give us a beacon.
695 if (pending_map.active_gid != 0
696 && last_beacon.count(pending_map.active_gid) == 0) {
697 last_beacon[pending_map.active_gid] = now;
698 }
699 for (auto s : pending_map.standbys) {
700 if (last_beacon.count(s.first) == 0) {
701 last_beacon[s.first] = now;
702 }
703 }
704
705 // Cull standbys first so that any remaining standbys
706 // will be eligible to take over from the active if we cull him.
707 std::list<uint64_t> dead_standbys;
3efd9988 708 const auto cutoff = now - mgr_beacon_grace;
7c673cae
FG
709 for (const auto &i : pending_map.standbys) {
710 auto last_beacon_time = last_beacon.at(i.first);
711 if (last_beacon_time < cutoff) {
712 dead_standbys.push_back(i.first);
713 }
714 }
715
716 bool propose = false;
717
718 for (auto i : dead_standbys) {
719 dout(4) << "Dropping laggy standby " << i << dendl;
720 drop_standby(i);
721 propose = true;
722 }
723
724 if (pending_map.active_gid != 0
725 && last_beacon.at(pending_map.active_gid) < cutoff) {
224ce89b 726 const std::string old_active_name = pending_map.active_name;
7c673cae
FG
727 drop_active();
728 propose = true;
729 dout(4) << "Dropping active" << pending_map.active_gid << dendl;
730 if (promote_standby()) {
731 dout(4) << "Promoted standby " << pending_map.active_gid << dendl;
224ce89b
WB
732 mon->clog->info() << "Manager daemon " << old_active_name
733 << " is unresponsive, replacing it with standby"
734 << " daemon " << pending_map.active_name;
7c673cae
FG
735 } else {
736 dout(4) << "Active is laggy but have no standbys to replace it" << dendl;
28e407b8 737 mon->clog->info() << "Manager daemon " << old_active_name
224ce89b 738 << " is unresponsive. No standby daemons available.";
7c673cae
FG
739 }
740 } else if (pending_map.active_gid == 0) {
741 if (promote_standby()) {
742 dout(4) << "Promoted standby " << pending_map.active_gid << dendl;
224ce89b 743 mon->clog->info() << "Activating manager daemon "
3efd9988 744 << pending_map.active_name;
7c673cae
FG
745 propose = true;
746 }
747 }
748
224ce89b 749 if (!pending_map.available &&
c07f9fc5 750 !ever_had_active_mgr &&
224ce89b 751 should_warn_about_mgr_down() != HEALTH_OK) {
3efd9988 752 dout(10) << " exceeded mon_mgr_mkfs_grace "
11fdf7f2 753 << g_conf().get_val<int64_t>("mon_mgr_mkfs_grace")
3efd9988 754 << " seconds" << dendl;
224ce89b
WB
755 propose = true;
756 }
757
7c673cae
FG
758 if (propose) {
759 propose_pending();
760 }
761}
762
224ce89b
WB
763void MgrMonitor::on_restart()
764{
765 // Clear out the leader-specific state.
766 last_beacon.clear();
3efd9988 767 last_tick = ceph::coarse_mono_clock::now();
224ce89b
WB
768}
769
770
7c673cae
FG
771bool MgrMonitor::promote_standby()
772{
11fdf7f2 773 ceph_assert(pending_map.active_gid == 0);
7c673cae
FG
774 if (pending_map.standbys.size()) {
775 // Promote a replacement (arbitrary choice of standby)
776 auto replacement_gid = pending_map.standbys.begin()->first;
777 pending_map.active_gid = replacement_gid;
778 pending_map.active_name = pending_map.standbys.at(replacement_gid).name;
779 pending_map.available = false;
11fdf7f2
TL
780 pending_map.active_addrs = entity_addrvec_t();
781 pending_map.active_change = ceph_clock_now();
7c673cae 782
181888fb
FG
783 drop_standby(replacement_gid, false);
784
7c673cae
FG
785 return true;
786 } else {
787 return false;
788 }
789}
790
791void MgrMonitor::drop_active()
792{
793 if (last_beacon.count(pending_map.active_gid) > 0) {
794 last_beacon.erase(pending_map.active_gid);
795 }
796
c07f9fc5
FG
797 pending_metadata_rm.insert(pending_map.active_name);
798 pending_metadata.erase(pending_map.active_name);
7c673cae
FG
799 pending_map.active_name = "";
800 pending_map.active_gid = 0;
11fdf7f2 801 pending_map.active_change = ceph_clock_now();
7c673cae 802 pending_map.available = false;
11fdf7f2 803 pending_map.active_addrs = entity_addrvec_t();
3efd9988 804 pending_map.services.clear();
224ce89b
WB
805
806 // So that when new active mgr subscribes to mgrdigest, it will
807 // get an immediate response instead of waiting for next timer
808 cancel_timer();
7c673cae
FG
809}
810
181888fb 811void MgrMonitor::drop_standby(uint64_t gid, bool drop_meta)
7c673cae 812{
181888fb
FG
813 if (drop_meta) {
814 pending_metadata_rm.insert(pending_map.standbys[gid].name);
815 pending_metadata.erase(pending_map.standbys[gid].name);
816 }
7c673cae
FG
817 pending_map.standbys.erase(gid);
818 if (last_beacon.count(gid) > 0) {
819 last_beacon.erase(gid);
820 }
7c673cae
FG
821}
822
823bool MgrMonitor::preprocess_command(MonOpRequestRef op)
824{
31f18b77
FG
825 MMonCommand *m = static_cast<MMonCommand*>(op->get_req());
826 std::stringstream ss;
827 bufferlist rdata;
828
11fdf7f2 829 cmdmap_t cmdmap;
31f18b77
FG
830 if (!cmdmap_from_json(m->cmd, &cmdmap, ss)) {
831 string rs = ss.str();
832 mon->reply_command(op, -EINVAL, rs, rdata, get_last_committed());
833 return true;
834 }
7c673cae 835
11fdf7f2 836 MonSession *session = op->get_session();
31f18b77
FG
837 if (!session) {
838 mon->reply_command(op, -EACCES, "access denied", rdata,
839 get_last_committed());
840 return true;
841 }
842
843 string format;
494da23a
TL
844 cmd_getval(g_ceph_context, cmdmap, "format", format);
845 boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty",
846 "json-pretty"));
31f18b77
FG
847
848 string prefix;
11fdf7f2 849 cmd_getval(g_ceph_context, cmdmap, "prefix", prefix);
31f18b77
FG
850 int r = 0;
851
852 if (prefix == "mgr dump") {
853 int64_t epoch = 0;
11fdf7f2 854 cmd_getval(g_ceph_context, cmdmap, "epoch", epoch, (int64_t)map.get_epoch());
31f18b77
FG
855 if (epoch == (int64_t)map.get_epoch()) {
856 f->dump_object("mgrmap", map);
857 } else {
858 bufferlist bl;
859 int err = get_version(epoch, bl);
860 if (err == -ENOENT) {
861 r = -ENOENT;
862 ss << "there is no map for epoch " << epoch;
863 goto reply;
864 }
865 MgrMap m;
11fdf7f2 866 auto p = bl.cbegin();
31f18b77
FG
867 m.decode(p);
868 f->dump_object("mgrmap", m);
869 }
870 f->flush(rdata);
224ce89b 871 } else if (prefix == "mgr module ls") {
3efd9988
FG
872 f->open_object_section("modules");
873 {
92f5a8d4
TL
874 f->open_array_section("always_on_modules");
875 for (auto& p : map.get_always_on_modules()) {
876 f->dump_string("module", p);
877 }
878 f->close_section();
3efd9988
FG
879 f->open_array_section("enabled_modules");
880 for (auto& p : map.modules) {
11fdf7f2
TL
881 if (map.get_always_on_modules().count(p) > 0)
882 continue;
883 // We only show the name for enabled modules. The any errors
884 // etc will show up as a health checks.
3efd9988
FG
885 f->dump_string("module", p);
886 }
887 f->close_section();
888 f->open_array_section("disabled_modules");
889 for (auto& p : map.available_modules) {
11fdf7f2
TL
890 if (map.modules.count(p.name) == 0 &&
891 map.get_always_on_modules().count(p.name) == 0) {
892 // For disabled modules, we show the full info, to
893 // give a hint about whether enabling it will work
894 p.dump(f.get());
3efd9988
FG
895 }
896 }
897 f->close_section();
898 }
899 f->close_section();
900 f->flush(rdata);
901 } else if (prefix == "mgr services") {
902 f->open_object_section("services");
903 for (const auto &i : map.services) {
904 f->dump_string(i.first.c_str(), i.second);
224ce89b
WB
905 }
906 f->close_section();
907 f->flush(rdata);
c07f9fc5
FG
908 } else if (prefix == "mgr metadata") {
909 string name;
11fdf7f2 910 cmd_getval(g_ceph_context, cmdmap, "who", name);
c07f9fc5
FG
911 if (name.size() > 0 && !map.have_name(name)) {
912 ss << "mgr." << name << " does not exist";
913 r = -ENOENT;
914 goto reply;
915 }
916 string format;
11fdf7f2 917 cmd_getval(g_ceph_context, cmdmap, "format", format);
c07f9fc5
FG
918 boost::scoped_ptr<Formatter> f(Formatter::create(format, "json-pretty", "json-pretty"));
919 if (name.size()) {
920 f->open_object_section("mgr_metadata");
11fdf7f2 921 f->dump_string("name", name);
c07f9fc5
FG
922 r = dump_metadata(name, f.get(), &ss);
923 if (r < 0)
924 goto reply;
925 f->close_section();
926 } else {
927 r = 0;
928 f->open_array_section("mgr_metadata");
929 for (auto& i : map.get_all_names()) {
930 f->open_object_section("mgr");
11fdf7f2 931 f->dump_string("name", i);
c07f9fc5
FG
932 r = dump_metadata(i, f.get(), NULL);
933 if (r == -EINVAL || r == -ENOENT) {
934 // Drop error, continue to get other daemons' metadata
935 dout(4) << "No metadata for mgr." << i << dendl;
936 r = 0;
937 } else if (r < 0) {
938 // Unexpected error
939 goto reply;
940 }
941 f->close_section();
942 }
943 f->close_section();
944 }
945 f->flush(rdata);
946 } else if (prefix == "mgr versions") {
c07f9fc5
FG
947 count_metadata("ceph_version", f.get());
948 f->flush(rdata);
949 r = 0;
950 } else if (prefix == "mgr count-metadata") {
c07f9fc5 951 string field;
11fdf7f2 952 cmd_getval(g_ceph_context, cmdmap, "property", field);
c07f9fc5
FG
953 count_metadata(field, f.get());
954 f->flush(rdata);
955 r = 0;
31f18b77
FG
956 } else {
957 return false;
958 }
959
960reply:
961 string rs;
962 getline(ss, rs);
963 mon->reply_command(op, r, rs, rdata, get_last_committed());
964 return true;
7c673cae
FG
965}
966
967bool MgrMonitor::prepare_command(MonOpRequestRef op)
968{
969 MMonCommand *m = static_cast<MMonCommand*>(op->get_req());
970
971 std::stringstream ss;
972 bufferlist rdata;
973
11fdf7f2 974 cmdmap_t cmdmap;
7c673cae
FG
975 if (!cmdmap_from_json(m->cmd, &cmdmap, ss)) {
976 string rs = ss.str();
977 mon->reply_command(op, -EINVAL, rs, rdata, get_last_committed());
978 return true;
979 }
980
11fdf7f2 981 MonSession *session = op->get_session();
7c673cae
FG
982 if (!session) {
983 mon->reply_command(op, -EACCES, "access denied", rdata, get_last_committed());
984 return true;
985 }
986
224ce89b 987 string format;
11fdf7f2 988 cmd_getval(g_ceph_context, cmdmap, "format", format, string("plain"));
224ce89b
WB
989 boost::scoped_ptr<Formatter> f(Formatter::create(format));
990
7c673cae 991 string prefix;
11fdf7f2 992 cmd_getval(g_ceph_context, cmdmap, "prefix", prefix);
7c673cae
FG
993
994 int r = 0;
995
996 if (prefix == "mgr fail") {
997 string who;
11fdf7f2 998 cmd_getval(g_ceph_context, cmdmap, "who", who);
7c673cae
FG
999
1000 std::string err;
1001 uint64_t gid = strict_strtol(who.c_str(), 10, &err);
1002 bool changed = false;
1003 if (!err.empty()) {
1004 // Does not parse as a gid, treat it as a name
1005 if (pending_map.active_name == who) {
1006 drop_active();
1007 changed = true;
1008 } else {
1009 gid = 0;
1010 for (const auto &i : pending_map.standbys) {
1011 if (i.second.name == who) {
1012 gid = i.first;
1013 break;
1014 }
1015 }
1016 if (gid != 0) {
1017 drop_standby(gid);
1018 changed = true;
1019 } else {
1020 ss << "Daemon not found '" << who << "', already failed?";
1021 }
1022 }
1023 } else {
1024 if (pending_map.active_gid == gid) {
1025 drop_active();
1026 changed = true;
1027 } else if (pending_map.standbys.count(gid) > 0) {
1028 drop_standby(gid);
1029 changed = true;
1030 } else {
1031 ss << "Daemon not found '" << gid << "', already failed?";
1032 }
1033 }
1034
1035 if (changed && pending_map.active_gid == 0) {
1036 promote_standby();
1037 }
224ce89b
WB
1038 } else if (prefix == "mgr module enable") {
1039 string module;
11fdf7f2 1040 cmd_getval(g_ceph_context, cmdmap, "module", module);
224ce89b
WB
1041 if (module.empty()) {
1042 r = -EINVAL;
1043 goto out;
1044 }
11fdf7f2
TL
1045 if (pending_map.get_always_on_modules().count(module) > 0) {
1046 ss << "module '" << module << "' is already enabled (always-on)";
1047 goto out;
1048 }
224ce89b 1049 string force;
11fdf7f2 1050 cmd_getval(g_ceph_context, cmdmap, "force", force);
224ce89b
WB
1051 if (!pending_map.all_support_module(module) &&
1052 force != "--force") {
1053 ss << "all mgr daemons do not support module '" << module << "', pass "
1054 << "--force to force enablement";
1055 r = -ENOENT;
1056 goto out;
1057 }
11fdf7f2
TL
1058
1059 std::string can_run_error;
1060 if (force != "--force" && !pending_map.can_run_module(module, &can_run_error)) {
1061 ss << "module '" << module << "' reports that it cannot run on the active "
1062 "manager daemon: " << can_run_error << " (pass --force to force "
1063 "enablement)";
1064 r = -ENOENT;
1065 goto out;
1066 }
1067
1068 if (pending_map.module_enabled(module)) {
1069 ss << "module '" << module << "' is already enabled";
1070 r = 0;
1071 goto out;
1072 }
224ce89b
WB
1073 pending_map.modules.insert(module);
1074 } else if (prefix == "mgr module disable") {
1075 string module;
11fdf7f2 1076 cmd_getval(g_ceph_context, cmdmap, "module", module);
224ce89b
WB
1077 if (module.empty()) {
1078 r = -EINVAL;
1079 goto out;
1080 }
11fdf7f2
TL
1081 if (pending_map.get_always_on_modules().count(module) > 0) {
1082 ss << "module '" << module << "' cannot be disabled (always-on)";
1083 r = -EINVAL;
1084 goto out;
1085 }
1086 if (!pending_map.module_enabled(module)) {
1087 ss << "module '" << module << "' is already disabled";
1088 r = 0;
1089 goto out;
1090 }
1091 if (!pending_map.any_supports_module(module)) {
1092 ss << "module '" << module << "' does not exist";
1093 }
224ce89b 1094 pending_map.modules.erase(module);
7c673cae 1095 } else {
224ce89b 1096 ss << "Command '" << prefix << "' not implemented!";
7c673cae
FG
1097 r = -ENOSYS;
1098 }
1099
224ce89b 1100out:
7c673cae
FG
1101 dout(4) << __func__ << " done, r=" << r << dendl;
1102 /* Compose response */
1103 string rs;
1104 getline(ss, rs);
1105
1106 if (r >= 0) {
1107 // success.. delay reply
1108 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, r, rs,
1109 get_last_committed() + 1));
1110 return true;
1111 } else {
1112 // reply immediately
1113 mon->reply_command(op, r, rs, rdata, get_last_committed());
1114 return false;
1115 }
1116}
1117
1118void MgrMonitor::init()
1119{
31f18b77 1120 if (digest_event == nullptr) {
7c673cae
FG
1121 send_digests(); // To get it to schedule its own event
1122 }
1123}
1124
1125void MgrMonitor::on_shutdown()
1126{
31f18b77 1127 cancel_timer();
7c673cae
FG
1128}
1129
c07f9fc5 1130int MgrMonitor::load_metadata(const string& name, std::map<string, string>& m,
11fdf7f2 1131 ostream *err) const
c07f9fc5
FG
1132{
1133 bufferlist bl;
1134 int r = mon->store->get(MGR_METADATA_PREFIX, name, bl);
1135 if (r < 0)
1136 return r;
1137 try {
11fdf7f2
TL
1138 auto p = bl.cbegin();
1139 decode(m, p);
c07f9fc5
FG
1140 }
1141 catch (buffer::error& e) {
1142 if (err)
1143 *err << "mgr." << name << " metadata is corrupt";
1144 return -EIO;
1145 }
1146 return 0;
1147}
1148
1149void MgrMonitor::count_metadata(const string& field, std::map<string,int> *out)
1150{
1151 std::set<string> ls = map.get_all_names();
1152 for (auto& name : ls) {
1153 std::map<string,string> meta;
1154 load_metadata(name, meta, nullptr);
1155 auto p = meta.find(field);
1156 if (p == meta.end()) {
1157 (*out)["unknown"]++;
1158 } else {
1159 (*out)[p->second]++;
1160 }
1161 }
1162}
1163
1164void MgrMonitor::count_metadata(const string& field, Formatter *f)
1165{
1166 std::map<string,int> by_val;
1167 count_metadata(field, &by_val);
1168 f->open_object_section(field.c_str());
1169 for (auto& p : by_val) {
1170 f->dump_int(p.first.c_str(), p.second);
1171 }
1172 f->close_section();
1173}
1174
1175int MgrMonitor::dump_metadata(const string& name, Formatter *f, ostream *err)
1176{
1177 std::map<string,string> m;
1178 if (int r = load_metadata(name, m, err))
1179 return r;
1180 for (auto& p : m) {
1181 f->dump_string(p.first.c_str(), p.second);
1182 }
1183 return 0;
1184}
31f18b77 1185
11fdf7f2
TL
1186void MgrMonitor::print_nodes(Formatter *f) const
1187{
1188 ceph_assert(f);
1189
1190 std::map<string, list<string> > mgrs; // hostname => mgr
1191 auto ls = map.get_all_names();
1192 for (auto& name : ls) {
1193 std::map<string,string> meta;
1194 if (load_metadata(name, meta, nullptr)) {
1195 continue;
1196 }
1197 auto hostname = meta.find("hostname");
1198 if (hostname == meta.end()) {
1199 // not likely though
1200 continue;
1201 }
1202 mgrs[hostname->second].push_back(name);
1203 }
1204
1205 dump_services(f, mgrs, "mgr");
1206}
1207
d2e6a577
FG
1208const std::vector<MonCommand> &MgrMonitor::get_command_descs() const
1209{
1210 if (command_descs.empty()) {
1211 // must have just upgraded; fallback to static commands
1212 return mgr_commands;
1213 } else {
1214 return command_descs;
1215 }
1216}