]> git.proxmox.com Git - ceph.git/blob - ceph/src/mon/MgrMonitor.cc
863672afe7f5d7ccb29ec14a5a53bd6aa2bf32a5
[ceph.git] / ceph / src / mon / MgrMonitor.cc
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3 /*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2016 John Spray <john.spray@redhat.com>
7 *
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
12 */
13
14 #include "messages/MMgrBeacon.h"
15 #include "messages/MMgrMap.h"
16 #include "messages/MMgrDigest.h"
17
18 #include "PGStatService.h"
19 #include "include/stringify.h"
20 #include "mgr/MgrContext.h"
21 #include "OSDMonitor.h"
22
23 #include "MgrMonitor.h"
24
25 #define dout_subsys ceph_subsys_mon
26 #undef dout_prefix
27 #define dout_prefix _prefix(_dout, mon, map)
28 static ostream& _prefix(std::ostream *_dout, Monitor *mon,
29 const MgrMap& mgrmap) {
30 return *_dout << "mon." << mon->name << "@" << mon->rank
31 << "(" << mon->get_state_name()
32 << ").mgr e" << mgrmap.get_epoch() << " ";
33 }
34
35
36 void MgrMonitor::create_initial()
37 {
38 }
39
40 void MgrMonitor::update_from_paxos(bool *need_bootstrap)
41 {
42 version_t version = get_last_committed();
43 if (version != map.epoch) {
44 dout(4) << "loading version " << version << dendl;
45
46 bufferlist bl;
47 int err = get_version(version, bl);
48 assert(err == 0);
49
50 bufferlist::iterator p = bl.begin();
51 map.decode(p);
52
53 dout(4) << "active server: " << map.active_addr
54 << "(" << map.active_gid << ")" << dendl;
55
56 if (map.available) {
57 first_seen_inactive = utime_t();
58 } else {
59 first_seen_inactive = ceph_clock_now();
60 }
61
62 check_subs();
63 }
64
65 // feed our pet MgrClient
66 mon->mgr_client.ms_dispatch(new MMgrMap(map));
67 }
68
69 void MgrMonitor::create_pending()
70 {
71 pending_map = map;
72 pending_map.epoch++;
73 }
74
75 void MgrMonitor::encode_pending(MonitorDBStore::TransactionRef t)
76 {
77 dout(10) << __func__ << " " << pending_map << dendl;
78 bufferlist bl;
79 pending_map.encode(bl, mon->get_quorum_con_features());
80 put_version(t, pending_map.epoch, bl);
81 put_last_committed(t, pending_map.epoch);
82 }
83
84 bool MgrMonitor::check_caps(MonOpRequestRef op, const uuid_d& fsid)
85 {
86 // check permissions
87 MonSession *session = op->get_session();
88 if (!session)
89 return false;
90 if (!session->is_capable("mgr", MON_CAP_X)) {
91 dout(1) << __func__ << " insufficient caps " << session->caps << dendl;
92 return false;
93 }
94 if (fsid != mon->monmap->fsid) {
95 dout(1) << __func__ << " op fsid " << fsid
96 << " != " << mon->monmap->fsid << dendl;
97 return false;
98 }
99 return true;
100 }
101
102 bool MgrMonitor::preprocess_query(MonOpRequestRef op)
103 {
104 PaxosServiceMessage *m = static_cast<PaxosServiceMessage*>(op->get_req());
105 switch (m->get_type()) {
106 case MSG_MGR_BEACON:
107 return preprocess_beacon(op);
108 case MSG_MON_COMMAND:
109 return preprocess_command(op);
110 default:
111 mon->no_reply(op);
112 derr << "Unhandled message type " << m->get_type() << dendl;
113 return true;
114 }
115 }
116
117 bool MgrMonitor::prepare_update(MonOpRequestRef op)
118 {
119 PaxosServiceMessage *m = static_cast<PaxosServiceMessage*>(op->get_req());
120 switch (m->get_type()) {
121 case MSG_MGR_BEACON:
122 return prepare_beacon(op);
123
124 case MSG_MON_COMMAND:
125 return prepare_command(op);
126
127 default:
128 mon->no_reply(op);
129 derr << "Unhandled message type " << m->get_type() << dendl;
130 return true;
131 }
132 }
133
134
135
136 class C_Updated : public Context {
137 MgrMonitor *mm;
138 MonOpRequestRef op;
139 public:
140 C_Updated(MgrMonitor *a, MonOpRequestRef c) :
141 mm(a), op(c) {}
142 void finish(int r) override {
143 if (r >= 0) {
144 // Success
145 } else if (r == -ECANCELED) {
146 mm->mon->no_reply(op);
147 } else {
148 mm->dispatch(op); // try again
149 }
150 }
151 };
152
153 bool MgrMonitor::preprocess_beacon(MonOpRequestRef op)
154 {
155 MMgrBeacon *m = static_cast<MMgrBeacon*>(op->get_req());
156 dout(4) << "beacon from " << m->get_gid() << dendl;
157
158 if (!check_caps(op, m->get_fsid())) {
159 // drop it on the floor
160 return true;
161 }
162
163 // always send this to the leader's prepare_beacon()
164 return false;
165 }
166
167 bool MgrMonitor::prepare_beacon(MonOpRequestRef op)
168 {
169 MMgrBeacon *m = static_cast<MMgrBeacon*>(op->get_req());
170 dout(4) << "beacon from " << m->get_gid() << dendl;
171
172 // See if we are seeing same name, new GID for the active daemon
173 if (m->get_name() == pending_map.active_name
174 && m->get_gid() != pending_map.active_gid)
175 {
176 dout(4) << "Active daemon restart (mgr." << m->get_name() << ")" << dendl;
177 drop_active();
178 }
179
180 // See if we are seeing same name, new GID for any standbys
181 for (const auto &i : pending_map.standbys) {
182 const StandbyInfo &s = i.second;
183 if (s.name == m->get_name() && s.gid != m->get_gid()) {
184 dout(4) << "Standby daemon restart (mgr." << m->get_name() << ")" << dendl;
185 drop_standby(i.first);
186 break;
187 }
188 }
189
190 last_beacon[m->get_gid()] = ceph::coarse_mono_clock::now();
191
192 // Track whether we modified pending_map
193 bool updated = false;
194
195 if (pending_map.active_gid == m->get_gid()) {
196 // A beacon from the currently active daemon
197 if (pending_map.active_addr != m->get_server_addr()) {
198 dout(4) << "learned address " << m->get_server_addr()
199 << " (was " << pending_map.active_addr << ")" << dendl;
200 pending_map.active_addr = m->get_server_addr();
201 updated = true;
202 }
203
204 if (pending_map.get_available() != m->get_available()) {
205 dout(4) << "available " << m->get_gid() << dendl;
206 pending_map.available = m->get_available();
207 updated = true;
208 }
209 } else if (pending_map.active_gid == 0) {
210 // There is no currently active daemon, select this one.
211 if (pending_map.standbys.count(m->get_gid())) {
212 drop_standby(m->get_gid());
213 }
214 dout(4) << "selecting new active " << m->get_gid()
215 << " " << m->get_name()
216 << " (was " << pending_map.active_gid << " "
217 << pending_map.active_name << ")" << dendl;
218 pending_map.active_gid = m->get_gid();
219 pending_map.active_name = m->get_name();
220
221 updated = true;
222 } else {
223 if (pending_map.standbys.count(m->get_gid()) > 0) {
224 dout(10) << "from existing standby " << m->get_gid() << dendl;
225 } else {
226 dout(10) << "new standby " << m->get_gid() << dendl;
227 pending_map.standbys[m->get_gid()] = {m->get_gid(), m->get_name()};
228 updated = true;
229 }
230 }
231
232 if (updated) {
233 dout(4) << "updating map" << dendl;
234 wait_for_finished_proposal(op, new C_Updated(this, op));
235 } else {
236 dout(10) << "no change" << dendl;
237 }
238
239 return updated;
240 }
241
242 void MgrMonitor::check_subs()
243 {
244 const std::string type = "mgrmap";
245 if (mon->session_map.subs.count(type) == 0)
246 return;
247 for (auto sub : *(mon->session_map.subs[type])) {
248 check_sub(sub);
249 }
250 }
251
252 void MgrMonitor::check_sub(Subscription *sub)
253 {
254 if (sub->type == "mgrmap") {
255 if (sub->next <= map.get_epoch()) {
256 dout(20) << "Sending map to subscriber " << sub->session->con << dendl;
257 sub->session->con->send_message(new MMgrMap(map));
258 if (sub->onetime) {
259 mon->session_map.remove_sub(sub);
260 } else {
261 sub->next = map.get_epoch() + 1;
262 }
263 }
264 } else {
265 assert(sub->type == "mgrdigest");
266 if (digest_event == nullptr) {
267 send_digests();
268 }
269 }
270 }
271
272 /**
273 * Handle digest subscriptions separately (outside of check_sub) because
274 * they are going to be periodic rather than version-driven.
275 */
276 void MgrMonitor::send_digests()
277 {
278 cancel_timer();
279
280 if (!is_active()) {
281 return;
282 }
283
284 const std::string type = "mgrdigest";
285 if (mon->session_map.subs.count(type) == 0)
286 return;
287
288 for (auto sub : *(mon->session_map.subs[type])) {
289 MMgrDigest *mdigest = new MMgrDigest;
290
291 JSONFormatter f;
292 std::list<std::string> health_strs;
293 mon->get_health(health_strs, nullptr, &f);
294 f.flush(mdigest->health_json);
295 f.reset();
296
297 std::ostringstream ss;
298 mon->get_mon_status(&f, ss);
299 f.flush(mdigest->mon_status_json);
300 f.reset();
301
302 sub->session->con->send_message(mdigest);
303 }
304
305 digest_event = new C_MonContext(mon, [this](int){
306 send_digests();
307 });
308 mon->timer.add_event_after(g_conf->mon_mgr_digest_period, digest_event);
309 }
310
311 void MgrMonitor::cancel_timer()
312 {
313 if (digest_event) {
314 mon->timer.cancel_event(digest_event);
315 digest_event = nullptr;
316 }
317 }
318
319 void MgrMonitor::on_active()
320 {
321 if (mon->is_leader())
322 mon->clog->info() << "mgrmap e" << map.epoch << ": " << map;
323 }
324
325 void MgrMonitor::get_health(
326 list<pair<health_status_t,string> >& summary,
327 list<pair<health_status_t,string> > *detail,
328 CephContext *cct) const
329 {
330 // start mgr warnings as soon as the mons and osds are all upgraded,
331 // but before the require_luminous osdmap flag is set. this way the
332 // user gets some warning before the osd flag is set and mgr is
333 // actually *required*.
334 if (!mon->monmap->get_required_features().contains_all(
335 ceph::features::mon::FEATURE_LUMINOUS) ||
336 !HAVE_FEATURE(mon->osdmon()->osdmap.get_up_osd_features(),
337 SERVER_LUMINOUS)) {
338 return;
339 }
340
341 if (!map.available) {
342 auto level = HEALTH_WARN;
343 // do not escalate to ERR if they are still upgrading to jewel.
344 if (mon->osdmon()->osdmap.require_osd_release >= CEPH_RELEASE_LUMINOUS) {
345 utime_t now = ceph_clock_now();
346 if (first_seen_inactive != utime_t() &&
347 now - first_seen_inactive > g_conf->mon_mgr_inactive_grace) {
348 level = HEALTH_ERR;
349 }
350 }
351 summary.push_back(make_pair(level, "no active mgr"));
352 }
353 }
354
355 void MgrMonitor::tick()
356 {
357 if (!is_active() || !mon->is_leader())
358 return;
359
360 const auto now = ceph::coarse_mono_clock::now();
361 const auto cutoff = now - std::chrono::seconds(g_conf->mon_mgr_beacon_grace);
362
363 // Populate any missing beacons (i.e. no beacon since MgrMonitor
364 // instantiation) with the current time, so that they will
365 // eventually look laggy if they fail to give us a beacon.
366 if (pending_map.active_gid != 0
367 && last_beacon.count(pending_map.active_gid) == 0) {
368 last_beacon[pending_map.active_gid] = now;
369 }
370 for (auto s : pending_map.standbys) {
371 if (last_beacon.count(s.first) == 0) {
372 last_beacon[s.first] = now;
373 }
374 }
375
376 // Cull standbys first so that any remaining standbys
377 // will be eligible to take over from the active if we cull him.
378 std::list<uint64_t> dead_standbys;
379 for (const auto &i : pending_map.standbys) {
380 auto last_beacon_time = last_beacon.at(i.first);
381 if (last_beacon_time < cutoff) {
382 dead_standbys.push_back(i.first);
383 }
384 }
385
386 bool propose = false;
387
388 for (auto i : dead_standbys) {
389 dout(4) << "Dropping laggy standby " << i << dendl;
390 drop_standby(i);
391 propose = true;
392 }
393
394 if (pending_map.active_gid != 0
395 && last_beacon.at(pending_map.active_gid) < cutoff) {
396
397 drop_active();
398 propose = true;
399 dout(4) << "Dropping active" << pending_map.active_gid << dendl;
400 if (promote_standby()) {
401 dout(4) << "Promoted standby " << pending_map.active_gid << dendl;
402 } else {
403 dout(4) << "Active is laggy but have no standbys to replace it" << dendl;
404 }
405 } else if (pending_map.active_gid == 0) {
406 if (promote_standby()) {
407 dout(4) << "Promoted standby " << pending_map.active_gid << dendl;
408 propose = true;
409 }
410 }
411
412 if (propose) {
413 propose_pending();
414 }
415 }
416
417 bool MgrMonitor::promote_standby()
418 {
419 assert(pending_map.active_gid == 0);
420 if (pending_map.standbys.size()) {
421 // Promote a replacement (arbitrary choice of standby)
422 auto replacement_gid = pending_map.standbys.begin()->first;
423 pending_map.active_gid = replacement_gid;
424 pending_map.active_name = pending_map.standbys.at(replacement_gid).name;
425 pending_map.available = false;
426 pending_map.active_addr = entity_addr_t();
427
428 drop_standby(replacement_gid);
429 return true;
430 } else {
431 return false;
432 }
433 }
434
435 void MgrMonitor::drop_active()
436 {
437 if (last_beacon.count(pending_map.active_gid) > 0) {
438 last_beacon.erase(pending_map.active_gid);
439 }
440
441 pending_map.active_name = "";
442 pending_map.active_gid = 0;
443 pending_map.available = false;
444 pending_map.active_addr = entity_addr_t();
445 }
446
447 void MgrMonitor::drop_standby(uint64_t gid)
448 {
449 pending_map.standbys.erase(gid);
450 if (last_beacon.count(gid) > 0) {
451 last_beacon.erase(gid);
452 }
453
454 }
455
456 bool MgrMonitor::preprocess_command(MonOpRequestRef op)
457 {
458 MMonCommand *m = static_cast<MMonCommand*>(op->get_req());
459 std::stringstream ss;
460 bufferlist rdata;
461
462 std::map<std::string, cmd_vartype> cmdmap;
463 if (!cmdmap_from_json(m->cmd, &cmdmap, ss)) {
464 string rs = ss.str();
465 mon->reply_command(op, -EINVAL, rs, rdata, get_last_committed());
466 return true;
467 }
468
469 MonSession *session = m->get_session();
470 if (!session) {
471 mon->reply_command(op, -EACCES, "access denied", rdata,
472 get_last_committed());
473 return true;
474 }
475
476 string format;
477 cmd_getval(g_ceph_context, cmdmap, "format", format, string("json-pretty"));
478 boost::scoped_ptr<Formatter> f(Formatter::create(format));
479
480 string prefix;
481 cmd_getval(g_ceph_context, cmdmap, "prefix", prefix);
482 int r = 0;
483
484 if (prefix == "mgr dump") {
485 int64_t epoch = 0;
486 cmd_getval(g_ceph_context, cmdmap, "epoch", epoch, (int64_t)map.get_epoch());
487 if (epoch == (int64_t)map.get_epoch()) {
488 f->dump_object("mgrmap", map);
489 } else {
490 bufferlist bl;
491 int err = get_version(epoch, bl);
492 if (err == -ENOENT) {
493 r = -ENOENT;
494 ss << "there is no map for epoch " << epoch;
495 goto reply;
496 }
497 MgrMap m;
498 auto p = bl.begin();
499 m.decode(p);
500 f->dump_object("mgrmap", m);
501 }
502 f->flush(rdata);
503 } else {
504 return false;
505 }
506
507 reply:
508 string rs;
509 getline(ss, rs);
510 mon->reply_command(op, r, rs, rdata, get_last_committed());
511 return true;
512 }
513
514 bool MgrMonitor::prepare_command(MonOpRequestRef op)
515 {
516 MMonCommand *m = static_cast<MMonCommand*>(op->get_req());
517
518 std::stringstream ss;
519 bufferlist rdata;
520
521 std::map<std::string, cmd_vartype> cmdmap;
522 if (!cmdmap_from_json(m->cmd, &cmdmap, ss)) {
523 string rs = ss.str();
524 mon->reply_command(op, -EINVAL, rs, rdata, get_last_committed());
525 return true;
526 }
527
528 MonSession *session = m->get_session();
529 if (!session) {
530 mon->reply_command(op, -EACCES, "access denied", rdata, get_last_committed());
531 return true;
532 }
533
534 string prefix;
535 cmd_getval(g_ceph_context, cmdmap, "prefix", prefix);
536
537 int r = 0;
538
539 if (prefix == "mgr fail") {
540 string who;
541 cmd_getval(g_ceph_context, cmdmap, "who", who);
542
543 std::string err;
544 uint64_t gid = strict_strtol(who.c_str(), 10, &err);
545 bool changed = false;
546 if (!err.empty()) {
547 // Does not parse as a gid, treat it as a name
548 if (pending_map.active_name == who) {
549 drop_active();
550 changed = true;
551 } else {
552 gid = 0;
553 for (const auto &i : pending_map.standbys) {
554 if (i.second.name == who) {
555 gid = i.first;
556 break;
557 }
558 }
559 if (gid != 0) {
560 drop_standby(gid);
561 changed = true;
562 } else {
563 ss << "Daemon not found '" << who << "', already failed?";
564 }
565 }
566 } else {
567 if (pending_map.active_gid == gid) {
568 drop_active();
569 changed = true;
570 } else if (pending_map.standbys.count(gid) > 0) {
571 drop_standby(gid);
572 changed = true;
573 } else {
574 ss << "Daemon not found '" << gid << "', already failed?";
575 }
576 }
577
578 if (changed && pending_map.active_gid == 0) {
579 promote_standby();
580 }
581 } else {
582 r = -ENOSYS;
583 }
584
585 dout(4) << __func__ << " done, r=" << r << dendl;
586 /* Compose response */
587 string rs;
588 getline(ss, rs);
589
590 if (r >= 0) {
591 // success.. delay reply
592 wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, r, rs,
593 get_last_committed() + 1));
594 return true;
595 } else {
596 // reply immediately
597 mon->reply_command(op, r, rs, rdata, get_last_committed());
598 return false;
599 }
600 }
601
602 void MgrMonitor::init()
603 {
604 if (digest_event == nullptr) {
605 send_digests(); // To get it to schedule its own event
606 }
607 }
608
609 void MgrMonitor::on_shutdown()
610 {
611 cancel_timer();
612 }
613
614