]>
Commit | Line | Data |
---|---|---|
7c673cae FG |
1 | // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- |
2 | // vim: ts=8 sw=2 smarttab | |
3 | /* | |
4 | * Ceph - scalable distributed file system | |
5 | * | |
6 | * Copyright (C) 2016 John Spray <john.spray@redhat.com> | |
7 | * | |
8 | * This is free software; you can redistribute it and/or | |
9 | * modify it under the terms of the GNU Lesser General Public | |
10 | * License version 2.1, as published by the Free Software | |
11 | * Foundation. See file COPYING. | |
12 | */ | |
13 | ||
14 | #include "messages/MMgrBeacon.h" | |
15 | #include "messages/MMgrMap.h" | |
16 | #include "messages/MMgrDigest.h" | |
17 | ||
18 | #include "PGMap.h" | |
19 | #include "PGMonitor.h" | |
20 | #include "include/stringify.h" | |
21 | #include "mgr/MgrContext.h" | |
22 | #include "OSDMonitor.h" | |
23 | ||
24 | #include "MgrMonitor.h" | |
25 | ||
26 | #define dout_subsys ceph_subsys_mon | |
27 | #undef dout_prefix | |
28 | #define dout_prefix _prefix(_dout, mon, map) | |
29 | static ostream& _prefix(std::ostream *_dout, Monitor *mon, | |
30 | const MgrMap& mgrmap) { | |
31 | return *_dout << "mon." << mon->name << "@" << mon->rank | |
32 | << "(" << mon->get_state_name() | |
33 | << ").mgr e" << mgrmap.get_epoch() << " "; | |
34 | } | |
35 | ||
36 | void MgrMonitor::create_initial() | |
37 | { | |
38 | } | |
39 | ||
40 | void MgrMonitor::update_from_paxos(bool *need_bootstrap) | |
41 | { | |
42 | version_t version = get_last_committed(); | |
43 | if (version != map.epoch) { | |
44 | dout(4) << "loading version " << version << dendl; | |
45 | ||
46 | bufferlist bl; | |
47 | int err = get_version(version, bl); | |
48 | assert(err == 0); | |
49 | ||
50 | bufferlist::iterator p = bl.begin(); | |
51 | map.decode(p); | |
52 | ||
53 | dout(4) << "active server: " << map.active_addr | |
54 | << "(" << map.active_gid << ")" << dendl; | |
55 | ||
56 | if (map.available) { | |
57 | first_seen_inactive = utime_t(); | |
58 | } else { | |
59 | first_seen_inactive = ceph_clock_now(); | |
60 | } | |
61 | ||
62 | check_subs(); | |
63 | } | |
64 | ||
65 | // feed our pet MgrClient | |
66 | mon->mgr_client.ms_dispatch(new MMgrMap(map)); | |
67 | } | |
68 | ||
69 | void MgrMonitor::create_pending() | |
70 | { | |
71 | pending_map = map; | |
72 | pending_map.epoch++; | |
73 | } | |
74 | ||
75 | void MgrMonitor::encode_pending(MonitorDBStore::TransactionRef t) | |
76 | { | |
77 | dout(10) << __func__ << " " << pending_map << dendl; | |
78 | bufferlist bl; | |
79 | pending_map.encode(bl, mon->get_quorum_con_features()); | |
80 | put_version(t, pending_map.epoch, bl); | |
81 | put_last_committed(t, pending_map.epoch); | |
82 | } | |
83 | ||
84 | bool MgrMonitor::check_caps(MonOpRequestRef op, const uuid_d& fsid) | |
85 | { | |
86 | // check permissions | |
87 | MonSession *session = op->get_session(); | |
88 | if (!session) | |
89 | return false; | |
90 | if (!session->is_capable("mgr", MON_CAP_X)) { | |
91 | dout(1) << __func__ << " insufficient caps " << session->caps << dendl; | |
92 | return false; | |
93 | } | |
94 | if (fsid != mon->monmap->fsid) { | |
95 | dout(1) << __func__ << " op fsid " << fsid | |
96 | << " != " << mon->monmap->fsid << dendl; | |
97 | return false; | |
98 | } | |
99 | return true; | |
100 | } | |
101 | ||
102 | bool MgrMonitor::preprocess_query(MonOpRequestRef op) | |
103 | { | |
104 | PaxosServiceMessage *m = static_cast<PaxosServiceMessage*>(op->get_req()); | |
105 | switch (m->get_type()) { | |
106 | case MSG_MGR_BEACON: | |
107 | return preprocess_beacon(op); | |
108 | case MSG_MON_COMMAND: | |
109 | return preprocess_command(op); | |
110 | default: | |
111 | mon->no_reply(op); | |
112 | derr << "Unhandled message type " << m->get_type() << dendl; | |
113 | return true; | |
114 | } | |
115 | } | |
116 | ||
117 | bool MgrMonitor::prepare_update(MonOpRequestRef op) | |
118 | { | |
119 | PaxosServiceMessage *m = static_cast<PaxosServiceMessage*>(op->get_req()); | |
120 | switch (m->get_type()) { | |
121 | case MSG_MGR_BEACON: | |
122 | return prepare_beacon(op); | |
123 | ||
124 | case MSG_MON_COMMAND: | |
125 | return prepare_command(op); | |
126 | ||
127 | default: | |
128 | mon->no_reply(op); | |
129 | derr << "Unhandled message type " << m->get_type() << dendl; | |
130 | return true; | |
131 | } | |
132 | } | |
133 | ||
134 | ||
135 | ||
136 | class C_Updated : public Context { | |
137 | MgrMonitor *mm; | |
138 | MonOpRequestRef op; | |
139 | public: | |
140 | C_Updated(MgrMonitor *a, MonOpRequestRef c) : | |
141 | mm(a), op(c) {} | |
142 | void finish(int r) override { | |
143 | if (r >= 0) { | |
144 | // Success | |
145 | } else if (r == -ECANCELED) { | |
146 | mm->mon->no_reply(op); | |
147 | } else { | |
148 | mm->dispatch(op); // try again | |
149 | } | |
150 | } | |
151 | }; | |
152 | ||
153 | bool MgrMonitor::preprocess_beacon(MonOpRequestRef op) | |
154 | { | |
155 | MMgrBeacon *m = static_cast<MMgrBeacon*>(op->get_req()); | |
156 | dout(4) << "beacon from " << m->get_gid() << dendl; | |
157 | ||
158 | if (!check_caps(op, m->get_fsid())) { | |
159 | // drop it on the floor | |
160 | return true; | |
161 | } | |
162 | ||
163 | // always send this to the leader's prepare_beacon() | |
164 | return false; | |
165 | } | |
166 | ||
167 | bool MgrMonitor::prepare_beacon(MonOpRequestRef op) | |
168 | { | |
169 | MMgrBeacon *m = static_cast<MMgrBeacon*>(op->get_req()); | |
170 | dout(4) << "beacon from " << m->get_gid() << dendl; | |
171 | ||
172 | // See if we are seeing same name, new GID for the active daemon | |
173 | if (m->get_name() == pending_map.active_name | |
174 | && m->get_gid() != pending_map.active_gid) | |
175 | { | |
176 | dout(4) << "Active daemon restart (mgr." << m->get_name() << ")" << dendl; | |
177 | drop_active(); | |
178 | } | |
179 | ||
180 | // See if we are seeing same name, new GID for any standbys | |
181 | for (const auto &i : pending_map.standbys) { | |
182 | const StandbyInfo &s = i.second; | |
183 | if (s.name == m->get_name() && s.gid != m->get_gid()) { | |
184 | dout(4) << "Standby daemon restart (mgr." << m->get_name() << ")" << dendl; | |
185 | drop_standby(i.first); | |
186 | break; | |
187 | } | |
188 | } | |
189 | ||
190 | last_beacon[m->get_gid()] = ceph_clock_now(); | |
191 | ||
192 | // Track whether we modified pending_map | |
193 | bool updated = false; | |
194 | ||
195 | if (pending_map.active_gid == m->get_gid()) { | |
196 | // A beacon from the currently active daemon | |
197 | if (pending_map.active_addr != m->get_server_addr()) { | |
198 | dout(4) << "learned address " << m->get_server_addr() | |
199 | << " (was " << pending_map.active_addr << ")" << dendl; | |
200 | pending_map.active_addr = m->get_server_addr(); | |
201 | updated = true; | |
202 | } | |
203 | ||
204 | if (pending_map.get_available() != m->get_available()) { | |
205 | dout(4) << "available " << m->get_gid() << dendl; | |
206 | pending_map.available = m->get_available(); | |
207 | updated = true; | |
208 | } | |
209 | } else if (pending_map.active_gid == 0) { | |
210 | // There is no currently active daemon, select this one. | |
211 | if (pending_map.standbys.count(m->get_gid())) { | |
212 | drop_standby(m->get_gid()); | |
213 | } | |
214 | dout(4) << "selecting new active " << m->get_gid() | |
215 | << " " << m->get_name() | |
216 | << " (was " << pending_map.active_gid << " " | |
217 | << pending_map.active_name << ")" << dendl; | |
218 | pending_map.active_gid = m->get_gid(); | |
219 | pending_map.active_name = m->get_name(); | |
220 | ||
221 | updated = true; | |
222 | } else { | |
223 | if (pending_map.standbys.count(m->get_gid()) > 0) { | |
224 | dout(10) << "from existing standby " << m->get_gid() << dendl; | |
225 | } else { | |
226 | dout(10) << "new standby " << m->get_gid() << dendl; | |
227 | pending_map.standbys[m->get_gid()] = {m->get_gid(), m->get_name()}; | |
228 | updated = true; | |
229 | } | |
230 | } | |
231 | ||
232 | if (updated) { | |
233 | dout(4) << "updating map" << dendl; | |
234 | wait_for_finished_proposal(op, new C_Updated(this, op)); | |
235 | } else { | |
236 | dout(10) << "no change" << dendl; | |
237 | } | |
238 | ||
239 | return updated; | |
240 | } | |
241 | ||
242 | void MgrMonitor::check_subs() | |
243 | { | |
244 | const std::string type = "mgrmap"; | |
245 | if (mon->session_map.subs.count(type) == 0) | |
246 | return; | |
247 | for (auto sub : *(mon->session_map.subs[type])) { | |
248 | check_sub(sub); | |
249 | } | |
250 | } | |
251 | ||
252 | void MgrMonitor::check_sub(Subscription *sub) | |
253 | { | |
254 | if (sub->type == "mgrmap") { | |
255 | if (sub->next <= map.get_epoch()) { | |
256 | dout(20) << "Sending map to subscriber " << sub->session->con << dendl; | |
257 | sub->session->con->send_message(new MMgrMap(map)); | |
258 | if (sub->onetime) { | |
259 | mon->session_map.remove_sub(sub); | |
260 | } else { | |
261 | sub->next = map.get_epoch() + 1; | |
262 | } | |
263 | } | |
264 | } else { | |
265 | assert(sub->type == "mgrdigest"); | |
266 | if (digest_callback == nullptr) { | |
267 | send_digests(); | |
268 | } | |
269 | } | |
270 | } | |
271 | ||
272 | /** | |
273 | * Handle digest subscriptions separately (outside of check_sub) because | |
274 | * they are going to be periodic rather than version-driven. | |
275 | */ | |
276 | void MgrMonitor::send_digests() | |
277 | { | |
278 | digest_callback = nullptr; | |
279 | ||
280 | const std::string type = "mgrdigest"; | |
281 | if (mon->session_map.subs.count(type) == 0) | |
282 | return; | |
283 | ||
284 | for (auto sub : *(mon->session_map.subs[type])) { | |
285 | MMgrDigest *mdigest = new MMgrDigest; | |
286 | ||
287 | JSONFormatter f; | |
288 | std::list<std::string> health_strs; | |
289 | mon->get_health(health_strs, nullptr, &f); | |
290 | f.flush(mdigest->health_json); | |
291 | f.reset(); | |
292 | ||
293 | std::ostringstream ss; | |
294 | mon->get_mon_status(&f, ss); | |
295 | f.flush(mdigest->mon_status_json); | |
296 | f.reset(); | |
297 | ||
298 | sub->session->con->send_message(mdigest); | |
299 | } | |
300 | ||
301 | digest_callback = new C_MonContext(mon, [this](int){ | |
302 | send_digests(); | |
303 | }); | |
304 | mon->timer.add_event_after(g_conf->mon_mgr_digest_period, digest_callback); | |
305 | } | |
306 | ||
307 | void MgrMonitor::on_active() | |
308 | { | |
309 | if (mon->is_leader()) | |
310 | mon->clog->info() << "mgrmap e" << map.epoch << ": " << map; | |
311 | } | |
312 | ||
313 | void MgrMonitor::get_health( | |
314 | list<pair<health_status_t,string> >& summary, | |
315 | list<pair<health_status_t,string> > *detail, | |
316 | CephContext *cct) const | |
317 | { | |
318 | // start mgr warnings as soon as the mons and osds are all upgraded, | |
319 | // but before the require_luminous osdmap flag is set. this way the | |
320 | // user gets some warning before the osd flag is set and mgr is | |
321 | // actually *required*. | |
322 | if (!mon->monmap->get_required_features().contains_all( | |
323 | ceph::features::mon::FEATURE_LUMINOUS) || | |
324 | !HAVE_FEATURE(mon->osdmon()->osdmap.get_up_osd_features(), | |
325 | SERVER_LUMINOUS)) { | |
326 | return; | |
327 | } | |
328 | ||
329 | if (!map.available) { | |
330 | auto level = HEALTH_WARN; | |
331 | // do not escalate to ERR if they are still upgrading to jewel. | |
332 | if (mon->osdmon()->osdmap.test_flag(CEPH_OSDMAP_REQUIRE_LUMINOUS)) { | |
333 | utime_t now = ceph_clock_now(); | |
334 | if (first_seen_inactive != utime_t() && | |
335 | now - first_seen_inactive > g_conf->mon_mgr_inactive_grace) { | |
336 | level = HEALTH_ERR; | |
337 | } | |
338 | } | |
339 | summary.push_back(make_pair(level, "no active mgr")); | |
340 | } | |
341 | } | |
342 | ||
343 | void MgrMonitor::tick() | |
344 | { | |
345 | if (!is_active() || !mon->is_leader()) | |
346 | return; | |
347 | ||
348 | const utime_t now = ceph_clock_now(); | |
349 | utime_t cutoff = now; | |
350 | cutoff -= g_conf->mon_mgr_beacon_grace; | |
351 | ||
352 | // Populate any missing beacons (i.e. no beacon since MgrMonitor | |
353 | // instantiation) with the current time, so that they will | |
354 | // eventually look laggy if they fail to give us a beacon. | |
355 | if (pending_map.active_gid != 0 | |
356 | && last_beacon.count(pending_map.active_gid) == 0) { | |
357 | last_beacon[pending_map.active_gid] = now; | |
358 | } | |
359 | for (auto s : pending_map.standbys) { | |
360 | if (last_beacon.count(s.first) == 0) { | |
361 | last_beacon[s.first] = now; | |
362 | } | |
363 | } | |
364 | ||
365 | // Cull standbys first so that any remaining standbys | |
366 | // will be eligible to take over from the active if we cull him. | |
367 | std::list<uint64_t> dead_standbys; | |
368 | for (const auto &i : pending_map.standbys) { | |
369 | auto last_beacon_time = last_beacon.at(i.first); | |
370 | if (last_beacon_time < cutoff) { | |
371 | dead_standbys.push_back(i.first); | |
372 | } | |
373 | } | |
374 | ||
375 | bool propose = false; | |
376 | ||
377 | for (auto i : dead_standbys) { | |
378 | dout(4) << "Dropping laggy standby " << i << dendl; | |
379 | drop_standby(i); | |
380 | propose = true; | |
381 | } | |
382 | ||
383 | if (pending_map.active_gid != 0 | |
384 | && last_beacon.at(pending_map.active_gid) < cutoff) { | |
385 | ||
386 | drop_active(); | |
387 | propose = true; | |
388 | dout(4) << "Dropping active" << pending_map.active_gid << dendl; | |
389 | if (promote_standby()) { | |
390 | dout(4) << "Promoted standby " << pending_map.active_gid << dendl; | |
391 | } else { | |
392 | dout(4) << "Active is laggy but have no standbys to replace it" << dendl; | |
393 | } | |
394 | } else if (pending_map.active_gid == 0) { | |
395 | if (promote_standby()) { | |
396 | dout(4) << "Promoted standby " << pending_map.active_gid << dendl; | |
397 | propose = true; | |
398 | } | |
399 | } | |
400 | ||
401 | if (propose) { | |
402 | propose_pending(); | |
403 | } | |
404 | } | |
405 | ||
406 | bool MgrMonitor::promote_standby() | |
407 | { | |
408 | assert(pending_map.active_gid == 0); | |
409 | if (pending_map.standbys.size()) { | |
410 | // Promote a replacement (arbitrary choice of standby) | |
411 | auto replacement_gid = pending_map.standbys.begin()->first; | |
412 | pending_map.active_gid = replacement_gid; | |
413 | pending_map.active_name = pending_map.standbys.at(replacement_gid).name; | |
414 | pending_map.available = false; | |
415 | pending_map.active_addr = entity_addr_t(); | |
416 | ||
417 | drop_standby(replacement_gid); | |
418 | return true; | |
419 | } else { | |
420 | return false; | |
421 | } | |
422 | } | |
423 | ||
424 | void MgrMonitor::drop_active() | |
425 | { | |
426 | if (last_beacon.count(pending_map.active_gid) > 0) { | |
427 | last_beacon.erase(pending_map.active_gid); | |
428 | } | |
429 | ||
430 | pending_map.active_name = ""; | |
431 | pending_map.active_gid = 0; | |
432 | pending_map.available = false; | |
433 | pending_map.active_addr = entity_addr_t(); | |
434 | } | |
435 | ||
436 | void MgrMonitor::drop_standby(uint64_t gid) | |
437 | { | |
438 | pending_map.standbys.erase(gid); | |
439 | if (last_beacon.count(gid) > 0) { | |
440 | last_beacon.erase(gid); | |
441 | } | |
442 | ||
443 | } | |
444 | ||
445 | bool MgrMonitor::preprocess_command(MonOpRequestRef op) | |
446 | { | |
447 | return false; | |
448 | ||
449 | } | |
450 | ||
451 | bool MgrMonitor::prepare_command(MonOpRequestRef op) | |
452 | { | |
453 | MMonCommand *m = static_cast<MMonCommand*>(op->get_req()); | |
454 | ||
455 | std::stringstream ss; | |
456 | bufferlist rdata; | |
457 | ||
458 | std::map<std::string, cmd_vartype> cmdmap; | |
459 | if (!cmdmap_from_json(m->cmd, &cmdmap, ss)) { | |
460 | string rs = ss.str(); | |
461 | mon->reply_command(op, -EINVAL, rs, rdata, get_last_committed()); | |
462 | return true; | |
463 | } | |
464 | ||
465 | MonSession *session = m->get_session(); | |
466 | if (!session) { | |
467 | mon->reply_command(op, -EACCES, "access denied", rdata, get_last_committed()); | |
468 | return true; | |
469 | } | |
470 | ||
471 | string prefix; | |
472 | cmd_getval(g_ceph_context, cmdmap, "prefix", prefix); | |
473 | ||
474 | int r = 0; | |
475 | ||
476 | if (prefix == "mgr fail") { | |
477 | string who; | |
478 | cmd_getval(g_ceph_context, cmdmap, "who", who); | |
479 | ||
480 | std::string err; | |
481 | uint64_t gid = strict_strtol(who.c_str(), 10, &err); | |
482 | bool changed = false; | |
483 | if (!err.empty()) { | |
484 | // Does not parse as a gid, treat it as a name | |
485 | if (pending_map.active_name == who) { | |
486 | drop_active(); | |
487 | changed = true; | |
488 | } else { | |
489 | gid = 0; | |
490 | for (const auto &i : pending_map.standbys) { | |
491 | if (i.second.name == who) { | |
492 | gid = i.first; | |
493 | break; | |
494 | } | |
495 | } | |
496 | if (gid != 0) { | |
497 | drop_standby(gid); | |
498 | changed = true; | |
499 | } else { | |
500 | ss << "Daemon not found '" << who << "', already failed?"; | |
501 | } | |
502 | } | |
503 | } else { | |
504 | if (pending_map.active_gid == gid) { | |
505 | drop_active(); | |
506 | changed = true; | |
507 | } else if (pending_map.standbys.count(gid) > 0) { | |
508 | drop_standby(gid); | |
509 | changed = true; | |
510 | } else { | |
511 | ss << "Daemon not found '" << gid << "', already failed?"; | |
512 | } | |
513 | } | |
514 | ||
515 | if (changed && pending_map.active_gid == 0) { | |
516 | promote_standby(); | |
517 | } | |
518 | } else { | |
519 | r = -ENOSYS; | |
520 | } | |
521 | ||
522 | dout(4) << __func__ << " done, r=" << r << dendl; | |
523 | /* Compose response */ | |
524 | string rs; | |
525 | getline(ss, rs); | |
526 | ||
527 | if (r >= 0) { | |
528 | // success.. delay reply | |
529 | wait_for_finished_proposal(op, new Monitor::C_Command(mon, op, r, rs, | |
530 | get_last_committed() + 1)); | |
531 | return true; | |
532 | } else { | |
533 | // reply immediately | |
534 | mon->reply_command(op, r, rs, rdata, get_last_committed()); | |
535 | return false; | |
536 | } | |
537 | } | |
538 | ||
539 | void MgrMonitor::init() | |
540 | { | |
541 | if (digest_callback == nullptr) { | |
542 | send_digests(); // To get it to schedule its own event | |
543 | } | |
544 | } | |
545 | ||
546 | void MgrMonitor::on_shutdown() | |
547 | { | |
548 | if (digest_callback) { | |
549 | mon->timer.cancel_event(digest_callback); | |
550 | } | |
551 | } | |
552 |