]> git.proxmox.com Git - ceph.git/blob - ceph/src/mon/MonmapMonitor.cc
import ceph pacific 16.2.5
[ceph.git] / ceph / src / mon / MonmapMonitor.cc
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3 /*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2009 Sage Weil <sage@newdream.net>
7 *
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
12 *
13 */
14
15 #include "MonmapMonitor.h"
16 #include "Monitor.h"
17 #include "OSDMonitor.h"
18 #include "messages/MMonCommand.h"
19 #include "messages/MMonJoin.h"
20
21 #include "common/ceph_argparse.h"
22 #include "common/errno.h"
23 #include <sstream>
24 #include "common/config.h"
25 #include "common/cmdparse.h"
26
27 #include "include/ceph_assert.h"
28 #include "include/stringify.h"
29
30 #define dout_subsys ceph_subsys_mon
31 #undef dout_prefix
32 #define dout_prefix _prefix(_dout, mon)
33 using namespace TOPNSPC::common;
34
35 using std::cout;
36 using std::dec;
37 using std::hex;
38 using std::list;
39 using std::map;
40 using std::make_pair;
41 using std::ostream;
42 using std::ostringstream;
43 using std::pair;
44 using std::set;
45 using std::setfill;
46 using std::string;
47 using std::stringstream;
48 using std::to_string;
49 using std::vector;
50 using std::unique_ptr;
51
52 using ceph::bufferlist;
53 using ceph::decode;
54 using ceph::encode;
55 using ceph::Formatter;
56 using ceph::JSONFormatter;
57 using ceph::make_message;
58 using ceph::mono_clock;
59 using ceph::mono_time;
60 using ceph::timespan_str;
61 static ostream& _prefix(std::ostream *_dout, Monitor &mon) {
62 return *_dout << "mon." << mon.name << "@" << mon.rank
63 << "(" << mon.get_state_name()
64 << ").monmap v" << mon.monmap->epoch << " ";
65 }
66
67 void MonmapMonitor::create_initial()
68 {
69 dout(10) << __func__ << " using current monmap" << dendl;
70 pending_map = *mon.monmap;
71 pending_map.epoch = 1;
72
73 if (g_conf()->mon_debug_no_initial_persistent_features) {
74 derr << __func__ << " mon_debug_no_initial_persistent_features=true"
75 << dendl;
76 } else {
77 // initialize with default persistent features for new clusters
78 pending_map.persistent_features = ceph::features::mon::get_persistent();
79 pending_map.min_mon_release = ceph_release();
80 }
81 }
82
83 void MonmapMonitor::update_from_paxos(bool *need_bootstrap)
84 {
85 version_t version = get_last_committed();
86 if (version <= mon.monmap->get_epoch())
87 return;
88
89 dout(10) << __func__ << " version " << version
90 << ", my v " << mon.monmap->epoch << dendl;
91
92 if (need_bootstrap && version != mon.monmap->get_epoch()) {
93 dout(10) << " signaling that we need a bootstrap" << dendl;
94 *need_bootstrap = true;
95 }
96
97 // read and decode
98 monmap_bl.clear();
99 int ret = get_version(version, monmap_bl);
100 ceph_assert(ret == 0);
101 ceph_assert(monmap_bl.length());
102
103 dout(10) << __func__ << " got " << version << dendl;
104 mon.monmap->decode(monmap_bl);
105
106 if (mon.store->exists("mkfs", "monmap")) {
107 auto t(std::make_shared<MonitorDBStore::Transaction>());
108 t->erase("mkfs", "monmap");
109 mon.store->apply_transaction(t);
110 }
111
112 check_subs();
113
114 // make sure we've recorded min_mon_release
115 string val;
116 if (mon.store->read_meta("min_mon_release", &val) < 0 ||
117 val.size() == 0 ||
118 atoi(val.c_str()) != (int)ceph_release()) {
119 dout(10) << __func__ << " updating min_mon_release meta" << dendl;
120 mon.store->write_meta("min_mon_release",
121 stringify(ceph_release()));
122 }
123
124 mon.notify_new_monmap(true);
125 }
126
127 void MonmapMonitor::create_pending()
128 {
129 pending_map = *mon.monmap;
130 pending_map.epoch++;
131 pending_map.last_changed = ceph_clock_now();
132 dout(10) << __func__ << " monmap epoch " << pending_map.epoch << dendl;
133 }
134
135 void MonmapMonitor::encode_pending(MonitorDBStore::TransactionRef t)
136 {
137 dout(10) << __func__ << " epoch " << pending_map.epoch << dendl;
138
139 ceph_assert(mon.monmap->epoch + 1 == pending_map.epoch ||
140 pending_map.epoch == 1); // special case mkfs!
141 bufferlist bl;
142 pending_map.encode(bl, mon.get_quorum_con_features());
143
144 put_version(t, pending_map.epoch, bl);
145 put_last_committed(t, pending_map.epoch);
146
147 // generate a cluster fingerprint, too?
148 if (pending_map.epoch == 1) {
149 mon.prepare_new_fingerprint(t);
150 }
151
152 //health
153 health_check_map_t next;
154 pending_map.check_health(&next);
155 encode_health(next, t);
156 }
157
158 class C_ApplyFeatures : public Context {
159 MonmapMonitor *svc;
160 mon_feature_t features;
161 ceph_release_t min_mon_release;
162 public:
163 C_ApplyFeatures(MonmapMonitor *s, const mon_feature_t& f, ceph_release_t mmr) :
164 svc(s), features(f), min_mon_release(mmr) { }
165 void finish(int r) override {
166 if (r >= 0) {
167 svc->apply_mon_features(features, min_mon_release);
168 } else if (r == -EAGAIN || r == -ECANCELED) {
169 // discard features if we're no longer on the quorum that
170 // established them in the first place.
171 return;
172 } else {
173 ceph_abort_msg("bad C_ApplyFeatures return value");
174 }
175 }
176 };
177
178 void MonmapMonitor::apply_mon_features(const mon_feature_t& features,
179 ceph_release_t min_mon_release)
180 {
181 if (!is_writeable()) {
182 dout(5) << __func__ << " wait for service to be writeable" << dendl;
183 wait_for_writeable_ctx(new C_ApplyFeatures(this, features, min_mon_release));
184 return;
185 }
186
187 // do nothing here unless we have a full quorum
188 if (mon.get_quorum().size() < mon.monmap->size()) {
189 return;
190 }
191
192 ceph_assert(is_writeable());
193 ceph_assert(features.contains_all(pending_map.persistent_features));
194 // we should never hit this because `features` should be the result
195 // of the quorum's supported features. But if it happens, die.
196 ceph_assert(ceph::features::mon::get_supported().contains_all(features));
197
198 mon_feature_t new_features =
199 (pending_map.persistent_features ^
200 (features & ceph::features::mon::get_persistent()));
201
202 if (new_features.empty() &&
203 pending_map.min_mon_release == min_mon_release) {
204 dout(10) << __func__ << " min_mon_release (" << (int)min_mon_release
205 << ") and features (" << features << ") match" << dendl;
206 return;
207 }
208
209 if (!new_features.empty()) {
210 dout(1) << __func__ << " applying new features "
211 << new_features << ", had " << pending_map.persistent_features
212 << ", will have "
213 << (new_features | pending_map.persistent_features)
214 << dendl;
215 pending_map.persistent_features |= new_features;
216 }
217 if (min_mon_release > pending_map.min_mon_release) {
218 dout(1) << __func__ << " increasing min_mon_release to "
219 << to_integer<int>(min_mon_release) << " (" << min_mon_release
220 << ")" << dendl;
221 pending_map.min_mon_release = min_mon_release;
222 }
223
224 propose_pending();
225 }
226
227 void MonmapMonitor::on_active()
228 {
229 if (get_last_committed() >= 1 && !mon.has_ever_joined) {
230 // make note of the fact that i was, once, part of the quorum.
231 dout(10) << "noting that i was, once, part of an active quorum." << dendl;
232
233 /* This is some form of nasty in-breeding we have between the MonmapMonitor
234 and the Monitor itself. We should find a way to get rid of it given our
235 new architecture. Until then, stick with it since we are a
236 single-threaded process and, truth be told, no one else relies on this
237 thing besides us.
238 */
239 auto t(std::make_shared<MonitorDBStore::Transaction>());
240 t->put(Monitor::MONITOR_NAME, "joined", 1);
241 mon.store->apply_transaction(t);
242 mon.has_ever_joined = true;
243 }
244
245 if (mon.is_leader()) {
246 mon.clog->debug() << "monmap " << *mon.monmap;
247 }
248
249 apply_mon_features(mon.get_quorum_mon_features(),
250 mon.quorum_min_mon_release);
251 }
252
253 bool MonmapMonitor::preprocess_query(MonOpRequestRef op)
254 {
255 auto m = op->get_req<PaxosServiceMessage>();
256 switch (m->get_type()) {
257 // READs
258 case MSG_MON_COMMAND:
259 try {
260 return preprocess_command(op);
261 }
262 catch (const bad_cmd_get& e) {
263 bufferlist bl;
264 mon.reply_command(op, -EINVAL, e.what(), bl, get_last_committed());
265 return true;
266 }
267 case MSG_MON_JOIN:
268 return preprocess_join(op);
269 default:
270 ceph_abort();
271 return true;
272 }
273 }
274
275 void MonmapMonitor::dump_info(Formatter *f)
276 {
277 f->dump_unsigned("monmap_first_committed", get_first_committed());
278 f->dump_unsigned("monmap_last_committed", get_last_committed());
279 f->open_object_section("monmap");
280 mon.monmap->dump(f);
281 f->close_section();
282 f->open_array_section("quorum");
283 for (set<int>::iterator q = mon.get_quorum().begin(); q != mon.get_quorum().end(); ++q)
284 f->dump_int("mon", *q);
285 f->close_section();
286 }
287
288 bool MonmapMonitor::preprocess_command(MonOpRequestRef op)
289 {
290 auto m = op->get_req<MMonCommand>();
291 int r = -1;
292 bufferlist rdata;
293 stringstream ss;
294
295 cmdmap_t cmdmap;
296 if (!cmdmap_from_json(m->cmd, &cmdmap, ss)) {
297 string rs = ss.str();
298 mon.reply_command(op, -EINVAL, rs, rdata, get_last_committed());
299 return true;
300 }
301
302 string prefix;
303 cmd_getval(cmdmap, "prefix", prefix);
304
305 MonSession *session = op->get_session();
306 if (!session) {
307 mon.reply_command(op, -EACCES, "access denied", get_last_committed());
308 return true;
309 }
310
311 string format;
312 cmd_getval(cmdmap, "format", format, string("plain"));
313 boost::scoped_ptr<Formatter> f(Formatter::create(format));
314
315 if (prefix == "mon stat") {
316 if (f) {
317 f->open_object_section("monmap");
318 mon.monmap->dump_summary(f.get());
319 f->dump_string("leader", mon.get_leader_name());
320 f->open_array_section("quorum");
321 for (auto rank: mon.get_quorum()) {
322 std::string name = mon.monmap->get_name(rank);
323 f->open_object_section("mon");
324 f->dump_int("rank", rank);
325 f->dump_string("name", name);
326 f->close_section(); // mon
327 }
328 f->close_section(); // quorum
329 f->close_section(); // monmap
330 f->flush(ss);
331 } else {
332 mon.monmap->print_summary(ss);
333 ss << ", election epoch " << mon.get_epoch() << ", leader "
334 << mon.get_leader() << " " << mon.get_leader_name()
335 << ", quorum " << mon.get_quorum()
336 << " " << mon.get_quorum_names();
337 }
338
339 rdata.append(ss);
340 ss.str("");
341 r = 0;
342
343 } else if (prefix == "mon getmap" ||
344 prefix == "mon dump") {
345
346 epoch_t epoch;
347 int64_t epochnum;
348 cmd_getval(cmdmap, "epoch", epochnum, (int64_t)0);
349 epoch = epochnum;
350
351 MonMap *p = mon.monmap;
352 if (epoch) {
353 bufferlist bl;
354 r = get_version(epoch, bl);
355 if (r == -ENOENT) {
356 ss << "there is no map for epoch " << epoch;
357 goto reply;
358 }
359 ceph_assert(r == 0);
360 ceph_assert(bl.length() > 0);
361 p = new MonMap;
362 p->decode(bl);
363 }
364
365 ceph_assert(p);
366
367 if (prefix == "mon getmap") {
368 p->encode(rdata, m->get_connection()->get_features());
369 r = 0;
370 ss << "got monmap epoch " << p->get_epoch();
371 } else if (prefix == "mon dump") {
372 stringstream ds;
373 if (f) {
374 f->open_object_section("monmap");
375 p->dump(f.get());
376 f->open_array_section("quorum");
377 for (set<int>::iterator q = mon.get_quorum().begin();
378 q != mon.get_quorum().end(); ++q) {
379 f->dump_int("mon", *q);
380 }
381 f->close_section();
382 f->close_section();
383 f->flush(ds);
384 r = 0;
385 } else {
386 p->print(ds);
387 r = 0;
388 }
389 rdata.append(ds);
390 ss << "dumped monmap epoch " << p->get_epoch();
391 }
392 if (p != mon.monmap) {
393 delete p;
394 p = nullptr;
395 }
396
397 } else if (prefix == "mon feature ls") {
398
399 bool list_with_value = false;
400 string with_value;
401 if (cmd_getval(cmdmap, "with_value", with_value) &&
402 with_value == "--with-value") {
403 list_with_value = true;
404 }
405
406 MonMap *p = mon.monmap;
407
408 // list features
409 mon_feature_t supported = ceph::features::mon::get_supported();
410 mon_feature_t persistent = ceph::features::mon::get_persistent();
411 mon_feature_t required = p->get_required_features();
412
413 stringstream ds;
414 auto print_feature = [&](mon_feature_t& m_features, const char* m_str) {
415 if (f) {
416 if (list_with_value)
417 m_features.dump_with_value(f.get(), m_str);
418 else
419 m_features.dump(f.get(), m_str);
420 } else {
421 if (list_with_value)
422 m_features.print_with_value(ds);
423 else
424 m_features.print(ds);
425 }
426 };
427
428 if (f) {
429 f->open_object_section("features");
430
431 f->open_object_section("all");
432 print_feature(supported, "supported");
433 print_feature(persistent, "persistent");
434 f->close_section(); // all
435
436 f->open_object_section("monmap");
437 print_feature(p->persistent_features, "persistent");
438 print_feature(p->optional_features, "optional");
439 print_feature(required, "required");
440 f->close_section(); // monmap
441
442 f->close_section(); // features
443 f->flush(ds);
444
445 } else {
446 ds << "all features" << std::endl
447 << "\tsupported: ";
448 print_feature(supported, nullptr);
449 ds << std::endl
450 << "\tpersistent: ";
451 print_feature(persistent, nullptr);
452 ds << std::endl
453 << std::endl;
454
455 ds << "on current monmap (epoch "
456 << p->get_epoch() << ")" << std::endl
457 << "\tpersistent: ";
458 print_feature(p->persistent_features, nullptr);
459 ds << std::endl
460 // omit optional features in plain-text
461 // makes it easier to read, and they're, currently, empty.
462 << "\trequired: ";
463 print_feature(required, nullptr);
464 ds << std::endl;
465 }
466 rdata.append(ds);
467 r = 0;
468 }
469
470 reply:
471 if (r != -1) {
472 string rs;
473 getline(ss, rs);
474
475 mon.reply_command(op, r, rs, rdata, get_last_committed());
476 return true;
477 } else
478 return false;
479 }
480
481
482 bool MonmapMonitor::prepare_update(MonOpRequestRef op)
483 {
484 auto m = op->get_req<PaxosServiceMessage>();
485 dout(7) << __func__ << " " << *m << " from " << m->get_orig_source_inst() << dendl;
486
487 switch (m->get_type()) {
488 case MSG_MON_COMMAND:
489 try {
490 return prepare_command(op);
491 } catch (const bad_cmd_get& e) {
492 bufferlist bl;
493 mon.reply_command(op, -EINVAL, e.what(), bl, get_last_committed());
494 return true;
495 }
496 case MSG_MON_JOIN:
497 return prepare_join(op);
498 default:
499 ceph_abort();
500 }
501
502 return false;
503 }
504
505 bool MonmapMonitor::prepare_command(MonOpRequestRef op)
506 {
507 auto m = op->get_req<MMonCommand>();
508 stringstream ss;
509 string rs;
510 int err = -EINVAL;
511
512 cmdmap_t cmdmap;
513 if (!cmdmap_from_json(m->cmd, &cmdmap, ss)) {
514 string rs = ss.str();
515 mon.reply_command(op, -EINVAL, rs, get_last_committed());
516 return true;
517 }
518
519 string prefix;
520 cmd_getval(cmdmap, "prefix", prefix);
521
522 MonSession *session = op->get_session();
523 if (!session) {
524 mon.reply_command(op, -EACCES, "access denied", get_last_committed());
525 return true;
526 }
527
528 /* We should follow the following rules:
529 *
530 * - 'monmap' is the current, consistent version of the monmap
531 * - 'pending_map' is the uncommitted version of the monmap
532 *
533 * All checks for the current state must be made against 'monmap'.
534 * All changes are made against 'pending_map'.
535 *
536 * If there are concurrent operations modifying 'pending_map', please
537 * follow the following rules.
538 *
539 * - if pending_map has already been changed, the second operation must
540 * wait for the proposal to finish and be run again; This is the easiest
541 * path to guarantee correctness but may impact performance (i.e., it
542 * will take longer for the user to get a reply).
543 *
544 * - if the result of the second operation can be guaranteed to be
545 * idempotent, the operation may reply to the user once the proposal
546 * finishes; still needs to wait for the proposal to finish.
547 *
548 * - An operation _NEVER_ returns to the user based on pending state.
549 *
550 * If an operation does not modify current stable monmap, it may be
551 * serialized before current pending map, regardless of any change that
552 * has been made to the pending map -- remember, pending is uncommitted
553 * state, thus we are not bound by it.
554 */
555
556 ceph_assert(mon.monmap);
557 MonMap &monmap = *mon.monmap;
558
559
560 /* Please note:
561 *
562 * Adding or removing monitors may lead to loss of quorum.
563 *
564 * Because quorum may be lost, it's important to reply something
565 * to the user, lest she end up waiting forever for a reply. And
566 * no reply will ever be sent until quorum is formed again.
567 *
568 * On the other hand, this means we're leaking uncommitted state
569 * to the user. As such, please be mindful of the reply message.
570 *
571 * e.g., 'adding monitor mon.foo' is okay ('adding' is an on-going
572 * operation and conveys its not-yet-permanent nature); whereas
573 * 'added monitor mon.foo' presumes the action has successfully
574 * completed and state has been committed, which may not be true.
575 */
576
577
578 bool propose = false;
579 if (prefix == "mon add") {
580 string name;
581 cmd_getval(cmdmap, "name", name);
582 string addrstr;
583 cmd_getval(cmdmap, "addr", addrstr);
584 entity_addr_t addr;
585 bufferlist rdata;
586
587 if (!addr.parse(addrstr.c_str())) {
588 err = -EINVAL;
589 ss << "addr " << addrstr << "does not parse";
590 goto reply;
591 }
592
593 vector<string> locationvec;
594 map<string, string> loc;
595 cmd_getval(cmdmap, "location", locationvec);
596 CrushWrapper::parse_loc_map(locationvec, &loc);
597 if (locationvec.size() &&
598 !mon.get_quorum_mon_features().contains_all(
599 ceph::features::mon::FEATURE_PINGING)) {
600 err = -ENOTSUP;
601 ss << "Not all monitors support adding monitors with a location; please upgrade first!";
602 goto reply;
603 }
604 if (locationvec.size() && !loc.size()) {
605 ss << "We could not parse your input location to anything real; " << locationvec
606 << " turned into an empty map!";
607 err = -EINVAL;
608 goto reply;
609 }
610
611 dout(10) << "mon add setting location for " << name << " to " << loc << dendl;
612
613 // TODO: validate location in crush map
614 if (monmap.stretch_mode_enabled && !loc.size()) {
615 ss << "We are in stretch mode and new monitors must have a location, but "
616 << "could not parse your input location to anything real; " << locationvec
617 << " turned into an empty map!";
618 err = -EINVAL;
619 goto reply;
620 }
621 // TODO: validate location against any existing stretch config
622
623 entity_addrvec_t addrs;
624 if (monmap.persistent_features.contains_all(
625 ceph::features::mon::FEATURE_NAUTILUS)) {
626 if (addr.get_port() == CEPH_MON_PORT_IANA) {
627 addr.set_type(entity_addr_t::TYPE_MSGR2);
628 }
629 if (addr.get_port() == CEPH_MON_PORT_LEGACY) {
630 // if they specified the *old* default they probably don't care
631 addr.set_port(0);
632 }
633 if (addr.get_port()) {
634 addrs.v.push_back(addr);
635 } else {
636 addr.set_type(entity_addr_t::TYPE_MSGR2);
637 addr.set_port(CEPH_MON_PORT_IANA);
638 addrs.v.push_back(addr);
639 addr.set_type(entity_addr_t::TYPE_LEGACY);
640 addr.set_port(CEPH_MON_PORT_LEGACY);
641 addrs.v.push_back(addr);
642 }
643 } else {
644 if (addr.get_port() == 0) {
645 addr.set_port(CEPH_MON_PORT_LEGACY);
646 }
647 addr.set_type(entity_addr_t::TYPE_LEGACY);
648 addrs.v.push_back(addr);
649 }
650 dout(20) << __func__ << " addr " << addr << " -> addrs " << addrs << dendl;
651
652 /**
653 * If we have a monitor with the same name and different addr, then EEXIST
654 * If we have a monitor with the same addr and different name, then EEXIST
655 * If we have a monitor with the same addr and same name, then wait for
656 * the proposal to finish and return success.
657 * If we don't have the monitor, add it.
658 */
659
660 err = 0;
661 if (!ss.str().empty())
662 ss << "; ";
663
664 do {
665 if (monmap.contains(name)) {
666 if (monmap.get_addrs(name) == addrs) {
667 // stable map contains monitor with the same name at the same address.
668 // serialize before current pending map.
669 err = 0; // for clarity; this has already been set above.
670 ss << "mon." << name << " at " << addrs << " already exists";
671 goto reply;
672 } else {
673 ss << "mon." << name
674 << " already exists at address " << monmap.get_addrs(name);
675 }
676 } else if (monmap.contains(addrs)) {
677 // we established on the previous branch that name is different
678 ss << "mon." << monmap.get_name(addrs)
679 << " already exists at address " << addr;
680 } else {
681 // go ahead and add
682 break;
683 }
684 err = -EEXIST;
685 goto reply;
686 } while (false);
687
688 if (pending_map.stretch_mode_enabled) {
689
690 }
691
692 /* Given there's no delay between proposals on the MonmapMonitor (see
693 * MonmapMonitor::should_propose()), there is no point in checking for
694 * a mismatch between name and addr on pending_map.
695 *
696 * Once we established the monitor does not exist in the committed state,
697 * we can simply go ahead and add the monitor.
698 */
699
700 pending_map.add(name, addrs);
701 pending_map.mon_info[name].crush_loc = loc;
702 pending_map.last_changed = ceph_clock_now();
703 ss << "adding mon." << name << " at " << addrs;
704 propose = true;
705 dout(0) << __func__ << " proposing new mon." << name << dendl;
706
707 } else if (prefix == "mon remove" ||
708 prefix == "mon rm") {
709 string name;
710 cmd_getval(cmdmap, "name", name);
711 if (!monmap.contains(name)) {
712 err = 0;
713 ss << "mon." << name << " does not exist or has already been removed";
714 goto reply;
715 }
716
717 if (monmap.size() == 1) {
718 err = -EINVAL;
719 ss << "error: refusing removal of last monitor " << name;
720 goto reply;
721 }
722
723 /* At the time of writing, there is no risk of races when multiple clients
724 * attempt to use the same name. The reason is simple but may not be
725 * obvious.
726 *
727 * In a nutshell, we do not collate proposals on the MonmapMonitor. As
728 * soon as we return 'true' below, PaxosService::dispatch() will check if
729 * the service should propose, and - if so - the service will be marked as
730 * 'proposing' and a proposal will be triggered. The PaxosService class
731 * guarantees that once a service is marked 'proposing' no further writes
732 * will be handled.
733 *
734 * The decision on whether the service should propose or not is, in this
735 * case, made by MonmapMonitor::should_propose(), which always considers
736 * the proposal delay being 0.0 seconds. This is key for PaxosService to
737 * trigger the proposal immediately.
738 * 0.0 seconds of delay.
739 *
740 * From the above, there's no point in performing further checks on the
741 * pending_map, as we don't ever have multiple proposals in-flight in
742 * this service. As we've established the committed state contains the
743 * monitor, we can simply go ahead and remove it.
744 *
745 * Please note that the code hinges on all of the above to be true. It
746 * has been true since time immemorial and we don't see a good reason
747 * to make it sturdier at this time - mainly because we don't think it's
748 * going to change any time soon, lest for any bug that may be unwillingly
749 * introduced.
750 */
751
752 entity_addrvec_t addrs = pending_map.get_addrs(name);
753 pending_map.remove(name);
754 pending_map.last_changed = ceph_clock_now();
755 ss << "removing mon." << name << " at " << addrs
756 << ", there will be " << pending_map.size() << " monitors" ;
757 propose = true;
758 err = 0;
759
760 } else if (prefix == "mon feature set") {
761
762 /* PLEASE NOTE:
763 *
764 * We currently only support setting/unsetting persistent features.
765 * This is by design, given at the moment we still don't have optional
766 * features, and, as such, there is no point introducing an interface
767 * to manipulate them. This allows us to provide a cleaner, more
768 * intuitive interface to the user, modifying solely persistent
769 * features.
770 *
771 * In the future we should consider adding another interface to handle
772 * optional features/flags; e.g., 'mon feature flag set/unset', or
773 * 'mon flag set/unset'.
774 */
775 string feature_name;
776 if (!cmd_getval(cmdmap, "feature_name", feature_name)) {
777 ss << "missing required feature name";
778 err = -EINVAL;
779 goto reply;
780 }
781
782 mon_feature_t feature;
783 feature = ceph::features::mon::get_feature_by_name(feature_name);
784 if (feature == ceph::features::mon::FEATURE_NONE) {
785 ss << "unknown feature '" << feature_name << "'";
786 err = -ENOENT;
787 goto reply;
788 }
789
790 bool sure = false;
791 cmd_getval(cmdmap, "yes_i_really_mean_it", sure);
792 if (!sure) {
793 ss << "please specify '--yes-i-really-mean-it' if you "
794 << "really, **really** want to set feature '"
795 << feature << "' in the monmap.";
796 err = -EPERM;
797 goto reply;
798 }
799
800 if (!mon.get_quorum_mon_features().contains_all(feature)) {
801 ss << "current quorum does not support feature '" << feature
802 << "'; supported features: "
803 << mon.get_quorum_mon_features();
804 err = -EINVAL;
805 goto reply;
806 }
807
808 ss << "setting feature '" << feature << "'";
809
810 err = 0;
811 if (monmap.persistent_features.contains_all(feature)) {
812 dout(10) << __func__ << " feature '" << feature
813 << "' already set on monmap; no-op." << dendl;
814 goto reply;
815 }
816
817 pending_map.persistent_features.set_feature(feature);
818 pending_map.last_changed = ceph_clock_now();
819 propose = true;
820
821 dout(1) << __func__ << " " << ss.str() << "; new features will be: "
822 << "persistent = " << pending_map.persistent_features
823 // output optional nevertheless, for auditing purposes.
824 << ", optional = " << pending_map.optional_features << dendl;
825
826 } else if (prefix == "mon set-rank") {
827 string name;
828 int64_t rank;
829 if (!cmd_getval(cmdmap, "name", name) ||
830 !cmd_getval(cmdmap, "rank", rank)) {
831 err = -EINVAL;
832 goto reply;
833 }
834 int oldrank = pending_map.get_rank(name);
835 if (oldrank < 0) {
836 ss << "mon." << name << " does not exist in monmap";
837 err = -ENOENT;
838 goto reply;
839 }
840 err = 0;
841 pending_map.set_rank(name, rank);
842 pending_map.last_changed = ceph_clock_now();
843 propose = true;
844 } else if (prefix == "mon set-addrs") {
845 string name;
846 string addrs;
847 if (!cmd_getval(cmdmap, "name", name) ||
848 !cmd_getval(cmdmap, "addrs", addrs)) {
849 err = -EINVAL;
850 goto reply;
851 }
852 if (!pending_map.contains(name)) {
853 ss << "mon." << name << " does not exist";
854 err = -ENOENT;
855 goto reply;
856 }
857 entity_addrvec_t av;
858 if (!av.parse(addrs.c_str(), nullptr)) {
859 ss << "failed to parse addrs '" << addrs << "'";
860 err = -EINVAL;
861 goto reply;
862 }
863 for (auto& a : av.v) {
864 a.set_nonce(0);
865 if (!a.get_port()) {
866 ss << "monitor must bind to a non-zero port, not " << a;
867 err = -EINVAL;
868 goto reply;
869 }
870 }
871 err = 0;
872 pending_map.set_addrvec(name, av);
873 pending_map.last_changed = ceph_clock_now();
874 propose = true;
875 } else if (prefix == "mon set-weight") {
876 string name;
877 int64_t weight;
878 if (!cmd_getval(cmdmap, "name", name) ||
879 !cmd_getval(cmdmap, "weight", weight)) {
880 err = -EINVAL;
881 goto reply;
882 }
883 if (!pending_map.contains(name)) {
884 ss << "mon." << name << " does not exist";
885 err = -ENOENT;
886 goto reply;
887 }
888 err = 0;
889 pending_map.set_weight(name, weight);
890 pending_map.last_changed = ceph_clock_now();
891 propose = true;
892 } else if (prefix == "mon enable-msgr2") {
893 if (!monmap.get_required_features().contains_all(
894 ceph::features::mon::FEATURE_NAUTILUS)) {
895 err = -EACCES;
896 ss << "all monitors must be running nautilus to enable v2";
897 goto reply;
898 }
899 for (auto& i : pending_map.mon_info) {
900 if (i.second.public_addrs.v.size() == 1 &&
901 i.second.public_addrs.front().is_legacy() &&
902 i.second.public_addrs.front().get_port() == CEPH_MON_PORT_LEGACY) {
903 entity_addrvec_t av;
904 entity_addr_t a = i.second.public_addrs.front();
905 a.set_type(entity_addr_t::TYPE_MSGR2);
906 a.set_port(CEPH_MON_PORT_IANA);
907 av.v.push_back(a);
908 av.v.push_back(i.second.public_addrs.front());
909 dout(10) << " setting mon." << i.first
910 << " addrs " << i.second.public_addrs
911 << " -> " << av << dendl;
912 pending_map.set_addrvec(i.first, av);
913 propose = true;
914 pending_map.last_changed = ceph_clock_now();
915 }
916 }
917 err = 0;
918 } else if (prefix == "mon set election_strategy") {
919 if (!mon.get_quorum_mon_features().contains_all(
920 ceph::features::mon::FEATURE_PINGING)) {
921 err = -ENOTSUP;
922 ss << "Not all monitors support changing election strategies; please upgrade first!";
923 goto reply;
924 }
925 string strat;
926 MonMap::election_strategy strategy;
927 if (!cmd_getval(cmdmap, "strategy", strat)) {
928 err = -EINVAL;
929 goto reply;
930 }
931 if (strat == "classic") {
932 strategy = MonMap::CLASSIC;
933 } else if (strat == "disallow") {
934 strategy = MonMap::DISALLOW;
935 } else if (strat == "connectivity") {
936 strategy = MonMap::CONNECTIVITY;
937 } else {
938 err = -EINVAL;
939 goto reply;
940 }
941 err = 0;
942 pending_map.strategy = strategy;
943 propose = true;
944 } else if (prefix == "mon add disallowed_leader") {
945 if (!mon.get_quorum_mon_features().contains_all(
946 ceph::features::mon::FEATURE_PINGING)) {
947 err = -ENOTSUP;
948 ss << "Not all monitors support changing election strategies; please upgrade first!";
949 goto reply;
950 }
951 string name;
952 if (!cmd_getval(cmdmap, "name", name)) {
953 err = -EINVAL;
954 goto reply;
955 }
956 if (pending_map.strategy != MonMap::DISALLOW &&
957 pending_map.strategy != MonMap::CONNECTIVITY) {
958 ss << "You cannot disallow monitors in your current election mode";
959 err = -EINVAL;
960 goto reply;
961 }
962 if (!pending_map.contains(name)) {
963 ss << "mon." << name << " does not exist";
964 err = -ENOENT;
965 goto reply;
966 }
967 if (pending_map.disallowed_leaders.count(name)) {
968 ss << "mon." << name << " is already disallowed";
969 err = 0;
970 goto reply;
971 }
972 if (pending_map.disallowed_leaders.size() == pending_map.size() - 1) {
973 ss << "mon." << name << " is the only remaining allowed leader!";
974 err = -EINVAL;
975 goto reply;
976 }
977 pending_map.disallowed_leaders.insert(name);
978 err = 0;
979 propose = true;
980 } else if (prefix == "mon rm disallowed_leader") {
981 if (!mon.get_quorum_mon_features().contains_all(
982 ceph::features::mon::FEATURE_PINGING)) {
983 err = -ENOTSUP;
984 ss << "Not all monitors support changing election strategies; please upgrade first!";
985 goto reply;
986 }
987 string name;
988 if (!cmd_getval(cmdmap, "name", name)) {
989 err = -EINVAL;
990 goto reply;
991 }
992 if (pending_map.strategy != MonMap::DISALLOW &&
993 pending_map.strategy != MonMap::CONNECTIVITY) {
994 ss << "You cannot disallow monitors in your current election mode";
995 err = -EINVAL;
996 goto reply;
997 }
998 if (!pending_map.contains(name)) {
999 ss << "mon." << name << " does not exist";
1000 err = -ENOENT;
1001 goto reply;
1002 }
1003 if (!pending_map.disallowed_leaders.count(name)) {
1004 ss << "mon." << name << " is already allowed";
1005 err = 0;
1006 goto reply;
1007 }
1008 pending_map.disallowed_leaders.erase(name);
1009 err = 0;
1010 propose = true;
1011 } else if (prefix == "mon set_location") {
1012 if (!mon.get_quorum_mon_features().contains_all(
1013 ceph::features::mon::FEATURE_PINGING)) {
1014 err = -ENOTSUP;
1015 ss << "Not all monitors support monitor locations; please upgrade first!";
1016 goto reply;
1017 }
1018 string name;
1019 if (!cmd_getval(cmdmap, "name", name)) {
1020 err = -EINVAL;
1021 goto reply;
1022 }
1023 if (!pending_map.contains(name)) {
1024 ss << "mon." << name << " does not exist";
1025 err = -ENOENT;
1026 goto reply;
1027 }
1028
1029 if (!mon.osdmon()->is_readable()) {
1030 mon.osdmon()->wait_for_readable(op, new Monitor::C_RetryMessage(&mon, op));
1031 }
1032 vector<string> argvec;
1033 map<string, string> loc;
1034 cmd_getval(cmdmap, "args", argvec);
1035 CrushWrapper::parse_loc_map(argvec, &loc);
1036
1037 dout(10) << "mon set_location for " << name << " to " << loc << dendl;
1038
1039 // TODO: validate location in crush map
1040 if (!loc.size()) {
1041 ss << "We could not parse your input location to anything real; " << argvec
1042 << " turned into an empty map!";
1043 err = -EINVAL;
1044 goto reply;
1045 }
1046 // TODO: validate location against any existing stretch config
1047 pending_map.mon_info[name].crush_loc = loc;
1048 err = 0;
1049 propose = true;
1050 } else if (prefix == "mon enable_stretch_mode") {
1051 if (!mon.osdmon()->is_writeable()) {
1052 dout(1) << __func__
1053 << ": waiting for osdmon writeable for stretch mode" << dendl;
1054 mon.osdmon()->wait_for_writeable(op, new Monitor::C_RetryMessage(&mon, op));
1055 return false;
1056 }
1057 {
1058 if (monmap.stretch_mode_enabled) {
1059 ss << "stretch mode is already engaged";
1060 err = -EINVAL;
1061 goto reply;
1062 }
1063 if (pending_map.stretch_mode_enabled) {
1064 ss << "stretch mode currently committing";
1065 err = 0;
1066 goto reply;
1067 }
1068 string tiebreaker_mon;
1069 if (!cmd_getval(cmdmap, "tiebreaker_mon", tiebreaker_mon)) {
1070 ss << "must specify a tiebreaker monitor";
1071 err = -EINVAL;
1072 goto reply;
1073 }
1074 string new_crush_rule;
1075 if (!cmd_getval(cmdmap, "new_crush_rule", new_crush_rule)) {
1076 ss << "must specify a new crush rule that spreads out copies over multiple sites";
1077 err = -EINVAL;
1078 goto reply;
1079 }
1080 string dividing_bucket;
1081 if (!cmd_getval(cmdmap, "dividing_bucket", dividing_bucket)) {
1082 ss << "must specify a dividing bucket";
1083 err = -EINVAL;
1084 goto reply;
1085 }
1086 //okay, initial arguments make sense, check pools and cluster state
1087 err = mon.osdmon()->check_cluster_features(CEPH_FEATUREMASK_STRETCH_MODE, ss);
1088 if (err)
1089 goto reply;
1090 struct Plugger {
1091 Paxos &p;
1092 Plugger(Paxos &p) : p(p) { p.plug(); }
1093 ~Plugger() { p.unplug(); }
1094 } plugger(paxos);
1095
1096 set<pg_pool_t*> pools;
1097 bool okay = false;
1098 int errcode = 0;
1099
1100 mon.osdmon()->try_enable_stretch_mode_pools(ss, &okay, &errcode,
1101 &pools, new_crush_rule);
1102 if (!okay) {
1103 err = errcode;
1104 goto reply;
1105 }
1106 try_enable_stretch_mode(ss, &okay, &errcode, false,
1107 tiebreaker_mon, dividing_bucket);
1108 if (!okay) {
1109 err = errcode;
1110 goto reply;
1111 }
1112 mon.osdmon()->try_enable_stretch_mode(ss, &okay, &errcode, false,
1113 dividing_bucket, 2, pools, new_crush_rule);
1114 if (!okay) {
1115 err = errcode;
1116 goto reply;
1117 }
1118 // everything looks good, actually commit the changes!
1119 try_enable_stretch_mode(ss, &okay, &errcode, true,
1120 tiebreaker_mon, dividing_bucket);
1121 mon.osdmon()->try_enable_stretch_mode(ss, &okay, &errcode, true,
1122 dividing_bucket,
1123 2, // right now we only support 2 sites
1124 pools, new_crush_rule);
1125 ceph_assert(okay == true);
1126 }
1127 request_proposal(mon.osdmon());
1128 err = 0;
1129 propose = true;
1130 } else {
1131 ss << "unknown command " << prefix;
1132 err = -EINVAL;
1133 }
1134
1135 reply:
1136 getline(ss, rs);
1137 mon.reply_command(op, err, rs, get_last_committed());
1138 // we are returning to the user; do not propose.
1139 return propose;
1140 }
1141
1142 void MonmapMonitor::try_enable_stretch_mode(stringstream& ss, bool *okay,
1143 int *errcode, bool commit,
1144 const string& tiebreaker_mon,
1145 const string& dividing_bucket)
1146 {
1147 dout(20) << __func__ << dendl;
1148 *okay = false;
1149 if (pending_map.strategy != MonMap::CONNECTIVITY) {
1150 ss << "Monitors must use the connectivity strategy to enable stretch mode";
1151 *errcode = -EINVAL;
1152 ceph_assert(!commit);
1153 return;
1154 }
1155 if (!pending_map.contains(tiebreaker_mon)) {
1156 ss << "mon " << tiebreaker_mon << "does not seem to exist";
1157 *errcode = -ENOENT;
1158 ceph_assert(!commit);
1159 return;
1160 }
1161 map<string,string> buckets;
1162 for (const auto&mii : mon.monmap->mon_info) {
1163 const auto& mi = mii.second;
1164 const auto& bi = mi.crush_loc.find(dividing_bucket);
1165 if (bi == mi.crush_loc.end()) {
1166 ss << "Could not find location entry for " << dividing_bucket
1167 << " on monitor " << mi.name;
1168 *errcode = -EINVAL;
1169 ceph_assert(!commit);
1170 return;
1171 }
1172 buckets[mii.first] = bi->second;
1173 }
1174 string bucket1, bucket2, tiebreaker_bucket;
1175 for (auto& i : buckets) {
1176 if (i.first == tiebreaker_mon) {
1177 tiebreaker_bucket = i.second;
1178 continue;
1179 }
1180 if (bucket1.empty()) {
1181 bucket1 = i.second;
1182 }
1183 if (bucket1 != i.second &&
1184 bucket2.empty()) {
1185 bucket2 = i.second;
1186 }
1187 if (bucket1 != i.second &&
1188 bucket2 != i.second) {
1189 ss << "There are too many monitor buckets for stretch mode, found "
1190 << bucket1 << "," << bucket2 << "," << i.second;
1191 *errcode = -EINVAL;
1192 ceph_assert(!commit);
1193 return;
1194 }
1195 }
1196 if (bucket1.empty() || bucket2.empty()) {
1197 ss << "There are not enough monitor buckets for stretch mode;"
1198 << " must have at least 2 plus the tiebreaker but only found "
1199 << (bucket1.empty() ? bucket1 : bucket2);
1200 *errcode = -EINVAL;
1201 ceph_assert(!commit);
1202 return;
1203 }
1204 if (tiebreaker_bucket == bucket1 ||
1205 tiebreaker_bucket == bucket2) {
1206 ss << "The named tiebreaker monitor " << tiebreaker_mon
1207 << " is in the same CRUSH bucket " << tiebreaker_bucket
1208 << " as other monitors";
1209 *errcode = -EINVAL;
1210 ceph_assert(!commit);
1211 return;
1212 }
1213 if (commit) {
1214 pending_map.disallowed_leaders.insert(tiebreaker_mon);
1215 pending_map.tiebreaker_mon = tiebreaker_mon;
1216 pending_map.stretch_mode_enabled = true;
1217 }
1218 *okay = true;
1219 }
1220
1221 void MonmapMonitor::trigger_degraded_stretch_mode(const set<string>& dead_mons)
1222 {
1223 dout(20) << __func__ << dendl;
1224 pending_map.stretch_marked_down_mons.insert(dead_mons.begin(), dead_mons.end());
1225 propose_pending();
1226 }
1227
1228 void MonmapMonitor::trigger_healthy_stretch_mode()
1229 {
1230 dout(20) << __func__ << dendl;
1231 pending_map.stretch_marked_down_mons.clear();
1232 propose_pending();
1233 }
1234
1235 bool MonmapMonitor::preprocess_join(MonOpRequestRef op)
1236 {
1237 auto join = op->get_req<MMonJoin>();
1238 dout(10) << __func__ << " " << join->name << " at " << join->addrs << dendl;
1239
1240 MonSession *session = op->get_session();
1241 if (!session ||
1242 !session->is_capable("mon", MON_CAP_W | MON_CAP_X)) {
1243 dout(10) << " insufficient caps" << dendl;
1244 return true;
1245 }
1246
1247 const auto name_info_i = pending_map.mon_info.find(join->name);
1248 if (name_info_i != pending_map.mon_info.end() &&
1249 !name_info_i->second.public_addrs.front().is_blank_ip() &&
1250 (!join->force_loc || join->crush_loc == name_info_i->second.crush_loc)) {
1251 dout(10) << " already have " << join->name << dendl;
1252 return true;
1253 }
1254 string addr_name;
1255 if (pending_map.contains(join->addrs)) {
1256 addr_name = pending_map.get_name(join->addrs);
1257 }
1258 if (!addr_name.empty() &&
1259 addr_name == join->name &&
1260 (!join->force_loc || join->crush_loc.empty() ||
1261 pending_map.mon_info[addr_name].crush_loc == join->crush_loc)) {
1262 dout(10) << " already have " << join->addrs << dendl;
1263 return true;
1264 }
1265 if (pending_map.stretch_mode_enabled &&
1266 join->crush_loc.empty() &&
1267 (addr_name.empty() ||
1268 pending_map.mon_info[addr_name].crush_loc.empty())) {
1269 dout(10) << "stretch mode engaged but no source of crush_loc" << dendl;
1270 mon.clog->info() << join->name << " attempted to join from " << join->name
1271 << ' ' << join->addrs
1272 << "; but lacks a crush_location for stretch mode";
1273 return true;
1274 }
1275 return false;
1276 }
1277
1278 bool MonmapMonitor::prepare_join(MonOpRequestRef op)
1279 {
1280 auto join = op->get_req<MMonJoin>();
1281 dout(0) << "adding/updating " << join->name
1282 << " at " << join->addrs << " to monitor cluster" << dendl;
1283 map<string,string> existing_loc;
1284 if (pending_map.contains(join->addrs)) {
1285 string name = pending_map.get_name(join->addrs);
1286 existing_loc = pending_map.mon_info[name].crush_loc;
1287 pending_map.remove(name);
1288 }
1289 if (pending_map.contains(join->name))
1290 pending_map.remove(join->name);
1291 pending_map.add(join->name, join->addrs);
1292 pending_map.mon_info[join->name].crush_loc =
1293 ((join->force_loc || existing_loc.empty()) ?
1294 join->crush_loc : existing_loc);
1295 pending_map.last_changed = ceph_clock_now();
1296 return true;
1297 }
1298
1299 bool MonmapMonitor::should_propose(double& delay)
1300 {
1301 delay = 0.0;
1302 return true;
1303 }
1304
1305 int MonmapMonitor::get_monmap(bufferlist &bl)
1306 {
1307 version_t latest_ver = get_last_committed();
1308 dout(10) << __func__ << " ver " << latest_ver << dendl;
1309
1310 if (!mon.store->exists(get_service_name(), stringify(latest_ver)))
1311 return -ENOENT;
1312
1313 int err = get_version(latest_ver, bl);
1314 if (err < 0) {
1315 dout(1) << __func__ << " error obtaining monmap: "
1316 << cpp_strerror(err) << dendl;
1317 return err;
1318 }
1319 return 0;
1320 }
1321
1322 void MonmapMonitor::check_subs()
1323 {
1324 const string type = "monmap";
1325 mon.with_session_map([this, &type](const MonSessionMap& session_map) {
1326 auto subs = session_map.subs.find(type);
1327 if (subs == session_map.subs.end())
1328 return;
1329 for (auto sub : *subs->second) {
1330 check_sub(sub);
1331 }
1332 });
1333 }
1334
1335 void MonmapMonitor::check_sub(Subscription *sub)
1336 {
1337 const auto epoch = mon.monmap->get_epoch();
1338 dout(10) << __func__
1339 << " monmap next " << sub->next
1340 << " have " << epoch << dendl;
1341 if (sub->next <= epoch) {
1342 mon.send_latest_monmap(sub->session->con.get());
1343 if (sub->onetime) {
1344 mon.with_session_map([sub](MonSessionMap& session_map) {
1345 session_map.remove_sub(sub);
1346 });
1347 } else {
1348 sub->next = epoch + 1;
1349 }
1350 }
1351 }
1352
1353 void MonmapMonitor::tick()
1354 {
1355 if (!is_active() ||
1356 !mon.is_leader()) {
1357 return;
1358 }
1359
1360 if (mon.monmap->created.is_zero()) {
1361 dout(10) << __func__ << " detected empty created stamp" << dendl;
1362 utime_t ctime;
1363 for (version_t v = 1; v <= get_last_committed(); v++) {
1364 bufferlist bl;
1365 int r = get_version(v, bl);
1366 if (r < 0) {
1367 continue;
1368 }
1369 MonMap m;
1370 auto p = bl.cbegin();
1371 decode(m, p);
1372 if (!m.last_changed.is_zero()) {
1373 dout(10) << __func__ << " first monmap with last_changed is "
1374 << v << " with " << m.last_changed << dendl;
1375 ctime = m.last_changed;
1376 break;
1377 }
1378 }
1379 if (ctime.is_zero()) {
1380 ctime = ceph_clock_now();
1381 }
1382 dout(10) << __func__ << " updating created stamp to " << ctime << dendl;
1383 pending_map.created = ctime;
1384 propose_pending();
1385 }
1386 }