]> git.proxmox.com Git - ceph.git/blame - ceph/src/mon/MonmapMonitor.cc
buildsys: switch source download to quincy
[ceph.git] / ceph / src / mon / MonmapMonitor.cc
CommitLineData
7c673cae
FG
1// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2// vim: ts=8 sw=2 smarttab
3/*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2009 Sage Weil <sage@newdream.net>
7 *
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
12 *
13 */
14
15#include "MonmapMonitor.h"
16#include "Monitor.h"
f67539c2 17#include "OSDMonitor.h"
7c673cae
FG
18#include "messages/MMonCommand.h"
19#include "messages/MMonJoin.h"
20
21#include "common/ceph_argparse.h"
22#include "common/errno.h"
23#include <sstream>
24#include "common/config.h"
25#include "common/cmdparse.h"
26
11fdf7f2 27#include "include/ceph_assert.h"
7c673cae
FG
28#include "include/stringify.h"
29
30#define dout_subsys ceph_subsys_mon
31#undef dout_prefix
32#define dout_prefix _prefix(_dout, mon)
9f95a23c 33using namespace TOPNSPC::common;
f67539c2
TL
34
35using std::cout;
36using std::dec;
37using std::hex;
38using std::list;
39using std::map;
40using std::make_pair;
41using std::ostream;
42using std::ostringstream;
43using std::pair;
44using std::set;
45using std::setfill;
46using std::string;
47using std::stringstream;
48using std::to_string;
49using std::vector;
50using std::unique_ptr;
51
52using ceph::bufferlist;
53using ceph::decode;
54using ceph::encode;
55using ceph::Formatter;
56using ceph::JSONFormatter;
57using ceph::make_message;
58using ceph::mono_clock;
59using ceph::mono_time;
60using ceph::timespan_str;
61static ostream& _prefix(std::ostream *_dout, Monitor &mon) {
62 return *_dout << "mon." << mon.name << "@" << mon.rank
63 << "(" << mon.get_state_name()
64 << ").monmap v" << mon.monmap->epoch << " ";
7c673cae
FG
65}
66
67void MonmapMonitor::create_initial()
68{
224ce89b 69 dout(10) << __func__ << " using current monmap" << dendl;
f67539c2 70 pending_map = *mon.monmap;
7c673cae 71 pending_map.epoch = 1;
31f18b77 72
11fdf7f2 73 if (g_conf()->mon_debug_no_initial_persistent_features) {
31f18b77
FG
74 derr << __func__ << " mon_debug_no_initial_persistent_features=true"
75 << dendl;
76 } else {
77 // initialize with default persistent features for new clusters
78 pending_map.persistent_features = ceph::features::mon::get_persistent();
11fdf7f2 79 pending_map.min_mon_release = ceph_release();
31f18b77 80 }
7c673cae
FG
81}
82
83void MonmapMonitor::update_from_paxos(bool *need_bootstrap)
84{
85 version_t version = get_last_committed();
f67539c2 86 if (version <= mon.monmap->get_epoch())
7c673cae
FG
87 return;
88
89 dout(10) << __func__ << " version " << version
f67539c2 90 << ", my v " << mon.monmap->epoch << dendl;
7c673cae 91
f67539c2 92 if (need_bootstrap && version != mon.monmap->get_epoch()) {
7c673cae
FG
93 dout(10) << " signaling that we need a bootstrap" << dendl;
94 *need_bootstrap = true;
95 }
96
97 // read and decode
98 monmap_bl.clear();
99 int ret = get_version(version, monmap_bl);
11fdf7f2
TL
100 ceph_assert(ret == 0);
101 ceph_assert(monmap_bl.length());
7c673cae 102
224ce89b 103 dout(10) << __func__ << " got " << version << dendl;
f67539c2 104 mon.monmap->decode(monmap_bl);
7c673cae 105
f67539c2 106 if (mon.store->exists("mkfs", "monmap")) {
7c673cae
FG
107 auto t(std::make_shared<MonitorDBStore::Transaction>());
108 t->erase("mkfs", "monmap");
f67539c2 109 mon.store->apply_transaction(t);
7c673cae
FG
110 }
111
112 check_subs();
11fdf7f2
TL
113
114 // make sure we've recorded min_mon_release
115 string val;
f67539c2 116 if (mon.store->read_meta("min_mon_release", &val) < 0 ||
11fdf7f2
TL
117 val.size() == 0 ||
118 atoi(val.c_str()) != (int)ceph_release()) {
119 dout(10) << __func__ << " updating min_mon_release meta" << dendl;
f67539c2 120 mon.store->write_meta("min_mon_release",
11fdf7f2
TL
121 stringify(ceph_release()));
122 }
f67539c2 123
b3b6e05e 124 mon.notify_new_monmap(true);
7c673cae
FG
125}
126
127void MonmapMonitor::create_pending()
128{
f67539c2 129 pending_map = *mon.monmap;
7c673cae
FG
130 pending_map.epoch++;
131 pending_map.last_changed = ceph_clock_now();
224ce89b 132 dout(10) << __func__ << " monmap epoch " << pending_map.epoch << dendl;
7c673cae
FG
133}
134
135void MonmapMonitor::encode_pending(MonitorDBStore::TransactionRef t)
136{
224ce89b 137 dout(10) << __func__ << " epoch " << pending_map.epoch << dendl;
7c673cae 138
f67539c2 139 ceph_assert(mon.monmap->epoch + 1 == pending_map.epoch ||
7c673cae
FG
140 pending_map.epoch == 1); // special case mkfs!
141 bufferlist bl;
f67539c2 142 pending_map.encode(bl, mon.get_quorum_con_features());
7c673cae
FG
143
144 put_version(t, pending_map.epoch, bl);
145 put_last_committed(t, pending_map.epoch);
146
147 // generate a cluster fingerprint, too?
148 if (pending_map.epoch == 1) {
f67539c2 149 mon.prepare_new_fingerprint(t);
7c673cae 150 }
f67539c2
TL
151
152 //health
153 health_check_map_t next;
154 pending_map.check_health(&next);
155 encode_health(next, t);
7c673cae
FG
156}
157
158class C_ApplyFeatures : public Context {
159 MonmapMonitor *svc;
160 mon_feature_t features;
9f95a23c 161 ceph_release_t min_mon_release;
11fdf7f2 162public:
9f95a23c 163 C_ApplyFeatures(MonmapMonitor *s, const mon_feature_t& f, ceph_release_t mmr) :
11fdf7f2 164 svc(s), features(f), min_mon_release(mmr) { }
7c673cae
FG
165 void finish(int r) override {
166 if (r >= 0) {
11fdf7f2 167 svc->apply_mon_features(features, min_mon_release);
7c673cae
FG
168 } else if (r == -EAGAIN || r == -ECANCELED) {
169 // discard features if we're no longer on the quorum that
170 // established them in the first place.
171 return;
172 } else {
11fdf7f2 173 ceph_abort_msg("bad C_ApplyFeatures return value");
7c673cae
FG
174 }
175 }
176};
177
11fdf7f2 178void MonmapMonitor::apply_mon_features(const mon_feature_t& features,
9f95a23c 179 ceph_release_t min_mon_release)
7c673cae
FG
180{
181 if (!is_writeable()) {
182 dout(5) << __func__ << " wait for service to be writeable" << dendl;
11fdf7f2 183 wait_for_writeable_ctx(new C_ApplyFeatures(this, features, min_mon_release));
7c673cae
FG
184 return;
185 }
186
11fdf7f2 187 // do nothing here unless we have a full quorum
f67539c2 188 if (mon.get_quorum().size() < mon.monmap->size()) {
11fdf7f2
TL
189 return;
190 }
191
192 ceph_assert(is_writeable());
193 ceph_assert(features.contains_all(pending_map.persistent_features));
7c673cae
FG
194 // we should never hit this because `features` should be the result
195 // of the quorum's supported features. But if it happens, die.
11fdf7f2 196 ceph_assert(ceph::features::mon::get_supported().contains_all(features));
7c673cae
FG
197
198 mon_feature_t new_features =
199 (pending_map.persistent_features ^
200 (features & ceph::features::mon::get_persistent()));
201
11fdf7f2
TL
202 if (new_features.empty() &&
203 pending_map.min_mon_release == min_mon_release) {
81eedcae 204 dout(10) << __func__ << " min_mon_release (" << (int)min_mon_release
11fdf7f2 205 << ") and features (" << features << ") match" << dendl;
7c673cae
FG
206 return;
207 }
208
11fdf7f2
TL
209 if (!new_features.empty()) {
210 dout(1) << __func__ << " applying new features "
211 << new_features << ", had " << pending_map.persistent_features
212 << ", will have "
213 << (new_features | pending_map.persistent_features)
214 << dendl;
215 pending_map.persistent_features |= new_features;
216 }
217 if (min_mon_release > pending_map.min_mon_release) {
218 dout(1) << __func__ << " increasing min_mon_release to "
f67539c2 219 << to_integer<int>(min_mon_release) << " (" << min_mon_release
11fdf7f2
TL
220 << ")" << dendl;
221 pending_map.min_mon_release = min_mon_release;
7c673cae
FG
222 }
223
7c673cae
FG
224 propose_pending();
225}
226
227void MonmapMonitor::on_active()
228{
f67539c2 229 if (get_last_committed() >= 1 && !mon.has_ever_joined) {
7c673cae
FG
230 // make note of the fact that i was, once, part of the quorum.
231 dout(10) << "noting that i was, once, part of an active quorum." << dendl;
232
233 /* This is some form of nasty in-breeding we have between the MonmapMonitor
234 and the Monitor itself. We should find a way to get rid of it given our
235 new architecture. Until then, stick with it since we are a
236 single-threaded process and, truth be told, no one else relies on this
237 thing besides us.
238 */
239 auto t(std::make_shared<MonitorDBStore::Transaction>());
240 t->put(Monitor::MONITOR_NAME, "joined", 1);
f67539c2
TL
241 mon.store->apply_transaction(t);
242 mon.has_ever_joined = true;
7c673cae
FG
243 }
244
f67539c2
TL
245 if (mon.is_leader()) {
246 mon.clog->debug() << "monmap " << *mon.monmap;
b32b8144 247 }
7c673cae 248
f67539c2
TL
249 apply_mon_features(mon.get_quorum_mon_features(),
250 mon.quorum_min_mon_release);
7c673cae
FG
251}
252
253bool MonmapMonitor::preprocess_query(MonOpRequestRef op)
254{
9f95a23c 255 auto m = op->get_req<PaxosServiceMessage>();
7c673cae
FG
256 switch (m->get_type()) {
257 // READs
258 case MSG_MON_COMMAND:
f64942e4
AA
259 try {
260 return preprocess_command(op);
261 }
262 catch (const bad_cmd_get& e) {
263 bufferlist bl;
f67539c2 264 mon.reply_command(op, -EINVAL, e.what(), bl, get_last_committed());
f64942e4
AA
265 return true;
266 }
7c673cae
FG
267 case MSG_MON_JOIN:
268 return preprocess_join(op);
269 default:
270 ceph_abort();
271 return true;
272 }
273}
274
275void MonmapMonitor::dump_info(Formatter *f)
276{
277 f->dump_unsigned("monmap_first_committed", get_first_committed());
278 f->dump_unsigned("monmap_last_committed", get_last_committed());
279 f->open_object_section("monmap");
f67539c2 280 mon.monmap->dump(f);
7c673cae
FG
281 f->close_section();
282 f->open_array_section("quorum");
f67539c2 283 for (set<int>::iterator q = mon.get_quorum().begin(); q != mon.get_quorum().end(); ++q)
7c673cae
FG
284 f->dump_int("mon", *q);
285 f->close_section();
286}
287
288bool MonmapMonitor::preprocess_command(MonOpRequestRef op)
289{
9f95a23c 290 auto m = op->get_req<MMonCommand>();
7c673cae
FG
291 int r = -1;
292 bufferlist rdata;
293 stringstream ss;
294
11fdf7f2 295 cmdmap_t cmdmap;
7c673cae
FG
296 if (!cmdmap_from_json(m->cmd, &cmdmap, ss)) {
297 string rs = ss.str();
f67539c2 298 mon.reply_command(op, -EINVAL, rs, rdata, get_last_committed());
7c673cae
FG
299 return true;
300 }
301
302 string prefix;
9f95a23c 303 cmd_getval(cmdmap, "prefix", prefix);
7c673cae 304
11fdf7f2 305 MonSession *session = op->get_session();
7c673cae 306 if (!session) {
f67539c2 307 mon.reply_command(op, -EACCES, "access denied", get_last_committed());
7c673cae
FG
308 return true;
309 }
310
311 string format;
9f95a23c 312 cmd_getval(cmdmap, "format", format, string("plain"));
7c673cae
FG
313 boost::scoped_ptr<Formatter> f(Formatter::create(format));
314
315 if (prefix == "mon stat") {
f91f0fd5
TL
316 if (f) {
317 f->open_object_section("monmap");
f67539c2
TL
318 mon.monmap->dump_summary(f.get());
319 f->dump_string("leader", mon.get_leader_name());
f91f0fd5 320 f->open_array_section("quorum");
f67539c2
TL
321 for (auto rank: mon.get_quorum()) {
322 std::string name = mon.monmap->get_name(rank);
f91f0fd5
TL
323 f->open_object_section("mon");
324 f->dump_int("rank", rank);
325 f->dump_string("name", name);
326 f->close_section(); // mon
327 }
328 f->close_section(); // quorum
329 f->close_section(); // monmap
330 f->flush(ss);
331 } else {
f67539c2
TL
332 mon.monmap->print_summary(ss);
333 ss << ", election epoch " << mon.get_epoch() << ", leader "
334 << mon.get_leader() << " " << mon.get_leader_name()
335 << ", quorum " << mon.get_quorum()
336 << " " << mon.get_quorum_names();
f91f0fd5
TL
337 }
338
7c673cae
FG
339 rdata.append(ss);
340 ss.str("");
341 r = 0;
342
343 } else if (prefix == "mon getmap" ||
344 prefix == "mon dump") {
345
346 epoch_t epoch;
347 int64_t epochnum;
9f95a23c 348 cmd_getval(cmdmap, "epoch", epochnum, (int64_t)0);
7c673cae
FG
349 epoch = epochnum;
350
f67539c2 351 MonMap *p = mon.monmap;
7c673cae
FG
352 if (epoch) {
353 bufferlist bl;
354 r = get_version(epoch, bl);
355 if (r == -ENOENT) {
356 ss << "there is no map for epoch " << epoch;
357 goto reply;
358 }
11fdf7f2
TL
359 ceph_assert(r == 0);
360 ceph_assert(bl.length() > 0);
7c673cae
FG
361 p = new MonMap;
362 p->decode(bl);
363 }
364
11fdf7f2 365 ceph_assert(p);
7c673cae
FG
366
367 if (prefix == "mon getmap") {
368 p->encode(rdata, m->get_connection()->get_features());
369 r = 0;
370 ss << "got monmap epoch " << p->get_epoch();
371 } else if (prefix == "mon dump") {
372 stringstream ds;
373 if (f) {
374 f->open_object_section("monmap");
375 p->dump(f.get());
376 f->open_array_section("quorum");
f67539c2
TL
377 for (set<int>::iterator q = mon.get_quorum().begin();
378 q != mon.get_quorum().end(); ++q) {
7c673cae
FG
379 f->dump_int("mon", *q);
380 }
381 f->close_section();
382 f->close_section();
383 f->flush(ds);
384 r = 0;
385 } else {
386 p->print(ds);
387 r = 0;
388 }
389 rdata.append(ds);
390 ss << "dumped monmap epoch " << p->get_epoch();
391 }
f67539c2 392 if (p != mon.monmap) {
7c673cae 393 delete p;
11fdf7f2
TL
394 p = nullptr;
395 }
7c673cae 396
224ce89b 397 } else if (prefix == "mon feature ls") {
7c673cae
FG
398
399 bool list_with_value = false;
400 string with_value;
9f95a23c 401 if (cmd_getval(cmdmap, "with_value", with_value) &&
7c673cae
FG
402 with_value == "--with-value") {
403 list_with_value = true;
404 }
405
f67539c2 406 MonMap *p = mon.monmap;
7c673cae
FG
407
408 // list features
409 mon_feature_t supported = ceph::features::mon::get_supported();
410 mon_feature_t persistent = ceph::features::mon::get_persistent();
411 mon_feature_t required = p->get_required_features();
412
413 stringstream ds;
414 auto print_feature = [&](mon_feature_t& m_features, const char* m_str) {
415 if (f) {
416 if (list_with_value)
417 m_features.dump_with_value(f.get(), m_str);
418 else
419 m_features.dump(f.get(), m_str);
420 } else {
421 if (list_with_value)
422 m_features.print_with_value(ds);
423 else
424 m_features.print(ds);
425 }
426 };
427
428 if (f) {
429 f->open_object_section("features");
430
431 f->open_object_section("all");
432 print_feature(supported, "supported");
433 print_feature(persistent, "persistent");
434 f->close_section(); // all
435
436 f->open_object_section("monmap");
437 print_feature(p->persistent_features, "persistent");
438 print_feature(p->optional_features, "optional");
439 print_feature(required, "required");
440 f->close_section(); // monmap
441
442 f->close_section(); // features
443 f->flush(ds);
444
445 } else {
446 ds << "all features" << std::endl
447 << "\tsupported: ";
448 print_feature(supported, nullptr);
449 ds << std::endl
450 << "\tpersistent: ";
451 print_feature(persistent, nullptr);
452 ds << std::endl
453 << std::endl;
454
455 ds << "on current monmap (epoch "
456 << p->get_epoch() << ")" << std::endl
457 << "\tpersistent: ";
458 print_feature(p->persistent_features, nullptr);
459 ds << std::endl
460 // omit optional features in plain-text
461 // makes it easier to read, and they're, currently, empty.
462 << "\trequired: ";
463 print_feature(required, nullptr);
464 ds << std::endl;
465 }
466 rdata.append(ds);
467 r = 0;
468 }
469
470reply:
471 if (r != -1) {
472 string rs;
473 getline(ss, rs);
474
f67539c2 475 mon.reply_command(op, r, rs, rdata, get_last_committed());
7c673cae
FG
476 return true;
477 } else
478 return false;
479}
480
481
482bool MonmapMonitor::prepare_update(MonOpRequestRef op)
483{
9f95a23c 484 auto m = op->get_req<PaxosServiceMessage>();
224ce89b 485 dout(7) << __func__ << " " << *m << " from " << m->get_orig_source_inst() << dendl;
7c673cae
FG
486
487 switch (m->get_type()) {
488 case MSG_MON_COMMAND:
f64942e4
AA
489 try {
490 return prepare_command(op);
11fdf7f2 491 } catch (const bad_cmd_get& e) {
f64942e4 492 bufferlist bl;
f67539c2 493 mon.reply_command(op, -EINVAL, e.what(), bl, get_last_committed());
f64942e4
AA
494 return true;
495 }
7c673cae
FG
496 case MSG_MON_JOIN:
497 return prepare_join(op);
498 default:
499 ceph_abort();
500 }
501
502 return false;
503}
504
505bool MonmapMonitor::prepare_command(MonOpRequestRef op)
506{
9f95a23c 507 auto m = op->get_req<MMonCommand>();
7c673cae
FG
508 stringstream ss;
509 string rs;
510 int err = -EINVAL;
511
11fdf7f2 512 cmdmap_t cmdmap;
7c673cae
FG
513 if (!cmdmap_from_json(m->cmd, &cmdmap, ss)) {
514 string rs = ss.str();
f67539c2 515 mon.reply_command(op, -EINVAL, rs, get_last_committed());
7c673cae
FG
516 return true;
517 }
518
519 string prefix;
9f95a23c 520 cmd_getval(cmdmap, "prefix", prefix);
7c673cae 521
11fdf7f2 522 MonSession *session = op->get_session();
7c673cae 523 if (!session) {
f67539c2 524 mon.reply_command(op, -EACCES, "access denied", get_last_committed());
7c673cae
FG
525 return true;
526 }
527
528 /* We should follow the following rules:
529 *
530 * - 'monmap' is the current, consistent version of the monmap
531 * - 'pending_map' is the uncommitted version of the monmap
532 *
533 * All checks for the current state must be made against 'monmap'.
534 * All changes are made against 'pending_map'.
535 *
536 * If there are concurrent operations modifying 'pending_map', please
537 * follow the following rules.
538 *
539 * - if pending_map has already been changed, the second operation must
540 * wait for the proposal to finish and be run again; This is the easiest
541 * path to guarantee correctness but may impact performance (i.e., it
542 * will take longer for the user to get a reply).
543 *
544 * - if the result of the second operation can be guaranteed to be
545 * idempotent, the operation may reply to the user once the proposal
546 * finishes; still needs to wait for the proposal to finish.
547 *
548 * - An operation _NEVER_ returns to the user based on pending state.
549 *
550 * If an operation does not modify current stable monmap, it may be
551 * serialized before current pending map, regardless of any change that
552 * has been made to the pending map -- remember, pending is uncommitted
553 * state, thus we are not bound by it.
554 */
555
f67539c2
TL
556 ceph_assert(mon.monmap);
557 MonMap &monmap = *mon.monmap;
7c673cae
FG
558
559
560 /* Please note:
561 *
562 * Adding or removing monitors may lead to loss of quorum.
563 *
564 * Because quorum may be lost, it's important to reply something
565 * to the user, lest she end up waiting forever for a reply. And
566 * no reply will ever be sent until quorum is formed again.
567 *
568 * On the other hand, this means we're leaking uncommitted state
569 * to the user. As such, please be mindful of the reply message.
570 *
571 * e.g., 'adding monitor mon.foo' is okay ('adding' is an on-going
572 * operation and conveys its not-yet-permanent nature); whereas
573 * 'added monitor mon.foo' presumes the action has successfully
574 * completed and state has been committed, which may not be true.
575 */
576
577
578 bool propose = false;
579 if (prefix == "mon add") {
580 string name;
9f95a23c 581 cmd_getval(cmdmap, "name", name);
7c673cae 582 string addrstr;
9f95a23c 583 cmd_getval(cmdmap, "addr", addrstr);
7c673cae
FG
584 entity_addr_t addr;
585 bufferlist rdata;
586
587 if (!addr.parse(addrstr.c_str())) {
588 err = -EINVAL;
589 ss << "addr " << addrstr << "does not parse";
590 goto reply;
591 }
592
f67539c2
TL
593 vector<string> locationvec;
594 map<string, string> loc;
595 cmd_getval(cmdmap, "location", locationvec);
596 CrushWrapper::parse_loc_map(locationvec, &loc);
597 if (locationvec.size() &&
598 !mon.get_quorum_mon_features().contains_all(
599 ceph::features::mon::FEATURE_PINGING)) {
600 err = -ENOTSUP;
601 ss << "Not all monitors support adding monitors with a location; please upgrade first!";
602 goto reply;
603 }
604 if (locationvec.size() && !loc.size()) {
605 ss << "We could not parse your input location to anything real; " << locationvec
606 << " turned into an empty map!";
607 err = -EINVAL;
608 goto reply;
609 }
610
611 dout(10) << "mon add setting location for " << name << " to " << loc << dendl;
612
613 // TODO: validate location in crush map
614 if (monmap.stretch_mode_enabled && !loc.size()) {
615 ss << "We are in stretch mode and new monitors must have a location, but "
616 << "could not parse your input location to anything real; " << locationvec
617 << " turned into an empty map!";
618 err = -EINVAL;
619 goto reply;
620 }
621 // TODO: validate location against any existing stretch config
622
11fdf7f2
TL
623 entity_addrvec_t addrs;
624 if (monmap.persistent_features.contains_all(
625 ceph::features::mon::FEATURE_NAUTILUS)) {
626 if (addr.get_port() == CEPH_MON_PORT_IANA) {
627 addr.set_type(entity_addr_t::TYPE_MSGR2);
628 }
629 if (addr.get_port() == CEPH_MON_PORT_LEGACY) {
630 // if they specified the *old* default they probably don't care
631 addr.set_port(0);
632 }
633 if (addr.get_port()) {
634 addrs.v.push_back(addr);
635 } else {
636 addr.set_type(entity_addr_t::TYPE_MSGR2);
637 addr.set_port(CEPH_MON_PORT_IANA);
638 addrs.v.push_back(addr);
639 addr.set_type(entity_addr_t::TYPE_LEGACY);
640 addr.set_port(CEPH_MON_PORT_LEGACY);
641 addrs.v.push_back(addr);
642 }
643 } else {
644 if (addr.get_port() == 0) {
645 addr.set_port(CEPH_MON_PORT_LEGACY);
646 }
647 addr.set_type(entity_addr_t::TYPE_LEGACY);
648 addrs.v.push_back(addr);
7c673cae 649 }
11fdf7f2 650 dout(20) << __func__ << " addr " << addr << " -> addrs " << addrs << dendl;
7c673cae
FG
651
652 /**
653 * If we have a monitor with the same name and different addr, then EEXIST
654 * If we have a monitor with the same addr and different name, then EEXIST
655 * If we have a monitor with the same addr and same name, then wait for
656 * the proposal to finish and return success.
657 * If we don't have the monitor, add it.
658 */
659
660 err = 0;
661 if (!ss.str().empty())
662 ss << "; ";
663
664 do {
665 if (monmap.contains(name)) {
11fdf7f2 666 if (monmap.get_addrs(name) == addrs) {
7c673cae
FG
667 // stable map contains monitor with the same name at the same address.
668 // serialize before current pending map.
669 err = 0; // for clarity; this has already been set above.
11fdf7f2 670 ss << "mon." << name << " at " << addrs << " already exists";
7c673cae
FG
671 goto reply;
672 } else {
673 ss << "mon." << name
11fdf7f2 674 << " already exists at address " << monmap.get_addrs(name);
7c673cae 675 }
11fdf7f2 676 } else if (monmap.contains(addrs)) {
7c673cae 677 // we established on the previous branch that name is different
11fdf7f2 678 ss << "mon." << monmap.get_name(addrs)
7c673cae
FG
679 << " already exists at address " << addr;
680 } else {
681 // go ahead and add
682 break;
683 }
684 err = -EEXIST;
685 goto reply;
686 } while (false);
687
f67539c2
TL
688 if (pending_map.stretch_mode_enabled) {
689
690 }
691
7c673cae
FG
692 /* Given there's no delay between proposals on the MonmapMonitor (see
693 * MonmapMonitor::should_propose()), there is no point in checking for
694 * a mismatch between name and addr on pending_map.
695 *
696 * Once we established the monitor does not exist in the committed state,
697 * we can simply go ahead and add the monitor.
698 */
699
11fdf7f2 700 pending_map.add(name, addrs);
f67539c2 701 pending_map.mon_info[name].crush_loc = loc;
7c673cae 702 pending_map.last_changed = ceph_clock_now();
11fdf7f2 703 ss << "adding mon." << name << " at " << addrs;
7c673cae
FG
704 propose = true;
705 dout(0) << __func__ << " proposing new mon." << name << dendl;
706
707 } else if (prefix == "mon remove" ||
708 prefix == "mon rm") {
709 string name;
9f95a23c 710 cmd_getval(cmdmap, "name", name);
7c673cae
FG
711 if (!monmap.contains(name)) {
712 err = 0;
713 ss << "mon." << name << " does not exist or has already been removed";
714 goto reply;
715 }
716
717 if (monmap.size() == 1) {
718 err = -EINVAL;
719 ss << "error: refusing removal of last monitor " << name;
720 goto reply;
721 }
722
a4b75251
TL
723 if (pending_map.stretch_mode_enabled &&
724 name == pending_map.tiebreaker_mon) {
725 err = -EINVAL;
726 ss << "you cannot remove stretch mode's tiebreaker monitor";
727 goto reply;
728 }
7c673cae
FG
729 /* At the time of writing, there is no risk of races when multiple clients
730 * attempt to use the same name. The reason is simple but may not be
731 * obvious.
732 *
733 * In a nutshell, we do not collate proposals on the MonmapMonitor. As
734 * soon as we return 'true' below, PaxosService::dispatch() will check if
735 * the service should propose, and - if so - the service will be marked as
736 * 'proposing' and a proposal will be triggered. The PaxosService class
737 * guarantees that once a service is marked 'proposing' no further writes
738 * will be handled.
739 *
740 * The decision on whether the service should propose or not is, in this
741 * case, made by MonmapMonitor::should_propose(), which always considers
742 * the proposal delay being 0.0 seconds. This is key for PaxosService to
743 * trigger the proposal immediately.
744 * 0.0 seconds of delay.
745 *
746 * From the above, there's no point in performing further checks on the
747 * pending_map, as we don't ever have multiple proposals in-flight in
748 * this service. As we've established the committed state contains the
749 * monitor, we can simply go ahead and remove it.
750 *
751 * Please note that the code hinges on all of the above to be true. It
752 * has been true since time immemorial and we don't see a good reason
753 * to make it sturdier at this time - mainly because we don't think it's
754 * going to change any time soon, lest for any bug that may be unwillingly
755 * introduced.
756 */
757
11fdf7f2 758 entity_addrvec_t addrs = pending_map.get_addrs(name);
7c673cae 759 pending_map.remove(name);
a4b75251 760 pending_map.disallowed_leaders.erase(name);
7c673cae 761 pending_map.last_changed = ceph_clock_now();
11fdf7f2 762 ss << "removing mon." << name << " at " << addrs
7c673cae
FG
763 << ", there will be " << pending_map.size() << " monitors" ;
764 propose = true;
765 err = 0;
766
767 } else if (prefix == "mon feature set") {
768
769 /* PLEASE NOTE:
770 *
771 * We currently only support setting/unsetting persistent features.
772 * This is by design, given at the moment we still don't have optional
773 * features, and, as such, there is no point introducing an interface
774 * to manipulate them. This allows us to provide a cleaner, more
775 * intuitive interface to the user, modifying solely persistent
776 * features.
777 *
778 * In the future we should consider adding another interface to handle
779 * optional features/flags; e.g., 'mon feature flag set/unset', or
780 * 'mon flag set/unset'.
781 */
782 string feature_name;
9f95a23c 783 if (!cmd_getval(cmdmap, "feature_name", feature_name)) {
7c673cae
FG
784 ss << "missing required feature name";
785 err = -EINVAL;
786 goto reply;
787 }
788
789 mon_feature_t feature;
790 feature = ceph::features::mon::get_feature_by_name(feature_name);
791 if (feature == ceph::features::mon::FEATURE_NONE) {
792 ss << "unknown feature '" << feature_name << "'";
793 err = -ENOENT;
794 goto reply;
795 }
796
11fdf7f2 797 bool sure = false;
9f95a23c 798 cmd_getval(cmdmap, "yes_i_really_mean_it", sure);
11fdf7f2 799 if (!sure) {
7c673cae
FG
800 ss << "please specify '--yes-i-really-mean-it' if you "
801 << "really, **really** want to set feature '"
802 << feature << "' in the monmap.";
803 err = -EPERM;
804 goto reply;
805 }
806
f67539c2 807 if (!mon.get_quorum_mon_features().contains_all(feature)) {
7c673cae
FG
808 ss << "current quorum does not support feature '" << feature
809 << "'; supported features: "
f67539c2 810 << mon.get_quorum_mon_features();
7c673cae
FG
811 err = -EINVAL;
812 goto reply;
813 }
814
815 ss << "setting feature '" << feature << "'";
816
817 err = 0;
818 if (monmap.persistent_features.contains_all(feature)) {
819 dout(10) << __func__ << " feature '" << feature
820 << "' already set on monmap; no-op." << dendl;
821 goto reply;
822 }
823
824 pending_map.persistent_features.set_feature(feature);
825 pending_map.last_changed = ceph_clock_now();
826 propose = true;
827
11fdf7f2 828 dout(1) << __func__ << " " << ss.str() << "; new features will be: "
7c673cae
FG
829 << "persistent = " << pending_map.persistent_features
830 // output optional nevertheless, for auditing purposes.
831 << ", optional = " << pending_map.optional_features << dendl;
11fdf7f2
TL
832
833 } else if (prefix == "mon set-rank") {
834 string name;
835 int64_t rank;
9f95a23c
TL
836 if (!cmd_getval(cmdmap, "name", name) ||
837 !cmd_getval(cmdmap, "rank", rank)) {
11fdf7f2
TL
838 err = -EINVAL;
839 goto reply;
840 }
841 int oldrank = pending_map.get_rank(name);
842 if (oldrank < 0) {
843 ss << "mon." << name << " does not exist in monmap";
844 err = -ENOENT;
845 goto reply;
846 }
847 err = 0;
848 pending_map.set_rank(name, rank);
849 pending_map.last_changed = ceph_clock_now();
850 propose = true;
851 } else if (prefix == "mon set-addrs") {
852 string name;
853 string addrs;
9f95a23c
TL
854 if (!cmd_getval(cmdmap, "name", name) ||
855 !cmd_getval(cmdmap, "addrs", addrs)) {
11fdf7f2
TL
856 err = -EINVAL;
857 goto reply;
858 }
859 if (!pending_map.contains(name)) {
860 ss << "mon." << name << " does not exist";
861 err = -ENOENT;
862 goto reply;
863 }
864 entity_addrvec_t av;
865 if (!av.parse(addrs.c_str(), nullptr)) {
866 ss << "failed to parse addrs '" << addrs << "'";
867 err = -EINVAL;
868 goto reply;
869 }
870 for (auto& a : av.v) {
871 a.set_nonce(0);
872 if (!a.get_port()) {
873 ss << "monitor must bind to a non-zero port, not " << a;
874 err = -EINVAL;
875 goto reply;
876 }
877 }
878 err = 0;
879 pending_map.set_addrvec(name, av);
880 pending_map.last_changed = ceph_clock_now();
881 propose = true;
9f95a23c
TL
882 } else if (prefix == "mon set-weight") {
883 string name;
884 int64_t weight;
885 if (!cmd_getval(cmdmap, "name", name) ||
886 !cmd_getval(cmdmap, "weight", weight)) {
887 err = -EINVAL;
888 goto reply;
889 }
890 if (!pending_map.contains(name)) {
891 ss << "mon." << name << " does not exist";
892 err = -ENOENT;
893 goto reply;
894 }
895 err = 0;
896 pending_map.set_weight(name, weight);
897 pending_map.last_changed = ceph_clock_now();
898 propose = true;
11fdf7f2
TL
899 } else if (prefix == "mon enable-msgr2") {
900 if (!monmap.get_required_features().contains_all(
901 ceph::features::mon::FEATURE_NAUTILUS)) {
902 err = -EACCES;
903 ss << "all monitors must be running nautilus to enable v2";
904 goto reply;
905 }
906 for (auto& i : pending_map.mon_info) {
907 if (i.second.public_addrs.v.size() == 1 &&
908 i.second.public_addrs.front().is_legacy() &&
909 i.second.public_addrs.front().get_port() == CEPH_MON_PORT_LEGACY) {
910 entity_addrvec_t av;
911 entity_addr_t a = i.second.public_addrs.front();
912 a.set_type(entity_addr_t::TYPE_MSGR2);
913 a.set_port(CEPH_MON_PORT_IANA);
914 av.v.push_back(a);
915 av.v.push_back(i.second.public_addrs.front());
916 dout(10) << " setting mon." << i.first
917 << " addrs " << i.second.public_addrs
918 << " -> " << av << dendl;
919 pending_map.set_addrvec(i.first, av);
920 propose = true;
921 pending_map.last_changed = ceph_clock_now();
922 }
923 }
924 err = 0;
f67539c2
TL
925 } else if (prefix == "mon set election_strategy") {
926 if (!mon.get_quorum_mon_features().contains_all(
927 ceph::features::mon::FEATURE_PINGING)) {
928 err = -ENOTSUP;
929 ss << "Not all monitors support changing election strategies; please upgrade first!";
930 goto reply;
931 }
932 string strat;
933 MonMap::election_strategy strategy;
934 if (!cmd_getval(cmdmap, "strategy", strat)) {
935 err = -EINVAL;
936 goto reply;
937 }
938 if (strat == "classic") {
939 strategy = MonMap::CLASSIC;
940 } else if (strat == "disallow") {
941 strategy = MonMap::DISALLOW;
942 } else if (strat == "connectivity") {
943 strategy = MonMap::CONNECTIVITY;
944 } else {
945 err = -EINVAL;
946 goto reply;
947 }
948 err = 0;
949 pending_map.strategy = strategy;
a4b75251 950 pending_map.last_changed = ceph_clock_now();
f67539c2
TL
951 propose = true;
952 } else if (prefix == "mon add disallowed_leader") {
953 if (!mon.get_quorum_mon_features().contains_all(
954 ceph::features::mon::FEATURE_PINGING)) {
955 err = -ENOTSUP;
956 ss << "Not all monitors support changing election strategies; please upgrade first!";
957 goto reply;
958 }
959 string name;
960 if (!cmd_getval(cmdmap, "name", name)) {
961 err = -EINVAL;
962 goto reply;
963 }
964 if (pending_map.strategy != MonMap::DISALLOW &&
965 pending_map.strategy != MonMap::CONNECTIVITY) {
966 ss << "You cannot disallow monitors in your current election mode";
967 err = -EINVAL;
968 goto reply;
969 }
970 if (!pending_map.contains(name)) {
971 ss << "mon." << name << " does not exist";
972 err = -ENOENT;
973 goto reply;
974 }
975 if (pending_map.disallowed_leaders.count(name)) {
976 ss << "mon." << name << " is already disallowed";
977 err = 0;
978 goto reply;
979 }
980 if (pending_map.disallowed_leaders.size() == pending_map.size() - 1) {
981 ss << "mon." << name << " is the only remaining allowed leader!";
982 err = -EINVAL;
983 goto reply;
984 }
985 pending_map.disallowed_leaders.insert(name);
a4b75251 986 pending_map.last_changed = ceph_clock_now();
f67539c2
TL
987 err = 0;
988 propose = true;
989 } else if (prefix == "mon rm disallowed_leader") {
990 if (!mon.get_quorum_mon_features().contains_all(
991 ceph::features::mon::FEATURE_PINGING)) {
992 err = -ENOTSUP;
993 ss << "Not all monitors support changing election strategies; please upgrade first!";
994 goto reply;
995 }
996 string name;
997 if (!cmd_getval(cmdmap, "name", name)) {
998 err = -EINVAL;
999 goto reply;
1000 }
1001 if (pending_map.strategy != MonMap::DISALLOW &&
1002 pending_map.strategy != MonMap::CONNECTIVITY) {
1003 ss << "You cannot disallow monitors in your current election mode";
1004 err = -EINVAL;
1005 goto reply;
1006 }
1007 if (!pending_map.contains(name)) {
1008 ss << "mon." << name << " does not exist";
1009 err = -ENOENT;
1010 goto reply;
1011 }
1012 if (!pending_map.disallowed_leaders.count(name)) {
1013 ss << "mon." << name << " is already allowed";
1014 err = 0;
1015 goto reply;
1016 }
1017 pending_map.disallowed_leaders.erase(name);
a4b75251 1018 pending_map.last_changed = ceph_clock_now();
f67539c2
TL
1019 err = 0;
1020 propose = true;
1021 } else if (prefix == "mon set_location") {
1022 if (!mon.get_quorum_mon_features().contains_all(
1023 ceph::features::mon::FEATURE_PINGING)) {
1024 err = -ENOTSUP;
1025 ss << "Not all monitors support monitor locations; please upgrade first!";
1026 goto reply;
1027 }
1028 string name;
1029 if (!cmd_getval(cmdmap, "name", name)) {
1030 err = -EINVAL;
1031 goto reply;
1032 }
1033 if (!pending_map.contains(name)) {
1034 ss << "mon." << name << " does not exist";
1035 err = -ENOENT;
1036 goto reply;
1037 }
1038
f67539c2
TL
1039 vector<string> argvec;
1040 map<string, string> loc;
1041 cmd_getval(cmdmap, "args", argvec);
1042 CrushWrapper::parse_loc_map(argvec, &loc);
1043
1044 dout(10) << "mon set_location for " << name << " to " << loc << dendl;
1045
1046 // TODO: validate location in crush map
1047 if (!loc.size()) {
1048 ss << "We could not parse your input location to anything real; " << argvec
1049 << " turned into an empty map!";
1050 err = -EINVAL;
1051 goto reply;
1052 }
1053 // TODO: validate location against any existing stretch config
1054 pending_map.mon_info[name].crush_loc = loc;
a4b75251
TL
1055 pending_map.last_changed = ceph_clock_now();
1056 err = 0;
1057 propose = true;
1058 } else if (prefix == "mon set_new_tiebreaker") {
1059 if (!pending_map.stretch_mode_enabled) {
1060 err = -EINVAL;
1061 ss << "Stretch mode is not enabled, so there is no tiebreaker";
1062 goto reply;
1063 }
1064 string name;
1065 if (!cmd_getval(cmdmap, "name", name)) {
1066 err = -EINVAL;
1067 goto reply;
1068 }
1069 bool sure = false;
1070 cmd_getval(cmdmap, "yes_i_really_mean_it", sure);
1071
1072 const auto &existing_tiebreaker_info_i = pending_map.mon_info.find(pending_map.tiebreaker_mon);
1073 const auto &new_tiebreaker_info_i = pending_map.mon_info.find(name);
1074 if (new_tiebreaker_info_i == pending_map.mon_info.end()) {
1075 ss << "mon." << name << " does not exist";
1076 err = -ENOENT;
1077 goto reply;
1078 }
1079 const auto& new_info = new_tiebreaker_info_i->second;
1080 if (new_info.crush_loc.empty()) {
1081 ss << "mon." << name << " does not have a location specified";
1082 err = -EINVAL;
1083 goto reply;
1084 }
1085
1086 if (!mon.osdmon()->is_readable()) {
1087 dout(10) << __func__
1088 << ": waiting for osdmon readable to inspect crush barrier"
1089 << dendl;
1090 mon.osdmon()->wait_for_readable(op, new Monitor::C_RetryMessage(&mon, op));
1091 return false;
1092 }
1093 int32_t stretch_divider_id = mon.osdmon()->osdmap.stretch_mode_bucket;
1094 string stretch_bucket_divider = mon.osdmon()->osdmap.crush->
1095 get_type_name(stretch_divider_id);
1096
1097 const auto& new_loc_i = new_info.crush_loc.find(stretch_bucket_divider);
1098 if (new_loc_i == new_info.crush_loc.end()) {
1099 ss << "mon." << name << " has a specificed location, but not a "
1100 << stretch_bucket_divider << ", which is the stretch divider";
1101 err = -EINVAL;
1102 goto reply;
1103 }
1104 const string& new_loc = new_loc_i->second;
1105 set<string> matching_mons;
1106 for (const auto& mii : pending_map.mon_info) {
1107 const auto& other_loc_i = mii.second.crush_loc.find(stretch_bucket_divider);
1108 if (mii.first == name) {
1109 continue;
1110 }
1111 if (other_loc_i == mii.second.crush_loc.end()) { // huh
1112 continue;
1113 }
1114 const string& other_loc = other_loc_i->second;
1115 if (other_loc == new_loc &&
1116 mii.first != existing_tiebreaker_info_i->first) {
1117 matching_mons.insert(mii.first);
1118 }
1119 }
1120 if (!matching_mons.empty()) {
1121 ss << "mon." << name << " has location " << new_loc_i->second
1122 << ", which matches mons " << matching_mons << " on the "
1123 << stretch_bucket_divider << " dividing bucket for stretch mode. "
1124 "Pass --yes-i-really-mean-it if you're sure you want to do this."
1125 "(You really don't.)";
1126 err = -EINVAL;
1127 goto reply;
1128 }
1129 pending_map.tiebreaker_mon = name;
1130 pending_map.disallowed_leaders.insert(name);
1131 pending_map.last_changed = ceph_clock_now();
f67539c2
TL
1132 err = 0;
1133 propose = true;
1134 } else if (prefix == "mon enable_stretch_mode") {
1135 if (!mon.osdmon()->is_writeable()) {
a4b75251 1136 dout(10) << __func__
f67539c2
TL
1137 << ": waiting for osdmon writeable for stretch mode" << dendl;
1138 mon.osdmon()->wait_for_writeable(op, new Monitor::C_RetryMessage(&mon, op));
1139 return false;
1140 }
1141 {
1142 if (monmap.stretch_mode_enabled) {
1143 ss << "stretch mode is already engaged";
1144 err = -EINVAL;
1145 goto reply;
1146 }
1147 if (pending_map.stretch_mode_enabled) {
1148 ss << "stretch mode currently committing";
1149 err = 0;
1150 goto reply;
1151 }
1152 string tiebreaker_mon;
1153 if (!cmd_getval(cmdmap, "tiebreaker_mon", tiebreaker_mon)) {
1154 ss << "must specify a tiebreaker monitor";
1155 err = -EINVAL;
1156 goto reply;
1157 }
1158 string new_crush_rule;
1159 if (!cmd_getval(cmdmap, "new_crush_rule", new_crush_rule)) {
1160 ss << "must specify a new crush rule that spreads out copies over multiple sites";
1161 err = -EINVAL;
1162 goto reply;
1163 }
1164 string dividing_bucket;
1165 if (!cmd_getval(cmdmap, "dividing_bucket", dividing_bucket)) {
1166 ss << "must specify a dividing bucket";
1167 err = -EINVAL;
1168 goto reply;
1169 }
1170 //okay, initial arguments make sense, check pools and cluster state
1171 err = mon.osdmon()->check_cluster_features(CEPH_FEATUREMASK_STRETCH_MODE, ss);
1172 if (err)
1173 goto reply;
1174 struct Plugger {
1175 Paxos &p;
1176 Plugger(Paxos &p) : p(p) { p.plug(); }
1177 ~Plugger() { p.unplug(); }
1178 } plugger(paxos);
1179
1180 set<pg_pool_t*> pools;
1181 bool okay = false;
1182 int errcode = 0;
1183
1184 mon.osdmon()->try_enable_stretch_mode_pools(ss, &okay, &errcode,
1185 &pools, new_crush_rule);
1186 if (!okay) {
1187 err = errcode;
1188 goto reply;
1189 }
1190 try_enable_stretch_mode(ss, &okay, &errcode, false,
1191 tiebreaker_mon, dividing_bucket);
1192 if (!okay) {
1193 err = errcode;
1194 goto reply;
1195 }
1196 mon.osdmon()->try_enable_stretch_mode(ss, &okay, &errcode, false,
1197 dividing_bucket, 2, pools, new_crush_rule);
1198 if (!okay) {
1199 err = errcode;
1200 goto reply;
1201 }
1202 // everything looks good, actually commit the changes!
1203 try_enable_stretch_mode(ss, &okay, &errcode, true,
1204 tiebreaker_mon, dividing_bucket);
1205 mon.osdmon()->try_enable_stretch_mode(ss, &okay, &errcode, true,
1206 dividing_bucket,
1207 2, // right now we only support 2 sites
1208 pools, new_crush_rule);
1209 ceph_assert(okay == true);
1210 }
1211 request_proposal(mon.osdmon());
1212 err = 0;
1213 propose = true;
7c673cae
FG
1214 } else {
1215 ss << "unknown command " << prefix;
1216 err = -EINVAL;
1217 }
1218
1219reply:
1220 getline(ss, rs);
f67539c2 1221 mon.reply_command(op, err, rs, get_last_committed());
7c673cae
FG
1222 // we are returning to the user; do not propose.
1223 return propose;
1224}
1225
f67539c2
TL
1226void MonmapMonitor::try_enable_stretch_mode(stringstream& ss, bool *okay,
1227 int *errcode, bool commit,
1228 const string& tiebreaker_mon,
1229 const string& dividing_bucket)
1230{
1231 dout(20) << __func__ << dendl;
1232 *okay = false;
1233 if (pending_map.strategy != MonMap::CONNECTIVITY) {
1234 ss << "Monitors must use the connectivity strategy to enable stretch mode";
1235 *errcode = -EINVAL;
1236 ceph_assert(!commit);
1237 return;
1238 }
1239 if (!pending_map.contains(tiebreaker_mon)) {
1240 ss << "mon " << tiebreaker_mon << "does not seem to exist";
1241 *errcode = -ENOENT;
1242 ceph_assert(!commit);
1243 return;
1244 }
1245 map<string,string> buckets;
1246 for (const auto&mii : mon.monmap->mon_info) {
1247 const auto& mi = mii.second;
1248 const auto& bi = mi.crush_loc.find(dividing_bucket);
1249 if (bi == mi.crush_loc.end()) {
1250 ss << "Could not find location entry for " << dividing_bucket
1251 << " on monitor " << mi.name;
1252 *errcode = -EINVAL;
1253 ceph_assert(!commit);
1254 return;
1255 }
1256 buckets[mii.first] = bi->second;
1257 }
1258 string bucket1, bucket2, tiebreaker_bucket;
1259 for (auto& i : buckets) {
1260 if (i.first == tiebreaker_mon) {
1261 tiebreaker_bucket = i.second;
1262 continue;
1263 }
1264 if (bucket1.empty()) {
1265 bucket1 = i.second;
1266 }
1267 if (bucket1 != i.second &&
1268 bucket2.empty()) {
1269 bucket2 = i.second;
1270 }
1271 if (bucket1 != i.second &&
1272 bucket2 != i.second) {
1273 ss << "There are too many monitor buckets for stretch mode, found "
1274 << bucket1 << "," << bucket2 << "," << i.second;
1275 *errcode = -EINVAL;
1276 ceph_assert(!commit);
1277 return;
1278 }
1279 }
1280 if (bucket1.empty() || bucket2.empty()) {
1281 ss << "There are not enough monitor buckets for stretch mode;"
1282 << " must have at least 2 plus the tiebreaker but only found "
1283 << (bucket1.empty() ? bucket1 : bucket2);
1284 *errcode = -EINVAL;
1285 ceph_assert(!commit);
1286 return;
1287 }
1288 if (tiebreaker_bucket == bucket1 ||
1289 tiebreaker_bucket == bucket2) {
1290 ss << "The named tiebreaker monitor " << tiebreaker_mon
1291 << " is in the same CRUSH bucket " << tiebreaker_bucket
1292 << " as other monitors";
1293 *errcode = -EINVAL;
1294 ceph_assert(!commit);
1295 return;
1296 }
1297 if (commit) {
1298 pending_map.disallowed_leaders.insert(tiebreaker_mon);
1299 pending_map.tiebreaker_mon = tiebreaker_mon;
1300 pending_map.stretch_mode_enabled = true;
1301 }
1302 *okay = true;
1303}
1304
1305void MonmapMonitor::trigger_degraded_stretch_mode(const set<string>& dead_mons)
1306{
1307 dout(20) << __func__ << dendl;
1308 pending_map.stretch_marked_down_mons.insert(dead_mons.begin(), dead_mons.end());
1309 propose_pending();
1310}
1311
1312void MonmapMonitor::trigger_healthy_stretch_mode()
1313{
1314 dout(20) << __func__ << dendl;
1315 pending_map.stretch_marked_down_mons.clear();
1316 propose_pending();
1317}
1318
7c673cae
FG
1319bool MonmapMonitor::preprocess_join(MonOpRequestRef op)
1320{
9f95a23c 1321 auto join = op->get_req<MMonJoin>();
11fdf7f2 1322 dout(10) << __func__ << " " << join->name << " at " << join->addrs << dendl;
7c673cae 1323
11fdf7f2 1324 MonSession *session = op->get_session();
7c673cae
FG
1325 if (!session ||
1326 !session->is_capable("mon", MON_CAP_W | MON_CAP_X)) {
1327 dout(10) << " insufficient caps" << dendl;
1328 return true;
1329 }
1330
b3b6e05e
TL
1331 const auto name_info_i = pending_map.mon_info.find(join->name);
1332 if (name_info_i != pending_map.mon_info.end() &&
1333 !name_info_i->second.public_addrs.front().is_blank_ip() &&
1334 (!join->force_loc || join->crush_loc == name_info_i->second.crush_loc)) {
7c673cae
FG
1335 dout(10) << " already have " << join->name << dendl;
1336 return true;
1337 }
b3b6e05e
TL
1338 string addr_name;
1339 if (pending_map.contains(join->addrs)) {
1340 addr_name = pending_map.get_name(join->addrs);
1341 }
1342 if (!addr_name.empty() &&
1343 addr_name == join->name &&
1344 (!join->force_loc || join->crush_loc.empty() ||
1345 pending_map.mon_info[addr_name].crush_loc == join->crush_loc)) {
11fdf7f2 1346 dout(10) << " already have " << join->addrs << dendl;
7c673cae
FG
1347 return true;
1348 }
b3b6e05e
TL
1349 if (pending_map.stretch_mode_enabled &&
1350 join->crush_loc.empty() &&
1351 (addr_name.empty() ||
1352 pending_map.mon_info[addr_name].crush_loc.empty())) {
1353 dout(10) << "stretch mode engaged but no source of crush_loc" << dendl;
1354 mon.clog->info() << join->name << " attempted to join from " << join->name
1355 << ' ' << join->addrs
1356 << "; but lacks a crush_location for stretch mode";
1357 return true;
1358 }
7c673cae
FG
1359 return false;
1360}
b3b6e05e 1361
7c673cae
FG
1362bool MonmapMonitor::prepare_join(MonOpRequestRef op)
1363{
9f95a23c 1364 auto join = op->get_req<MMonJoin>();
11fdf7f2
TL
1365 dout(0) << "adding/updating " << join->name
1366 << " at " << join->addrs << " to monitor cluster" << dendl;
f67539c2
TL
1367 map<string,string> existing_loc;
1368 if (pending_map.contains(join->addrs)) {
1369 string name = pending_map.get_name(join->addrs);
1370 existing_loc = pending_map.mon_info[name].crush_loc;
1371 pending_map.remove(name);
1372 }
7c673cae
FG
1373 if (pending_map.contains(join->name))
1374 pending_map.remove(join->name);
11fdf7f2 1375 pending_map.add(join->name, join->addrs);
b3b6e05e
TL
1376 pending_map.mon_info[join->name].crush_loc =
1377 ((join->force_loc || existing_loc.empty()) ?
1378 join->crush_loc : existing_loc);
7c673cae
FG
1379 pending_map.last_changed = ceph_clock_now();
1380 return true;
1381}
1382
1383bool MonmapMonitor::should_propose(double& delay)
1384{
1385 delay = 0.0;
1386 return true;
1387}
1388
7c673cae
FG
1389int MonmapMonitor::get_monmap(bufferlist &bl)
1390{
1391 version_t latest_ver = get_last_committed();
1392 dout(10) << __func__ << " ver " << latest_ver << dendl;
1393
f67539c2 1394 if (!mon.store->exists(get_service_name(), stringify(latest_ver)))
7c673cae
FG
1395 return -ENOENT;
1396
1397 int err = get_version(latest_ver, bl);
1398 if (err < 0) {
1399 dout(1) << __func__ << " error obtaining monmap: "
1400 << cpp_strerror(err) << dendl;
1401 return err;
1402 }
1403 return 0;
1404}
1405
1406void MonmapMonitor::check_subs()
1407{
1408 const string type = "monmap";
f67539c2 1409 mon.with_session_map([this, &type](const MonSessionMap& session_map) {
7c673cae
FG
1410 auto subs = session_map.subs.find(type);
1411 if (subs == session_map.subs.end())
1412 return;
1413 for (auto sub : *subs->second) {
1414 check_sub(sub);
1415 }
1416 });
1417}
1418
1419void MonmapMonitor::check_sub(Subscription *sub)
1420{
f67539c2 1421 const auto epoch = mon.monmap->get_epoch();
7c673cae
FG
1422 dout(10) << __func__
1423 << " monmap next " << sub->next
1424 << " have " << epoch << dendl;
1425 if (sub->next <= epoch) {
f67539c2 1426 mon.send_latest_monmap(sub->session->con.get());
7c673cae 1427 if (sub->onetime) {
f67539c2 1428 mon.with_session_map([sub](MonSessionMap& session_map) {
7c673cae
FG
1429 session_map.remove_sub(sub);
1430 });
1431 } else {
1432 sub->next = epoch + 1;
1433 }
1434 }
1435}
11fdf7f2
TL
1436
1437void MonmapMonitor::tick()
1438{
1439 if (!is_active() ||
f67539c2 1440 !mon.is_leader()) {
11fdf7f2
TL
1441 return;
1442 }
1443
f67539c2 1444 if (mon.monmap->created.is_zero()) {
11fdf7f2
TL
1445 dout(10) << __func__ << " detected empty created stamp" << dendl;
1446 utime_t ctime;
1447 for (version_t v = 1; v <= get_last_committed(); v++) {
1448 bufferlist bl;
1449 int r = get_version(v, bl);
1450 if (r < 0) {
1451 continue;
1452 }
1453 MonMap m;
1454 auto p = bl.cbegin();
1455 decode(m, p);
1456 if (!m.last_changed.is_zero()) {
1457 dout(10) << __func__ << " first monmap with last_changed is "
1458 << v << " with " << m.last_changed << dendl;
1459 ctime = m.last_changed;
1460 break;
1461 }
1462 }
1463 if (ctime.is_zero()) {
1464 ctime = ceph_clock_now();
1465 }
1466 dout(10) << __func__ << " updating created stamp to " << ctime << dendl;
1467 pending_map.created = ctime;
1468 propose_pending();
1469 }
1470}