]> git.proxmox.com Git - ceph.git/blame - ceph/src/mon/MonmapMonitor.cc
import ceph quincy 17.2.6
[ceph.git] / ceph / src / mon / MonmapMonitor.cc
CommitLineData
7c673cae
FG
1// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2// vim: ts=8 sw=2 smarttab
3/*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2009 Sage Weil <sage@newdream.net>
7 *
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
12 *
13 */
14
15#include "MonmapMonitor.h"
16#include "Monitor.h"
f67539c2 17#include "OSDMonitor.h"
7c673cae
FG
18#include "messages/MMonCommand.h"
19#include "messages/MMonJoin.h"
20
21#include "common/ceph_argparse.h"
22#include "common/errno.h"
23#include <sstream>
24#include "common/config.h"
25#include "common/cmdparse.h"
26
11fdf7f2 27#include "include/ceph_assert.h"
7c673cae
FG
28#include "include/stringify.h"
29
30#define dout_subsys ceph_subsys_mon
31#undef dout_prefix
32#define dout_prefix _prefix(_dout, mon)
9f95a23c 33using namespace TOPNSPC::common;
f67539c2
TL
34
35using std::cout;
36using std::dec;
37using std::hex;
38using std::list;
39using std::map;
40using std::make_pair;
41using std::ostream;
42using std::ostringstream;
43using std::pair;
44using std::set;
45using std::setfill;
46using std::string;
47using std::stringstream;
48using std::to_string;
49using std::vector;
50using std::unique_ptr;
51
52using ceph::bufferlist;
53using ceph::decode;
54using ceph::encode;
55using ceph::Formatter;
56using ceph::JSONFormatter;
57using ceph::make_message;
58using ceph::mono_clock;
59using ceph::mono_time;
60using ceph::timespan_str;
61static ostream& _prefix(std::ostream *_dout, Monitor &mon) {
62 return *_dout << "mon." << mon.name << "@" << mon.rank
63 << "(" << mon.get_state_name()
64 << ").monmap v" << mon.monmap->epoch << " ";
7c673cae
FG
65}
66
67void MonmapMonitor::create_initial()
68{
224ce89b 69 dout(10) << __func__ << " using current monmap" << dendl;
f67539c2 70 pending_map = *mon.monmap;
7c673cae 71 pending_map.epoch = 1;
31f18b77 72
11fdf7f2 73 if (g_conf()->mon_debug_no_initial_persistent_features) {
31f18b77
FG
74 derr << __func__ << " mon_debug_no_initial_persistent_features=true"
75 << dendl;
76 } else {
77 // initialize with default persistent features for new clusters
78 pending_map.persistent_features = ceph::features::mon::get_persistent();
11fdf7f2 79 pending_map.min_mon_release = ceph_release();
31f18b77 80 }
7c673cae
FG
81}
82
83void MonmapMonitor::update_from_paxos(bool *need_bootstrap)
84{
85 version_t version = get_last_committed();
f67539c2 86 if (version <= mon.monmap->get_epoch())
7c673cae
FG
87 return;
88
89 dout(10) << __func__ << " version " << version
f67539c2 90 << ", my v " << mon.monmap->epoch << dendl;
7c673cae 91
f67539c2 92 if (need_bootstrap && version != mon.monmap->get_epoch()) {
7c673cae
FG
93 dout(10) << " signaling that we need a bootstrap" << dendl;
94 *need_bootstrap = true;
95 }
96
97 // read and decode
98 monmap_bl.clear();
99 int ret = get_version(version, monmap_bl);
11fdf7f2
TL
100 ceph_assert(ret == 0);
101 ceph_assert(monmap_bl.length());
7c673cae 102
224ce89b 103 dout(10) << __func__ << " got " << version << dendl;
f67539c2 104 mon.monmap->decode(monmap_bl);
7c673cae 105
f67539c2 106 if (mon.store->exists("mkfs", "monmap")) {
7c673cae
FG
107 auto t(std::make_shared<MonitorDBStore::Transaction>());
108 t->erase("mkfs", "monmap");
f67539c2 109 mon.store->apply_transaction(t);
7c673cae
FG
110 }
111
112 check_subs();
11fdf7f2
TL
113
114 // make sure we've recorded min_mon_release
115 string val;
f67539c2 116 if (mon.store->read_meta("min_mon_release", &val) < 0 ||
11fdf7f2
TL
117 val.size() == 0 ||
118 atoi(val.c_str()) != (int)ceph_release()) {
119 dout(10) << __func__ << " updating min_mon_release meta" << dendl;
f67539c2 120 mon.store->write_meta("min_mon_release",
11fdf7f2
TL
121 stringify(ceph_release()));
122 }
f67539c2 123
b3b6e05e 124 mon.notify_new_monmap(true);
7c673cae
FG
125}
126
127void MonmapMonitor::create_pending()
128{
f67539c2 129 pending_map = *mon.monmap;
7c673cae
FG
130 pending_map.epoch++;
131 pending_map.last_changed = ceph_clock_now();
39ae355f 132 pending_map.removed_ranks.clear();
7c673cae
FG
133}
134
135void MonmapMonitor::encode_pending(MonitorDBStore::TransactionRef t)
136{
224ce89b 137 dout(10) << __func__ << " epoch " << pending_map.epoch << dendl;
7c673cae 138
f67539c2 139 ceph_assert(mon.monmap->epoch + 1 == pending_map.epoch ||
7c673cae
FG
140 pending_map.epoch == 1); // special case mkfs!
141 bufferlist bl;
f67539c2 142 pending_map.encode(bl, mon.get_quorum_con_features());
7c673cae
FG
143
144 put_version(t, pending_map.epoch, bl);
145 put_last_committed(t, pending_map.epoch);
146
147 // generate a cluster fingerprint, too?
148 if (pending_map.epoch == 1) {
f67539c2 149 mon.prepare_new_fingerprint(t);
7c673cae 150 }
f67539c2
TL
151
152 //health
153 health_check_map_t next;
154 pending_map.check_health(&next);
155 encode_health(next, t);
7c673cae
FG
156}
157
158class C_ApplyFeatures : public Context {
159 MonmapMonitor *svc;
160 mon_feature_t features;
9f95a23c 161 ceph_release_t min_mon_release;
11fdf7f2 162public:
9f95a23c 163 C_ApplyFeatures(MonmapMonitor *s, const mon_feature_t& f, ceph_release_t mmr) :
11fdf7f2 164 svc(s), features(f), min_mon_release(mmr) { }
7c673cae
FG
165 void finish(int r) override {
166 if (r >= 0) {
11fdf7f2 167 svc->apply_mon_features(features, min_mon_release);
7c673cae
FG
168 } else if (r == -EAGAIN || r == -ECANCELED) {
169 // discard features if we're no longer on the quorum that
170 // established them in the first place.
171 return;
172 } else {
11fdf7f2 173 ceph_abort_msg("bad C_ApplyFeatures return value");
7c673cae
FG
174 }
175 }
176};
177
11fdf7f2 178void MonmapMonitor::apply_mon_features(const mon_feature_t& features,
9f95a23c 179 ceph_release_t min_mon_release)
7c673cae
FG
180{
181 if (!is_writeable()) {
182 dout(5) << __func__ << " wait for service to be writeable" << dendl;
11fdf7f2 183 wait_for_writeable_ctx(new C_ApplyFeatures(this, features, min_mon_release));
7c673cae
FG
184 return;
185 }
186
11fdf7f2 187 // do nothing here unless we have a full quorum
f67539c2 188 if (mon.get_quorum().size() < mon.monmap->size()) {
11fdf7f2
TL
189 return;
190 }
191
192 ceph_assert(is_writeable());
193 ceph_assert(features.contains_all(pending_map.persistent_features));
7c673cae
FG
194 // we should never hit this because `features` should be the result
195 // of the quorum's supported features. But if it happens, die.
11fdf7f2 196 ceph_assert(ceph::features::mon::get_supported().contains_all(features));
7c673cae
FG
197
198 mon_feature_t new_features =
199 (pending_map.persistent_features ^
200 (features & ceph::features::mon::get_persistent()));
201
11fdf7f2
TL
202 if (new_features.empty() &&
203 pending_map.min_mon_release == min_mon_release) {
81eedcae 204 dout(10) << __func__ << " min_mon_release (" << (int)min_mon_release
11fdf7f2 205 << ") and features (" << features << ") match" << dendl;
7c673cae
FG
206 return;
207 }
208
11fdf7f2
TL
209 if (!new_features.empty()) {
210 dout(1) << __func__ << " applying new features "
211 << new_features << ", had " << pending_map.persistent_features
212 << ", will have "
213 << (new_features | pending_map.persistent_features)
214 << dendl;
215 pending_map.persistent_features |= new_features;
216 }
217 if (min_mon_release > pending_map.min_mon_release) {
218 dout(1) << __func__ << " increasing min_mon_release to "
f67539c2 219 << to_integer<int>(min_mon_release) << " (" << min_mon_release
11fdf7f2
TL
220 << ")" << dendl;
221 pending_map.min_mon_release = min_mon_release;
7c673cae
FG
222 }
223
7c673cae
FG
224 propose_pending();
225}
226
227void MonmapMonitor::on_active()
228{
f67539c2 229 if (get_last_committed() >= 1 && !mon.has_ever_joined) {
7c673cae
FG
230 // make note of the fact that i was, once, part of the quorum.
231 dout(10) << "noting that i was, once, part of an active quorum." << dendl;
232
233 /* This is some form of nasty in-breeding we have between the MonmapMonitor
234 and the Monitor itself. We should find a way to get rid of it given our
235 new architecture. Until then, stick with it since we are a
236 single-threaded process and, truth be told, no one else relies on this
237 thing besides us.
238 */
239 auto t(std::make_shared<MonitorDBStore::Transaction>());
240 t->put(Monitor::MONITOR_NAME, "joined", 1);
f67539c2
TL
241 mon.store->apply_transaction(t);
242 mon.has_ever_joined = true;
7c673cae
FG
243 }
244
f67539c2
TL
245 if (mon.is_leader()) {
246 mon.clog->debug() << "monmap " << *mon.monmap;
b32b8144 247 }
7c673cae 248
f67539c2
TL
249 apply_mon_features(mon.get_quorum_mon_features(),
250 mon.quorum_min_mon_release);
2a845540
TL
251
252 mon.update_pending_metadata();
7c673cae
FG
253}
254
255bool MonmapMonitor::preprocess_query(MonOpRequestRef op)
256{
9f95a23c 257 auto m = op->get_req<PaxosServiceMessage>();
7c673cae
FG
258 switch (m->get_type()) {
259 // READs
260 case MSG_MON_COMMAND:
f64942e4
AA
261 try {
262 return preprocess_command(op);
263 }
264 catch (const bad_cmd_get& e) {
265 bufferlist bl;
f67539c2 266 mon.reply_command(op, -EINVAL, e.what(), bl, get_last_committed());
f64942e4
AA
267 return true;
268 }
7c673cae
FG
269 case MSG_MON_JOIN:
270 return preprocess_join(op);
271 default:
272 ceph_abort();
273 return true;
274 }
275}
276
277void MonmapMonitor::dump_info(Formatter *f)
278{
279 f->dump_unsigned("monmap_first_committed", get_first_committed());
280 f->dump_unsigned("monmap_last_committed", get_last_committed());
281 f->open_object_section("monmap");
f67539c2 282 mon.monmap->dump(f);
7c673cae
FG
283 f->close_section();
284 f->open_array_section("quorum");
f67539c2 285 for (set<int>::iterator q = mon.get_quorum().begin(); q != mon.get_quorum().end(); ++q)
7c673cae
FG
286 f->dump_int("mon", *q);
287 f->close_section();
288}
289
290bool MonmapMonitor::preprocess_command(MonOpRequestRef op)
291{
9f95a23c 292 auto m = op->get_req<MMonCommand>();
7c673cae
FG
293 int r = -1;
294 bufferlist rdata;
295 stringstream ss;
296
11fdf7f2 297 cmdmap_t cmdmap;
7c673cae
FG
298 if (!cmdmap_from_json(m->cmd, &cmdmap, ss)) {
299 string rs = ss.str();
f67539c2 300 mon.reply_command(op, -EINVAL, rs, rdata, get_last_committed());
7c673cae
FG
301 return true;
302 }
303
304 string prefix;
9f95a23c 305 cmd_getval(cmdmap, "prefix", prefix);
7c673cae 306
11fdf7f2 307 MonSession *session = op->get_session();
7c673cae 308 if (!session) {
f67539c2 309 mon.reply_command(op, -EACCES, "access denied", get_last_committed());
7c673cae
FG
310 return true;
311 }
312
20effc67 313 string format = cmd_getval_or<string>(cmdmap, "format", "plain");
7c673cae
FG
314 boost::scoped_ptr<Formatter> f(Formatter::create(format));
315
316 if (prefix == "mon stat") {
f91f0fd5
TL
317 if (f) {
318 f->open_object_section("monmap");
f67539c2
TL
319 mon.monmap->dump_summary(f.get());
320 f->dump_string("leader", mon.get_leader_name());
f91f0fd5 321 f->open_array_section("quorum");
f67539c2
TL
322 for (auto rank: mon.get_quorum()) {
323 std::string name = mon.monmap->get_name(rank);
f91f0fd5
TL
324 f->open_object_section("mon");
325 f->dump_int("rank", rank);
326 f->dump_string("name", name);
327 f->close_section(); // mon
328 }
329 f->close_section(); // quorum
330 f->close_section(); // monmap
331 f->flush(ss);
332 } else {
f67539c2
TL
333 mon.monmap->print_summary(ss);
334 ss << ", election epoch " << mon.get_epoch() << ", leader "
335 << mon.get_leader() << " " << mon.get_leader_name()
336 << ", quorum " << mon.get_quorum()
337 << " " << mon.get_quorum_names();
f91f0fd5
TL
338 }
339
7c673cae
FG
340 rdata.append(ss);
341 ss.str("");
342 r = 0;
343
344 } else if (prefix == "mon getmap" ||
345 prefix == "mon dump") {
346
347 epoch_t epoch;
20effc67 348 int64_t epochnum = cmd_getval_or<int64_t>(cmdmap, "epoch", 0);
7c673cae
FG
349 epoch = epochnum;
350
f67539c2 351 MonMap *p = mon.monmap;
7c673cae
FG
352 if (epoch) {
353 bufferlist bl;
354 r = get_version(epoch, bl);
355 if (r == -ENOENT) {
356 ss << "there is no map for epoch " << epoch;
357 goto reply;
358 }
11fdf7f2
TL
359 ceph_assert(r == 0);
360 ceph_assert(bl.length() > 0);
7c673cae
FG
361 p = new MonMap;
362 p->decode(bl);
363 }
364
11fdf7f2 365 ceph_assert(p);
7c673cae
FG
366
367 if (prefix == "mon getmap") {
368 p->encode(rdata, m->get_connection()->get_features());
369 r = 0;
370 ss << "got monmap epoch " << p->get_epoch();
371 } else if (prefix == "mon dump") {
372 stringstream ds;
373 if (f) {
374 f->open_object_section("monmap");
375 p->dump(f.get());
376 f->open_array_section("quorum");
f67539c2
TL
377 for (set<int>::iterator q = mon.get_quorum().begin();
378 q != mon.get_quorum().end(); ++q) {
7c673cae
FG
379 f->dump_int("mon", *q);
380 }
381 f->close_section();
382 f->close_section();
383 f->flush(ds);
384 r = 0;
385 } else {
386 p->print(ds);
387 r = 0;
388 }
389 rdata.append(ds);
390 ss << "dumped monmap epoch " << p->get_epoch();
391 }
f67539c2 392 if (p != mon.monmap) {
7c673cae 393 delete p;
11fdf7f2
TL
394 p = nullptr;
395 }
7c673cae 396
224ce89b 397 } else if (prefix == "mon feature ls") {
7c673cae
FG
398
399 bool list_with_value = false;
20effc67 400 cmd_getval_compat_cephbool(cmdmap, "with_value", list_with_value);
7c673cae 401
f67539c2 402 MonMap *p = mon.monmap;
7c673cae
FG
403
404 // list features
405 mon_feature_t supported = ceph::features::mon::get_supported();
406 mon_feature_t persistent = ceph::features::mon::get_persistent();
407 mon_feature_t required = p->get_required_features();
408
409 stringstream ds;
410 auto print_feature = [&](mon_feature_t& m_features, const char* m_str) {
411 if (f) {
412 if (list_with_value)
413 m_features.dump_with_value(f.get(), m_str);
414 else
415 m_features.dump(f.get(), m_str);
416 } else {
417 if (list_with_value)
418 m_features.print_with_value(ds);
419 else
420 m_features.print(ds);
421 }
422 };
423
424 if (f) {
425 f->open_object_section("features");
426
427 f->open_object_section("all");
428 print_feature(supported, "supported");
429 print_feature(persistent, "persistent");
430 f->close_section(); // all
431
432 f->open_object_section("monmap");
433 print_feature(p->persistent_features, "persistent");
434 print_feature(p->optional_features, "optional");
435 print_feature(required, "required");
436 f->close_section(); // monmap
437
438 f->close_section(); // features
439 f->flush(ds);
440
441 } else {
442 ds << "all features" << std::endl
443 << "\tsupported: ";
444 print_feature(supported, nullptr);
445 ds << std::endl
446 << "\tpersistent: ";
447 print_feature(persistent, nullptr);
448 ds << std::endl
449 << std::endl;
450
451 ds << "on current monmap (epoch "
452 << p->get_epoch() << ")" << std::endl
453 << "\tpersistent: ";
454 print_feature(p->persistent_features, nullptr);
455 ds << std::endl
456 // omit optional features in plain-text
457 // makes it easier to read, and they're, currently, empty.
458 << "\trequired: ";
459 print_feature(required, nullptr);
460 ds << std::endl;
461 }
462 rdata.append(ds);
463 r = 0;
464 }
465
466reply:
467 if (r != -1) {
468 string rs;
469 getline(ss, rs);
470
f67539c2 471 mon.reply_command(op, r, rs, rdata, get_last_committed());
7c673cae
FG
472 return true;
473 } else
474 return false;
475}
476
477
478bool MonmapMonitor::prepare_update(MonOpRequestRef op)
479{
9f95a23c 480 auto m = op->get_req<PaxosServiceMessage>();
224ce89b 481 dout(7) << __func__ << " " << *m << " from " << m->get_orig_source_inst() << dendl;
7c673cae
FG
482
483 switch (m->get_type()) {
484 case MSG_MON_COMMAND:
f64942e4
AA
485 try {
486 return prepare_command(op);
11fdf7f2 487 } catch (const bad_cmd_get& e) {
f64942e4 488 bufferlist bl;
f67539c2 489 mon.reply_command(op, -EINVAL, e.what(), bl, get_last_committed());
f64942e4
AA
490 return true;
491 }
7c673cae
FG
492 case MSG_MON_JOIN:
493 return prepare_join(op);
494 default:
495 ceph_abort();
496 }
497
498 return false;
499}
500
501bool MonmapMonitor::prepare_command(MonOpRequestRef op)
502{
9f95a23c 503 auto m = op->get_req<MMonCommand>();
7c673cae
FG
504 stringstream ss;
505 string rs;
506 int err = -EINVAL;
507
11fdf7f2 508 cmdmap_t cmdmap;
7c673cae
FG
509 if (!cmdmap_from_json(m->cmd, &cmdmap, ss)) {
510 string rs = ss.str();
f67539c2 511 mon.reply_command(op, -EINVAL, rs, get_last_committed());
7c673cae
FG
512 return true;
513 }
514
515 string prefix;
9f95a23c 516 cmd_getval(cmdmap, "prefix", prefix);
7c673cae 517
11fdf7f2 518 MonSession *session = op->get_session();
7c673cae 519 if (!session) {
f67539c2 520 mon.reply_command(op, -EACCES, "access denied", get_last_committed());
7c673cae
FG
521 return true;
522 }
523
524 /* We should follow the following rules:
525 *
526 * - 'monmap' is the current, consistent version of the monmap
527 * - 'pending_map' is the uncommitted version of the monmap
528 *
529 * All checks for the current state must be made against 'monmap'.
530 * All changes are made against 'pending_map'.
531 *
532 * If there are concurrent operations modifying 'pending_map', please
533 * follow the following rules.
534 *
535 * - if pending_map has already been changed, the second operation must
536 * wait for the proposal to finish and be run again; This is the easiest
537 * path to guarantee correctness but may impact performance (i.e., it
538 * will take longer for the user to get a reply).
539 *
540 * - if the result of the second operation can be guaranteed to be
541 * idempotent, the operation may reply to the user once the proposal
542 * finishes; still needs to wait for the proposal to finish.
543 *
544 * - An operation _NEVER_ returns to the user based on pending state.
545 *
546 * If an operation does not modify current stable monmap, it may be
547 * serialized before current pending map, regardless of any change that
548 * has been made to the pending map -- remember, pending is uncommitted
549 * state, thus we are not bound by it.
550 */
551
f67539c2
TL
552 ceph_assert(mon.monmap);
553 MonMap &monmap = *mon.monmap;
7c673cae
FG
554
555
556 /* Please note:
557 *
558 * Adding or removing monitors may lead to loss of quorum.
559 *
560 * Because quorum may be lost, it's important to reply something
561 * to the user, lest she end up waiting forever for a reply. And
562 * no reply will ever be sent until quorum is formed again.
563 *
564 * On the other hand, this means we're leaking uncommitted state
565 * to the user. As such, please be mindful of the reply message.
566 *
567 * e.g., 'adding monitor mon.foo' is okay ('adding' is an on-going
568 * operation and conveys its not-yet-permanent nature); whereas
569 * 'added monitor mon.foo' presumes the action has successfully
570 * completed and state has been committed, which may not be true.
571 */
572
573
574 bool propose = false;
575 if (prefix == "mon add") {
576 string name;
9f95a23c 577 cmd_getval(cmdmap, "name", name);
7c673cae 578 string addrstr;
9f95a23c 579 cmd_getval(cmdmap, "addr", addrstr);
7c673cae
FG
580 entity_addr_t addr;
581 bufferlist rdata;
582
20effc67 583 if (!addr.parse(addrstr)) {
7c673cae
FG
584 err = -EINVAL;
585 ss << "addr " << addrstr << "does not parse";
586 goto reply;
587 }
588
f67539c2
TL
589 vector<string> locationvec;
590 map<string, string> loc;
591 cmd_getval(cmdmap, "location", locationvec);
592 CrushWrapper::parse_loc_map(locationvec, &loc);
593 if (locationvec.size() &&
594 !mon.get_quorum_mon_features().contains_all(
595 ceph::features::mon::FEATURE_PINGING)) {
596 err = -ENOTSUP;
597 ss << "Not all monitors support adding monitors with a location; please upgrade first!";
598 goto reply;
599 }
600 if (locationvec.size() && !loc.size()) {
601 ss << "We could not parse your input location to anything real; " << locationvec
602 << " turned into an empty map!";
603 err = -EINVAL;
604 goto reply;
605 }
606
607 dout(10) << "mon add setting location for " << name << " to " << loc << dendl;
608
609 // TODO: validate location in crush map
610 if (monmap.stretch_mode_enabled && !loc.size()) {
611 ss << "We are in stretch mode and new monitors must have a location, but "
612 << "could not parse your input location to anything real; " << locationvec
613 << " turned into an empty map!";
614 err = -EINVAL;
615 goto reply;
616 }
617 // TODO: validate location against any existing stretch config
618
11fdf7f2
TL
619 entity_addrvec_t addrs;
620 if (monmap.persistent_features.contains_all(
621 ceph::features::mon::FEATURE_NAUTILUS)) {
622 if (addr.get_port() == CEPH_MON_PORT_IANA) {
623 addr.set_type(entity_addr_t::TYPE_MSGR2);
624 }
625 if (addr.get_port() == CEPH_MON_PORT_LEGACY) {
626 // if they specified the *old* default they probably don't care
627 addr.set_port(0);
628 }
629 if (addr.get_port()) {
630 addrs.v.push_back(addr);
631 } else {
632 addr.set_type(entity_addr_t::TYPE_MSGR2);
633 addr.set_port(CEPH_MON_PORT_IANA);
634 addrs.v.push_back(addr);
635 addr.set_type(entity_addr_t::TYPE_LEGACY);
636 addr.set_port(CEPH_MON_PORT_LEGACY);
637 addrs.v.push_back(addr);
638 }
639 } else {
640 if (addr.get_port() == 0) {
641 addr.set_port(CEPH_MON_PORT_LEGACY);
642 }
643 addr.set_type(entity_addr_t::TYPE_LEGACY);
644 addrs.v.push_back(addr);
7c673cae 645 }
11fdf7f2 646 dout(20) << __func__ << " addr " << addr << " -> addrs " << addrs << dendl;
7c673cae
FG
647
648 /**
649 * If we have a monitor with the same name and different addr, then EEXIST
650 * If we have a monitor with the same addr and different name, then EEXIST
651 * If we have a monitor with the same addr and same name, then wait for
652 * the proposal to finish and return success.
653 * If we don't have the monitor, add it.
654 */
655
656 err = 0;
657 if (!ss.str().empty())
658 ss << "; ";
659
660 do {
661 if (monmap.contains(name)) {
11fdf7f2 662 if (monmap.get_addrs(name) == addrs) {
7c673cae
FG
663 // stable map contains monitor with the same name at the same address.
664 // serialize before current pending map.
665 err = 0; // for clarity; this has already been set above.
11fdf7f2 666 ss << "mon." << name << " at " << addrs << " already exists";
7c673cae
FG
667 goto reply;
668 } else {
669 ss << "mon." << name
11fdf7f2 670 << " already exists at address " << monmap.get_addrs(name);
7c673cae 671 }
11fdf7f2 672 } else if (monmap.contains(addrs)) {
7c673cae 673 // we established on the previous branch that name is different
11fdf7f2 674 ss << "mon." << monmap.get_name(addrs)
7c673cae
FG
675 << " already exists at address " << addr;
676 } else {
677 // go ahead and add
678 break;
679 }
680 err = -EEXIST;
681 goto reply;
682 } while (false);
683
f67539c2
TL
684 if (pending_map.stretch_mode_enabled) {
685
686 }
687
7c673cae
FG
688 /* Given there's no delay between proposals on the MonmapMonitor (see
689 * MonmapMonitor::should_propose()), there is no point in checking for
690 * a mismatch between name and addr on pending_map.
691 *
692 * Once we established the monitor does not exist in the committed state,
693 * we can simply go ahead and add the monitor.
694 */
695
11fdf7f2 696 pending_map.add(name, addrs);
f67539c2 697 pending_map.mon_info[name].crush_loc = loc;
7c673cae 698 pending_map.last_changed = ceph_clock_now();
11fdf7f2 699 ss << "adding mon." << name << " at " << addrs;
7c673cae
FG
700 propose = true;
701 dout(0) << __func__ << " proposing new mon." << name << dendl;
702
703 } else if (prefix == "mon remove" ||
704 prefix == "mon rm") {
705 string name;
9f95a23c 706 cmd_getval(cmdmap, "name", name);
7c673cae
FG
707 if (!monmap.contains(name)) {
708 err = 0;
709 ss << "mon." << name << " does not exist or has already been removed";
710 goto reply;
711 }
712
713 if (monmap.size() == 1) {
714 err = -EINVAL;
715 ss << "error: refusing removal of last monitor " << name;
716 goto reply;
717 }
718
a4b75251
TL
719 if (pending_map.stretch_mode_enabled &&
720 name == pending_map.tiebreaker_mon) {
721 err = -EINVAL;
722 ss << "you cannot remove stretch mode's tiebreaker monitor";
723 goto reply;
724 }
7c673cae
FG
725 /* At the time of writing, there is no risk of races when multiple clients
726 * attempt to use the same name. The reason is simple but may not be
727 * obvious.
728 *
729 * In a nutshell, we do not collate proposals on the MonmapMonitor. As
730 * soon as we return 'true' below, PaxosService::dispatch() will check if
731 * the service should propose, and - if so - the service will be marked as
732 * 'proposing' and a proposal will be triggered. The PaxosService class
733 * guarantees that once a service is marked 'proposing' no further writes
734 * will be handled.
735 *
736 * The decision on whether the service should propose or not is, in this
737 * case, made by MonmapMonitor::should_propose(), which always considers
738 * the proposal delay being 0.0 seconds. This is key for PaxosService to
739 * trigger the proposal immediately.
740 * 0.0 seconds of delay.
741 *
742 * From the above, there's no point in performing further checks on the
743 * pending_map, as we don't ever have multiple proposals in-flight in
744 * this service. As we've established the committed state contains the
745 * monitor, we can simply go ahead and remove it.
746 *
747 * Please note that the code hinges on all of the above to be true. It
748 * has been true since time immemorial and we don't see a good reason
749 * to make it sturdier at this time - mainly because we don't think it's
750 * going to change any time soon, lest for any bug that may be unwillingly
751 * introduced.
752 */
753
11fdf7f2 754 entity_addrvec_t addrs = pending_map.get_addrs(name);
7c673cae 755 pending_map.remove(name);
a4b75251 756 pending_map.disallowed_leaders.erase(name);
7c673cae 757 pending_map.last_changed = ceph_clock_now();
7c673cae
FG
758 propose = true;
759 err = 0;
760
761 } else if (prefix == "mon feature set") {
762
763 /* PLEASE NOTE:
764 *
765 * We currently only support setting/unsetting persistent features.
766 * This is by design, given at the moment we still don't have optional
767 * features, and, as such, there is no point introducing an interface
768 * to manipulate them. This allows us to provide a cleaner, more
769 * intuitive interface to the user, modifying solely persistent
770 * features.
771 *
772 * In the future we should consider adding another interface to handle
773 * optional features/flags; e.g., 'mon feature flag set/unset', or
774 * 'mon flag set/unset'.
775 */
776 string feature_name;
9f95a23c 777 if (!cmd_getval(cmdmap, "feature_name", feature_name)) {
7c673cae
FG
778 ss << "missing required feature name";
779 err = -EINVAL;
780 goto reply;
781 }
782
783 mon_feature_t feature;
784 feature = ceph::features::mon::get_feature_by_name(feature_name);
785 if (feature == ceph::features::mon::FEATURE_NONE) {
786 ss << "unknown feature '" << feature_name << "'";
787 err = -ENOENT;
788 goto reply;
789 }
790
11fdf7f2 791 bool sure = false;
9f95a23c 792 cmd_getval(cmdmap, "yes_i_really_mean_it", sure);
11fdf7f2 793 if (!sure) {
7c673cae
FG
794 ss << "please specify '--yes-i-really-mean-it' if you "
795 << "really, **really** want to set feature '"
796 << feature << "' in the monmap.";
797 err = -EPERM;
798 goto reply;
799 }
800
f67539c2 801 if (!mon.get_quorum_mon_features().contains_all(feature)) {
7c673cae
FG
802 ss << "current quorum does not support feature '" << feature
803 << "'; supported features: "
f67539c2 804 << mon.get_quorum_mon_features();
7c673cae
FG
805 err = -EINVAL;
806 goto reply;
807 }
808
809 ss << "setting feature '" << feature << "'";
810
811 err = 0;
812 if (monmap.persistent_features.contains_all(feature)) {
813 dout(10) << __func__ << " feature '" << feature
814 << "' already set on monmap; no-op." << dendl;
815 goto reply;
816 }
817
818 pending_map.persistent_features.set_feature(feature);
819 pending_map.last_changed = ceph_clock_now();
820 propose = true;
821
11fdf7f2 822 dout(1) << __func__ << " " << ss.str() << "; new features will be: "
7c673cae
FG
823 << "persistent = " << pending_map.persistent_features
824 // output optional nevertheless, for auditing purposes.
825 << ", optional = " << pending_map.optional_features << dendl;
11fdf7f2
TL
826
827 } else if (prefix == "mon set-rank") {
828 string name;
829 int64_t rank;
9f95a23c
TL
830 if (!cmd_getval(cmdmap, "name", name) ||
831 !cmd_getval(cmdmap, "rank", rank)) {
11fdf7f2
TL
832 err = -EINVAL;
833 goto reply;
834 }
835 int oldrank = pending_map.get_rank(name);
836 if (oldrank < 0) {
837 ss << "mon." << name << " does not exist in monmap";
838 err = -ENOENT;
839 goto reply;
840 }
841 err = 0;
842 pending_map.set_rank(name, rank);
843 pending_map.last_changed = ceph_clock_now();
844 propose = true;
845 } else if (prefix == "mon set-addrs") {
846 string name;
847 string addrs;
9f95a23c
TL
848 if (!cmd_getval(cmdmap, "name", name) ||
849 !cmd_getval(cmdmap, "addrs", addrs)) {
11fdf7f2
TL
850 err = -EINVAL;
851 goto reply;
852 }
853 if (!pending_map.contains(name)) {
854 ss << "mon." << name << " does not exist";
855 err = -ENOENT;
856 goto reply;
857 }
858 entity_addrvec_t av;
859 if (!av.parse(addrs.c_str(), nullptr)) {
860 ss << "failed to parse addrs '" << addrs << "'";
861 err = -EINVAL;
862 goto reply;
863 }
864 for (auto& a : av.v) {
865 a.set_nonce(0);
866 if (!a.get_port()) {
867 ss << "monitor must bind to a non-zero port, not " << a;
868 err = -EINVAL;
869 goto reply;
870 }
871 }
872 err = 0;
873 pending_map.set_addrvec(name, av);
874 pending_map.last_changed = ceph_clock_now();
875 propose = true;
9f95a23c
TL
876 } else if (prefix == "mon set-weight") {
877 string name;
878 int64_t weight;
879 if (!cmd_getval(cmdmap, "name", name) ||
880 !cmd_getval(cmdmap, "weight", weight)) {
881 err = -EINVAL;
882 goto reply;
883 }
884 if (!pending_map.contains(name)) {
885 ss << "mon." << name << " does not exist";
886 err = -ENOENT;
887 goto reply;
888 }
889 err = 0;
890 pending_map.set_weight(name, weight);
891 pending_map.last_changed = ceph_clock_now();
892 propose = true;
11fdf7f2
TL
893 } else if (prefix == "mon enable-msgr2") {
894 if (!monmap.get_required_features().contains_all(
895 ceph::features::mon::FEATURE_NAUTILUS)) {
896 err = -EACCES;
897 ss << "all monitors must be running nautilus to enable v2";
898 goto reply;
899 }
900 for (auto& i : pending_map.mon_info) {
901 if (i.second.public_addrs.v.size() == 1 &&
902 i.second.public_addrs.front().is_legacy() &&
903 i.second.public_addrs.front().get_port() == CEPH_MON_PORT_LEGACY) {
904 entity_addrvec_t av;
905 entity_addr_t a = i.second.public_addrs.front();
906 a.set_type(entity_addr_t::TYPE_MSGR2);
907 a.set_port(CEPH_MON_PORT_IANA);
908 av.v.push_back(a);
909 av.v.push_back(i.second.public_addrs.front());
910 dout(10) << " setting mon." << i.first
911 << " addrs " << i.second.public_addrs
912 << " -> " << av << dendl;
913 pending_map.set_addrvec(i.first, av);
914 propose = true;
915 pending_map.last_changed = ceph_clock_now();
916 }
917 }
918 err = 0;
f67539c2
TL
919 } else if (prefix == "mon set election_strategy") {
920 if (!mon.get_quorum_mon_features().contains_all(
921 ceph::features::mon::FEATURE_PINGING)) {
922 err = -ENOTSUP;
923 ss << "Not all monitors support changing election strategies; please upgrade first!";
924 goto reply;
925 }
926 string strat;
927 MonMap::election_strategy strategy;
928 if (!cmd_getval(cmdmap, "strategy", strat)) {
929 err = -EINVAL;
930 goto reply;
931 }
932 if (strat == "classic") {
933 strategy = MonMap::CLASSIC;
934 } else if (strat == "disallow") {
935 strategy = MonMap::DISALLOW;
936 } else if (strat == "connectivity") {
937 strategy = MonMap::CONNECTIVITY;
938 } else {
939 err = -EINVAL;
940 goto reply;
941 }
942 err = 0;
943 pending_map.strategy = strategy;
a4b75251 944 pending_map.last_changed = ceph_clock_now();
f67539c2
TL
945 propose = true;
946 } else if (prefix == "mon add disallowed_leader") {
947 if (!mon.get_quorum_mon_features().contains_all(
948 ceph::features::mon::FEATURE_PINGING)) {
949 err = -ENOTSUP;
950 ss << "Not all monitors support changing election strategies; please upgrade first!";
951 goto reply;
952 }
953 string name;
954 if (!cmd_getval(cmdmap, "name", name)) {
955 err = -EINVAL;
956 goto reply;
957 }
958 if (pending_map.strategy != MonMap::DISALLOW &&
959 pending_map.strategy != MonMap::CONNECTIVITY) {
960 ss << "You cannot disallow monitors in your current election mode";
961 err = -EINVAL;
962 goto reply;
963 }
964 if (!pending_map.contains(name)) {
965 ss << "mon." << name << " does not exist";
966 err = -ENOENT;
967 goto reply;
968 }
969 if (pending_map.disallowed_leaders.count(name)) {
970 ss << "mon." << name << " is already disallowed";
971 err = 0;
972 goto reply;
973 }
974 if (pending_map.disallowed_leaders.size() == pending_map.size() - 1) {
975 ss << "mon." << name << " is the only remaining allowed leader!";
976 err = -EINVAL;
977 goto reply;
978 }
979 pending_map.disallowed_leaders.insert(name);
a4b75251 980 pending_map.last_changed = ceph_clock_now();
f67539c2
TL
981 err = 0;
982 propose = true;
983 } else if (prefix == "mon rm disallowed_leader") {
984 if (!mon.get_quorum_mon_features().contains_all(
985 ceph::features::mon::FEATURE_PINGING)) {
986 err = -ENOTSUP;
987 ss << "Not all monitors support changing election strategies; please upgrade first!";
988 goto reply;
989 }
990 string name;
991 if (!cmd_getval(cmdmap, "name", name)) {
992 err = -EINVAL;
993 goto reply;
994 }
995 if (pending_map.strategy != MonMap::DISALLOW &&
996 pending_map.strategy != MonMap::CONNECTIVITY) {
997 ss << "You cannot disallow monitors in your current election mode";
998 err = -EINVAL;
999 goto reply;
1000 }
1001 if (!pending_map.contains(name)) {
1002 ss << "mon." << name << " does not exist";
1003 err = -ENOENT;
1004 goto reply;
1005 }
1006 if (!pending_map.disallowed_leaders.count(name)) {
1007 ss << "mon." << name << " is already allowed";
1008 err = 0;
1009 goto reply;
1010 }
1011 pending_map.disallowed_leaders.erase(name);
a4b75251 1012 pending_map.last_changed = ceph_clock_now();
f67539c2
TL
1013 err = 0;
1014 propose = true;
1015 } else if (prefix == "mon set_location") {
1016 if (!mon.get_quorum_mon_features().contains_all(
1017 ceph::features::mon::FEATURE_PINGING)) {
1018 err = -ENOTSUP;
1019 ss << "Not all monitors support monitor locations; please upgrade first!";
1020 goto reply;
1021 }
1022 string name;
1023 if (!cmd_getval(cmdmap, "name", name)) {
1024 err = -EINVAL;
1025 goto reply;
1026 }
1027 if (!pending_map.contains(name)) {
1028 ss << "mon." << name << " does not exist";
1029 err = -ENOENT;
1030 goto reply;
1031 }
1032
f67539c2
TL
1033 vector<string> argvec;
1034 map<string, string> loc;
1035 cmd_getval(cmdmap, "args", argvec);
1036 CrushWrapper::parse_loc_map(argvec, &loc);
1037
1038 dout(10) << "mon set_location for " << name << " to " << loc << dendl;
1039
1040 // TODO: validate location in crush map
1041 if (!loc.size()) {
1042 ss << "We could not parse your input location to anything real; " << argvec
1043 << " turned into an empty map!";
1044 err = -EINVAL;
1045 goto reply;
1046 }
1047 // TODO: validate location against any existing stretch config
1048 pending_map.mon_info[name].crush_loc = loc;
a4b75251
TL
1049 pending_map.last_changed = ceph_clock_now();
1050 err = 0;
1051 propose = true;
1052 } else if (prefix == "mon set_new_tiebreaker") {
1053 if (!pending_map.stretch_mode_enabled) {
1054 err = -EINVAL;
1055 ss << "Stretch mode is not enabled, so there is no tiebreaker";
1056 goto reply;
1057 }
1058 string name;
1059 if (!cmd_getval(cmdmap, "name", name)) {
1060 err = -EINVAL;
1061 goto reply;
1062 }
1063 bool sure = false;
1064 cmd_getval(cmdmap, "yes_i_really_mean_it", sure);
1065
1066 const auto &existing_tiebreaker_info_i = pending_map.mon_info.find(pending_map.tiebreaker_mon);
1067 const auto &new_tiebreaker_info_i = pending_map.mon_info.find(name);
1068 if (new_tiebreaker_info_i == pending_map.mon_info.end()) {
1069 ss << "mon." << name << " does not exist";
1070 err = -ENOENT;
1071 goto reply;
1072 }
1073 const auto& new_info = new_tiebreaker_info_i->second;
1074 if (new_info.crush_loc.empty()) {
1075 ss << "mon." << name << " does not have a location specified";
1076 err = -EINVAL;
1077 goto reply;
1078 }
1079
1080 if (!mon.osdmon()->is_readable()) {
1081 dout(10) << __func__
1082 << ": waiting for osdmon readable to inspect crush barrier"
1083 << dendl;
1084 mon.osdmon()->wait_for_readable(op, new Monitor::C_RetryMessage(&mon, op));
1085 return false;
1086 }
1087 int32_t stretch_divider_id = mon.osdmon()->osdmap.stretch_mode_bucket;
1088 string stretch_bucket_divider = mon.osdmon()->osdmap.crush->
1089 get_type_name(stretch_divider_id);
1090
1091 const auto& new_loc_i = new_info.crush_loc.find(stretch_bucket_divider);
1092 if (new_loc_i == new_info.crush_loc.end()) {
1093 ss << "mon." << name << " has a specificed location, but not a "
1094 << stretch_bucket_divider << ", which is the stretch divider";
1095 err = -EINVAL;
1096 goto reply;
1097 }
1098 const string& new_loc = new_loc_i->second;
1099 set<string> matching_mons;
1100 for (const auto& mii : pending_map.mon_info) {
1101 const auto& other_loc_i = mii.second.crush_loc.find(stretch_bucket_divider);
1102 if (mii.first == name) {
1103 continue;
1104 }
1105 if (other_loc_i == mii.second.crush_loc.end()) { // huh
1106 continue;
1107 }
1108 const string& other_loc = other_loc_i->second;
1109 if (other_loc == new_loc &&
1110 mii.first != existing_tiebreaker_info_i->first) {
1111 matching_mons.insert(mii.first);
1112 }
1113 }
1114 if (!matching_mons.empty()) {
1115 ss << "mon." << name << " has location " << new_loc_i->second
1116 << ", which matches mons " << matching_mons << " on the "
1117 << stretch_bucket_divider << " dividing bucket for stretch mode. "
1118 "Pass --yes-i-really-mean-it if you're sure you want to do this."
1119 "(You really don't.)";
1120 err = -EINVAL;
1121 goto reply;
1122 }
1123 pending_map.tiebreaker_mon = name;
1124 pending_map.disallowed_leaders.insert(name);
1125 pending_map.last_changed = ceph_clock_now();
f67539c2
TL
1126 err = 0;
1127 propose = true;
1128 } else if (prefix == "mon enable_stretch_mode") {
1129 if (!mon.osdmon()->is_writeable()) {
a4b75251 1130 dout(10) << __func__
f67539c2
TL
1131 << ": waiting for osdmon writeable for stretch mode" << dendl;
1132 mon.osdmon()->wait_for_writeable(op, new Monitor::C_RetryMessage(&mon, op));
1133 return false;
1134 }
1135 {
1136 if (monmap.stretch_mode_enabled) {
1137 ss << "stretch mode is already engaged";
1138 err = -EINVAL;
1139 goto reply;
1140 }
1141 if (pending_map.stretch_mode_enabled) {
1142 ss << "stretch mode currently committing";
1143 err = 0;
1144 goto reply;
1145 }
1146 string tiebreaker_mon;
1147 if (!cmd_getval(cmdmap, "tiebreaker_mon", tiebreaker_mon)) {
1148 ss << "must specify a tiebreaker monitor";
1149 err = -EINVAL;
1150 goto reply;
1151 }
1152 string new_crush_rule;
1153 if (!cmd_getval(cmdmap, "new_crush_rule", new_crush_rule)) {
1154 ss << "must specify a new crush rule that spreads out copies over multiple sites";
1155 err = -EINVAL;
1156 goto reply;
1157 }
1158 string dividing_bucket;
1159 if (!cmd_getval(cmdmap, "dividing_bucket", dividing_bucket)) {
1160 ss << "must specify a dividing bucket";
1161 err = -EINVAL;
1162 goto reply;
1163 }
1164 //okay, initial arguments make sense, check pools and cluster state
1165 err = mon.osdmon()->check_cluster_features(CEPH_FEATUREMASK_STRETCH_MODE, ss);
1166 if (err)
1167 goto reply;
1168 struct Plugger {
1169 Paxos &p;
1170 Plugger(Paxos &p) : p(p) { p.plug(); }
1171 ~Plugger() { p.unplug(); }
1172 } plugger(paxos);
1173
1174 set<pg_pool_t*> pools;
1175 bool okay = false;
1176 int errcode = 0;
1177
1178 mon.osdmon()->try_enable_stretch_mode_pools(ss, &okay, &errcode,
1179 &pools, new_crush_rule);
1180 if (!okay) {
1181 err = errcode;
1182 goto reply;
1183 }
1184 try_enable_stretch_mode(ss, &okay, &errcode, false,
1185 tiebreaker_mon, dividing_bucket);
1186 if (!okay) {
1187 err = errcode;
1188 goto reply;
1189 }
1190 mon.osdmon()->try_enable_stretch_mode(ss, &okay, &errcode, false,
1191 dividing_bucket, 2, pools, new_crush_rule);
1192 if (!okay) {
1193 err = errcode;
1194 goto reply;
1195 }
1196 // everything looks good, actually commit the changes!
1197 try_enable_stretch_mode(ss, &okay, &errcode, true,
1198 tiebreaker_mon, dividing_bucket);
1199 mon.osdmon()->try_enable_stretch_mode(ss, &okay, &errcode, true,
1200 dividing_bucket,
1201 2, // right now we only support 2 sites
1202 pools, new_crush_rule);
1203 ceph_assert(okay == true);
1204 }
1205 request_proposal(mon.osdmon());
1206 err = 0;
1207 propose = true;
7c673cae
FG
1208 } else {
1209 ss << "unknown command " << prefix;
1210 err = -EINVAL;
1211 }
1212
1213reply:
1214 getline(ss, rs);
f67539c2 1215 mon.reply_command(op, err, rs, get_last_committed());
7c673cae
FG
1216 // we are returning to the user; do not propose.
1217 return propose;
1218}
1219
f67539c2
TL
1220void MonmapMonitor::try_enable_stretch_mode(stringstream& ss, bool *okay,
1221 int *errcode, bool commit,
1222 const string& tiebreaker_mon,
1223 const string& dividing_bucket)
1224{
1225 dout(20) << __func__ << dendl;
1226 *okay = false;
1227 if (pending_map.strategy != MonMap::CONNECTIVITY) {
1228 ss << "Monitors must use the connectivity strategy to enable stretch mode";
1229 *errcode = -EINVAL;
1230 ceph_assert(!commit);
1231 return;
1232 }
1233 if (!pending_map.contains(tiebreaker_mon)) {
1234 ss << "mon " << tiebreaker_mon << "does not seem to exist";
1235 *errcode = -ENOENT;
1236 ceph_assert(!commit);
1237 return;
1238 }
1239 map<string,string> buckets;
1240 for (const auto&mii : mon.monmap->mon_info) {
1241 const auto& mi = mii.second;
1242 const auto& bi = mi.crush_loc.find(dividing_bucket);
1243 if (bi == mi.crush_loc.end()) {
1244 ss << "Could not find location entry for " << dividing_bucket
1245 << " on monitor " << mi.name;
1246 *errcode = -EINVAL;
1247 ceph_assert(!commit);
1248 return;
1249 }
1250 buckets[mii.first] = bi->second;
1251 }
1252 string bucket1, bucket2, tiebreaker_bucket;
1253 for (auto& i : buckets) {
1254 if (i.first == tiebreaker_mon) {
1255 tiebreaker_bucket = i.second;
1256 continue;
1257 }
1258 if (bucket1.empty()) {
1259 bucket1 = i.second;
1260 }
1261 if (bucket1 != i.second &&
1262 bucket2.empty()) {
1263 bucket2 = i.second;
1264 }
1265 if (bucket1 != i.second &&
1266 bucket2 != i.second) {
1267 ss << "There are too many monitor buckets for stretch mode, found "
1268 << bucket1 << "," << bucket2 << "," << i.second;
1269 *errcode = -EINVAL;
1270 ceph_assert(!commit);
1271 return;
1272 }
1273 }
1274 if (bucket1.empty() || bucket2.empty()) {
1275 ss << "There are not enough monitor buckets for stretch mode;"
1276 << " must have at least 2 plus the tiebreaker but only found "
1277 << (bucket1.empty() ? bucket1 : bucket2);
1278 *errcode = -EINVAL;
1279 ceph_assert(!commit);
1280 return;
1281 }
1282 if (tiebreaker_bucket == bucket1 ||
1283 tiebreaker_bucket == bucket2) {
1284 ss << "The named tiebreaker monitor " << tiebreaker_mon
1285 << " is in the same CRUSH bucket " << tiebreaker_bucket
1286 << " as other monitors";
1287 *errcode = -EINVAL;
1288 ceph_assert(!commit);
1289 return;
1290 }
1291 if (commit) {
1292 pending_map.disallowed_leaders.insert(tiebreaker_mon);
1293 pending_map.tiebreaker_mon = tiebreaker_mon;
1294 pending_map.stretch_mode_enabled = true;
1295 }
1296 *okay = true;
1297}
1298
1299void MonmapMonitor::trigger_degraded_stretch_mode(const set<string>& dead_mons)
1300{
1301 dout(20) << __func__ << dendl;
1302 pending_map.stretch_marked_down_mons.insert(dead_mons.begin(), dead_mons.end());
1303 propose_pending();
1304}
1305
1306void MonmapMonitor::trigger_healthy_stretch_mode()
1307{
1308 dout(20) << __func__ << dendl;
1309 pending_map.stretch_marked_down_mons.clear();
1310 propose_pending();
1311}
1312
7c673cae
FG
1313bool MonmapMonitor::preprocess_join(MonOpRequestRef op)
1314{
9f95a23c 1315 auto join = op->get_req<MMonJoin>();
11fdf7f2 1316 dout(10) << __func__ << " " << join->name << " at " << join->addrs << dendl;
7c673cae 1317
11fdf7f2 1318 MonSession *session = op->get_session();
7c673cae
FG
1319 if (!session ||
1320 !session->is_capable("mon", MON_CAP_W | MON_CAP_X)) {
1321 dout(10) << " insufficient caps" << dendl;
1322 return true;
1323 }
1324
b3b6e05e
TL
1325 const auto name_info_i = pending_map.mon_info.find(join->name);
1326 if (name_info_i != pending_map.mon_info.end() &&
1327 !name_info_i->second.public_addrs.front().is_blank_ip() &&
1328 (!join->force_loc || join->crush_loc == name_info_i->second.crush_loc)) {
7c673cae
FG
1329 dout(10) << " already have " << join->name << dendl;
1330 return true;
1331 }
b3b6e05e
TL
1332 string addr_name;
1333 if (pending_map.contains(join->addrs)) {
1334 addr_name = pending_map.get_name(join->addrs);
1335 }
1336 if (!addr_name.empty() &&
1337 addr_name == join->name &&
1338 (!join->force_loc || join->crush_loc.empty() ||
1339 pending_map.mon_info[addr_name].crush_loc == join->crush_loc)) {
11fdf7f2 1340 dout(10) << " already have " << join->addrs << dendl;
7c673cae
FG
1341 return true;
1342 }
b3b6e05e
TL
1343 if (pending_map.stretch_mode_enabled &&
1344 join->crush_loc.empty() &&
1345 (addr_name.empty() ||
1346 pending_map.mon_info[addr_name].crush_loc.empty())) {
1347 dout(10) << "stretch mode engaged but no source of crush_loc" << dendl;
1348 mon.clog->info() << join->name << " attempted to join from " << join->name
1349 << ' ' << join->addrs
1350 << "; but lacks a crush_location for stretch mode";
1351 return true;
1352 }
7c673cae
FG
1353 return false;
1354}
b3b6e05e 1355
7c673cae
FG
1356bool MonmapMonitor::prepare_join(MonOpRequestRef op)
1357{
9f95a23c 1358 auto join = op->get_req<MMonJoin>();
11fdf7f2
TL
1359 dout(0) << "adding/updating " << join->name
1360 << " at " << join->addrs << " to monitor cluster" << dendl;
f67539c2
TL
1361 map<string,string> existing_loc;
1362 if (pending_map.contains(join->addrs)) {
1363 string name = pending_map.get_name(join->addrs);
1364 existing_loc = pending_map.mon_info[name].crush_loc;
1365 pending_map.remove(name);
1366 }
7c673cae
FG
1367 if (pending_map.contains(join->name))
1368 pending_map.remove(join->name);
11fdf7f2 1369 pending_map.add(join->name, join->addrs);
b3b6e05e
TL
1370 pending_map.mon_info[join->name].crush_loc =
1371 ((join->force_loc || existing_loc.empty()) ?
1372 join->crush_loc : existing_loc);
7c673cae
FG
1373 pending_map.last_changed = ceph_clock_now();
1374 return true;
1375}
1376
1377bool MonmapMonitor::should_propose(double& delay)
1378{
1379 delay = 0.0;
1380 return true;
1381}
1382
7c673cae
FG
1383int MonmapMonitor::get_monmap(bufferlist &bl)
1384{
1385 version_t latest_ver = get_last_committed();
1386 dout(10) << __func__ << " ver " << latest_ver << dendl;
1387
f67539c2 1388 if (!mon.store->exists(get_service_name(), stringify(latest_ver)))
7c673cae
FG
1389 return -ENOENT;
1390
1391 int err = get_version(latest_ver, bl);
1392 if (err < 0) {
1393 dout(1) << __func__ << " error obtaining monmap: "
1394 << cpp_strerror(err) << dendl;
1395 return err;
1396 }
1397 return 0;
1398}
1399
1400void MonmapMonitor::check_subs()
1401{
1402 const string type = "monmap";
f67539c2 1403 mon.with_session_map([this, &type](const MonSessionMap& session_map) {
7c673cae
FG
1404 auto subs = session_map.subs.find(type);
1405 if (subs == session_map.subs.end())
1406 return;
1407 for (auto sub : *subs->second) {
1408 check_sub(sub);
1409 }
1410 });
1411}
1412
1413void MonmapMonitor::check_sub(Subscription *sub)
1414{
f67539c2 1415 const auto epoch = mon.monmap->get_epoch();
7c673cae
FG
1416 dout(10) << __func__
1417 << " monmap next " << sub->next
1418 << " have " << epoch << dendl;
1419 if (sub->next <= epoch) {
f67539c2 1420 mon.send_latest_monmap(sub->session->con.get());
7c673cae 1421 if (sub->onetime) {
f67539c2 1422 mon.with_session_map([sub](MonSessionMap& session_map) {
7c673cae
FG
1423 session_map.remove_sub(sub);
1424 });
1425 } else {
1426 sub->next = epoch + 1;
1427 }
1428 }
1429}
11fdf7f2
TL
1430
1431void MonmapMonitor::tick()
1432{
1433 if (!is_active() ||
f67539c2 1434 !mon.is_leader()) {
11fdf7f2
TL
1435 return;
1436 }
1437
f67539c2 1438 if (mon.monmap->created.is_zero()) {
11fdf7f2
TL
1439 dout(10) << __func__ << " detected empty created stamp" << dendl;
1440 utime_t ctime;
1441 for (version_t v = 1; v <= get_last_committed(); v++) {
1442 bufferlist bl;
1443 int r = get_version(v, bl);
1444 if (r < 0) {
1445 continue;
1446 }
1447 MonMap m;
1448 auto p = bl.cbegin();
1449 decode(m, p);
1450 if (!m.last_changed.is_zero()) {
1451 dout(10) << __func__ << " first monmap with last_changed is "
1452 << v << " with " << m.last_changed << dendl;
1453 ctime = m.last_changed;
1454 break;
1455 }
1456 }
1457 if (ctime.is_zero()) {
1458 ctime = ceph_clock_now();
1459 }
1460 dout(10) << __func__ << " updating created stamp to " << ctime << dendl;
1461 pending_map.created = ctime;
1462 propose_pending();
1463 }
1464}