]>
Commit | Line | Data |
---|---|---|
7c673cae FG |
1 | // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- |
2 | // vim: ts=8 sw=2 smarttab | |
3 | /* | |
4 | * Ceph - scalable distributed file system | |
5 | * | |
6 | * Copyright (C) 2009 Sage Weil <sage@newdream.net> | |
7 | * | |
8 | * This is free software; you can redistribute it and/or | |
9 | * modify it under the terms of the GNU Lesser General Public | |
10 | * License version 2.1, as published by the Free Software | |
11 | * Foundation. See file COPYING. | |
12 | * | |
13 | */ | |
14 | ||
15 | #include "MonmapMonitor.h" | |
16 | #include "Monitor.h" | |
17 | #include "messages/MMonCommand.h" | |
18 | #include "messages/MMonJoin.h" | |
19 | ||
20 | #include "common/ceph_argparse.h" | |
21 | #include "common/errno.h" | |
22 | #include <sstream> | |
23 | #include "common/config.h" | |
24 | #include "common/cmdparse.h" | |
25 | ||
11fdf7f2 | 26 | #include "include/ceph_assert.h" |
7c673cae FG |
27 | #include "include/stringify.h" |
28 | ||
29 | #define dout_subsys ceph_subsys_mon | |
30 | #undef dout_prefix | |
31 | #define dout_prefix _prefix(_dout, mon) | |
9f95a23c | 32 | using namespace TOPNSPC::common; |
7c673cae FG |
33 | static ostream& _prefix(std::ostream *_dout, Monitor *mon) { |
34 | return *_dout << "mon." << mon->name << "@" << mon->rank | |
35 | << "(" << mon->get_state_name() | |
36 | << ").monmap v" << mon->monmap->epoch << " "; | |
37 | } | |
38 | ||
39 | void MonmapMonitor::create_initial() | |
40 | { | |
224ce89b | 41 | dout(10) << __func__ << " using current monmap" << dendl; |
7c673cae FG |
42 | pending_map = *mon->monmap; |
43 | pending_map.epoch = 1; | |
31f18b77 | 44 | |
11fdf7f2 | 45 | if (g_conf()->mon_debug_no_initial_persistent_features) { |
31f18b77 FG |
46 | derr << __func__ << " mon_debug_no_initial_persistent_features=true" |
47 | << dendl; | |
48 | } else { | |
49 | // initialize with default persistent features for new clusters | |
50 | pending_map.persistent_features = ceph::features::mon::get_persistent(); | |
11fdf7f2 | 51 | pending_map.min_mon_release = ceph_release(); |
31f18b77 | 52 | } |
7c673cae FG |
53 | } |
54 | ||
55 | void MonmapMonitor::update_from_paxos(bool *need_bootstrap) | |
56 | { | |
57 | version_t version = get_last_committed(); | |
58 | if (version <= mon->monmap->get_epoch()) | |
59 | return; | |
60 | ||
61 | dout(10) << __func__ << " version " << version | |
62 | << ", my v " << mon->monmap->epoch << dendl; | |
63 | ||
64 | if (need_bootstrap && version != mon->monmap->get_epoch()) { | |
65 | dout(10) << " signaling that we need a bootstrap" << dendl; | |
66 | *need_bootstrap = true; | |
67 | } | |
68 | ||
69 | // read and decode | |
70 | monmap_bl.clear(); | |
71 | int ret = get_version(version, monmap_bl); | |
11fdf7f2 TL |
72 | ceph_assert(ret == 0); |
73 | ceph_assert(monmap_bl.length()); | |
7c673cae | 74 | |
224ce89b | 75 | dout(10) << __func__ << " got " << version << dendl; |
7c673cae FG |
76 | mon->monmap->decode(monmap_bl); |
77 | ||
78 | if (mon->store->exists("mkfs", "monmap")) { | |
79 | auto t(std::make_shared<MonitorDBStore::Transaction>()); | |
80 | t->erase("mkfs", "monmap"); | |
81 | mon->store->apply_transaction(t); | |
82 | } | |
83 | ||
84 | check_subs(); | |
11fdf7f2 TL |
85 | |
86 | // make sure we've recorded min_mon_release | |
87 | string val; | |
88 | if (mon->store->read_meta("min_mon_release", &val) < 0 || | |
89 | val.size() == 0 || | |
90 | atoi(val.c_str()) != (int)ceph_release()) { | |
91 | dout(10) << __func__ << " updating min_mon_release meta" << dendl; | |
92 | mon->store->write_meta("min_mon_release", | |
93 | stringify(ceph_release())); | |
94 | } | |
7c673cae FG |
95 | } |
96 | ||
97 | void MonmapMonitor::create_pending() | |
98 | { | |
99 | pending_map = *mon->monmap; | |
100 | pending_map.epoch++; | |
101 | pending_map.last_changed = ceph_clock_now(); | |
224ce89b | 102 | dout(10) << __func__ << " monmap epoch " << pending_map.epoch << dendl; |
7c673cae FG |
103 | } |
104 | ||
105 | void MonmapMonitor::encode_pending(MonitorDBStore::TransactionRef t) | |
106 | { | |
224ce89b | 107 | dout(10) << __func__ << " epoch " << pending_map.epoch << dendl; |
7c673cae | 108 | |
11fdf7f2 | 109 | ceph_assert(mon->monmap->epoch + 1 == pending_map.epoch || |
7c673cae FG |
110 | pending_map.epoch == 1); // special case mkfs! |
111 | bufferlist bl; | |
112 | pending_map.encode(bl, mon->get_quorum_con_features()); | |
113 | ||
114 | put_version(t, pending_map.epoch, bl); | |
115 | put_last_committed(t, pending_map.epoch); | |
116 | ||
117 | // generate a cluster fingerprint, too? | |
118 | if (pending_map.epoch == 1) { | |
119 | mon->prepare_new_fingerprint(t); | |
120 | } | |
121 | } | |
122 | ||
123 | class C_ApplyFeatures : public Context { | |
124 | MonmapMonitor *svc; | |
125 | mon_feature_t features; | |
9f95a23c | 126 | ceph_release_t min_mon_release; |
11fdf7f2 | 127 | public: |
9f95a23c | 128 | C_ApplyFeatures(MonmapMonitor *s, const mon_feature_t& f, ceph_release_t mmr) : |
11fdf7f2 | 129 | svc(s), features(f), min_mon_release(mmr) { } |
7c673cae FG |
130 | void finish(int r) override { |
131 | if (r >= 0) { | |
11fdf7f2 | 132 | svc->apply_mon_features(features, min_mon_release); |
7c673cae FG |
133 | } else if (r == -EAGAIN || r == -ECANCELED) { |
134 | // discard features if we're no longer on the quorum that | |
135 | // established them in the first place. | |
136 | return; | |
137 | } else { | |
11fdf7f2 | 138 | ceph_abort_msg("bad C_ApplyFeatures return value"); |
7c673cae FG |
139 | } |
140 | } | |
141 | }; | |
142 | ||
11fdf7f2 | 143 | void MonmapMonitor::apply_mon_features(const mon_feature_t& features, |
9f95a23c | 144 | ceph_release_t min_mon_release) |
7c673cae FG |
145 | { |
146 | if (!is_writeable()) { | |
147 | dout(5) << __func__ << " wait for service to be writeable" << dendl; | |
11fdf7f2 | 148 | wait_for_writeable_ctx(new C_ApplyFeatures(this, features, min_mon_release)); |
7c673cae FG |
149 | return; |
150 | } | |
151 | ||
11fdf7f2 TL |
152 | // do nothing here unless we have a full quorum |
153 | if (mon->get_quorum().size() < mon->monmap->size()) { | |
154 | return; | |
155 | } | |
156 | ||
157 | ceph_assert(is_writeable()); | |
158 | ceph_assert(features.contains_all(pending_map.persistent_features)); | |
7c673cae FG |
159 | // we should never hit this because `features` should be the result |
160 | // of the quorum's supported features. But if it happens, die. | |
11fdf7f2 | 161 | ceph_assert(ceph::features::mon::get_supported().contains_all(features)); |
7c673cae FG |
162 | |
163 | mon_feature_t new_features = | |
164 | (pending_map.persistent_features ^ | |
165 | (features & ceph::features::mon::get_persistent())); | |
166 | ||
11fdf7f2 TL |
167 | if (new_features.empty() && |
168 | pending_map.min_mon_release == min_mon_release) { | |
81eedcae | 169 | dout(10) << __func__ << " min_mon_release (" << (int)min_mon_release |
11fdf7f2 | 170 | << ") and features (" << features << ") match" << dendl; |
7c673cae FG |
171 | return; |
172 | } | |
173 | ||
11fdf7f2 TL |
174 | if (!new_features.empty()) { |
175 | dout(1) << __func__ << " applying new features " | |
176 | << new_features << ", had " << pending_map.persistent_features | |
177 | << ", will have " | |
178 | << (new_features | pending_map.persistent_features) | |
179 | << dendl; | |
180 | pending_map.persistent_features |= new_features; | |
181 | } | |
182 | if (min_mon_release > pending_map.min_mon_release) { | |
183 | dout(1) << __func__ << " increasing min_mon_release to " | |
9f95a23c | 184 | << ceph::to_integer<int>(min_mon_release) << " (" << min_mon_release |
11fdf7f2 TL |
185 | << ")" << dendl; |
186 | pending_map.min_mon_release = min_mon_release; | |
7c673cae FG |
187 | } |
188 | ||
7c673cae FG |
189 | propose_pending(); |
190 | } | |
191 | ||
192 | void MonmapMonitor::on_active() | |
193 | { | |
194 | if (get_last_committed() >= 1 && !mon->has_ever_joined) { | |
195 | // make note of the fact that i was, once, part of the quorum. | |
196 | dout(10) << "noting that i was, once, part of an active quorum." << dendl; | |
197 | ||
198 | /* This is some form of nasty in-breeding we have between the MonmapMonitor | |
199 | and the Monitor itself. We should find a way to get rid of it given our | |
200 | new architecture. Until then, stick with it since we are a | |
201 | single-threaded process and, truth be told, no one else relies on this | |
202 | thing besides us. | |
203 | */ | |
204 | auto t(std::make_shared<MonitorDBStore::Transaction>()); | |
205 | t->put(Monitor::MONITOR_NAME, "joined", 1); | |
206 | mon->store->apply_transaction(t); | |
207 | mon->has_ever_joined = true; | |
208 | } | |
209 | ||
b32b8144 FG |
210 | if (mon->is_leader()) { |
211 | mon->clog->debug() << "monmap " << *mon->monmap; | |
212 | } | |
7c673cae | 213 | |
11fdf7f2 TL |
214 | apply_mon_features(mon->get_quorum_mon_features(), |
215 | mon->quorum_min_mon_release); | |
7c673cae FG |
216 | } |
217 | ||
218 | bool MonmapMonitor::preprocess_query(MonOpRequestRef op) | |
219 | { | |
9f95a23c | 220 | auto m = op->get_req<PaxosServiceMessage>(); |
7c673cae FG |
221 | switch (m->get_type()) { |
222 | // READs | |
223 | case MSG_MON_COMMAND: | |
f64942e4 AA |
224 | try { |
225 | return preprocess_command(op); | |
226 | } | |
227 | catch (const bad_cmd_get& e) { | |
228 | bufferlist bl; | |
229 | mon->reply_command(op, -EINVAL, e.what(), bl, get_last_committed()); | |
230 | return true; | |
231 | } | |
7c673cae FG |
232 | case MSG_MON_JOIN: |
233 | return preprocess_join(op); | |
234 | default: | |
235 | ceph_abort(); | |
236 | return true; | |
237 | } | |
238 | } | |
239 | ||
240 | void MonmapMonitor::dump_info(Formatter *f) | |
241 | { | |
242 | f->dump_unsigned("monmap_first_committed", get_first_committed()); | |
243 | f->dump_unsigned("monmap_last_committed", get_last_committed()); | |
244 | f->open_object_section("monmap"); | |
245 | mon->monmap->dump(f); | |
246 | f->close_section(); | |
247 | f->open_array_section("quorum"); | |
248 | for (set<int>::iterator q = mon->get_quorum().begin(); q != mon->get_quorum().end(); ++q) | |
249 | f->dump_int("mon", *q); | |
250 | f->close_section(); | |
251 | } | |
252 | ||
253 | bool MonmapMonitor::preprocess_command(MonOpRequestRef op) | |
254 | { | |
9f95a23c | 255 | auto m = op->get_req<MMonCommand>(); |
7c673cae FG |
256 | int r = -1; |
257 | bufferlist rdata; | |
258 | stringstream ss; | |
259 | ||
11fdf7f2 | 260 | cmdmap_t cmdmap; |
7c673cae FG |
261 | if (!cmdmap_from_json(m->cmd, &cmdmap, ss)) { |
262 | string rs = ss.str(); | |
263 | mon->reply_command(op, -EINVAL, rs, rdata, get_last_committed()); | |
264 | return true; | |
265 | } | |
266 | ||
267 | string prefix; | |
9f95a23c | 268 | cmd_getval(cmdmap, "prefix", prefix); |
7c673cae | 269 | |
11fdf7f2 | 270 | MonSession *session = op->get_session(); |
7c673cae FG |
271 | if (!session) { |
272 | mon->reply_command(op, -EACCES, "access denied", get_last_committed()); | |
273 | return true; | |
274 | } | |
275 | ||
276 | string format; | |
9f95a23c | 277 | cmd_getval(cmdmap, "format", format, string("plain")); |
7c673cae FG |
278 | boost::scoped_ptr<Formatter> f(Formatter::create(format)); |
279 | ||
280 | if (prefix == "mon stat") { | |
281 | mon->monmap->print_summary(ss); | |
224ce89b WB |
282 | ss << ", election epoch " << mon->get_epoch() << ", leader " |
283 | << mon->get_leader() << " " << mon->get_leader_name() | |
284 | << ", quorum " << mon->get_quorum() << " " << mon->get_quorum_names(); | |
7c673cae FG |
285 | rdata.append(ss); |
286 | ss.str(""); | |
287 | r = 0; | |
288 | ||
289 | } else if (prefix == "mon getmap" || | |
290 | prefix == "mon dump") { | |
291 | ||
292 | epoch_t epoch; | |
293 | int64_t epochnum; | |
9f95a23c | 294 | cmd_getval(cmdmap, "epoch", epochnum, (int64_t)0); |
7c673cae FG |
295 | epoch = epochnum; |
296 | ||
297 | MonMap *p = mon->monmap; | |
298 | if (epoch) { | |
299 | bufferlist bl; | |
300 | r = get_version(epoch, bl); | |
301 | if (r == -ENOENT) { | |
302 | ss << "there is no map for epoch " << epoch; | |
303 | goto reply; | |
304 | } | |
11fdf7f2 TL |
305 | ceph_assert(r == 0); |
306 | ceph_assert(bl.length() > 0); | |
7c673cae FG |
307 | p = new MonMap; |
308 | p->decode(bl); | |
309 | } | |
310 | ||
11fdf7f2 | 311 | ceph_assert(p); |
7c673cae FG |
312 | |
313 | if (prefix == "mon getmap") { | |
314 | p->encode(rdata, m->get_connection()->get_features()); | |
315 | r = 0; | |
316 | ss << "got monmap epoch " << p->get_epoch(); | |
317 | } else if (prefix == "mon dump") { | |
318 | stringstream ds; | |
319 | if (f) { | |
320 | f->open_object_section("monmap"); | |
321 | p->dump(f.get()); | |
322 | f->open_array_section("quorum"); | |
323 | for (set<int>::iterator q = mon->get_quorum().begin(); | |
324 | q != mon->get_quorum().end(); ++q) { | |
325 | f->dump_int("mon", *q); | |
326 | } | |
327 | f->close_section(); | |
328 | f->close_section(); | |
329 | f->flush(ds); | |
330 | r = 0; | |
331 | } else { | |
332 | p->print(ds); | |
333 | r = 0; | |
334 | } | |
335 | rdata.append(ds); | |
336 | ss << "dumped monmap epoch " << p->get_epoch(); | |
337 | } | |
11fdf7f2 | 338 | if (p != mon->monmap) { |
7c673cae | 339 | delete p; |
11fdf7f2 TL |
340 | p = nullptr; |
341 | } | |
7c673cae | 342 | |
224ce89b | 343 | } else if (prefix == "mon feature ls") { |
7c673cae FG |
344 | |
345 | bool list_with_value = false; | |
346 | string with_value; | |
9f95a23c | 347 | if (cmd_getval(cmdmap, "with_value", with_value) && |
7c673cae FG |
348 | with_value == "--with-value") { |
349 | list_with_value = true; | |
350 | } | |
351 | ||
352 | MonMap *p = mon->monmap; | |
353 | ||
354 | // list features | |
355 | mon_feature_t supported = ceph::features::mon::get_supported(); | |
356 | mon_feature_t persistent = ceph::features::mon::get_persistent(); | |
357 | mon_feature_t required = p->get_required_features(); | |
358 | ||
359 | stringstream ds; | |
360 | auto print_feature = [&](mon_feature_t& m_features, const char* m_str) { | |
361 | if (f) { | |
362 | if (list_with_value) | |
363 | m_features.dump_with_value(f.get(), m_str); | |
364 | else | |
365 | m_features.dump(f.get(), m_str); | |
366 | } else { | |
367 | if (list_with_value) | |
368 | m_features.print_with_value(ds); | |
369 | else | |
370 | m_features.print(ds); | |
371 | } | |
372 | }; | |
373 | ||
374 | if (f) { | |
375 | f->open_object_section("features"); | |
376 | ||
377 | f->open_object_section("all"); | |
378 | print_feature(supported, "supported"); | |
379 | print_feature(persistent, "persistent"); | |
380 | f->close_section(); // all | |
381 | ||
382 | f->open_object_section("monmap"); | |
383 | print_feature(p->persistent_features, "persistent"); | |
384 | print_feature(p->optional_features, "optional"); | |
385 | print_feature(required, "required"); | |
386 | f->close_section(); // monmap | |
387 | ||
388 | f->close_section(); // features | |
389 | f->flush(ds); | |
390 | ||
391 | } else { | |
392 | ds << "all features" << std::endl | |
393 | << "\tsupported: "; | |
394 | print_feature(supported, nullptr); | |
395 | ds << std::endl | |
396 | << "\tpersistent: "; | |
397 | print_feature(persistent, nullptr); | |
398 | ds << std::endl | |
399 | << std::endl; | |
400 | ||
401 | ds << "on current monmap (epoch " | |
402 | << p->get_epoch() << ")" << std::endl | |
403 | << "\tpersistent: "; | |
404 | print_feature(p->persistent_features, nullptr); | |
405 | ds << std::endl | |
406 | // omit optional features in plain-text | |
407 | // makes it easier to read, and they're, currently, empty. | |
408 | << "\trequired: "; | |
409 | print_feature(required, nullptr); | |
410 | ds << std::endl; | |
411 | } | |
412 | rdata.append(ds); | |
413 | r = 0; | |
414 | } | |
415 | ||
416 | reply: | |
417 | if (r != -1) { | |
418 | string rs; | |
419 | getline(ss, rs); | |
420 | ||
421 | mon->reply_command(op, r, rs, rdata, get_last_committed()); | |
422 | return true; | |
423 | } else | |
424 | return false; | |
425 | } | |
426 | ||
427 | ||
428 | bool MonmapMonitor::prepare_update(MonOpRequestRef op) | |
429 | { | |
9f95a23c | 430 | auto m = op->get_req<PaxosServiceMessage>(); |
224ce89b | 431 | dout(7) << __func__ << " " << *m << " from " << m->get_orig_source_inst() << dendl; |
7c673cae FG |
432 | |
433 | switch (m->get_type()) { | |
434 | case MSG_MON_COMMAND: | |
f64942e4 AA |
435 | try { |
436 | return prepare_command(op); | |
11fdf7f2 | 437 | } catch (const bad_cmd_get& e) { |
f64942e4 AA |
438 | bufferlist bl; |
439 | mon->reply_command(op, -EINVAL, e.what(), bl, get_last_committed()); | |
440 | return true; | |
441 | } | |
7c673cae FG |
442 | case MSG_MON_JOIN: |
443 | return prepare_join(op); | |
444 | default: | |
445 | ceph_abort(); | |
446 | } | |
447 | ||
448 | return false; | |
449 | } | |
450 | ||
451 | bool MonmapMonitor::prepare_command(MonOpRequestRef op) | |
452 | { | |
9f95a23c | 453 | auto m = op->get_req<MMonCommand>(); |
7c673cae FG |
454 | stringstream ss; |
455 | string rs; | |
456 | int err = -EINVAL; | |
457 | ||
11fdf7f2 | 458 | cmdmap_t cmdmap; |
7c673cae FG |
459 | if (!cmdmap_from_json(m->cmd, &cmdmap, ss)) { |
460 | string rs = ss.str(); | |
461 | mon->reply_command(op, -EINVAL, rs, get_last_committed()); | |
462 | return true; | |
463 | } | |
464 | ||
465 | string prefix; | |
9f95a23c | 466 | cmd_getval(cmdmap, "prefix", prefix); |
7c673cae | 467 | |
11fdf7f2 | 468 | MonSession *session = op->get_session(); |
7c673cae FG |
469 | if (!session) { |
470 | mon->reply_command(op, -EACCES, "access denied", get_last_committed()); | |
471 | return true; | |
472 | } | |
473 | ||
474 | /* We should follow the following rules: | |
475 | * | |
476 | * - 'monmap' is the current, consistent version of the monmap | |
477 | * - 'pending_map' is the uncommitted version of the monmap | |
478 | * | |
479 | * All checks for the current state must be made against 'monmap'. | |
480 | * All changes are made against 'pending_map'. | |
481 | * | |
482 | * If there are concurrent operations modifying 'pending_map', please | |
483 | * follow the following rules. | |
484 | * | |
485 | * - if pending_map has already been changed, the second operation must | |
486 | * wait for the proposal to finish and be run again; This is the easiest | |
487 | * path to guarantee correctness but may impact performance (i.e., it | |
488 | * will take longer for the user to get a reply). | |
489 | * | |
490 | * - if the result of the second operation can be guaranteed to be | |
491 | * idempotent, the operation may reply to the user once the proposal | |
492 | * finishes; still needs to wait for the proposal to finish. | |
493 | * | |
494 | * - An operation _NEVER_ returns to the user based on pending state. | |
495 | * | |
496 | * If an operation does not modify current stable monmap, it may be | |
497 | * serialized before current pending map, regardless of any change that | |
498 | * has been made to the pending map -- remember, pending is uncommitted | |
499 | * state, thus we are not bound by it. | |
500 | */ | |
501 | ||
11fdf7f2 | 502 | ceph_assert(mon->monmap); |
7c673cae FG |
503 | MonMap &monmap = *mon->monmap; |
504 | ||
505 | ||
506 | /* Please note: | |
507 | * | |
508 | * Adding or removing monitors may lead to loss of quorum. | |
509 | * | |
510 | * Because quorum may be lost, it's important to reply something | |
511 | * to the user, lest she end up waiting forever for a reply. And | |
512 | * no reply will ever be sent until quorum is formed again. | |
513 | * | |
514 | * On the other hand, this means we're leaking uncommitted state | |
515 | * to the user. As such, please be mindful of the reply message. | |
516 | * | |
517 | * e.g., 'adding monitor mon.foo' is okay ('adding' is an on-going | |
518 | * operation and conveys its not-yet-permanent nature); whereas | |
519 | * 'added monitor mon.foo' presumes the action has successfully | |
520 | * completed and state has been committed, which may not be true. | |
521 | */ | |
522 | ||
523 | ||
524 | bool propose = false; | |
525 | if (prefix == "mon add") { | |
526 | string name; | |
9f95a23c | 527 | cmd_getval(cmdmap, "name", name); |
7c673cae | 528 | string addrstr; |
9f95a23c | 529 | cmd_getval(cmdmap, "addr", addrstr); |
7c673cae FG |
530 | entity_addr_t addr; |
531 | bufferlist rdata; | |
532 | ||
533 | if (!addr.parse(addrstr.c_str())) { | |
534 | err = -EINVAL; | |
535 | ss << "addr " << addrstr << "does not parse"; | |
536 | goto reply; | |
537 | } | |
538 | ||
11fdf7f2 TL |
539 | entity_addrvec_t addrs; |
540 | if (monmap.persistent_features.contains_all( | |
541 | ceph::features::mon::FEATURE_NAUTILUS)) { | |
542 | if (addr.get_port() == CEPH_MON_PORT_IANA) { | |
543 | addr.set_type(entity_addr_t::TYPE_MSGR2); | |
544 | } | |
545 | if (addr.get_port() == CEPH_MON_PORT_LEGACY) { | |
546 | // if they specified the *old* default they probably don't care | |
547 | addr.set_port(0); | |
548 | } | |
549 | if (addr.get_port()) { | |
550 | addrs.v.push_back(addr); | |
551 | } else { | |
552 | addr.set_type(entity_addr_t::TYPE_MSGR2); | |
553 | addr.set_port(CEPH_MON_PORT_IANA); | |
554 | addrs.v.push_back(addr); | |
555 | addr.set_type(entity_addr_t::TYPE_LEGACY); | |
556 | addr.set_port(CEPH_MON_PORT_LEGACY); | |
557 | addrs.v.push_back(addr); | |
558 | } | |
559 | } else { | |
560 | if (addr.get_port() == 0) { | |
561 | addr.set_port(CEPH_MON_PORT_LEGACY); | |
562 | } | |
563 | addr.set_type(entity_addr_t::TYPE_LEGACY); | |
564 | addrs.v.push_back(addr); | |
7c673cae | 565 | } |
11fdf7f2 | 566 | dout(20) << __func__ << " addr " << addr << " -> addrs " << addrs << dendl; |
7c673cae FG |
567 | |
568 | /** | |
569 | * If we have a monitor with the same name and different addr, then EEXIST | |
570 | * If we have a monitor with the same addr and different name, then EEXIST | |
571 | * If we have a monitor with the same addr and same name, then wait for | |
572 | * the proposal to finish and return success. | |
573 | * If we don't have the monitor, add it. | |
574 | */ | |
575 | ||
576 | err = 0; | |
577 | if (!ss.str().empty()) | |
578 | ss << "; "; | |
579 | ||
580 | do { | |
581 | if (monmap.contains(name)) { | |
11fdf7f2 | 582 | if (monmap.get_addrs(name) == addrs) { |
7c673cae FG |
583 | // stable map contains monitor with the same name at the same address. |
584 | // serialize before current pending map. | |
585 | err = 0; // for clarity; this has already been set above. | |
11fdf7f2 | 586 | ss << "mon." << name << " at " << addrs << " already exists"; |
7c673cae FG |
587 | goto reply; |
588 | } else { | |
589 | ss << "mon." << name | |
11fdf7f2 | 590 | << " already exists at address " << monmap.get_addrs(name); |
7c673cae | 591 | } |
11fdf7f2 | 592 | } else if (monmap.contains(addrs)) { |
7c673cae | 593 | // we established on the previous branch that name is different |
11fdf7f2 | 594 | ss << "mon." << monmap.get_name(addrs) |
7c673cae FG |
595 | << " already exists at address " << addr; |
596 | } else { | |
597 | // go ahead and add | |
598 | break; | |
599 | } | |
600 | err = -EEXIST; | |
601 | goto reply; | |
602 | } while (false); | |
603 | ||
604 | /* Given there's no delay between proposals on the MonmapMonitor (see | |
605 | * MonmapMonitor::should_propose()), there is no point in checking for | |
606 | * a mismatch between name and addr on pending_map. | |
607 | * | |
608 | * Once we established the monitor does not exist in the committed state, | |
609 | * we can simply go ahead and add the monitor. | |
610 | */ | |
611 | ||
11fdf7f2 | 612 | pending_map.add(name, addrs); |
7c673cae | 613 | pending_map.last_changed = ceph_clock_now(); |
11fdf7f2 | 614 | ss << "adding mon." << name << " at " << addrs; |
7c673cae FG |
615 | propose = true; |
616 | dout(0) << __func__ << " proposing new mon." << name << dendl; | |
617 | ||
618 | } else if (prefix == "mon remove" || | |
619 | prefix == "mon rm") { | |
620 | string name; | |
9f95a23c | 621 | cmd_getval(cmdmap, "name", name); |
7c673cae FG |
622 | if (!monmap.contains(name)) { |
623 | err = 0; | |
624 | ss << "mon." << name << " does not exist or has already been removed"; | |
625 | goto reply; | |
626 | } | |
627 | ||
628 | if (monmap.size() == 1) { | |
629 | err = -EINVAL; | |
630 | ss << "error: refusing removal of last monitor " << name; | |
631 | goto reply; | |
632 | } | |
633 | ||
634 | /* At the time of writing, there is no risk of races when multiple clients | |
635 | * attempt to use the same name. The reason is simple but may not be | |
636 | * obvious. | |
637 | * | |
638 | * In a nutshell, we do not collate proposals on the MonmapMonitor. As | |
639 | * soon as we return 'true' below, PaxosService::dispatch() will check if | |
640 | * the service should propose, and - if so - the service will be marked as | |
641 | * 'proposing' and a proposal will be triggered. The PaxosService class | |
642 | * guarantees that once a service is marked 'proposing' no further writes | |
643 | * will be handled. | |
644 | * | |
645 | * The decision on whether the service should propose or not is, in this | |
646 | * case, made by MonmapMonitor::should_propose(), which always considers | |
647 | * the proposal delay being 0.0 seconds. This is key for PaxosService to | |
648 | * trigger the proposal immediately. | |
649 | * 0.0 seconds of delay. | |
650 | * | |
651 | * From the above, there's no point in performing further checks on the | |
652 | * pending_map, as we don't ever have multiple proposals in-flight in | |
653 | * this service. As we've established the committed state contains the | |
654 | * monitor, we can simply go ahead and remove it. | |
655 | * | |
656 | * Please note that the code hinges on all of the above to be true. It | |
657 | * has been true since time immemorial and we don't see a good reason | |
658 | * to make it sturdier at this time - mainly because we don't think it's | |
659 | * going to change any time soon, lest for any bug that may be unwillingly | |
660 | * introduced. | |
661 | */ | |
662 | ||
11fdf7f2 | 663 | entity_addrvec_t addrs = pending_map.get_addrs(name); |
7c673cae FG |
664 | pending_map.remove(name); |
665 | pending_map.last_changed = ceph_clock_now(); | |
11fdf7f2 | 666 | ss << "removing mon." << name << " at " << addrs |
7c673cae FG |
667 | << ", there will be " << pending_map.size() << " monitors" ; |
668 | propose = true; | |
669 | err = 0; | |
670 | ||
671 | } else if (prefix == "mon feature set") { | |
672 | ||
673 | /* PLEASE NOTE: | |
674 | * | |
675 | * We currently only support setting/unsetting persistent features. | |
676 | * This is by design, given at the moment we still don't have optional | |
677 | * features, and, as such, there is no point introducing an interface | |
678 | * to manipulate them. This allows us to provide a cleaner, more | |
679 | * intuitive interface to the user, modifying solely persistent | |
680 | * features. | |
681 | * | |
682 | * In the future we should consider adding another interface to handle | |
683 | * optional features/flags; e.g., 'mon feature flag set/unset', or | |
684 | * 'mon flag set/unset'. | |
685 | */ | |
686 | string feature_name; | |
9f95a23c | 687 | if (!cmd_getval(cmdmap, "feature_name", feature_name)) { |
7c673cae FG |
688 | ss << "missing required feature name"; |
689 | err = -EINVAL; | |
690 | goto reply; | |
691 | } | |
692 | ||
693 | mon_feature_t feature; | |
694 | feature = ceph::features::mon::get_feature_by_name(feature_name); | |
695 | if (feature == ceph::features::mon::FEATURE_NONE) { | |
696 | ss << "unknown feature '" << feature_name << "'"; | |
697 | err = -ENOENT; | |
698 | goto reply; | |
699 | } | |
700 | ||
11fdf7f2 | 701 | bool sure = false; |
9f95a23c | 702 | cmd_getval(cmdmap, "yes_i_really_mean_it", sure); |
11fdf7f2 | 703 | if (!sure) { |
7c673cae FG |
704 | ss << "please specify '--yes-i-really-mean-it' if you " |
705 | << "really, **really** want to set feature '" | |
706 | << feature << "' in the monmap."; | |
707 | err = -EPERM; | |
708 | goto reply; | |
709 | } | |
710 | ||
711 | if (!mon->get_quorum_mon_features().contains_all(feature)) { | |
712 | ss << "current quorum does not support feature '" << feature | |
713 | << "'; supported features: " | |
714 | << mon->get_quorum_mon_features(); | |
715 | err = -EINVAL; | |
716 | goto reply; | |
717 | } | |
718 | ||
719 | ss << "setting feature '" << feature << "'"; | |
720 | ||
721 | err = 0; | |
722 | if (monmap.persistent_features.contains_all(feature)) { | |
723 | dout(10) << __func__ << " feature '" << feature | |
724 | << "' already set on monmap; no-op." << dendl; | |
725 | goto reply; | |
726 | } | |
727 | ||
728 | pending_map.persistent_features.set_feature(feature); | |
729 | pending_map.last_changed = ceph_clock_now(); | |
730 | propose = true; | |
731 | ||
11fdf7f2 | 732 | dout(1) << __func__ << " " << ss.str() << "; new features will be: " |
7c673cae FG |
733 | << "persistent = " << pending_map.persistent_features |
734 | // output optional nevertheless, for auditing purposes. | |
735 | << ", optional = " << pending_map.optional_features << dendl; | |
11fdf7f2 TL |
736 | |
737 | } else if (prefix == "mon set-rank") { | |
738 | string name; | |
739 | int64_t rank; | |
9f95a23c TL |
740 | if (!cmd_getval(cmdmap, "name", name) || |
741 | !cmd_getval(cmdmap, "rank", rank)) { | |
11fdf7f2 TL |
742 | err = -EINVAL; |
743 | goto reply; | |
744 | } | |
745 | int oldrank = pending_map.get_rank(name); | |
746 | if (oldrank < 0) { | |
747 | ss << "mon." << name << " does not exist in monmap"; | |
748 | err = -ENOENT; | |
749 | goto reply; | |
750 | } | |
751 | err = 0; | |
752 | pending_map.set_rank(name, rank); | |
753 | pending_map.last_changed = ceph_clock_now(); | |
754 | propose = true; | |
755 | } else if (prefix == "mon set-addrs") { | |
756 | string name; | |
757 | string addrs; | |
9f95a23c TL |
758 | if (!cmd_getval(cmdmap, "name", name) || |
759 | !cmd_getval(cmdmap, "addrs", addrs)) { | |
11fdf7f2 TL |
760 | err = -EINVAL; |
761 | goto reply; | |
762 | } | |
763 | if (!pending_map.contains(name)) { | |
764 | ss << "mon." << name << " does not exist"; | |
765 | err = -ENOENT; | |
766 | goto reply; | |
767 | } | |
768 | entity_addrvec_t av; | |
769 | if (!av.parse(addrs.c_str(), nullptr)) { | |
770 | ss << "failed to parse addrs '" << addrs << "'"; | |
771 | err = -EINVAL; | |
772 | goto reply; | |
773 | } | |
774 | for (auto& a : av.v) { | |
775 | a.set_nonce(0); | |
776 | if (!a.get_port()) { | |
777 | ss << "monitor must bind to a non-zero port, not " << a; | |
778 | err = -EINVAL; | |
779 | goto reply; | |
780 | } | |
781 | } | |
782 | err = 0; | |
783 | pending_map.set_addrvec(name, av); | |
784 | pending_map.last_changed = ceph_clock_now(); | |
785 | propose = true; | |
9f95a23c TL |
786 | } else if (prefix == "mon set-weight") { |
787 | string name; | |
788 | int64_t weight; | |
789 | if (!cmd_getval(cmdmap, "name", name) || | |
790 | !cmd_getval(cmdmap, "weight", weight)) { | |
791 | err = -EINVAL; | |
792 | goto reply; | |
793 | } | |
794 | if (!pending_map.contains(name)) { | |
795 | ss << "mon." << name << " does not exist"; | |
796 | err = -ENOENT; | |
797 | goto reply; | |
798 | } | |
799 | err = 0; | |
800 | pending_map.set_weight(name, weight); | |
801 | pending_map.last_changed = ceph_clock_now(); | |
802 | propose = true; | |
11fdf7f2 TL |
803 | } else if (prefix == "mon enable-msgr2") { |
804 | if (!monmap.get_required_features().contains_all( | |
805 | ceph::features::mon::FEATURE_NAUTILUS)) { | |
806 | err = -EACCES; | |
807 | ss << "all monitors must be running nautilus to enable v2"; | |
808 | goto reply; | |
809 | } | |
810 | for (auto& i : pending_map.mon_info) { | |
811 | if (i.second.public_addrs.v.size() == 1 && | |
812 | i.second.public_addrs.front().is_legacy() && | |
813 | i.second.public_addrs.front().get_port() == CEPH_MON_PORT_LEGACY) { | |
814 | entity_addrvec_t av; | |
815 | entity_addr_t a = i.second.public_addrs.front(); | |
816 | a.set_type(entity_addr_t::TYPE_MSGR2); | |
817 | a.set_port(CEPH_MON_PORT_IANA); | |
818 | av.v.push_back(a); | |
819 | av.v.push_back(i.second.public_addrs.front()); | |
820 | dout(10) << " setting mon." << i.first | |
821 | << " addrs " << i.second.public_addrs | |
822 | << " -> " << av << dendl; | |
823 | pending_map.set_addrvec(i.first, av); | |
824 | propose = true; | |
825 | pending_map.last_changed = ceph_clock_now(); | |
826 | } | |
827 | } | |
828 | err = 0; | |
7c673cae FG |
829 | } else { |
830 | ss << "unknown command " << prefix; | |
831 | err = -EINVAL; | |
832 | } | |
833 | ||
834 | reply: | |
835 | getline(ss, rs); | |
836 | mon->reply_command(op, err, rs, get_last_committed()); | |
837 | // we are returning to the user; do not propose. | |
838 | return propose; | |
839 | } | |
840 | ||
841 | bool MonmapMonitor::preprocess_join(MonOpRequestRef op) | |
842 | { | |
9f95a23c | 843 | auto join = op->get_req<MMonJoin>(); |
11fdf7f2 | 844 | dout(10) << __func__ << " " << join->name << " at " << join->addrs << dendl; |
7c673cae | 845 | |
11fdf7f2 | 846 | MonSession *session = op->get_session(); |
7c673cae FG |
847 | if (!session || |
848 | !session->is_capable("mon", MON_CAP_W | MON_CAP_X)) { | |
849 | dout(10) << " insufficient caps" << dendl; | |
850 | return true; | |
851 | } | |
852 | ||
11fdf7f2 TL |
853 | if (pending_map.contains(join->name) && |
854 | !pending_map.get_addrs(join->name).front().is_blank_ip()) { | |
7c673cae FG |
855 | dout(10) << " already have " << join->name << dendl; |
856 | return true; | |
857 | } | |
11fdf7f2 TL |
858 | if (pending_map.contains(join->addrs) && |
859 | pending_map.get_name(join->addrs) == join->name) { | |
860 | dout(10) << " already have " << join->addrs << dendl; | |
7c673cae FG |
861 | return true; |
862 | } | |
863 | return false; | |
864 | } | |
865 | bool MonmapMonitor::prepare_join(MonOpRequestRef op) | |
866 | { | |
9f95a23c | 867 | auto join = op->get_req<MMonJoin>(); |
11fdf7f2 TL |
868 | dout(0) << "adding/updating " << join->name |
869 | << " at " << join->addrs << " to monitor cluster" << dendl; | |
7c673cae FG |
870 | if (pending_map.contains(join->name)) |
871 | pending_map.remove(join->name); | |
11fdf7f2 TL |
872 | if (pending_map.contains(join->addrs)) |
873 | pending_map.remove(pending_map.get_name(join->addrs)); | |
874 | pending_map.add(join->name, join->addrs); | |
7c673cae FG |
875 | pending_map.last_changed = ceph_clock_now(); |
876 | return true; | |
877 | } | |
878 | ||
879 | bool MonmapMonitor::should_propose(double& delay) | |
880 | { | |
881 | delay = 0.0; | |
882 | return true; | |
883 | } | |
884 | ||
7c673cae FG |
885 | int MonmapMonitor::get_monmap(bufferlist &bl) |
886 | { | |
887 | version_t latest_ver = get_last_committed(); | |
888 | dout(10) << __func__ << " ver " << latest_ver << dendl; | |
889 | ||
890 | if (!mon->store->exists(get_service_name(), stringify(latest_ver))) | |
891 | return -ENOENT; | |
892 | ||
893 | int err = get_version(latest_ver, bl); | |
894 | if (err < 0) { | |
895 | dout(1) << __func__ << " error obtaining monmap: " | |
896 | << cpp_strerror(err) << dendl; | |
897 | return err; | |
898 | } | |
899 | return 0; | |
900 | } | |
901 | ||
902 | void MonmapMonitor::check_subs() | |
903 | { | |
904 | const string type = "monmap"; | |
905 | mon->with_session_map([this, &type](const MonSessionMap& session_map) { | |
906 | auto subs = session_map.subs.find(type); | |
907 | if (subs == session_map.subs.end()) | |
908 | return; | |
909 | for (auto sub : *subs->second) { | |
910 | check_sub(sub); | |
911 | } | |
912 | }); | |
913 | } | |
914 | ||
915 | void MonmapMonitor::check_sub(Subscription *sub) | |
916 | { | |
917 | const auto epoch = mon->monmap->get_epoch(); | |
918 | dout(10) << __func__ | |
919 | << " monmap next " << sub->next | |
920 | << " have " << epoch << dendl; | |
921 | if (sub->next <= epoch) { | |
922 | mon->send_latest_monmap(sub->session->con.get()); | |
923 | if (sub->onetime) { | |
11fdf7f2 | 924 | mon->with_session_map([sub](MonSessionMap& session_map) { |
7c673cae FG |
925 | session_map.remove_sub(sub); |
926 | }); | |
927 | } else { | |
928 | sub->next = epoch + 1; | |
929 | } | |
930 | } | |
931 | } | |
11fdf7f2 TL |
932 | |
933 | void MonmapMonitor::tick() | |
934 | { | |
935 | if (!is_active() || | |
936 | !mon->is_leader()) { | |
937 | return; | |
938 | } | |
939 | ||
940 | if (mon->monmap->created.is_zero()) { | |
941 | dout(10) << __func__ << " detected empty created stamp" << dendl; | |
942 | utime_t ctime; | |
943 | for (version_t v = 1; v <= get_last_committed(); v++) { | |
944 | bufferlist bl; | |
945 | int r = get_version(v, bl); | |
946 | if (r < 0) { | |
947 | continue; | |
948 | } | |
949 | MonMap m; | |
950 | auto p = bl.cbegin(); | |
951 | decode(m, p); | |
952 | if (!m.last_changed.is_zero()) { | |
953 | dout(10) << __func__ << " first monmap with last_changed is " | |
954 | << v << " with " << m.last_changed << dendl; | |
955 | ctime = m.last_changed; | |
956 | break; | |
957 | } | |
958 | } | |
959 | if (ctime.is_zero()) { | |
960 | ctime = ceph_clock_now(); | |
961 | } | |
962 | dout(10) << __func__ << " updating created stamp to " << ctime << dendl; | |
963 | pending_map.created = ctime; | |
964 | propose_pending(); | |
965 | } | |
966 | } |