]>
Commit | Line | Data |
---|---|---|
11fdf7f2 TL |
1 | // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- |
2 | // vim: ts=8 sw=2 smarttab | |
7c673cae FG |
3 | |
4 | #include "MonMap.h" | |
5 | ||
6 | #include <algorithm> | |
7 | #include <sys/types.h> | |
8 | #include <sys/stat.h> | |
9 | #include <fcntl.h> | |
10 | ||
11fdf7f2 TL |
11 | #ifdef WITH_SEASTAR |
12 | #include <seastar/core/fstream.hh> | |
13 | #include <seastar/core/reactor.hh> | |
14 | #include <seastar/net/dns.hh> | |
15 | #include "crimson/common/config_proxy.h" | |
16 | #endif | |
17 | ||
7c673cae FG |
18 | #include "common/Formatter.h" |
19 | ||
20 | #include "include/ceph_features.h" | |
21 | #include "include/addr_parsing.h" | |
22 | #include "common/ceph_argparse.h" | |
23 | #include "common/dns_resolve.h" | |
24 | #include "common/errno.h" | |
7c673cae | 25 | #include "common/dout.h" |
11fdf7f2 | 26 | #include "common/Clock.h" |
f67539c2 | 27 | #include "mon/health_check.h" |
7c673cae | 28 | |
9f95a23c TL |
29 | using std::list; |
30 | using std::map; | |
31 | using std::ostream; | |
32 | using std::set; | |
33 | using std::string; | |
34 | using std::vector; | |
35 | ||
36 | using ceph::DNSResolver; | |
7c673cae FG |
37 | using ceph::Formatter; |
38 | ||
9f95a23c | 39 | void mon_info_t::encode(ceph::buffer::list& bl, uint64_t features) const |
7c673cae | 40 | { |
f67539c2 TL |
41 | uint8_t v = 5; |
42 | uint8_t min_v = 1; | |
43 | if (!crush_loc.empty()) { | |
44 | // we added crush_loc in version 5, but need to let old clients decode it | |
a4b75251 | 45 | // so just leave the min_v at version 1. Monitors are protected |
f67539c2 TL |
46 | // from misunderstandings about location because setting it is blocked |
47 | // on FEATURE_PINGING | |
a4b75251 | 48 | min_v = 1; |
f67539c2 | 49 | } |
11fdf7f2 TL |
50 | if (!HAVE_FEATURE(features, SERVER_NAUTILUS)) { |
51 | v = 2; | |
52 | } | |
f67539c2 | 53 | ENCODE_START(v, min_v, bl); |
11fdf7f2 TL |
54 | encode(name, bl); |
55 | if (v < 3) { | |
f67539c2 | 56 | ceph_assert(min_v == 1); |
eafe8130 TL |
57 | auto a = public_addrs.legacy_addr(); |
58 | if (a != entity_addr_t()) { | |
59 | encode(a, bl, features); | |
60 | } else { | |
61 | // note: we don't have a legacy addr here, so lie so that it looks | |
62 | // like one, just so that old clients get a valid-looking map. | |
63 | // they won't be able to talk to the v2 mons, but that's better | |
64 | // than nothing. | |
65 | encode(public_addrs.as_legacy_addr(), bl, features); | |
66 | } | |
11fdf7f2 TL |
67 | } else { |
68 | encode(public_addrs, bl, features); | |
69 | } | |
70 | encode(priority, bl); | |
9f95a23c | 71 | encode(weight, bl); |
f67539c2 | 72 | encode(crush_loc, bl); |
7c673cae FG |
73 | ENCODE_FINISH(bl); |
74 | } | |
75 | ||
9f95a23c | 76 | void mon_info_t::decode(ceph::buffer::list::const_iterator& p) |
7c673cae | 77 | { |
f67539c2 | 78 | DECODE_START(5, p); |
11fdf7f2 TL |
79 | decode(name, p); |
80 | decode(public_addrs, p); | |
224ce89b | 81 | if (struct_v >= 2) { |
11fdf7f2 | 82 | decode(priority, p); |
224ce89b | 83 | } |
9f95a23c TL |
84 | if (struct_v >= 4) { |
85 | decode(weight, p); | |
86 | } | |
f67539c2 TL |
87 | if (struct_v >= 5) { |
88 | decode(crush_loc, p); | |
89 | } | |
7c673cae FG |
90 | DECODE_FINISH(p); |
91 | } | |
92 | ||
93 | void mon_info_t::print(ostream& out) const | |
94 | { | |
95 | out << "mon." << name | |
11fdf7f2 | 96 | << " addrs " << public_addrs |
9f95a23c | 97 | << " priority " << priority |
f67539c2 TL |
98 | << " weight " << weight |
99 | << " crush location " << crush_loc; | |
7c673cae FG |
100 | } |
101 | ||
7c673cae FG |
102 | namespace { |
103 | struct rank_cmp { | |
104 | bool operator()(const mon_info_t &a, const mon_info_t &b) const { | |
11fdf7f2 | 105 | if (a.public_addrs.legacy_or_front_addr() == b.public_addrs.legacy_or_front_addr()) |
7c673cae | 106 | return a.name < b.name; |
11fdf7f2 | 107 | return a.public_addrs.legacy_or_front_addr() < b.public_addrs.legacy_or_front_addr(); |
7c673cae FG |
108 | } |
109 | }; | |
110 | } | |
111 | ||
11fdf7f2 TL |
112 | void MonMap::calc_legacy_ranks() |
113 | { | |
7c673cae | 114 | ranks.resize(mon_info.size()); |
7c673cae FG |
115 | |
116 | // Used to order entries according to public_addr, because that's | |
117 | // how the ranks are expected to be ordered by. We may expand this | |
118 | // later on, according to some other criteria, by specifying a | |
119 | // different comparator. | |
120 | // | |
121 | // Please note that we use a 'set' here instead of resorting to | |
122 | // std::sort() because we need more info than that's available in | |
123 | // the vector. The vector will thus be ordered by, e.g., public_addr | |
124 | // while only containing the names of each individual monitor. | |
125 | // The only way of achieving this with std::sort() would be to first | |
126 | // insert every mon_info_t entry into a vector 'foo', std::sort() 'foo' | |
127 | // with custom comparison functions, and then copy each invidual entry | |
128 | // to a new vector. Unless there's a simpler way, we don't think the | |
129 | // added complexity makes up for the additional memory usage of a 'set'. | |
130 | set<mon_info_t, rank_cmp> tmp; | |
131 | ||
9f95a23c | 132 | for (auto p = mon_info.begin(); p != mon_info.end(); ++p) { |
7c673cae FG |
133 | mon_info_t &m = p->second; |
134 | tmp.insert(m); | |
7c673cae FG |
135 | } |
136 | ||
137 | // map the set to the actual ranks etc | |
138 | unsigned i = 0; | |
9f95a23c | 139 | for (auto p = tmp.begin(); p != tmp.end(); ++p, ++i) { |
7c673cae FG |
140 | ranks[i] = p->name; |
141 | } | |
142 | } | |
143 | ||
9f95a23c | 144 | void MonMap::encode(ceph::buffer::list& blist, uint64_t con_features) const |
7c673cae | 145 | { |
7c673cae | 146 | if ((con_features & CEPH_FEATURE_MONNAMES) == 0) { |
11fdf7f2 | 147 | using ceph::encode; |
7c673cae | 148 | __u16 v = 1; |
11fdf7f2 | 149 | encode(v, blist); |
9f95a23c | 150 | ceph::encode_raw(fsid, blist); |
11fdf7f2 TL |
151 | encode(epoch, blist); |
152 | vector<entity_inst_t> mon_inst(ranks.size()); | |
153 | for (unsigned n = 0; n < ranks.size(); n++) { | |
154 | mon_inst[n].name = entity_name_t::MON(n); | |
155 | mon_inst[n].addr = get_addrs(n).legacy_addr(); | |
156 | } | |
157 | encode(mon_inst, blist, con_features); | |
158 | encode(last_changed, blist); | |
159 | encode(created, blist); | |
7c673cae FG |
160 | return; |
161 | } | |
162 | ||
11fdf7f2 TL |
163 | map<string,entity_addr_t> legacy_mon_addr; |
164 | if (!HAVE_FEATURE(con_features, MONENC) || | |
165 | !HAVE_FEATURE(con_features, SERVER_NAUTILUS)) { | |
166 | for (auto& [name, info] : mon_info) { | |
167 | legacy_mon_addr[name] = info.public_addrs.legacy_addr(); | |
168 | } | |
169 | } | |
170 | ||
171 | if (!HAVE_FEATURE(con_features, MONENC)) { | |
172 | /* we keep the mon_addr map when encoding to ensure compatibility | |
173 | * with clients and other monitors that do not yet support the 'mons' | |
174 | * map. This map keeps its original behavior, containing a mapping of | |
175 | * monitor id (i.e., 'foo' in 'mon.foo') to the monitor's public | |
176 | * address -- which is obtained from the public address of each entry | |
177 | * in the 'mons' map. | |
178 | */ | |
179 | using ceph::encode; | |
7c673cae | 180 | __u16 v = 2; |
11fdf7f2 | 181 | encode(v, blist); |
9f95a23c | 182 | ceph::encode_raw(fsid, blist); |
11fdf7f2 TL |
183 | encode(epoch, blist); |
184 | encode(legacy_mon_addr, blist, con_features); | |
185 | encode(last_changed, blist); | |
186 | encode(created, blist); | |
187 | return; | |
188 | } | |
189 | ||
190 | if (!HAVE_FEATURE(con_features, SERVER_NAUTILUS)) { | |
191 | ENCODE_START(5, 3, blist); | |
9f95a23c | 192 | ceph::encode_raw(fsid, blist); |
11fdf7f2 TL |
193 | encode(epoch, blist); |
194 | encode(legacy_mon_addr, blist, con_features); | |
195 | encode(last_changed, blist); | |
196 | encode(created, blist); | |
197 | encode(persistent_features, blist); | |
198 | encode(optional_features, blist); | |
199 | encode(mon_info, blist, con_features); | |
200 | ENCODE_FINISH(blist); | |
201 | return; | |
202 | } | |
203 | ||
f67539c2 | 204 | ENCODE_START(9, 6, blist); |
9f95a23c | 205 | ceph::encode_raw(fsid, blist); |
11fdf7f2 TL |
206 | encode(epoch, blist); |
207 | encode(last_changed, blist); | |
208 | encode(created, blist); | |
209 | encode(persistent_features, blist); | |
210 | encode(optional_features, blist); | |
211 | encode(mon_info, blist, con_features); | |
212 | encode(ranks, blist); | |
213 | encode(min_mon_release, blist); | |
f67539c2 TL |
214 | encode(removed_ranks, blist); |
215 | uint8_t t = strategy; | |
216 | encode(t, blist); | |
217 | encode(disallowed_leaders, blist); | |
218 | encode(stretch_mode_enabled, blist); | |
219 | encode(tiebreaker_mon, blist); | |
220 | encode(stretch_marked_down_mons, blist); | |
7c673cae FG |
221 | ENCODE_FINISH(blist); |
222 | } | |
223 | ||
9f95a23c | 224 | void MonMap::decode(ceph::buffer::list::const_iterator& p) |
7c673cae FG |
225 | { |
226 | map<string,entity_addr_t> mon_addr; | |
f67539c2 | 227 | DECODE_START_LEGACY_COMPAT_LEN_16(9, 3, 3, p); |
9f95a23c | 228 | ceph::decode_raw(fsid, p); |
11fdf7f2 | 229 | decode(epoch, p); |
7c673cae FG |
230 | if (struct_v == 1) { |
231 | vector<entity_inst_t> mon_inst; | |
11fdf7f2 | 232 | decode(mon_inst, p); |
7c673cae FG |
233 | for (unsigned i = 0; i < mon_inst.size(); i++) { |
234 | char n[2]; | |
235 | n[0] = '0' + i; | |
236 | n[1] = 0; | |
237 | string name = n; | |
238 | mon_addr[name] = mon_inst[i].addr; | |
239 | } | |
11fdf7f2 TL |
240 | } else if (struct_v < 6) { |
241 | decode(mon_addr, p); | |
7c673cae | 242 | } |
11fdf7f2 TL |
243 | decode(last_changed, p); |
244 | decode(created, p); | |
7c673cae | 245 | if (struct_v >= 4) { |
11fdf7f2 TL |
246 | decode(persistent_features, p); |
247 | decode(optional_features, p); | |
7c673cae | 248 | } |
11fdf7f2 TL |
249 | if (struct_v < 5) { |
250 | // generate mon_info from legacy mon_addr | |
251 | for (auto& [name, addr] : mon_addr) { | |
252 | mon_info_t &m = mon_info[name]; | |
253 | m.name = name; | |
254 | m.public_addrs = entity_addrvec_t(addr); | |
255 | } | |
256 | } else { | |
257 | decode(mon_info, p); | |
258 | } | |
259 | if (struct_v < 6) { | |
260 | calc_legacy_ranks(); | |
261 | } else { | |
262 | decode(ranks, p); | |
263 | } | |
264 | if (struct_v >= 7) { | |
265 | decode(min_mon_release, p); | |
7c673cae | 266 | } else { |
11fdf7f2 | 267 | min_mon_release = infer_ceph_release_from_mon_features(persistent_features); |
7c673cae | 268 | } |
f67539c2 TL |
269 | if (struct_v >= 8) { |
270 | decode(removed_ranks, p); | |
271 | uint8_t t; | |
272 | decode(t, p); | |
273 | strategy = static_cast<election_strategy>(t); | |
274 | decode(disallowed_leaders, p); | |
275 | } | |
276 | if (struct_v >= 9) { | |
277 | decode(stretch_mode_enabled, p); | |
278 | decode(tiebreaker_mon, p); | |
279 | decode(stretch_marked_down_mons, p); | |
280 | } else { | |
281 | stretch_mode_enabled = false; | |
282 | tiebreaker_mon = ""; | |
283 | stretch_marked_down_mons.clear(); | |
284 | } | |
11fdf7f2 | 285 | calc_addr_mons(); |
7c673cae | 286 | DECODE_FINISH(p); |
7c673cae FG |
287 | } |
288 | ||
289 | void MonMap::generate_test_instances(list<MonMap*>& o) | |
290 | { | |
291 | o.push_back(new MonMap); | |
292 | o.push_back(new MonMap); | |
293 | o.back()->epoch = 1; | |
294 | o.back()->last_changed = utime_t(123, 456); | |
295 | o.back()->created = utime_t(789, 101112); | |
11fdf7f2 | 296 | o.back()->add("one", entity_addrvec_t()); |
7c673cae FG |
297 | |
298 | MonMap *m = new MonMap; | |
299 | { | |
300 | m->epoch = 1; | |
301 | m->last_changed = utime_t(123, 456); | |
302 | ||
11fdf7f2 TL |
303 | entity_addrvec_t empty_addr_one = entity_addrvec_t(entity_addr_t()); |
304 | empty_addr_one.v[0].set_nonce(1); | |
7c673cae | 305 | m->add("empty_addr_one", empty_addr_one); |
11fdf7f2 TL |
306 | entity_addrvec_t empty_addr_two = entity_addrvec_t(entity_addr_t()); |
307 | empty_addr_two.v[0].set_nonce(2); | |
308 | m->add("empty_addr_two", empty_addr_two); | |
7c673cae FG |
309 | |
310 | const char *local_pub_addr_s = "127.0.1.2"; | |
311 | ||
312 | const char *end_p = local_pub_addr_s + strlen(local_pub_addr_s); | |
11fdf7f2 | 313 | entity_addrvec_t local_pub_addr; |
7c673cae FG |
314 | local_pub_addr.parse(local_pub_addr_s, &end_p); |
315 | ||
9f95a23c | 316 | m->add(mon_info_t("filled_pub_addr", entity_addrvec_t(local_pub_addr), 1, 1)); |
7c673cae | 317 | |
11fdf7f2 | 318 | m->add("empty_addr_zero", entity_addrvec_t()); |
7c673cae FG |
319 | } |
320 | o.push_back(m); | |
321 | } | |
322 | ||
323 | // read from/write to a file | |
324 | int MonMap::write(const char *fn) | |
325 | { | |
326 | // encode | |
9f95a23c | 327 | ceph::buffer::list bl; |
7c673cae FG |
328 | encode(bl, CEPH_FEATURES_ALL); |
329 | ||
330 | return bl.write_file(fn); | |
331 | } | |
332 | ||
333 | int MonMap::read(const char *fn) | |
334 | { | |
335 | // read | |
9f95a23c | 336 | ceph::buffer::list bl; |
7c673cae FG |
337 | std::string error; |
338 | int r = bl.read_file(fn, &error); | |
339 | if (r < 0) | |
340 | return r; | |
341 | decode(bl); | |
342 | return 0; | |
343 | } | |
344 | ||
345 | void MonMap::print_summary(ostream& out) const | |
346 | { | |
347 | out << "e" << epoch << ": " | |
348 | << mon_info.size() << " mons at {"; | |
349 | // the map that we used to print, as it was, no longer | |
350 | // maps strings to the monitor's public address, but to | |
351 | // mon_info_t instead. As such, print the map in a way | |
352 | // that keeps the expected format. | |
353 | bool has_printed = false; | |
9f95a23c | 354 | for (auto p = mon_info.begin(); p != mon_info.end(); ++p) { |
7c673cae FG |
355 | if (has_printed) |
356 | out << ","; | |
11fdf7f2 | 357 | out << p->first << "=" << p->second.public_addrs; |
7c673cae FG |
358 | has_printed = true; |
359 | } | |
360 | out << "}"; | |
361 | } | |
362 | ||
363 | void MonMap::print(ostream& out) const | |
364 | { | |
365 | out << "epoch " << epoch << "\n"; | |
366 | out << "fsid " << fsid << "\n"; | |
367 | out << "last_changed " << last_changed << "\n"; | |
368 | out << "created " << created << "\n"; | |
f67539c2 | 369 | out << "min_mon_release " << to_integer<unsigned>(min_mon_release) |
9f95a23c | 370 | << " (" << min_mon_release << ")\n"; |
f67539c2 | 371 | out << "election_strategy: " << strategy << "\n"; |
a4b75251 TL |
372 | if (stretch_mode_enabled) { |
373 | out << "stretch_mode_enabled " << stretch_mode_enabled << "\n"; | |
374 | out << "tiebreaker_mon " << tiebreaker_mon << "\n"; | |
375 | } | |
376 | if (stretch_mode_enabled || | |
377 | !disallowed_leaders.empty()) { | |
f67539c2 TL |
378 | out << "disallowed_leaders " << disallowed_leaders << "\n"; |
379 | } | |
7c673cae | 380 | unsigned i = 0; |
9f95a23c | 381 | for (auto p = ranks.begin(); p != ranks.end(); ++p) { |
f67539c2 TL |
382 | const auto &mi = mon_info.find(*p); |
383 | ceph_assert(mi != mon_info.end()); | |
384 | out << i++ << ": " << mi->second.public_addrs << " mon." << *p; | |
385 | if (!mi->second.crush_loc.empty()) { | |
386 | out << "; crush_location " << mi->second.crush_loc; | |
387 | } | |
388 | out << "\n"; | |
7c673cae FG |
389 | } |
390 | } | |
391 | ||
392 | void MonMap::dump(Formatter *f) const | |
393 | { | |
394 | f->dump_unsigned("epoch", epoch); | |
395 | f->dump_stream("fsid") << fsid; | |
9f95a23c TL |
396 | last_changed.gmtime(f->dump_stream("modified")); |
397 | created.gmtime(f->dump_stream("created")); | |
f67539c2 TL |
398 | f->dump_unsigned("min_mon_release", to_integer<unsigned>(min_mon_release)); |
399 | f->dump_string("min_mon_release_name", to_string(min_mon_release)); | |
400 | f->dump_int ("election_strategy", strategy); | |
401 | f->dump_stream("disallowed_leaders: ") << disallowed_leaders; | |
402 | f->dump_bool("stretch_mode", stretch_mode_enabled); | |
a4b75251 | 403 | f->dump_string("tiebreaker_mon", tiebreaker_mon); |
7c673cae FG |
404 | f->open_object_section("features"); |
405 | persistent_features.dump(f, "persistent"); | |
406 | optional_features.dump(f, "optional"); | |
407 | f->close_section(); | |
408 | f->open_array_section("mons"); | |
409 | int i = 0; | |
9f95a23c | 410 | for (auto p = ranks.begin(); p != ranks.end(); ++p, ++i) { |
7c673cae FG |
411 | f->open_object_section("mon"); |
412 | f->dump_int("rank", i); | |
413 | f->dump_string("name", *p); | |
11fdf7f2 TL |
414 | f->dump_object("public_addrs", get_addrs(*p)); |
415 | // compat: make these look like pre-nautilus entity_addr_t | |
416 | f->dump_stream("addr") << get_addrs(*p).get_legacy_str(); | |
417 | f->dump_stream("public_addr") << get_addrs(*p).get_legacy_str(); | |
9f95a23c TL |
418 | f->dump_unsigned("priority", get_priority(*p)); |
419 | f->dump_unsigned("weight", get_weight(*p)); | |
f67539c2 TL |
420 | const auto &mi = mon_info.find(*p); |
421 | // we don't need to assert this validity as all the get_* functions did | |
422 | f->dump_stream("crush_location") << mi->second.crush_loc; | |
7c673cae FG |
423 | f->close_section(); |
424 | } | |
425 | f->close_section(); | |
426 | } | |
427 | ||
9f95a23c TL |
428 | void MonMap::dump_summary(Formatter *f) const |
429 | { | |
430 | f->dump_unsigned("epoch", epoch); | |
f67539c2 | 431 | f->dump_string("min_mon_release_name", to_string(min_mon_release)); |
9f95a23c TL |
432 | f->dump_unsigned("num_mons", ranks.size()); |
433 | } | |
434 | ||
11fdf7f2 TL |
435 | // an ambiguous mon addr may be legacy or may be msgr2--we aren' sure. |
436 | // when that happens we need to try them both (unless we can | |
437 | // reasonably infer from the port number which it is). | |
438 | void MonMap::_add_ambiguous_addr(const string& name, | |
9f95a23c TL |
439 | entity_addr_t addr, |
440 | int priority, | |
441 | int weight, | |
442 | bool for_mkfs) | |
11fdf7f2 TL |
443 | { |
444 | if (addr.get_type() != entity_addr_t::TYPE_ANY) { | |
445 | // a v1: or v2: prefix was specified | |
446 | if (addr.get_port() == 0) { | |
447 | // use default port | |
9f95a23c | 448 | if (addr.get_type() == entity_addr_t::TYPE_LEGACY) { |
11fdf7f2 TL |
449 | addr.set_port(CEPH_MON_PORT_LEGACY); |
450 | } else if (addr.get_type() == entity_addr_t::TYPE_MSGR2) { | |
451 | addr.set_port(CEPH_MON_PORT_IANA); | |
452 | } else { | |
453 | // wth | |
454 | return; | |
455 | } | |
456 | if (!contains(addr)) { | |
9f95a23c | 457 | add(name, entity_addrvec_t(addr), priority, weight); |
11fdf7f2 TL |
458 | } |
459 | } else { | |
460 | if (!contains(addr)) { | |
9f95a23c | 461 | add(name, entity_addrvec_t(addr), priority, weight); |
11fdf7f2 TL |
462 | } |
463 | } | |
464 | } else { | |
465 | // no v1: or v2: prefix specified | |
466 | if (addr.get_port() == CEPH_MON_PORT_LEGACY) { | |
467 | // legacy port implies legacy addr | |
468 | addr.set_type(entity_addr_t::TYPE_LEGACY); | |
469 | if (!contains(addr)) { | |
470 | if (!for_mkfs) { | |
9f95a23c | 471 | add(name + "-legacy", entity_addrvec_t(addr), priority, weight); |
11fdf7f2 | 472 | } else { |
9f95a23c | 473 | add(name, entity_addrvec_t(addr), priority, weight); |
11fdf7f2 TL |
474 | } |
475 | } | |
476 | } else if (addr.get_port() == CEPH_MON_PORT_IANA) { | |
477 | // iana port implies msgr2 addr | |
478 | addr.set_type(entity_addr_t::TYPE_MSGR2); | |
479 | if (!contains(addr)) { | |
9f95a23c | 480 | add(name, entity_addrvec_t(addr), priority, weight); |
11fdf7f2 TL |
481 | } |
482 | } else if (addr.get_port() == 0) { | |
483 | // no port; include both msgr2 and legacy ports | |
484 | if (!for_mkfs) { | |
485 | addr.set_type(entity_addr_t::TYPE_MSGR2); | |
486 | addr.set_port(CEPH_MON_PORT_IANA); | |
487 | if (!contains(addr)) { | |
9f95a23c | 488 | add(name, entity_addrvec_t(addr), priority, weight); |
11fdf7f2 TL |
489 | } |
490 | addr.set_type(entity_addr_t::TYPE_LEGACY); | |
491 | addr.set_port(CEPH_MON_PORT_LEGACY); | |
492 | if (!contains(addr)) { | |
9f95a23c | 493 | add(name + "-legacy", entity_addrvec_t(addr), priority, weight); |
11fdf7f2 TL |
494 | } |
495 | } else { | |
496 | entity_addrvec_t av; | |
497 | addr.set_type(entity_addr_t::TYPE_MSGR2); | |
498 | addr.set_port(CEPH_MON_PORT_IANA); | |
499 | av.v.push_back(addr); | |
500 | addr.set_type(entity_addr_t::TYPE_LEGACY); | |
501 | addr.set_port(CEPH_MON_PORT_LEGACY); | |
502 | av.v.push_back(addr); | |
503 | if (!contains(av)) { | |
9f95a23c | 504 | add(name, av, priority, weight); |
11fdf7f2 TL |
505 | } |
506 | } | |
507 | } else { | |
508 | addr.set_type(entity_addr_t::TYPE_MSGR2); | |
509 | if (!contains(addr)) { | |
9f95a23c | 510 | add(name, entity_addrvec_t(addr), priority, weight); |
11fdf7f2 TL |
511 | } |
512 | if (!for_mkfs) { | |
513 | // try legacy on same port too | |
514 | addr.set_type(entity_addr_t::TYPE_LEGACY); | |
515 | if (!contains(addr)) { | |
9f95a23c | 516 | add(name + "-legacy", entity_addrvec_t(addr), priority, weight); |
11fdf7f2 TL |
517 | } |
518 | } | |
519 | } | |
520 | } | |
521 | } | |
7c673cae | 522 | |
f91f0fd5 TL |
523 | void MonMap::init_with_addrs(const std::vector<entity_addrvec_t>& addrs, |
524 | bool for_mkfs, | |
525 | std::string_view prefix) | |
526 | { | |
527 | char id = 'a'; | |
528 | for (auto& addr : addrs) { | |
529 | string name{prefix}; | |
530 | name += id++; | |
531 | if (addr.v.size() == 1) { | |
532 | _add_ambiguous_addr(name, addr.front(), 0, 0, for_mkfs); | |
533 | } else { | |
534 | // they specified an addrvec, so let's assume they also specified | |
535 | // the addr *type* and *port*. (we could possibly improve this?) | |
536 | add(name, addr, 0); | |
537 | } | |
538 | } | |
539 | } | |
540 | ||
11fdf7f2 TL |
541 | int MonMap::init_with_ips(const std::string& ips, |
542 | bool for_mkfs, | |
f91f0fd5 | 543 | std::string_view prefix) |
7c673cae | 544 | { |
11fdf7f2 TL |
545 | vector<entity_addrvec_t> addrs; |
546 | if (!parse_ip_port_vec( | |
547 | ips.c_str(), addrs, | |
548 | entity_addr_t::TYPE_ANY)) { | |
549 | return -EINVAL; | |
550 | } | |
551 | if (addrs.empty()) | |
552 | return -ENOENT; | |
f91f0fd5 | 553 | init_with_addrs(addrs, for_mkfs, prefix); |
11fdf7f2 TL |
554 | return 0; |
555 | } | |
7c673cae | 556 | |
11fdf7f2 TL |
557 | int MonMap::init_with_hosts(const std::string& hostlist, |
558 | bool for_mkfs, | |
f91f0fd5 | 559 | std::string_view prefix) |
11fdf7f2 | 560 | { |
7c673cae | 561 | // maybe they passed us a DNS-resolvable name |
11fdf7f2 | 562 | char *hosts = resolve_addrs(hostlist.c_str()); |
7c673cae FG |
563 | if (!hosts) |
564 | return -EINVAL; | |
11fdf7f2 TL |
565 | |
566 | vector<entity_addrvec_t> addrs; | |
567 | bool success = parse_ip_port_vec( | |
568 | hosts, addrs, | |
92f5a8d4 | 569 | entity_addr_t::TYPE_ANY); |
7c673cae FG |
570 | free(hosts); |
571 | if (!success) | |
572 | return -EINVAL; | |
7c673cae FG |
573 | if (addrs.empty()) |
574 | return -ENOENT; | |
f91f0fd5 | 575 | init_with_addrs(addrs, for_mkfs, prefix); |
11fdf7f2 | 576 | calc_legacy_ranks(); |
7c673cae FG |
577 | return 0; |
578 | } | |
579 | ||
580 | void MonMap::set_initial_members(CephContext *cct, | |
581 | list<std::string>& initial_members, | |
11fdf7f2 TL |
582 | string my_name, |
583 | const entity_addrvec_t& my_addrs, | |
584 | set<entity_addrvec_t> *removed) | |
7c673cae FG |
585 | { |
586 | // remove non-initial members | |
587 | unsigned i = 0; | |
588 | while (i < size()) { | |
589 | string n = get_name(i); | |
11fdf7f2 TL |
590 | if (std::find(initial_members.begin(), initial_members.end(), n) |
591 | != initial_members.end()) { | |
592 | lgeneric_dout(cct, 1) << " keeping " << n << " " << get_addrs(i) << dendl; | |
7c673cae FG |
593 | i++; |
594 | continue; | |
595 | } | |
596 | ||
11fdf7f2 TL |
597 | lgeneric_dout(cct, 1) << " removing " << get_name(i) << " " << get_addrs(i) |
598 | << dendl; | |
599 | if (removed) { | |
600 | removed->insert(get_addrs(i)); | |
601 | } | |
7c673cae | 602 | remove(n); |
11fdf7f2 | 603 | ceph_assert(!contains(n)); |
7c673cae FG |
604 | } |
605 | ||
606 | // add missing initial members | |
11fdf7f2 TL |
607 | for (auto& p : initial_members) { |
608 | if (!contains(p)) { | |
609 | if (p == my_name) { | |
610 | lgeneric_dout(cct, 1) << " adding self " << p << " " << my_addrs | |
611 | << dendl; | |
612 | add(p, my_addrs); | |
7c673cae FG |
613 | } else { |
614 | entity_addr_t a; | |
615 | a.set_type(entity_addr_t::TYPE_LEGACY); | |
616 | a.set_family(AF_INET); | |
617 | for (int n=1; ; n++) { | |
618 | a.set_nonce(n); | |
619 | if (!contains(a)) | |
620 | break; | |
621 | } | |
11fdf7f2 TL |
622 | lgeneric_dout(cct, 1) << " adding " << p << " " << a << dendl; |
623 | add(p, entity_addrvec_t(a)); | |
7c673cae | 624 | } |
11fdf7f2 | 625 | ceph_assert(contains(p)); |
7c673cae FG |
626 | } |
627 | } | |
11fdf7f2 | 628 | calc_legacy_ranks(); |
7c673cae FG |
629 | } |
630 | ||
11fdf7f2 TL |
631 | int MonMap::init_with_config_file(const ConfigProxy& conf, |
632 | std::ostream& errout) | |
7c673cae | 633 | { |
11fdf7f2 TL |
634 | std::vector<std::string> sections; |
635 | int ret = conf.get_all_sections(sections); | |
7c673cae FG |
636 | if (ret) { |
637 | errout << "Unable to find any monitors in the configuration " | |
638 | << "file, because there was an error listing the sections. error " | |
639 | << ret << std::endl; | |
640 | return -ENOENT; | |
641 | } | |
11fdf7f2 TL |
642 | std::vector<std::string> mon_names; |
643 | for (const auto& section : sections) { | |
644 | if (section.substr(0, 4) == "mon." && section.size() > 4) { | |
645 | mon_names.push_back(section.substr(4)); | |
7c673cae FG |
646 | } |
647 | } | |
648 | ||
649 | // Find an address for each monitor in the config file. | |
11fdf7f2 TL |
650 | for (const auto& mon_name : mon_names) { |
651 | std::vector<std::string> sections; | |
7c673cae FG |
652 | std::string m_name("mon"); |
653 | m_name += "."; | |
11fdf7f2 | 654 | m_name += mon_name; |
7c673cae FG |
655 | sections.push_back(m_name); |
656 | sections.push_back("mon"); | |
657 | sections.push_back("global"); | |
658 | std::string val; | |
11fdf7f2 | 659 | int res = conf.get_val_from_conf_file(sections, "mon addr", val, true); |
7c673cae | 660 | if (res) { |
11fdf7f2 TL |
661 | errout << "failed to get an address for mon." << mon_name |
662 | << ": error " << res << std::endl; | |
7c673cae FG |
663 | continue; |
664 | } | |
11fdf7f2 TL |
665 | // the 'mon addr' field is a legacy field, so assume anything |
666 | // there on a weird port is a v1 address, and do not handle | |
667 | // addrvecs. | |
7c673cae | 668 | entity_addr_t addr; |
11fdf7f2 TL |
669 | if (!addr.parse(val.c_str(), nullptr, entity_addr_t::TYPE_LEGACY)) { |
670 | errout << "unable to parse address for mon." << mon_name | |
671 | << ": addr='" << val << "'" << std::endl; | |
7c673cae FG |
672 | continue; |
673 | } | |
11fdf7f2 TL |
674 | if (addr.get_port() == 0) { |
675 | addr.set_port(CEPH_MON_PORT_LEGACY); | |
676 | } | |
224ce89b | 677 | uint16_t priority = 0; |
11fdf7f2 | 678 | if (!conf.get_val_from_conf_file(sections, "mon priority", val, false)) { |
224ce89b WB |
679 | try { |
680 | priority = std::stoul(val); | |
681 | } catch (std::logic_error&) { | |
11fdf7f2 | 682 | errout << "unable to parse priority for mon." << mon_name |
224ce89b WB |
683 | << ": priority='" << val << "'" << std::endl; |
684 | continue; | |
685 | } | |
686 | } | |
9f95a23c TL |
687 | uint16_t weight = 0; |
688 | if (!conf.get_val_from_conf_file(sections, "mon weight", val, false)) { | |
689 | try { | |
690 | weight = std::stoul(val); | |
691 | } catch (std::logic_error&) { | |
692 | errout << "unable to parse weight for mon." << mon_name | |
693 | << ": weight='" << val << "'" | |
694 | << std::endl; | |
695 | continue; | |
696 | } | |
697 | } | |
11fdf7f2 | 698 | |
9f95a23c | 699 | // make sure this mon isn't already in the map |
7c673cae FG |
700 | if (contains(addr)) |
701 | remove(get_name(addr)); | |
11fdf7f2 TL |
702 | if (contains(mon_name)) |
703 | remove(mon_name); | |
9f95a23c | 704 | _add_ambiguous_addr(mon_name, addr, priority, weight, false); |
11fdf7f2 TL |
705 | } |
706 | return 0; | |
707 | } | |
708 | ||
f67539c2 TL |
709 | void MonMap::check_health(health_check_map_t *checks) const |
710 | { | |
711 | if (stretch_mode_enabled) { | |
712 | list<string> detail; | |
713 | for (auto& p : mon_info) { | |
714 | if (p.second.crush_loc.empty()) { | |
715 | ostringstream ss; | |
716 | ss << "mon " << p.first << " has no location set while in stretch mode"; | |
717 | detail.push_back(ss.str()); | |
718 | } | |
719 | } | |
720 | if (!detail.empty()) { | |
721 | ostringstream ss; | |
722 | ss << detail.size() << " monitor(s) have no location set while in stretch mode" | |
723 | << "; this may cause issues with failover, OSD connections, netsplit handling, etc"; | |
724 | auto& d = checks->add("MON_LOCATION_NOT_SET", HEALTH_WARN, | |
725 | ss.str(), detail.size()); | |
726 | d.detail.swap(detail); | |
727 | } | |
728 | } | |
729 | } | |
730 | ||
11fdf7f2 TL |
731 | #ifdef WITH_SEASTAR |
732 | ||
733 | using namespace seastar; | |
734 | ||
f67539c2 | 735 | seastar::future<> MonMap::read_monmap(const std::string& monmap) |
11fdf7f2 TL |
736 | { |
737 | return open_file_dma(monmap, open_flags::ro).then([this] (file f) { | |
738 | return f.size().then([this, f = std::move(f)](size_t s) { | |
739 | return do_with(make_file_input_stream(f), [this, s](input_stream<char>& in) { | |
740 | return in.read_exactly(s).then([this](temporary_buffer<char> buf) { | |
9f95a23c TL |
741 | ceph::buffer::list bl; |
742 | bl.push_back(ceph::buffer::ptr_node::create( | |
743 | ceph::buffer::create(std::move(buf)))); | |
11fdf7f2 TL |
744 | decode(bl); |
745 | }); | |
746 | }); | |
747 | }); | |
748 | }); | |
749 | } | |
7c673cae | 750 | |
f67539c2 | 751 | seastar::future<> MonMap::init_with_dns_srv(bool for_mkfs, const std::string& name) |
11fdf7f2 TL |
752 | { |
753 | string domain; | |
754 | string service = name; | |
755 | // check if domain is also provided and extract it from srv_name | |
756 | size_t idx = name.find("_"); | |
757 | if (idx != name.npos) { | |
758 | domain = name.substr(idx + 1); | |
759 | service = name.substr(0, idx); | |
7c673cae | 760 | } |
9f95a23c TL |
761 | return seastar::net::dns::get_srv_records( |
762 | seastar::net::dns_resolver::srv_proto::tcp, | |
763 | service, domain).then([this](seastar::net::dns_resolver::srv_records records) { | |
11fdf7f2 | 764 | return parallel_for_each(records, [this](auto record) { |
9f95a23c TL |
765 | return seastar::net::dns::resolve_name(record.target).then( |
766 | [record,this](seastar::net::inet_address a) { | |
11fdf7f2 TL |
767 | // the resolved address does not contain ceph specific info like nonce |
768 | // nonce or msgr proto (legacy, msgr2), so set entity_addr_t manually | |
769 | entity_addr_t addr; | |
770 | addr.set_type(entity_addr_t::TYPE_ANY); | |
771 | addr.set_family(int(a.in_family())); | |
772 | addr.set_port(record.port); | |
773 | switch (a.in_family()) { | |
9f95a23c | 774 | case seastar::net::inet_address::family::INET: |
11fdf7f2 TL |
775 | addr.in4_addr().sin_addr = a; |
776 | break; | |
9f95a23c | 777 | case seastar::net::inet_address::family::INET6: |
11fdf7f2 TL |
778 | addr.in6_addr().sin6_addr = a; |
779 | break; | |
780 | } | |
9f95a23c TL |
781 | _add_ambiguous_addr(record.target, |
782 | addr, | |
783 | record.priority, | |
784 | record.weight, | |
785 | false); | |
11fdf7f2 TL |
786 | }); |
787 | }); | |
788 | }).handle_exception_type([](const std::system_error& e) { | |
789 | // ignore DNS failures | |
790 | return seastar::make_ready_future<>(); | |
791 | }); | |
792 | } | |
7c673cae | 793 | |
9f95a23c | 794 | seastar::future<> MonMap::build_monmap(const crimson::common::ConfigProxy& conf, |
11fdf7f2 TL |
795 | bool for_mkfs) |
796 | { | |
797 | // -m foo? | |
798 | if (const auto mon_host = conf.get_val<std::string>("mon_host"); | |
799 | !mon_host.empty()) { | |
800 | if (auto ret = init_with_ips(mon_host, for_mkfs, "noname-"); ret == 0) { | |
801 | return make_ready_future<>(); | |
7c673cae | 802 | } |
11fdf7f2 TL |
803 | // TODO: resolve_addrs() is a blocking call |
804 | if (auto ret = init_with_hosts(mon_host, for_mkfs, "noname-"); ret == 0) { | |
805 | return make_ready_future<>(); | |
806 | } else { | |
807 | throw std::runtime_error(cpp_strerror(ret)); | |
808 | } | |
809 | } | |
7c673cae | 810 | |
11fdf7f2 TL |
811 | // What monitors are in the config file? |
812 | ostringstream errout; | |
813 | if (auto ret = init_with_config_file(conf, errout); ret < 0) { | |
814 | throw std::runtime_error(errout.str()); | |
815 | } | |
816 | if (size() > 0) { | |
817 | return make_ready_future<>(); | |
818 | } | |
819 | // no info found from conf options lets try use DNS SRV records | |
820 | const string srv_name = conf.get_val<std::string>("mon_dns_srv_name"); | |
821 | return init_with_dns_srv(for_mkfs, srv_name).then([this] { | |
822 | if (size() == 0) { | |
823 | throw std::runtime_error("no monitors specified to connect to."); | |
824 | } | |
825 | }); | |
826 | } | |
7c673cae | 827 | |
f67539c2 | 828 | seastar::future<> MonMap::build_initial(const crimson::common::ConfigProxy& conf, bool for_mkfs) |
11fdf7f2 TL |
829 | { |
830 | // file? | |
831 | if (const auto monmap = conf.get_val<std::string>("monmap"); | |
832 | !monmap.empty()) { | |
833 | return read_monmap(monmap); | |
834 | } else { | |
835 | // fsid from conf? | |
836 | if (const auto new_fsid = conf.get_val<uuid_d>("fsid"); | |
837 | !new_fsid.is_zero()) { | |
838 | fsid = new_fsid; | |
7c673cae | 839 | } |
11fdf7f2 TL |
840 | return build_monmap(conf, for_mkfs).then([this] { |
841 | created = ceph_clock_now(); | |
842 | last_changed = created; | |
843 | calc_legacy_ranks(); | |
844 | }); | |
845 | } | |
846 | } | |
847 | ||
848 | #else // WITH_SEASTAR | |
849 | ||
850 | int MonMap::init_with_monmap(const std::string& monmap, std::ostream& errout) | |
851 | { | |
852 | int r; | |
853 | try { | |
854 | r = read(monmap.c_str()); | |
9f95a23c | 855 | } catch (ceph::buffer::error&) { |
11fdf7f2 TL |
856 | r = -EINVAL; |
857 | } | |
858 | if (r >= 0) | |
859 | return 0; | |
860 | errout << "unable to read/decode monmap from " << monmap | |
861 | << ": " << cpp_strerror(-r) << std::endl; | |
862 | return r; | |
863 | } | |
864 | ||
865 | int MonMap::init_with_dns_srv(CephContext* cct, | |
866 | std::string srv_name, | |
867 | bool for_mkfs, | |
868 | std::ostream& errout) | |
869 | { | |
870 | string domain; | |
871 | // check if domain is also provided and extract it from srv_name | |
872 | size_t idx = srv_name.find("_"); | |
873 | if (idx != string::npos) { | |
874 | domain = srv_name.substr(idx + 1); | |
875 | srv_name = srv_name.substr(0, idx); | |
876 | } | |
877 | ||
878 | map<string, DNSResolver::Record> records; | |
879 | if (DNSResolver::get_instance()->resolve_srv_hosts(cct, srv_name, | |
880 | DNSResolver::SRV_Protocol::TCP, domain, &records) != 0) { | |
881 | ||
882 | errout << "unable to get monitor info from DNS SRV with service name: " | |
883 | << "ceph-mon" << std::endl; | |
884 | return -1; | |
885 | } else { | |
886 | for (auto& record : records) { | |
887 | record.second.addr.set_type(entity_addr_t::TYPE_ANY); | |
9f95a23c TL |
888 | _add_ambiguous_addr(record.first, |
889 | record.second.addr, | |
890 | record.second.priority, | |
891 | record.second.weight, | |
892 | false); | |
7c673cae | 893 | } |
11fdf7f2 TL |
894 | return 0; |
895 | } | |
896 | } | |
897 | ||
898 | int MonMap::build_initial(CephContext *cct, bool for_mkfs, ostream& errout) | |
899 | { | |
900 | const auto& conf = cct->_conf; | |
f91f0fd5 TL |
901 | |
902 | // mon_host_override? | |
903 | auto mon_host_override = conf.get_val<std::string>("mon_host_override"); | |
904 | if (!mon_host_override.empty()) { | |
905 | lgeneric_dout(cct, 1) << "Using mon_host_override " << mon_host_override << dendl; | |
906 | auto ret = init_with_ips(mon_host_override, for_mkfs, "noname-"); | |
907 | if (ret == -EINVAL) { | |
908 | ret = init_with_hosts(mon_host_override, for_mkfs, "noname-"); | |
909 | } | |
910 | if (ret < 0) { | |
911 | errout << "unable to parse addrs in '" << mon_host_override << "'" | |
912 | << std::endl; | |
913 | } | |
914 | return ret; | |
915 | } | |
916 | ||
917 | // cct? | |
918 | auto addrs = cct->get_mon_addrs(); | |
919 | if (addrs != nullptr && (addrs->size() > 0)) { | |
920 | init_with_addrs(*addrs, for_mkfs, "noname-"); | |
921 | return 0; | |
922 | } | |
923 | ||
11fdf7f2 TL |
924 | // file? |
925 | if (const auto monmap = conf.get_val<std::string>("monmap"); | |
926 | !monmap.empty()) { | |
927 | return init_with_monmap(monmap, errout); | |
7c673cae FG |
928 | } |
929 | ||
11fdf7f2 TL |
930 | // fsid from conf? |
931 | if (const auto new_fsid = conf.get_val<uuid_d>("fsid"); | |
932 | !new_fsid.is_zero()) { | |
933 | fsid = new_fsid; | |
934 | } | |
935 | // -m foo? | |
936 | if (const auto mon_host = conf.get_val<std::string>("mon_host"); | |
937 | !mon_host.empty()) { | |
938 | auto ret = init_with_ips(mon_host, for_mkfs, "noname-"); | |
939 | if (ret == -EINVAL) { | |
940 | ret = init_with_hosts(mon_host, for_mkfs, "noname-"); | |
941 | } | |
942 | if (ret < 0) { | |
943 | errout << "unable to parse addrs in '" << mon_host << "'" | |
944 | << std::endl; | |
945 | return ret; | |
946 | } | |
947 | } | |
948 | if (size() == 0) { | |
949 | // What monitors are in the config file? | |
950 | if (auto ret = init_with_config_file(conf, errout); ret < 0) { | |
951 | return ret; | |
952 | } | |
953 | } | |
954 | if (size() == 0) { | |
955 | // no info found from conf options lets try use DNS SRV records | |
956 | string srv_name = conf.get_val<std::string>("mon_dns_srv_name"); | |
957 | if (auto ret = init_with_dns_srv(cct, srv_name, for_mkfs, errout); ret < 0) { | |
958 | return -ENOENT; | |
959 | } | |
960 | } | |
7c673cae FG |
961 | if (size() == 0) { |
962 | errout << "no monitors specified to connect to." << std::endl; | |
963 | return -ENOENT; | |
964 | } | |
f67539c2 | 965 | strategy = static_cast<election_strategy>(conf.get_val<uint64_t>("mon_election_default_strategy")); |
7c673cae FG |
966 | created = ceph_clock_now(); |
967 | last_changed = created; | |
11fdf7f2 | 968 | calc_legacy_ranks(); |
7c673cae FG |
969 | return 0; |
970 | } | |
11fdf7f2 | 971 | #endif // WITH_SEASTAR |