]>
Commit | Line | Data |
---|---|---|
7c673cae FG |
1 | |
2 | #include "MonMap.h" | |
3 | ||
4 | #include <algorithm> | |
5 | #include <sys/types.h> | |
6 | #include <sys/stat.h> | |
7 | #include <fcntl.h> | |
8 | ||
9 | #include "common/Formatter.h" | |
10 | ||
11 | #include "include/ceph_features.h" | |
12 | #include "include/addr_parsing.h" | |
13 | #include "common/ceph_argparse.h" | |
14 | #include "common/dns_resolve.h" | |
15 | #include "common/errno.h" | |
16 | ||
17 | #include "common/dout.h" | |
18 | ||
19 | using ceph::Formatter; | |
20 | ||
21 | void mon_info_t::encode(bufferlist& bl, uint64_t features) const | |
22 | { | |
224ce89b | 23 | ENCODE_START(2, 1, bl); |
7c673cae FG |
24 | ::encode(name, bl); |
25 | ::encode(public_addr, bl, features); | |
224ce89b | 26 | ::encode(priority, bl); |
7c673cae FG |
27 | ENCODE_FINISH(bl); |
28 | } | |
29 | ||
30 | void mon_info_t::decode(bufferlist::iterator& p) | |
31 | { | |
32 | DECODE_START(1, p); | |
33 | ::decode(name, p); | |
34 | ::decode(public_addr, p); | |
224ce89b WB |
35 | if (struct_v >= 2) { |
36 | ::decode(priority, p); | |
37 | } | |
7c673cae FG |
38 | DECODE_FINISH(p); |
39 | } | |
40 | ||
41 | void mon_info_t::print(ostream& out) const | |
42 | { | |
43 | out << "mon." << name | |
224ce89b WB |
44 | << " public " << public_addr |
45 | << " priority " << priority; | |
7c673cae FG |
46 | } |
47 | ||
48 | void MonMap::sanitize_mons(map<string,entity_addr_t>& o) | |
49 | { | |
50 | // if mon_info is populated, it means we decoded a map encoded | |
51 | // by someone who understands the new format (i.e., is able to | |
52 | // encode 'mon_info'). This means they must also have provided | |
53 | // a properly populated 'mon_addr' (which we have dropped with | |
54 | // this patch), 'o' being the contents of said map. In this | |
55 | // case, 'o' must have the same number of entries as 'mon_info'. | |
56 | // | |
57 | // Also, for each entry in 'o', there has to be a matching | |
58 | // 'mon_info' entry, properly populated with a name and a matching | |
59 | // 'public_addr'. | |
60 | // | |
61 | // OTOH, if 'mon_info' is not populated, it means the one that | |
62 | // originally encoded the map does not know the new format, and | |
63 | // 'o' will be our only source of info about the monitors in the | |
64 | // cluster -- and we will use it to populate our 'mon_info' map. | |
65 | ||
66 | bool has_mon_info = false; | |
67 | if (mon_info.size() > 0) { | |
68 | assert(o.size() == mon_info.size()); | |
69 | has_mon_info = true; | |
70 | } | |
71 | ||
72 | for (auto p : o) { | |
73 | if (has_mon_info) { | |
74 | // make sure the info we have is accurate | |
75 | assert(mon_info.count(p.first)); | |
76 | assert(mon_info[p.first].name == p.first); | |
77 | assert(mon_info[p.first].public_addr == p.second); | |
78 | } else { | |
79 | mon_info_t &m = mon_info[p.first]; | |
80 | m.name = p.first; | |
81 | m.public_addr = p.second; | |
82 | } | |
83 | } | |
84 | } | |
85 | ||
86 | namespace { | |
87 | struct rank_cmp { | |
88 | bool operator()(const mon_info_t &a, const mon_info_t &b) const { | |
89 | if (a.public_addr == b.public_addr) | |
90 | return a.name < b.name; | |
91 | return a.public_addr < b.public_addr; | |
92 | } | |
93 | }; | |
94 | } | |
95 | ||
96 | void MonMap::calc_ranks() { | |
97 | ||
98 | ranks.resize(mon_info.size()); | |
99 | addr_mons.clear(); | |
100 | ||
101 | // Used to order entries according to public_addr, because that's | |
102 | // how the ranks are expected to be ordered by. We may expand this | |
103 | // later on, according to some other criteria, by specifying a | |
104 | // different comparator. | |
105 | // | |
106 | // Please note that we use a 'set' here instead of resorting to | |
107 | // std::sort() because we need more info than that's available in | |
108 | // the vector. The vector will thus be ordered by, e.g., public_addr | |
109 | // while only containing the names of each individual monitor. | |
110 | // The only way of achieving this with std::sort() would be to first | |
111 | // insert every mon_info_t entry into a vector 'foo', std::sort() 'foo' | |
112 | // with custom comparison functions, and then copy each invidual entry | |
113 | // to a new vector. Unless there's a simpler way, we don't think the | |
114 | // added complexity makes up for the additional memory usage of a 'set'. | |
115 | set<mon_info_t, rank_cmp> tmp; | |
116 | ||
117 | for (map<string,mon_info_t>::iterator p = mon_info.begin(); | |
118 | p != mon_info.end(); | |
119 | ++p) { | |
120 | mon_info_t &m = p->second; | |
121 | tmp.insert(m); | |
122 | ||
123 | // populate addr_mons | |
124 | assert(addr_mons.count(m.public_addr) == 0); | |
125 | addr_mons[m.public_addr] = m.name; | |
126 | } | |
127 | ||
128 | // map the set to the actual ranks etc | |
129 | unsigned i = 0; | |
130 | for (set<mon_info_t>::iterator p = tmp.begin(); | |
131 | p != tmp.end(); | |
132 | ++p, ++i) { | |
133 | ranks[i] = p->name; | |
134 | } | |
135 | } | |
136 | ||
137 | void MonMap::encode(bufferlist& blist, uint64_t con_features) const | |
138 | { | |
139 | /* we keep the mon_addr map when encoding to ensure compatibility | |
140 | * with clients and other monitors that do not yet support the 'mons' | |
141 | * map. This map keeps its original behavior, containing a mapping of | |
142 | * monitor id (i.e., 'foo' in 'mon.foo') to the monitor's public | |
143 | * address -- which is obtained from the public address of each entry | |
144 | * in the 'mons' map. | |
145 | */ | |
146 | map<string,entity_addr_t> mon_addr; | |
147 | for (map<string,mon_info_t>::const_iterator p = mon_info.begin(); | |
148 | p != mon_info.end(); | |
149 | ++p) { | |
150 | mon_addr[p->first] = p->second.public_addr; | |
151 | } | |
152 | ||
153 | if ((con_features & CEPH_FEATURE_MONNAMES) == 0) { | |
154 | __u16 v = 1; | |
155 | ::encode(v, blist); | |
156 | ::encode_raw(fsid, blist); | |
157 | ::encode(epoch, blist); | |
158 | vector<entity_inst_t> mon_inst(mon_addr.size()); | |
159 | for (unsigned n = 0; n < mon_addr.size(); n++) | |
160 | mon_inst[n] = get_inst(n); | |
161 | ::encode(mon_inst, blist, con_features); | |
162 | ::encode(last_changed, blist); | |
163 | ::encode(created, blist); | |
164 | return; | |
165 | } | |
166 | ||
167 | if ((con_features & CEPH_FEATURE_MONENC) == 0) { | |
168 | __u16 v = 2; | |
169 | ::encode(v, blist); | |
170 | ::encode_raw(fsid, blist); | |
171 | ::encode(epoch, blist); | |
172 | ::encode(mon_addr, blist, con_features); | |
173 | ::encode(last_changed, blist); | |
174 | ::encode(created, blist); | |
175 | } | |
176 | ||
177 | ENCODE_START(5, 3, blist); | |
178 | ::encode_raw(fsid, blist); | |
179 | ::encode(epoch, blist); | |
180 | ::encode(mon_addr, blist, con_features); | |
181 | ::encode(last_changed, blist); | |
182 | ::encode(created, blist); | |
183 | ::encode(persistent_features, blist); | |
184 | ::encode(optional_features, blist); | |
185 | // this superseeds 'mon_addr' | |
186 | ::encode(mon_info, blist, con_features); | |
187 | ENCODE_FINISH(blist); | |
188 | } | |
189 | ||
190 | void MonMap::decode(bufferlist::iterator &p) | |
191 | { | |
192 | map<string,entity_addr_t> mon_addr; | |
193 | DECODE_START_LEGACY_COMPAT_LEN_16(5, 3, 3, p); | |
194 | ::decode_raw(fsid, p); | |
195 | ::decode(epoch, p); | |
196 | if (struct_v == 1) { | |
197 | vector<entity_inst_t> mon_inst; | |
198 | ::decode(mon_inst, p); | |
199 | for (unsigned i = 0; i < mon_inst.size(); i++) { | |
200 | char n[2]; | |
201 | n[0] = '0' + i; | |
202 | n[1] = 0; | |
203 | string name = n; | |
204 | mon_addr[name] = mon_inst[i].addr; | |
205 | } | |
206 | } else { | |
207 | ::decode(mon_addr, p); | |
208 | } | |
209 | ::decode(last_changed, p); | |
210 | ::decode(created, p); | |
211 | if (struct_v >= 4) { | |
212 | ::decode(persistent_features, p); | |
213 | ::decode(optional_features, p); | |
214 | } | |
215 | if (struct_v >= 5) { | |
216 | ::decode(mon_info, p); | |
217 | } else { | |
218 | // we may be decoding to an existing monmap; if we do not | |
219 | // clear the mon_info map now, we will likely incur in problems | |
220 | // later on MonMap::sanitize_mons() | |
221 | mon_info.clear(); | |
222 | } | |
223 | DECODE_FINISH(p); | |
224 | sanitize_mons(mon_addr); | |
225 | calc_ranks(); | |
226 | } | |
227 | ||
228 | void MonMap::generate_test_instances(list<MonMap*>& o) | |
229 | { | |
230 | o.push_back(new MonMap); | |
231 | o.push_back(new MonMap); | |
232 | o.back()->epoch = 1; | |
233 | o.back()->last_changed = utime_t(123, 456); | |
234 | o.back()->created = utime_t(789, 101112); | |
235 | o.back()->add("one", entity_addr_t()); | |
236 | ||
237 | MonMap *m = new MonMap; | |
238 | { | |
239 | m->epoch = 1; | |
240 | m->last_changed = utime_t(123, 456); | |
241 | ||
242 | entity_addr_t empty_addr_one; | |
243 | empty_addr_one.set_nonce(1); | |
244 | m->add("empty_addr_one", empty_addr_one); | |
245 | entity_addr_t empty_addr_two; | |
246 | empty_addr_two.set_nonce(2); | |
247 | m->add("empty_adrr_two", empty_addr_two); | |
248 | ||
249 | const char *local_pub_addr_s = "127.0.1.2"; | |
250 | ||
251 | const char *end_p = local_pub_addr_s + strlen(local_pub_addr_s); | |
252 | entity_addr_t local_pub_addr; | |
253 | local_pub_addr.parse(local_pub_addr_s, &end_p); | |
254 | ||
224ce89b | 255 | m->add(mon_info_t("filled_pub_addr", local_pub_addr, 1)); |
7c673cae FG |
256 | |
257 | m->add("empty_addr_zero", entity_addr_t()); | |
258 | } | |
259 | o.push_back(m); | |
260 | } | |
261 | ||
262 | // read from/write to a file | |
263 | int MonMap::write(const char *fn) | |
264 | { | |
265 | // encode | |
266 | bufferlist bl; | |
267 | encode(bl, CEPH_FEATURES_ALL); | |
268 | ||
269 | return bl.write_file(fn); | |
270 | } | |
271 | ||
272 | int MonMap::read(const char *fn) | |
273 | { | |
274 | // read | |
275 | bufferlist bl; | |
276 | std::string error; | |
277 | int r = bl.read_file(fn, &error); | |
278 | if (r < 0) | |
279 | return r; | |
280 | decode(bl); | |
281 | return 0; | |
282 | } | |
283 | ||
284 | void MonMap::print_summary(ostream& out) const | |
285 | { | |
286 | out << "e" << epoch << ": " | |
287 | << mon_info.size() << " mons at {"; | |
288 | // the map that we used to print, as it was, no longer | |
289 | // maps strings to the monitor's public address, but to | |
290 | // mon_info_t instead. As such, print the map in a way | |
291 | // that keeps the expected format. | |
292 | bool has_printed = false; | |
293 | for (map<string,mon_info_t>::const_iterator p = mon_info.begin(); | |
294 | p != mon_info.end(); | |
295 | ++p) { | |
296 | if (has_printed) | |
297 | out << ","; | |
298 | out << p->first << "=" << p->second.public_addr; | |
299 | has_printed = true; | |
300 | } | |
301 | out << "}"; | |
302 | } | |
303 | ||
304 | void MonMap::print(ostream& out) const | |
305 | { | |
306 | out << "epoch " << epoch << "\n"; | |
307 | out << "fsid " << fsid << "\n"; | |
308 | out << "last_changed " << last_changed << "\n"; | |
309 | out << "created " << created << "\n"; | |
310 | unsigned i = 0; | |
311 | for (vector<string>::const_iterator p = ranks.begin(); | |
312 | p != ranks.end(); | |
313 | ++p) { | |
314 | out << i++ << ": " << get_addr(*p) << " mon." << *p << "\n"; | |
315 | } | |
316 | } | |
317 | ||
318 | void MonMap::dump(Formatter *f) const | |
319 | { | |
320 | f->dump_unsigned("epoch", epoch); | |
321 | f->dump_stream("fsid") << fsid; | |
322 | f->dump_stream("modified") << last_changed; | |
323 | f->dump_stream("created") << created; | |
324 | f->open_object_section("features"); | |
325 | persistent_features.dump(f, "persistent"); | |
326 | optional_features.dump(f, "optional"); | |
327 | f->close_section(); | |
328 | f->open_array_section("mons"); | |
329 | int i = 0; | |
330 | for (vector<string>::const_iterator p = ranks.begin(); | |
331 | p != ranks.end(); | |
332 | ++p, ++i) { | |
333 | f->open_object_section("mon"); | |
334 | f->dump_int("rank", i); | |
335 | f->dump_string("name", *p); | |
336 | f->dump_stream("addr") << get_addr(*p); | |
337 | f->dump_stream("public_addr") << get_addr(*p); | |
338 | f->close_section(); | |
339 | } | |
340 | f->close_section(); | |
341 | } | |
342 | ||
343 | ||
344 | int MonMap::build_from_host_list(std::string hostlist, std::string prefix) | |
345 | { | |
346 | vector<entity_addr_t> addrs; | |
347 | if (parse_ip_port_vec(hostlist.c_str(), addrs)) { | |
348 | if (addrs.empty()) | |
349 | return -ENOENT; | |
350 | for (unsigned i=0; i<addrs.size(); i++) { | |
351 | char n[2]; | |
352 | n[0] = 'a' + i; | |
353 | n[1] = 0; | |
354 | if (addrs[i].get_port() == 0) | |
355 | addrs[i].set_port(CEPH_MON_PORT); | |
356 | string name = prefix; | |
357 | name += n; | |
358 | if (!contains(addrs[i])) | |
359 | add(name, addrs[i]); | |
360 | } | |
361 | return 0; | |
362 | } | |
363 | ||
364 | // maybe they passed us a DNS-resolvable name | |
365 | char *hosts = NULL; | |
366 | hosts = resolve_addrs(hostlist.c_str()); | |
367 | if (!hosts) | |
368 | return -EINVAL; | |
369 | bool success = parse_ip_port_vec(hosts, addrs); | |
370 | free(hosts); | |
371 | if (!success) | |
372 | return -EINVAL; | |
373 | ||
374 | if (addrs.empty()) | |
375 | return -ENOENT; | |
376 | ||
377 | for (unsigned i=0; i<addrs.size(); i++) { | |
378 | char n[2]; | |
379 | n[0] = 'a' + i; | |
380 | n[1] = 0; | |
381 | if (addrs[i].get_port() == 0) | |
382 | addrs[i].set_port(CEPH_MON_PORT); | |
383 | string name = prefix; | |
384 | name += n; | |
385 | if (!contains(addrs[i]) && | |
386 | !contains(name)) | |
387 | add(name, addrs[i]); | |
388 | } | |
389 | return 0; | |
390 | } | |
391 | ||
392 | void MonMap::set_initial_members(CephContext *cct, | |
393 | list<std::string>& initial_members, | |
394 | string my_name, const entity_addr_t& my_addr, | |
395 | set<entity_addr_t> *removed) | |
396 | { | |
397 | // remove non-initial members | |
398 | unsigned i = 0; | |
399 | while (i < size()) { | |
400 | string n = get_name(i); | |
401 | if (std::find(initial_members.begin(), initial_members.end(), n) != initial_members.end()) { | |
402 | lgeneric_dout(cct, 1) << " keeping " << n << " " << get_addr(i) << dendl; | |
403 | i++; | |
404 | continue; | |
405 | } | |
406 | ||
407 | lgeneric_dout(cct, 1) << " removing " << get_name(i) << " " << get_addr(i) << dendl; | |
408 | if (removed) | |
409 | removed->insert(get_addr(i)); | |
410 | remove(n); | |
411 | assert(!contains(n)); | |
412 | } | |
413 | ||
414 | // add missing initial members | |
415 | for (list<string>::iterator p = initial_members.begin(); p != initial_members.end(); ++p) { | |
416 | if (!contains(*p)) { | |
417 | if (*p == my_name) { | |
418 | lgeneric_dout(cct, 1) << " adding self " << *p << " " << my_addr << dendl; | |
419 | add(*p, my_addr); | |
420 | } else { | |
421 | entity_addr_t a; | |
422 | a.set_type(entity_addr_t::TYPE_LEGACY); | |
423 | a.set_family(AF_INET); | |
424 | for (int n=1; ; n++) { | |
425 | a.set_nonce(n); | |
426 | if (!contains(a)) | |
427 | break; | |
428 | } | |
429 | lgeneric_dout(cct, 1) << " adding " << *p << " " << a << dendl; | |
430 | add(*p, a); | |
431 | } | |
432 | assert(contains(*p)); | |
433 | } | |
434 | } | |
435 | } | |
436 | ||
437 | ||
438 | int MonMap::build_initial(CephContext *cct, ostream& errout) | |
439 | { | |
440 | const md_config_t *conf = cct->_conf; | |
441 | // file? | |
3efd9988 FG |
442 | const auto monmap = conf->get_val<std::string>("monmap"); |
443 | if (!monmap.empty()) { | |
7c673cae FG |
444 | int r; |
445 | try { | |
3efd9988 | 446 | r = read(monmap.c_str()); |
7c673cae FG |
447 | } |
448 | catch (const buffer::error &e) { | |
449 | r = -EINVAL; | |
450 | } | |
451 | if (r >= 0) | |
452 | return 0; | |
3efd9988 | 453 | errout << "unable to read/decode monmap from " << monmap |
7c673cae FG |
454 | << ": " << cpp_strerror(-r) << std::endl; |
455 | return r; | |
456 | } | |
457 | ||
458 | // fsid from conf? | |
3efd9988 FG |
459 | const auto new_fsid = conf->get_val<uuid_d>("fsid"); |
460 | if (!new_fsid.is_zero()) { | |
461 | fsid = new_fsid; | |
7c673cae FG |
462 | } |
463 | ||
464 | // -m foo? | |
3efd9988 FG |
465 | const auto mon_host = conf->get_val<std::string>("mon_host"); |
466 | if (!mon_host.empty()) { | |
467 | int r = build_from_host_list(mon_host, "noname-"); | |
7c673cae | 468 | if (r < 0) { |
3efd9988 | 469 | errout << "unable to parse addrs in '" << mon_host << "'" |
7c673cae FG |
470 | << std::endl; |
471 | return r; | |
472 | } | |
473 | created = ceph_clock_now(); | |
474 | last_changed = created; | |
475 | return 0; | |
476 | } | |
477 | ||
478 | // What monitors are in the config file? | |
479 | std::vector <std::string> sections; | |
480 | int ret = conf->get_all_sections(sections); | |
481 | if (ret) { | |
482 | errout << "Unable to find any monitors in the configuration " | |
483 | << "file, because there was an error listing the sections. error " | |
484 | << ret << std::endl; | |
485 | return -ENOENT; | |
486 | } | |
487 | std::vector <std::string> mon_names; | |
488 | for (std::vector <std::string>::const_iterator s = sections.begin(); | |
489 | s != sections.end(); ++s) { | |
490 | if ((s->substr(0, 4) == "mon.") && (s->size() > 4)) { | |
491 | mon_names.push_back(s->substr(4)); | |
492 | } | |
493 | } | |
494 | ||
495 | // Find an address for each monitor in the config file. | |
496 | for (std::vector <std::string>::const_iterator m = mon_names.begin(); | |
497 | m != mon_names.end(); ++m) { | |
498 | std::vector <std::string> sections; | |
499 | std::string m_name("mon"); | |
500 | m_name += "."; | |
501 | m_name += *m; | |
502 | sections.push_back(m_name); | |
503 | sections.push_back("mon"); | |
504 | sections.push_back("global"); | |
505 | std::string val; | |
506 | int res = conf->get_val_from_conf_file(sections, "mon addr", val, true); | |
507 | if (res) { | |
508 | errout << "failed to get an address for mon." << *m << ": error " | |
509 | << res << std::endl; | |
510 | continue; | |
511 | } | |
512 | entity_addr_t addr; | |
513 | if (!addr.parse(val.c_str())) { | |
514 | errout << "unable to parse address for mon." << *m | |
515 | << ": addr='" << val << "'" << std::endl; | |
516 | continue; | |
517 | } | |
518 | if (addr.get_port() == 0) | |
519 | addr.set_port(CEPH_MON_PORT); | |
520 | ||
224ce89b WB |
521 | uint16_t priority = 0; |
522 | if (!conf->get_val_from_conf_file(sections, "mon priority", val, false)) { | |
523 | try { | |
524 | priority = std::stoul(val); | |
525 | } catch (std::logic_error&) { | |
526 | errout << "unable to parse priority for mon." << *m | |
527 | << ": priority='" << val << "'" << std::endl; | |
528 | continue; | |
529 | } | |
530 | } | |
7c673cae FG |
531 | // the make sure this mon isn't already in the map |
532 | if (contains(addr)) | |
533 | remove(get_name(addr)); | |
534 | if (contains(*m)) | |
535 | remove(*m); | |
536 | ||
224ce89b | 537 | add(mon_info_t{*m, addr, priority}); |
7c673cae FG |
538 | } |
539 | ||
540 | if (size() == 0) { | |
541 | // no info found from conf options lets try use DNS SRV records | |
3efd9988 | 542 | string srv_name = conf->get_val<std::string>("mon_dns_srv_name"); |
7c673cae FG |
543 | string domain; |
544 | // check if domain is also provided and extract it from srv_name | |
545 | size_t idx = srv_name.find("_"); | |
546 | if (idx != string::npos) { | |
547 | domain = srv_name.substr(idx + 1); | |
548 | srv_name = srv_name.substr(0, idx); | |
549 | } | |
550 | ||
224ce89b | 551 | map<string, DNSResolver::Record> records; |
7c673cae | 552 | if (DNSResolver::get_instance()->resolve_srv_hosts(cct, srv_name, |
224ce89b | 553 | DNSResolver::SRV_Protocol::TCP, domain, &records) != 0) { |
7c673cae FG |
554 | |
555 | errout << "unable to get monitor info from DNS SRV with service name: " << | |
556 | "ceph-mon" << std::endl; | |
557 | } | |
558 | else { | |
224ce89b WB |
559 | for (const auto& record : records) { |
560 | add(mon_info_t{record.first, | |
561 | record.second.addr, | |
562 | record.second.priority}); | |
7c673cae FG |
563 | } |
564 | } | |
565 | } | |
566 | ||
567 | if (size() == 0) { | |
568 | errout << "no monitors specified to connect to." << std::endl; | |
569 | return -ENOENT; | |
570 | } | |
571 | created = ceph_clock_now(); | |
572 | last_changed = created; | |
573 | return 0; | |
574 | } |