]> git.proxmox.com Git - ceph.git/blob - ceph/src/mon/MonMap.cc
836c6901c9cb64a5169726ea935308fe23880e8b
[ceph.git] / ceph / src / mon / MonMap.cc
1
2 #include "MonMap.h"
3
4 #include <algorithm>
5 #include <sys/types.h>
6 #include <sys/stat.h>
7 #include <fcntl.h>
8
9 #include "common/Formatter.h"
10
11 #include "include/ceph_features.h"
12 #include "include/addr_parsing.h"
13 #include "common/ceph_argparse.h"
14 #include "common/dns_resolve.h"
15 #include "common/errno.h"
16
17 #include "common/dout.h"
18
19 using ceph::Formatter;
20
21 void mon_info_t::encode(bufferlist& bl, uint64_t features) const
22 {
23 ENCODE_START(1, 1, bl);
24 ::encode(name, bl);
25 ::encode(public_addr, bl, features);
26 ENCODE_FINISH(bl);
27 }
28
29 void mon_info_t::decode(bufferlist::iterator& p)
30 {
31 DECODE_START(1, p);
32 ::decode(name, p);
33 ::decode(public_addr, p);
34 DECODE_FINISH(p);
35 }
36
37 void mon_info_t::print(ostream& out) const
38 {
39 out << "mon." << name
40 << " public " << public_addr;
41 }
42
43 void MonMap::sanitize_mons(map<string,entity_addr_t>& o)
44 {
45 // if mon_info is populated, it means we decoded a map encoded
46 // by someone who understands the new format (i.e., is able to
47 // encode 'mon_info'). This means they must also have provided
48 // a properly populated 'mon_addr' (which we have dropped with
49 // this patch), 'o' being the contents of said map. In this
50 // case, 'o' must have the same number of entries as 'mon_info'.
51 //
52 // Also, for each entry in 'o', there has to be a matching
53 // 'mon_info' entry, properly populated with a name and a matching
54 // 'public_addr'.
55 //
56 // OTOH, if 'mon_info' is not populated, it means the one that
57 // originally encoded the map does not know the new format, and
58 // 'o' will be our only source of info about the monitors in the
59 // cluster -- and we will use it to populate our 'mon_info' map.
60
61 bool has_mon_info = false;
62 if (mon_info.size() > 0) {
63 assert(o.size() == mon_info.size());
64 has_mon_info = true;
65 }
66
67 for (auto p : o) {
68 if (has_mon_info) {
69 // make sure the info we have is accurate
70 assert(mon_info.count(p.first));
71 assert(mon_info[p.first].name == p.first);
72 assert(mon_info[p.first].public_addr == p.second);
73 } else {
74 mon_info_t &m = mon_info[p.first];
75 m.name = p.first;
76 m.public_addr = p.second;
77 }
78 }
79 }
80
81 namespace {
82 struct rank_cmp {
83 bool operator()(const mon_info_t &a, const mon_info_t &b) const {
84 if (a.public_addr == b.public_addr)
85 return a.name < b.name;
86 return a.public_addr < b.public_addr;
87 }
88 };
89 }
90
91 void MonMap::calc_ranks() {
92
93 ranks.resize(mon_info.size());
94 addr_mons.clear();
95
96 // Used to order entries according to public_addr, because that's
97 // how the ranks are expected to be ordered by. We may expand this
98 // later on, according to some other criteria, by specifying a
99 // different comparator.
100 //
101 // Please note that we use a 'set' here instead of resorting to
102 // std::sort() because we need more info than that's available in
103 // the vector. The vector will thus be ordered by, e.g., public_addr
104 // while only containing the names of each individual monitor.
105 // The only way of achieving this with std::sort() would be to first
106 // insert every mon_info_t entry into a vector 'foo', std::sort() 'foo'
107 // with custom comparison functions, and then copy each invidual entry
108 // to a new vector. Unless there's a simpler way, we don't think the
109 // added complexity makes up for the additional memory usage of a 'set'.
110 set<mon_info_t, rank_cmp> tmp;
111
112 for (map<string,mon_info_t>::iterator p = mon_info.begin();
113 p != mon_info.end();
114 ++p) {
115 mon_info_t &m = p->second;
116 tmp.insert(m);
117
118 // populate addr_mons
119 assert(addr_mons.count(m.public_addr) == 0);
120 addr_mons[m.public_addr] = m.name;
121 }
122
123 // map the set to the actual ranks etc
124 unsigned i = 0;
125 for (set<mon_info_t>::iterator p = tmp.begin();
126 p != tmp.end();
127 ++p, ++i) {
128 ranks[i] = p->name;
129 }
130 }
131
132 void MonMap::encode(bufferlist& blist, uint64_t con_features) const
133 {
134 /* we keep the mon_addr map when encoding to ensure compatibility
135 * with clients and other monitors that do not yet support the 'mons'
136 * map. This map keeps its original behavior, containing a mapping of
137 * monitor id (i.e., 'foo' in 'mon.foo') to the monitor's public
138 * address -- which is obtained from the public address of each entry
139 * in the 'mons' map.
140 */
141 map<string,entity_addr_t> mon_addr;
142 for (map<string,mon_info_t>::const_iterator p = mon_info.begin();
143 p != mon_info.end();
144 ++p) {
145 mon_addr[p->first] = p->second.public_addr;
146 }
147
148 if ((con_features & CEPH_FEATURE_MONNAMES) == 0) {
149 __u16 v = 1;
150 ::encode(v, blist);
151 ::encode_raw(fsid, blist);
152 ::encode(epoch, blist);
153 vector<entity_inst_t> mon_inst(mon_addr.size());
154 for (unsigned n = 0; n < mon_addr.size(); n++)
155 mon_inst[n] = get_inst(n);
156 ::encode(mon_inst, blist, con_features);
157 ::encode(last_changed, blist);
158 ::encode(created, blist);
159 return;
160 }
161
162 if ((con_features & CEPH_FEATURE_MONENC) == 0) {
163 __u16 v = 2;
164 ::encode(v, blist);
165 ::encode_raw(fsid, blist);
166 ::encode(epoch, blist);
167 ::encode(mon_addr, blist, con_features);
168 ::encode(last_changed, blist);
169 ::encode(created, blist);
170 }
171
172 ENCODE_START(5, 3, blist);
173 ::encode_raw(fsid, blist);
174 ::encode(epoch, blist);
175 ::encode(mon_addr, blist, con_features);
176 ::encode(last_changed, blist);
177 ::encode(created, blist);
178 ::encode(persistent_features, blist);
179 ::encode(optional_features, blist);
180 // this superseeds 'mon_addr'
181 ::encode(mon_info, blist, con_features);
182 ENCODE_FINISH(blist);
183 }
184
185 void MonMap::decode(bufferlist::iterator &p)
186 {
187 map<string,entity_addr_t> mon_addr;
188 DECODE_START_LEGACY_COMPAT_LEN_16(5, 3, 3, p);
189 ::decode_raw(fsid, p);
190 ::decode(epoch, p);
191 if (struct_v == 1) {
192 vector<entity_inst_t> mon_inst;
193 ::decode(mon_inst, p);
194 for (unsigned i = 0; i < mon_inst.size(); i++) {
195 char n[2];
196 n[0] = '0' + i;
197 n[1] = 0;
198 string name = n;
199 mon_addr[name] = mon_inst[i].addr;
200 }
201 } else {
202 ::decode(mon_addr, p);
203 }
204 ::decode(last_changed, p);
205 ::decode(created, p);
206 if (struct_v >= 4) {
207 ::decode(persistent_features, p);
208 ::decode(optional_features, p);
209 }
210 if (struct_v >= 5) {
211 ::decode(mon_info, p);
212 } else {
213 // we may be decoding to an existing monmap; if we do not
214 // clear the mon_info map now, we will likely incur in problems
215 // later on MonMap::sanitize_mons()
216 mon_info.clear();
217 }
218 DECODE_FINISH(p);
219 sanitize_mons(mon_addr);
220 calc_ranks();
221 }
222
223 void MonMap::generate_test_instances(list<MonMap*>& o)
224 {
225 o.push_back(new MonMap);
226 o.push_back(new MonMap);
227 o.back()->epoch = 1;
228 o.back()->last_changed = utime_t(123, 456);
229 o.back()->created = utime_t(789, 101112);
230 o.back()->add("one", entity_addr_t());
231
232 MonMap *m = new MonMap;
233 {
234 m->epoch = 1;
235 m->last_changed = utime_t(123, 456);
236
237 entity_addr_t empty_addr_one;
238 empty_addr_one.set_nonce(1);
239 m->add("empty_addr_one", empty_addr_one);
240 entity_addr_t empty_addr_two;
241 empty_addr_two.set_nonce(2);
242 m->add("empty_adrr_two", empty_addr_two);
243
244 const char *local_pub_addr_s = "127.0.1.2";
245
246 const char *end_p = local_pub_addr_s + strlen(local_pub_addr_s);
247 entity_addr_t local_pub_addr;
248 local_pub_addr.parse(local_pub_addr_s, &end_p);
249
250 m->add("filled_pub_addr", local_pub_addr);
251
252 m->add("empty_addr_zero", entity_addr_t());
253 }
254 o.push_back(m);
255 }
256
257 // read from/write to a file
258 int MonMap::write(const char *fn)
259 {
260 // encode
261 bufferlist bl;
262 encode(bl, CEPH_FEATURES_ALL);
263
264 return bl.write_file(fn);
265 }
266
267 int MonMap::read(const char *fn)
268 {
269 // read
270 bufferlist bl;
271 std::string error;
272 int r = bl.read_file(fn, &error);
273 if (r < 0)
274 return r;
275 decode(bl);
276 return 0;
277 }
278
279 void MonMap::print_summary(ostream& out) const
280 {
281 out << "e" << epoch << ": "
282 << mon_info.size() << " mons at {";
283 // the map that we used to print, as it was, no longer
284 // maps strings to the monitor's public address, but to
285 // mon_info_t instead. As such, print the map in a way
286 // that keeps the expected format.
287 bool has_printed = false;
288 for (map<string,mon_info_t>::const_iterator p = mon_info.begin();
289 p != mon_info.end();
290 ++p) {
291 if (has_printed)
292 out << ",";
293 out << p->first << "=" << p->second.public_addr;
294 has_printed = true;
295 }
296 out << "}";
297 }
298
299 void MonMap::print(ostream& out) const
300 {
301 out << "epoch " << epoch << "\n";
302 out << "fsid " << fsid << "\n";
303 out << "last_changed " << last_changed << "\n";
304 out << "created " << created << "\n";
305 unsigned i = 0;
306 for (vector<string>::const_iterator p = ranks.begin();
307 p != ranks.end();
308 ++p) {
309 out << i++ << ": " << get_addr(*p) << " mon." << *p << "\n";
310 }
311 }
312
313 void MonMap::dump(Formatter *f) const
314 {
315 f->dump_unsigned("epoch", epoch);
316 f->dump_stream("fsid") << fsid;
317 f->dump_stream("modified") << last_changed;
318 f->dump_stream("created") << created;
319 f->open_object_section("features");
320 persistent_features.dump(f, "persistent");
321 optional_features.dump(f, "optional");
322 f->close_section();
323 f->open_array_section("mons");
324 int i = 0;
325 for (vector<string>::const_iterator p = ranks.begin();
326 p != ranks.end();
327 ++p, ++i) {
328 f->open_object_section("mon");
329 f->dump_int("rank", i);
330 f->dump_string("name", *p);
331 f->dump_stream("addr") << get_addr(*p);
332 f->dump_stream("public_addr") << get_addr(*p);
333 f->close_section();
334 }
335 f->close_section();
336 }
337
338
339 int MonMap::build_from_host_list(std::string hostlist, std::string prefix)
340 {
341 vector<entity_addr_t> addrs;
342 if (parse_ip_port_vec(hostlist.c_str(), addrs)) {
343 if (addrs.empty())
344 return -ENOENT;
345 for (unsigned i=0; i<addrs.size(); i++) {
346 char n[2];
347 n[0] = 'a' + i;
348 n[1] = 0;
349 if (addrs[i].get_port() == 0)
350 addrs[i].set_port(CEPH_MON_PORT);
351 string name = prefix;
352 name += n;
353 if (!contains(addrs[i]))
354 add(name, addrs[i]);
355 }
356 return 0;
357 }
358
359 // maybe they passed us a DNS-resolvable name
360 char *hosts = NULL;
361 hosts = resolve_addrs(hostlist.c_str());
362 if (!hosts)
363 return -EINVAL;
364 bool success = parse_ip_port_vec(hosts, addrs);
365 free(hosts);
366 if (!success)
367 return -EINVAL;
368
369 if (addrs.empty())
370 return -ENOENT;
371
372 for (unsigned i=0; i<addrs.size(); i++) {
373 char n[2];
374 n[0] = 'a' + i;
375 n[1] = 0;
376 if (addrs[i].get_port() == 0)
377 addrs[i].set_port(CEPH_MON_PORT);
378 string name = prefix;
379 name += n;
380 if (!contains(addrs[i]) &&
381 !contains(name))
382 add(name, addrs[i]);
383 }
384 return 0;
385 }
386
387 void MonMap::set_initial_members(CephContext *cct,
388 list<std::string>& initial_members,
389 string my_name, const entity_addr_t& my_addr,
390 set<entity_addr_t> *removed)
391 {
392 // remove non-initial members
393 unsigned i = 0;
394 while (i < size()) {
395 string n = get_name(i);
396 if (std::find(initial_members.begin(), initial_members.end(), n) != initial_members.end()) {
397 lgeneric_dout(cct, 1) << " keeping " << n << " " << get_addr(i) << dendl;
398 i++;
399 continue;
400 }
401
402 lgeneric_dout(cct, 1) << " removing " << get_name(i) << " " << get_addr(i) << dendl;
403 if (removed)
404 removed->insert(get_addr(i));
405 remove(n);
406 assert(!contains(n));
407 }
408
409 // add missing initial members
410 for (list<string>::iterator p = initial_members.begin(); p != initial_members.end(); ++p) {
411 if (!contains(*p)) {
412 if (*p == my_name) {
413 lgeneric_dout(cct, 1) << " adding self " << *p << " " << my_addr << dendl;
414 add(*p, my_addr);
415 } else {
416 entity_addr_t a;
417 a.set_type(entity_addr_t::TYPE_LEGACY);
418 a.set_family(AF_INET);
419 for (int n=1; ; n++) {
420 a.set_nonce(n);
421 if (!contains(a))
422 break;
423 }
424 lgeneric_dout(cct, 1) << " adding " << *p << " " << a << dendl;
425 add(*p, a);
426 }
427 assert(contains(*p));
428 }
429 }
430 }
431
432
433 int MonMap::build_initial(CephContext *cct, ostream& errout)
434 {
435 const md_config_t *conf = cct->_conf;
436 // file?
437 if (!conf->monmap.empty()) {
438 int r;
439 try {
440 r = read(conf->monmap.c_str());
441 }
442 catch (const buffer::error &e) {
443 r = -EINVAL;
444 }
445 if (r >= 0)
446 return 0;
447 errout << "unable to read/decode monmap from " << conf->monmap
448 << ": " << cpp_strerror(-r) << std::endl;
449 return r;
450 }
451
452 // fsid from conf?
453 if (!cct->_conf->fsid.is_zero()) {
454 fsid = cct->_conf->fsid;
455 }
456
457 // -m foo?
458 if (!conf->mon_host.empty()) {
459 int r = build_from_host_list(conf->mon_host, "noname-");
460 if (r < 0) {
461 errout << "unable to parse addrs in '" << conf->mon_host << "'"
462 << std::endl;
463 return r;
464 }
465 created = ceph_clock_now();
466 last_changed = created;
467 return 0;
468 }
469
470 // What monitors are in the config file?
471 std::vector <std::string> sections;
472 int ret = conf->get_all_sections(sections);
473 if (ret) {
474 errout << "Unable to find any monitors in the configuration "
475 << "file, because there was an error listing the sections. error "
476 << ret << std::endl;
477 return -ENOENT;
478 }
479 std::vector <std::string> mon_names;
480 for (std::vector <std::string>::const_iterator s = sections.begin();
481 s != sections.end(); ++s) {
482 if ((s->substr(0, 4) == "mon.") && (s->size() > 4)) {
483 mon_names.push_back(s->substr(4));
484 }
485 }
486
487 // Find an address for each monitor in the config file.
488 for (std::vector <std::string>::const_iterator m = mon_names.begin();
489 m != mon_names.end(); ++m) {
490 std::vector <std::string> sections;
491 std::string m_name("mon");
492 m_name += ".";
493 m_name += *m;
494 sections.push_back(m_name);
495 sections.push_back("mon");
496 sections.push_back("global");
497 std::string val;
498 int res = conf->get_val_from_conf_file(sections, "mon addr", val, true);
499 if (res) {
500 errout << "failed to get an address for mon." << *m << ": error "
501 << res << std::endl;
502 continue;
503 }
504 entity_addr_t addr;
505 if (!addr.parse(val.c_str())) {
506 errout << "unable to parse address for mon." << *m
507 << ": addr='" << val << "'" << std::endl;
508 continue;
509 }
510 if (addr.get_port() == 0)
511 addr.set_port(CEPH_MON_PORT);
512
513 // the make sure this mon isn't already in the map
514 if (contains(addr))
515 remove(get_name(addr));
516 if (contains(*m))
517 remove(*m);
518
519 add(m->c_str(), addr);
520 }
521
522 if (size() == 0) {
523 // no info found from conf options lets try use DNS SRV records
524 string srv_name = conf->mon_dns_srv_name;
525 string domain;
526 // check if domain is also provided and extract it from srv_name
527 size_t idx = srv_name.find("_");
528 if (idx != string::npos) {
529 domain = srv_name.substr(idx + 1);
530 srv_name = srv_name.substr(0, idx);
531 }
532
533 map<string, entity_addr_t> addrs;
534 if (DNSResolver::get_instance()->resolve_srv_hosts(cct, srv_name,
535 DNSResolver::SRV_Protocol::TCP, domain, &addrs) != 0) {
536
537 errout << "unable to get monitor info from DNS SRV with service name: " <<
538 "ceph-mon" << std::endl;
539 }
540 else {
541 for (const auto& addr : addrs) {
542 add(addr.first, addr.second);
543 }
544 }
545 }
546
547 if (size() == 0) {
548 errout << "no monitors specified to connect to." << std::endl;
549 return -ENOENT;
550 }
551 created = ceph_clock_now();
552 last_changed = created;
553 return 0;
554 }