]> git.proxmox.com Git - ceph.git/blob - ceph/src/mds/MDSMap.cc
9dfce950f5a8d46fa2d410b198122e0d965e11c5
[ceph.git] / ceph / src / mds / MDSMap.cc
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3 /*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 *
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
12 *
13 */
14
15 #include "common/debug.h"
16 #include "mon/health_check.h"
17
18 #include "MDSMap.h"
19
20 #include <sstream>
21 using std::stringstream;
22
23 #define dout_context g_ceph_context
24 #define dout_subsys ceph_subsys_
25
26 // features
27 CompatSet get_mdsmap_compat_set_all() {
28 CompatSet::FeatureSet feature_compat;
29 CompatSet::FeatureSet feature_ro_compat;
30 CompatSet::FeatureSet feature_incompat;
31 feature_incompat.insert(MDS_FEATURE_INCOMPAT_BASE);
32 feature_incompat.insert(MDS_FEATURE_INCOMPAT_CLIENTRANGES);
33 feature_incompat.insert(MDS_FEATURE_INCOMPAT_FILELAYOUT);
34 feature_incompat.insert(MDS_FEATURE_INCOMPAT_DIRINODE);
35 feature_incompat.insert(MDS_FEATURE_INCOMPAT_ENCODING);
36 feature_incompat.insert(MDS_FEATURE_INCOMPAT_OMAPDIRFRAG);
37 feature_incompat.insert(MDS_FEATURE_INCOMPAT_INLINE);
38 feature_incompat.insert(MDS_FEATURE_INCOMPAT_NOANCHOR);
39 feature_incompat.insert(MDS_FEATURE_INCOMPAT_FILE_LAYOUT_V2);
40
41 return CompatSet(feature_compat, feature_ro_compat, feature_incompat);
42 }
43
44 CompatSet get_mdsmap_compat_set_default() {
45 CompatSet::FeatureSet feature_compat;
46 CompatSet::FeatureSet feature_ro_compat;
47 CompatSet::FeatureSet feature_incompat;
48 feature_incompat.insert(MDS_FEATURE_INCOMPAT_BASE);
49 feature_incompat.insert(MDS_FEATURE_INCOMPAT_CLIENTRANGES);
50 feature_incompat.insert(MDS_FEATURE_INCOMPAT_FILELAYOUT);
51 feature_incompat.insert(MDS_FEATURE_INCOMPAT_DIRINODE);
52 feature_incompat.insert(MDS_FEATURE_INCOMPAT_ENCODING);
53 feature_incompat.insert(MDS_FEATURE_INCOMPAT_OMAPDIRFRAG);
54 feature_incompat.insert(MDS_FEATURE_INCOMPAT_NOANCHOR);
55 feature_incompat.insert(MDS_FEATURE_INCOMPAT_FILE_LAYOUT_V2);
56
57 return CompatSet(feature_compat, feature_ro_compat, feature_incompat);
58 }
59
60 // base (pre v0.20)
61 CompatSet get_mdsmap_compat_set_base() {
62 CompatSet::FeatureSet feature_compat_base;
63 CompatSet::FeatureSet feature_incompat_base;
64 feature_incompat_base.insert(MDS_FEATURE_INCOMPAT_BASE);
65 CompatSet::FeatureSet feature_ro_compat_base;
66
67 return CompatSet(feature_compat_base, feature_ro_compat_base, feature_incompat_base);
68 }
69
70 void MDSMap::mds_info_t::dump(Formatter *f) const
71 {
72 f->dump_unsigned("gid", global_id);
73 f->dump_string("name", name);
74 f->dump_int("rank", rank);
75 f->dump_int("incarnation", inc);
76 f->dump_stream("state") << ceph_mds_state_name(state);
77 f->dump_int("state_seq", state_seq);
78 f->dump_stream("addr") << addr;
79 if (laggy_since != utime_t())
80 f->dump_stream("laggy_since") << laggy_since;
81
82 f->dump_int("standby_for_rank", standby_for_rank);
83 f->dump_int("standby_for_fscid", standby_for_fscid);
84 f->dump_string("standby_for_name", standby_for_name);
85 f->dump_bool("standby_replay", standby_replay);
86 f->open_array_section("export_targets");
87 for (set<mds_rank_t>::iterator p = export_targets.begin();
88 p != export_targets.end(); ++p) {
89 f->dump_int("mds", *p);
90 }
91 f->close_section();
92 f->dump_unsigned("features", mds_features);
93 }
94
95 void MDSMap::mds_info_t::print_summary(ostream &out) const
96 {
97 out << global_id << ":\t"
98 << addr
99 << " '" << name << "'"
100 << " mds." << rank
101 << "." << inc
102 << " " << ceph_mds_state_name(state)
103 << " seq " << state_seq;
104 if (laggy()) {
105 out << " laggy since " << laggy_since;
106 }
107 if (standby_for_rank != -1 ||
108 !standby_for_name.empty()) {
109 out << " (standby for";
110 //if (standby_for_rank >= 0)
111 out << " rank " << standby_for_rank;
112 if (!standby_for_name.empty()) {
113 out << " '" << standby_for_name << "'";
114 }
115 out << ")";
116 }
117 if (!export_targets.empty()) {
118 out << " export_targets=" << export_targets;
119 }
120 }
121
122 void MDSMap::mds_info_t::generate_test_instances(list<mds_info_t*>& ls)
123 {
124 mds_info_t *sample = new mds_info_t();
125 ls.push_back(sample);
126 sample = new mds_info_t();
127 sample->global_id = 1;
128 sample->name = "test_instance";
129 sample->rank = 0;
130 ls.push_back(sample);
131 }
132
133 void MDSMap::dump(Formatter *f) const
134 {
135 f->dump_int("epoch", epoch);
136 f->dump_unsigned("flags", flags);
137 f->dump_unsigned("ever_allowed_features", ever_allowed_features);
138 f->dump_unsigned("explicitly_allowed_features", explicitly_allowed_features);
139 f->dump_stream("created") << created;
140 f->dump_stream("modified") << modified;
141 f->dump_int("tableserver", tableserver);
142 f->dump_int("root", root);
143 f->dump_int("session_timeout", session_timeout);
144 f->dump_int("session_autoclose", session_autoclose);
145 f->dump_int("max_file_size", max_file_size);
146 f->dump_int("last_failure", last_failure);
147 f->dump_int("last_failure_osd_epoch", last_failure_osd_epoch);
148 f->open_object_section("compat");
149 compat.dump(f);
150 f->close_section();
151 f->dump_int("max_mds", max_mds);
152 f->open_array_section("in");
153 for (set<mds_rank_t>::const_iterator p = in.begin(); p != in.end(); ++p)
154 f->dump_int("mds", *p);
155 f->close_section();
156 f->open_object_section("up");
157 for (map<mds_rank_t,mds_gid_t>::const_iterator p = up.begin(); p != up.end(); ++p) {
158 char s[14];
159 sprintf(s, "mds_%d", int(p->first));
160 f->dump_int(s, p->second);
161 }
162 f->close_section();
163 f->open_array_section("failed");
164 for (set<mds_rank_t>::const_iterator p = failed.begin(); p != failed.end(); ++p)
165 f->dump_int("mds", *p);
166 f->close_section();
167 f->open_array_section("damaged");
168 for (set<mds_rank_t>::const_iterator p = damaged.begin(); p != damaged.end(); ++p)
169 f->dump_int("mds", *p);
170 f->close_section();
171 f->open_array_section("stopped");
172 for (set<mds_rank_t>::const_iterator p = stopped.begin(); p != stopped.end(); ++p)
173 f->dump_int("mds", *p);
174 f->close_section();
175 f->open_object_section("info");
176 for (map<mds_gid_t,mds_info_t>::const_iterator p = mds_info.begin(); p != mds_info.end(); ++p) {
177 char s[25]; // 'gid_' + len(str(ULLONG_MAX)) + '\0'
178 sprintf(s, "gid_%llu", (long long unsigned)p->first);
179 f->open_object_section(s);
180 p->second.dump(f);
181 f->close_section();
182 }
183 f->close_section();
184 f->open_array_section("data_pools");
185 for (const auto p: data_pools)
186 f->dump_int("pool", p);
187 f->close_section();
188 f->dump_int("metadata_pool", metadata_pool);
189 f->dump_bool("enabled", enabled);
190 f->dump_string("fs_name", fs_name);
191 f->dump_string("balancer", balancer);
192 f->dump_int("standby_count_wanted", std::max(0, standby_count_wanted));
193 }
194
195 void MDSMap::generate_test_instances(list<MDSMap*>& ls)
196 {
197 MDSMap *m = new MDSMap();
198 m->max_mds = 1;
199 m->data_pools.push_back(0);
200 m->metadata_pool = 1;
201 m->cas_pool = 2;
202 m->compat = get_mdsmap_compat_set_all();
203
204 // these aren't the defaults, just in case anybody gets confused
205 m->session_timeout = 61;
206 m->session_autoclose = 301;
207 m->max_file_size = 1<<24;
208 ls.push_back(m);
209 }
210
211 void MDSMap::print(ostream& out) const
212 {
213 out << "fs_name\t" << fs_name << "\n";
214 out << "epoch\t" << epoch << "\n";
215 out << "flags\t" << hex << flags << dec << "\n";
216 out << "created\t" << created << "\n";
217 out << "modified\t" << modified << "\n";
218 out << "tableserver\t" << tableserver << "\n";
219 out << "root\t" << root << "\n";
220 out << "session_timeout\t" << session_timeout << "\n"
221 << "session_autoclose\t" << session_autoclose << "\n";
222 out << "max_file_size\t" << max_file_size << "\n";
223 out << "last_failure\t" << last_failure << "\n"
224 << "last_failure_osd_epoch\t" << last_failure_osd_epoch << "\n";
225 out << "compat\t" << compat << "\n";
226 out << "max_mds\t" << max_mds << "\n";
227 out << "in\t" << in << "\n"
228 << "up\t" << up << "\n"
229 << "failed\t" << failed << "\n"
230 << "damaged\t" << damaged << "\n"
231 << "stopped\t" << stopped << "\n";
232 out << "data_pools\t" << data_pools << "\n";
233 out << "metadata_pool\t" << metadata_pool << "\n";
234 out << "inline_data\t" << (inline_data_enabled ? "enabled" : "disabled") << "\n";
235 out << "balancer\t" << balancer << "\n";
236 out << "standby_count_wanted\t" << std::max(0, standby_count_wanted) << "\n";
237
238 multimap< pair<mds_rank_t, unsigned>, mds_gid_t > foo;
239 for (const auto &p : mds_info) {
240 foo.insert(std::make_pair(
241 std::make_pair(p.second.rank, p.second.inc-1), p.first));
242 }
243
244 for (const auto &p : foo) {
245 const mds_info_t& info = mds_info.at(p.second);
246 info.print_summary(out);
247 out << "\n";
248 }
249 }
250
251
252
253 void MDSMap::print_summary(Formatter *f, ostream *out) const
254 {
255 map<mds_rank_t,string> by_rank;
256 map<string,int> by_state;
257
258 if (f) {
259 f->dump_unsigned("epoch", get_epoch());
260 f->dump_unsigned("up", up.size());
261 f->dump_unsigned("in", in.size());
262 f->dump_unsigned("max", max_mds);
263 } else {
264 *out << "e" << get_epoch() << ": " << up.size() << "/" << in.size() << "/" << max_mds << " up";
265 }
266
267 if (f)
268 f->open_array_section("by_rank");
269 for (const auto &p : mds_info) {
270 string s = ceph_mds_state_name(p.second.state);
271 if (p.second.laggy())
272 s += "(laggy or crashed)";
273
274 if (p.second.rank >= 0 && p.second.state != MDSMap::STATE_STANDBY_REPLAY) {
275 if (f) {
276 f->open_object_section("mds");
277 f->dump_unsigned("rank", p.second.rank);
278 f->dump_string("name", p.second.name);
279 f->dump_string("status", s);
280 f->close_section();
281 } else {
282 by_rank[p.second.rank] = p.second.name + "=" + s;
283 }
284 } else {
285 by_state[s]++;
286 }
287 }
288 if (f) {
289 f->close_section();
290 } else {
291 if (!by_rank.empty())
292 *out << " " << by_rank;
293 }
294
295 for (map<string,int>::reverse_iterator p = by_state.rbegin(); p != by_state.rend(); ++p) {
296 if (f) {
297 f->dump_unsigned(p->first.c_str(), p->second);
298 } else {
299 *out << ", " << p->second << " " << p->first;
300 }
301 }
302
303 if (!failed.empty()) {
304 if (f) {
305 f->dump_unsigned("failed", failed.size());
306 } else {
307 *out << ", " << failed.size() << " failed";
308 }
309 }
310
311 if (!damaged.empty()) {
312 if (f) {
313 f->dump_unsigned("damaged", damaged.size());
314 } else {
315 *out << ", " << damaged.size() << " damaged";
316 }
317 }
318 //if (stopped.size())
319 //out << ", " << stopped.size() << " stopped";
320 }
321
322 void MDSMap::get_health(list<pair<health_status_t,string> >& summary,
323 list<pair<health_status_t,string> > *detail) const
324 {
325 if (!failed.empty()) {
326 std::ostringstream oss;
327 oss << "mds rank"
328 << ((failed.size() > 1) ? "s ":" ")
329 << failed
330 << ((failed.size() > 1) ? " have":" has")
331 << " failed";
332 summary.push_back(make_pair(HEALTH_ERR, oss.str()));
333 if (detail) {
334 for (set<mds_rank_t>::const_iterator p = failed.begin(); p != failed.end(); ++p) {
335 std::ostringstream oss;
336 oss << "mds." << *p << " has failed";
337 detail->push_back(make_pair(HEALTH_ERR, oss.str()));
338 }
339 }
340 }
341
342 if (!damaged.empty()) {
343 std::ostringstream oss;
344 oss << "mds rank"
345 << ((damaged.size() > 1) ? "s ":" ")
346 << damaged
347 << ((damaged.size() > 1) ? " are":" is")
348 << " damaged";
349 summary.push_back(make_pair(HEALTH_ERR, oss.str()));
350 if (detail) {
351 for (set<mds_rank_t>::const_iterator p = damaged.begin(); p != damaged.end(); ++p) {
352 std::ostringstream oss;
353 oss << "mds." << *p << " is damaged";
354 detail->push_back(make_pair(HEALTH_ERR, oss.str()));
355 }
356 }
357 }
358
359 if (is_degraded()) {
360 summary.push_back(make_pair(HEALTH_WARN, "mds cluster is degraded"));
361 if (detail) {
362 detail->push_back(make_pair(HEALTH_WARN, "mds cluster is degraded"));
363 for (mds_rank_t i = mds_rank_t(0); i< get_max_mds(); i++) {
364 if (!is_up(i))
365 continue;
366 mds_gid_t gid = up.find(i)->second;
367 map<mds_gid_t,mds_info_t>::const_iterator info = mds_info.find(gid);
368 stringstream ss;
369 if (is_resolve(i))
370 ss << "mds." << info->second.name << " at " << info->second.addr << " rank " << i << " is resolving";
371 if (is_replay(i))
372 ss << "mds." << info->second.name << " at " << info->second.addr << " rank " << i << " is replaying journal";
373 if (is_rejoin(i))
374 ss << "mds." << info->second.name << " at " << info->second.addr << " rank " << i << " is rejoining";
375 if (is_reconnect(i))
376 ss << "mds." << info->second.name << " at " << info->second.addr << " rank " << i << " is reconnecting to clients";
377 if (ss.str().length())
378 detail->push_back(make_pair(HEALTH_WARN, ss.str()));
379 }
380 }
381 }
382
383 map<mds_gid_t, mds_info_t>::const_iterator m_end = mds_info.end();
384 set<string> laggy;
385 for (const auto &u : up) {
386 map<mds_gid_t, mds_info_t>::const_iterator m = mds_info.find(u.second);
387 if (m == m_end) {
388 std::cerr << "Up rank " << u.first << " GID " << u.second << " not found!" << std::endl;
389 }
390 assert(m != m_end);
391 const mds_info_t &mds_info(m->second);
392 if (mds_info.laggy()) {
393 laggy.insert(mds_info.name);
394 if (detail) {
395 std::ostringstream oss;
396 oss << "mds." << mds_info.name << " at " << mds_info.addr << " is laggy/unresponsive";
397 detail->push_back(make_pair(HEALTH_WARN, oss.str()));
398 }
399 }
400 }
401
402 if (!laggy.empty()) {
403 std::ostringstream oss;
404 oss << "mds " << laggy
405 << ((laggy.size() > 1) ? " are":" is")
406 << " laggy";
407 summary.push_back(make_pair(HEALTH_WARN, oss.str()));
408 }
409 }
410
411 void MDSMap::get_health_checks(health_check_map_t *checks) const
412 {
413 // MDS_DAMAGE
414 if (!damaged.empty()) {
415 health_check_t& check = checks->get_or_add("MDS_DAMAGE", HEALTH_ERR,
416 "%num% mds daemon%plurals% damaged");
417 for (auto p : damaged) {
418 std::ostringstream oss;
419 oss << "fs " << fs_name << " mds." << p << " is damaged";
420 check.detail.push_back(oss.str());
421 }
422 }
423
424 // FS_DEGRADED
425 if (is_degraded()) {
426 health_check_t& fscheck = checks->get_or_add(
427 "FS_DEGRADED", HEALTH_WARN,
428 "%num% filesystem%plurals% %isorare% degraded");
429 ostringstream ss;
430 ss << "fs " << fs_name << " is degraded";
431 fscheck.detail.push_back(ss.str());
432
433 list<string> detail;
434 for (mds_rank_t i = mds_rank_t(0); i< get_max_mds(); i++) {
435 if (!is_up(i))
436 continue;
437 mds_gid_t gid = up.find(i)->second;
438 map<mds_gid_t,mds_info_t>::const_iterator info = mds_info.find(gid);
439 stringstream ss;
440 ss << "fs " << fs_name << " mds." << info->second.name << " at "
441 << info->second.addr << " rank " << i;
442 if (is_resolve(i))
443 ss << " is resolving";
444 if (is_replay(i))
445 ss << " is replaying journal";
446 if (is_rejoin(i))
447 ss << " is rejoining";
448 if (is_reconnect(i))
449 ss << " is reconnecting to clients";
450 if (ss.str().length())
451 detail.push_back(ss.str());
452 }
453 }
454 }
455
456 void MDSMap::mds_info_t::encode_versioned(bufferlist& bl, uint64_t features) const
457 {
458 ENCODE_START(7, 4, bl);
459 ::encode(global_id, bl);
460 ::encode(name, bl);
461 ::encode(rank, bl);
462 ::encode(inc, bl);
463 ::encode((int32_t)state, bl);
464 ::encode(state_seq, bl);
465 ::encode(addr, bl, features);
466 ::encode(laggy_since, bl);
467 ::encode(standby_for_rank, bl);
468 ::encode(standby_for_name, bl);
469 ::encode(export_targets, bl);
470 ::encode(mds_features, bl);
471 ::encode(standby_for_fscid, bl);
472 ::encode(standby_replay, bl);
473 ENCODE_FINISH(bl);
474 }
475
476 void MDSMap::mds_info_t::encode_unversioned(bufferlist& bl) const
477 {
478 __u8 struct_v = 3;
479 ::encode(struct_v, bl);
480 ::encode(global_id, bl);
481 ::encode(name, bl);
482 ::encode(rank, bl);
483 ::encode(inc, bl);
484 ::encode((int32_t)state, bl);
485 ::encode(state_seq, bl);
486 ::encode(addr, bl, 0);
487 ::encode(laggy_since, bl);
488 ::encode(standby_for_rank, bl);
489 ::encode(standby_for_name, bl);
490 ::encode(export_targets, bl);
491 }
492
493 void MDSMap::mds_info_t::decode(bufferlist::iterator& bl)
494 {
495 DECODE_START_LEGACY_COMPAT_LEN(7, 4, 4, bl);
496 ::decode(global_id, bl);
497 ::decode(name, bl);
498 ::decode(rank, bl);
499 ::decode(inc, bl);
500 ::decode((int32_t&)(state), bl);
501 ::decode(state_seq, bl);
502 ::decode(addr, bl);
503 ::decode(laggy_since, bl);
504 ::decode(standby_for_rank, bl);
505 ::decode(standby_for_name, bl);
506 if (struct_v >= 2)
507 ::decode(export_targets, bl);
508 if (struct_v >= 5)
509 ::decode(mds_features, bl);
510 if (struct_v >= 6) {
511 ::decode(standby_for_fscid, bl);
512 }
513 if (struct_v >= 7) {
514 ::decode(standby_replay, bl);
515 }
516 DECODE_FINISH(bl);
517 }
518
519 std::string MDSMap::mds_info_t::human_name() const
520 {
521 // Like "daemon mds.myhost restarted", "Activating daemon mds.myhost"
522 std::ostringstream out;
523 out << "daemon mds." << name;
524 return out.str();
525 }
526
527 void MDSMap::encode(bufferlist& bl, uint64_t features) const
528 {
529 std::map<mds_rank_t,int32_t> inc; // Legacy field, fake it so that
530 // old-mon peers have something sane
531 // during upgrade
532 for (const auto rank : in) {
533 inc.insert(std::make_pair(rank, epoch));
534 }
535
536 if ((features & CEPH_FEATURE_PGID64) == 0) {
537 __u16 v = 2;
538 ::encode(v, bl);
539 ::encode(epoch, bl);
540 ::encode(flags, bl);
541 ::encode(last_failure, bl);
542 ::encode(root, bl);
543 ::encode(session_timeout, bl);
544 ::encode(session_autoclose, bl);
545 ::encode(max_file_size, bl);
546 ::encode(max_mds, bl);
547 __u32 n = mds_info.size();
548 ::encode(n, bl);
549 for (map<mds_gid_t, mds_info_t>::const_iterator i = mds_info.begin();
550 i != mds_info.end(); ++i) {
551 ::encode(i->first, bl);
552 ::encode(i->second, bl, features);
553 }
554 n = data_pools.size();
555 ::encode(n, bl);
556 for (const auto p: data_pools) {
557 n = p;
558 ::encode(n, bl);
559 }
560
561 int32_t m = cas_pool;
562 ::encode(m, bl);
563 return;
564 } else if ((features & CEPH_FEATURE_MDSENC) == 0) {
565 __u16 v = 3;
566 ::encode(v, bl);
567 ::encode(epoch, bl);
568 ::encode(flags, bl);
569 ::encode(last_failure, bl);
570 ::encode(root, bl);
571 ::encode(session_timeout, bl);
572 ::encode(session_autoclose, bl);
573 ::encode(max_file_size, bl);
574 ::encode(max_mds, bl);
575 __u32 n = mds_info.size();
576 ::encode(n, bl);
577 for (map<mds_gid_t, mds_info_t>::const_iterator i = mds_info.begin();
578 i != mds_info.end(); ++i) {
579 ::encode(i->first, bl);
580 ::encode(i->second, bl, features);
581 }
582 ::encode(data_pools, bl);
583 ::encode(cas_pool, bl);
584
585 // kclient ignores everything from here
586 __u16 ev = 5;
587 ::encode(ev, bl);
588 ::encode(compat, bl);
589 ::encode(metadata_pool, bl);
590 ::encode(created, bl);
591 ::encode(modified, bl);
592 ::encode(tableserver, bl);
593 ::encode(in, bl);
594 ::encode(inc, bl);
595 ::encode(up, bl);
596 ::encode(failed, bl);
597 ::encode(stopped, bl);
598 ::encode(last_failure_osd_epoch, bl);
599 return;
600 }
601
602 ENCODE_START(5, 4, bl);
603 ::encode(epoch, bl);
604 ::encode(flags, bl);
605 ::encode(last_failure, bl);
606 ::encode(root, bl);
607 ::encode(session_timeout, bl);
608 ::encode(session_autoclose, bl);
609 ::encode(max_file_size, bl);
610 ::encode(max_mds, bl);
611 ::encode(mds_info, bl, features);
612 ::encode(data_pools, bl);
613 ::encode(cas_pool, bl);
614
615 // kclient ignores everything from here
616 __u16 ev = 12;
617 ::encode(ev, bl);
618 ::encode(compat, bl);
619 ::encode(metadata_pool, bl);
620 ::encode(created, bl);
621 ::encode(modified, bl);
622 ::encode(tableserver, bl);
623 ::encode(in, bl);
624 ::encode(inc, bl);
625 ::encode(up, bl);
626 ::encode(failed, bl);
627 ::encode(stopped, bl);
628 ::encode(last_failure_osd_epoch, bl);
629 ::encode(ever_allowed_features, bl);
630 ::encode(explicitly_allowed_features, bl);
631 ::encode(inline_data_enabled, bl);
632 ::encode(enabled, bl);
633 ::encode(fs_name, bl);
634 ::encode(damaged, bl);
635 ::encode(balancer, bl);
636 ::encode(standby_count_wanted, bl);
637 ENCODE_FINISH(bl);
638 }
639
640 void MDSMap::sanitize(std::function<bool(int64_t pool)> pool_exists)
641 {
642 /* Before we did stricter checking, it was possible to remove a data pool
643 * without also deleting it from the MDSMap. Check for that here after
644 * decoding the data pools.
645 */
646
647 for (auto it = data_pools.begin(); it != data_pools.end();) {
648 if (!pool_exists(*it)) {
649 dout(0) << "removed non-existant data pool " << *it << " from MDSMap" << dendl;
650 it = data_pools.erase(it);
651 } else {
652 it++;
653 }
654 }
655 }
656
657 void MDSMap::decode(bufferlist::iterator& p)
658 {
659 std::map<mds_rank_t,int32_t> inc; // Legacy field, parse and drop
660
661 cached_up_features = 0;
662 DECODE_START_LEGACY_COMPAT_LEN_16(5, 4, 4, p);
663 ::decode(epoch, p);
664 ::decode(flags, p);
665 ::decode(last_failure, p);
666 ::decode(root, p);
667 ::decode(session_timeout, p);
668 ::decode(session_autoclose, p);
669 ::decode(max_file_size, p);
670 ::decode(max_mds, p);
671 ::decode(mds_info, p);
672 if (struct_v < 3) {
673 __u32 n;
674 ::decode(n, p);
675 while (n--) {
676 __u32 m;
677 ::decode(m, p);
678 data_pools.push_back(m);
679 }
680 __s32 s;
681 ::decode(s, p);
682 cas_pool = s;
683 } else {
684 ::decode(data_pools, p);
685 ::decode(cas_pool, p);
686 }
687
688 // kclient ignores everything from here
689 __u16 ev = 1;
690 if (struct_v >= 2)
691 ::decode(ev, p);
692 if (ev >= 3)
693 ::decode(compat, p);
694 else
695 compat = get_mdsmap_compat_set_base();
696 if (ev < 5) {
697 __u32 n;
698 ::decode(n, p);
699 metadata_pool = n;
700 } else {
701 ::decode(metadata_pool, p);
702 }
703 ::decode(created, p);
704 ::decode(modified, p);
705 ::decode(tableserver, p);
706 ::decode(in, p);
707 ::decode(inc, p);
708 ::decode(up, p);
709 ::decode(failed, p);
710 ::decode(stopped, p);
711 if (ev >= 4)
712 ::decode(last_failure_osd_epoch, p);
713 if (ev >= 6) {
714 if (ev < 10) {
715 // previously this was a bool about snaps, not a flag map
716 bool flag;
717 ::decode(flag, p);
718 ever_allowed_features = flag ? CEPH_MDSMAP_ALLOW_SNAPS : 0;
719 ever_allowed_features |= CEPH_MDSMAP_ALLOW_MULTIMDS|CEPH_MDSMAP_ALLOW_DIRFRAGS;
720 ::decode(flag, p);
721 explicitly_allowed_features = flag ? CEPH_MDSMAP_ALLOW_SNAPS : 0;
722 if (max_mds > 1) {
723 set_multimds_allowed();
724 }
725 } else {
726 ::decode(ever_allowed_features, p);
727 ::decode(explicitly_allowed_features, p);
728 }
729 } else {
730 ever_allowed_features = CEPH_MDSMAP_ALLOW_CLASSICS;
731 explicitly_allowed_features = 0;
732 if (max_mds > 1) {
733 set_multimds_allowed();
734 }
735 }
736 if (ev >= 7)
737 ::decode(inline_data_enabled, p);
738
739 if (ev >= 8) {
740 assert(struct_v >= 5);
741 ::decode(enabled, p);
742 ::decode(fs_name, p);
743 } else {
744 if (epoch > 1) {
745 // If an MDS has ever been started, epoch will be greater than 1,
746 // assume filesystem is enabled.
747 enabled = true;
748 } else {
749 // Upgrading from a cluster that never used an MDS, switch off
750 // filesystem until it's explicitly enabled.
751 enabled = false;
752 }
753 }
754
755 if (ev >= 9) {
756 ::decode(damaged, p);
757 }
758
759 if (ev >= 11) {
760 ::decode(balancer, p);
761 }
762
763 if (ev >= 12) {
764 ::decode(standby_count_wanted, p);
765 }
766
767 DECODE_FINISH(p);
768 }
769
770 MDSMap::availability_t MDSMap::is_cluster_available() const
771 {
772 if (epoch == 0) {
773 // If I'm a client, this means I'm looking at an MDSMap instance
774 // that was never actually initialized from the mons. Client should
775 // wait.
776 return TRANSIENT_UNAVAILABLE;
777 }
778
779 // If a rank is marked damage (unavailable until operator intervenes)
780 if (damaged.size()) {
781 return STUCK_UNAVAILABLE;
782 }
783
784 // If no ranks are created (filesystem not initialized)
785 if (in.empty()) {
786 return STUCK_UNAVAILABLE;
787 }
788
789 for (const auto rank : in) {
790 if (up.count(rank) && mds_info.at(up.at(rank)).laggy()) {
791 // This might only be transient, but because we can't see
792 // standbys, we have no way of knowing whether there is a
793 // standby available to replace the laggy guy.
794 return STUCK_UNAVAILABLE;
795 }
796 }
797
798 if (get_num_mds(CEPH_MDS_STATE_ACTIVE) > 0) {
799 // Nobody looks stuck, so indicate to client they should go ahead
800 // and try mounting if anybody is active. This may include e.g.
801 // one MDS failing over and another active: the client should
802 // proceed to start talking to the active one and let the
803 // transiently-unavailable guy catch up later.
804 return AVAILABLE;
805 } else {
806 // Nothing indicating we were stuck, but nobody active (yet)
807 //return TRANSIENT_UNAVAILABLE;
808
809 // Because we don't have standbys in the MDSMap any more, we can't
810 // reliably indicate transient vs. stuck, so always say stuck so
811 // that the client doesn't block.
812 return STUCK_UNAVAILABLE;
813 }
814 }
815
816 bool MDSMap::state_transition_valid(DaemonState prev, DaemonState next)
817 {
818 bool state_valid = true;
819 if (next != prev) {
820 if (prev == MDSMap::STATE_REPLAY) {
821 if (next != MDSMap::STATE_RESOLVE && next != MDSMap::STATE_RECONNECT) {
822 state_valid = false;
823 }
824 } else if (prev == MDSMap::STATE_REJOIN) {
825 if (next != MDSMap::STATE_ACTIVE
826 && next != MDSMap::STATE_CLIENTREPLAY
827 && next != MDSMap::STATE_STOPPED) {
828 state_valid = false;
829 }
830 } else if (prev >= MDSMap::STATE_RECONNECT && prev < MDSMap::STATE_ACTIVE) {
831 // Once I have entered replay, the only allowable transitions are to
832 // the next next along in the sequence.
833 if (next != prev + 1) {
834 state_valid = false;
835 }
836 }
837 }
838
839 return state_valid;
840 }
841
842 bool MDSMap::check_health(mds_rank_t standby_daemon_count)
843 {
844 std::set<mds_rank_t> standbys;
845 get_standby_replay_mds_set(standbys);
846 std::set<mds_rank_t> actives;
847 get_active_mds_set(actives);
848 mds_rank_t standbys_avail = (mds_rank_t)standbys.size()+standby_daemon_count;
849
850 /* If there are standby daemons available/replaying and
851 * standby_count_wanted is unset (default), then we set it to 1. This will
852 * happen during health checks by the mons. Also, during initial creation
853 * of the FS we will have no actives so we don't want to change the default
854 * yet.
855 */
856 if (standby_count_wanted == -1 && actives.size() > 0 && standbys_avail > 0) {
857 set_standby_count_wanted(1);
858 return true;
859 }
860 return false;
861 }