]> git.proxmox.com Git - ceph.git/blob - ceph/src/mds/MDSMap.cc
Import ceph 15.2.8
[ceph.git] / ceph / src / mds / MDSMap.cc
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3 /*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 *
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
12 *
13 */
14
15 #include "common/debug.h"
16 #include "mon/health_check.h"
17
18 #include "MDSMap.h"
19
20 #include <sstream>
21 using std::stringstream;
22
23 #define dout_context g_ceph_context
24 #define dout_subsys ceph_subsys_
25
26 // features
27 CompatSet MDSMap::get_compat_set_all() {
28 CompatSet::FeatureSet feature_compat;
29 CompatSet::FeatureSet feature_ro_compat;
30 CompatSet::FeatureSet feature_incompat;
31 feature_incompat.insert(MDS_FEATURE_INCOMPAT_BASE);
32 feature_incompat.insert(MDS_FEATURE_INCOMPAT_CLIENTRANGES);
33 feature_incompat.insert(MDS_FEATURE_INCOMPAT_FILELAYOUT);
34 feature_incompat.insert(MDS_FEATURE_INCOMPAT_DIRINODE);
35 feature_incompat.insert(MDS_FEATURE_INCOMPAT_ENCODING);
36 feature_incompat.insert(MDS_FEATURE_INCOMPAT_OMAPDIRFRAG);
37 feature_incompat.insert(MDS_FEATURE_INCOMPAT_INLINE);
38 feature_incompat.insert(MDS_FEATURE_INCOMPAT_NOANCHOR);
39 feature_incompat.insert(MDS_FEATURE_INCOMPAT_FILE_LAYOUT_V2);
40 feature_incompat.insert(MDS_FEATURE_INCOMPAT_SNAPREALM_V2);
41
42 return CompatSet(feature_compat, feature_ro_compat, feature_incompat);
43 }
44
45 CompatSet MDSMap::get_compat_set_default() {
46 CompatSet::FeatureSet feature_compat;
47 CompatSet::FeatureSet feature_ro_compat;
48 CompatSet::FeatureSet feature_incompat;
49 feature_incompat.insert(MDS_FEATURE_INCOMPAT_BASE);
50 feature_incompat.insert(MDS_FEATURE_INCOMPAT_CLIENTRANGES);
51 feature_incompat.insert(MDS_FEATURE_INCOMPAT_FILELAYOUT);
52 feature_incompat.insert(MDS_FEATURE_INCOMPAT_DIRINODE);
53 feature_incompat.insert(MDS_FEATURE_INCOMPAT_ENCODING);
54 feature_incompat.insert(MDS_FEATURE_INCOMPAT_OMAPDIRFRAG);
55 feature_incompat.insert(MDS_FEATURE_INCOMPAT_NOANCHOR);
56 feature_incompat.insert(MDS_FEATURE_INCOMPAT_FILE_LAYOUT_V2);
57 feature_incompat.insert(MDS_FEATURE_INCOMPAT_SNAPREALM_V2);
58
59 return CompatSet(feature_compat, feature_ro_compat, feature_incompat);
60 }
61
62 // base (pre v0.20)
63 CompatSet MDSMap::get_compat_set_base() {
64 CompatSet::FeatureSet feature_compat_base;
65 CompatSet::FeatureSet feature_incompat_base;
66 feature_incompat_base.insert(MDS_FEATURE_INCOMPAT_BASE);
67 CompatSet::FeatureSet feature_ro_compat_base;
68
69 return CompatSet(feature_compat_base, feature_ro_compat_base, feature_incompat_base);
70 }
71
72 void MDSMap::mds_info_t::dump(Formatter *f) const
73 {
74 f->dump_unsigned("gid", global_id);
75 f->dump_string("name", name);
76 f->dump_int("rank", rank);
77 f->dump_int("incarnation", inc);
78 f->dump_stream("state") << ceph_mds_state_name(state);
79 f->dump_int("state_seq", state_seq);
80 f->dump_stream("addr") << addrs.get_legacy_str();
81 f->dump_object("addrs", addrs);
82 f->dump_int("join_fscid", join_fscid);
83 if (laggy_since != utime_t())
84 f->dump_stream("laggy_since") << laggy_since;
85
86 f->open_array_section("export_targets");
87 for (set<mds_rank_t>::iterator p = export_targets.begin();
88 p != export_targets.end(); ++p) {
89 f->dump_int("mds", *p);
90 }
91 f->close_section();
92 f->dump_unsigned("features", mds_features);
93 f->dump_unsigned("flags", flags);
94 }
95
96 void MDSMap::mds_info_t::dump(std::ostream& o) const
97 {
98 o << "[mds." << name << "{" << rank << ":" << global_id << "}"
99 << " state " << ceph_mds_state_name(state)
100 << " seq " << state_seq;
101 if (laggy()) {
102 o << " laggy since " << laggy_since;
103 }
104 if (!export_targets.empty()) {
105 o << " export targets " << export_targets;
106 }
107 if (is_frozen()) {
108 o << " frozen";
109 }
110 if (join_fscid != FS_CLUSTER_ID_NONE) {
111 o << " join_fscid=" << join_fscid;
112 }
113 o << " addr " << addrs << "]";
114 }
115
116 void MDSMap::mds_info_t::generate_test_instances(std::list<mds_info_t*>& ls)
117 {
118 mds_info_t *sample = new mds_info_t();
119 ls.push_back(sample);
120 sample = new mds_info_t();
121 sample->global_id = 1;
122 sample->name = "test_instance";
123 sample->rank = 0;
124 ls.push_back(sample);
125 }
126
127 void MDSMap::dump(Formatter *f) const
128 {
129 f->dump_int("epoch", epoch);
130 f->dump_unsigned("flags", flags);
131 f->dump_unsigned("ever_allowed_features", ever_allowed_features);
132 f->dump_unsigned("explicitly_allowed_features", explicitly_allowed_features);
133 f->dump_stream("created") << created;
134 f->dump_stream("modified") << modified;
135 f->dump_int("tableserver", tableserver);
136 f->dump_int("root", root);
137 f->dump_int("session_timeout", session_timeout);
138 f->dump_int("session_autoclose", session_autoclose);
139 f->dump_stream("min_compat_client") << ceph::to_integer<int>(min_compat_client) << " ("
140 << min_compat_client << ")";
141 f->dump_int("max_file_size", max_file_size);
142 f->dump_int("last_failure", last_failure);
143 f->dump_int("last_failure_osd_epoch", last_failure_osd_epoch);
144 f->open_object_section("compat");
145 compat.dump(f);
146 f->close_section();
147 f->dump_int("max_mds", max_mds);
148 f->open_array_section("in");
149 for (set<mds_rank_t>::const_iterator p = in.begin(); p != in.end(); ++p)
150 f->dump_int("mds", *p);
151 f->close_section();
152 f->open_object_section("up");
153 for (map<mds_rank_t,mds_gid_t>::const_iterator p = up.begin(); p != up.end(); ++p) {
154 char s[14];
155 sprintf(s, "mds_%d", int(p->first));
156 f->dump_int(s, p->second);
157 }
158 f->close_section();
159 f->open_array_section("failed");
160 for (set<mds_rank_t>::const_iterator p = failed.begin(); p != failed.end(); ++p)
161 f->dump_int("mds", *p);
162 f->close_section();
163 f->open_array_section("damaged");
164 for (set<mds_rank_t>::const_iterator p = damaged.begin(); p != damaged.end(); ++p)
165 f->dump_int("mds", *p);
166 f->close_section();
167 f->open_array_section("stopped");
168 for (set<mds_rank_t>::const_iterator p = stopped.begin(); p != stopped.end(); ++p)
169 f->dump_int("mds", *p);
170 f->close_section();
171 f->open_object_section("info");
172 for (const auto& [gid, info] : mds_info) {
173 char s[25]; // 'gid_' + len(str(ULLONG_MAX)) + '\0'
174 sprintf(s, "gid_%llu", (long long unsigned)gid);
175 f->open_object_section(s);
176 info.dump(f);
177 f->close_section();
178 }
179 f->close_section();
180 f->open_array_section("data_pools");
181 for (const auto& p: data_pools)
182 f->dump_int("pool", p);
183 f->close_section();
184 f->dump_int("metadata_pool", metadata_pool);
185 f->dump_bool("enabled", enabled);
186 f->dump_string("fs_name", fs_name);
187 f->dump_string("balancer", balancer);
188 f->dump_int("standby_count_wanted", std::max(0, standby_count_wanted));
189 }
190
191 void MDSMap::generate_test_instances(std::list<MDSMap*>& ls)
192 {
193 MDSMap *m = new MDSMap();
194 m->max_mds = 1;
195 m->data_pools.push_back(0);
196 m->metadata_pool = 1;
197 m->cas_pool = 2;
198 m->compat = get_compat_set_all();
199
200 // these aren't the defaults, just in case anybody gets confused
201 m->session_timeout = 61;
202 m->session_autoclose = 301;
203 m->max_file_size = 1<<24;
204 ls.push_back(m);
205 }
206
207 void MDSMap::print(ostream& out) const
208 {
209 out << "fs_name\t" << fs_name << "\n";
210 out << "epoch\t" << epoch << "\n";
211 out << "flags\t" << hex << flags << dec << "\n";
212 out << "created\t" << created << "\n";
213 out << "modified\t" << modified << "\n";
214 out << "tableserver\t" << tableserver << "\n";
215 out << "root\t" << root << "\n";
216 out << "session_timeout\t" << session_timeout << "\n"
217 << "session_autoclose\t" << session_autoclose << "\n";
218 out << "max_file_size\t" << max_file_size << "\n";
219 out << "min_compat_client\t" << ceph::to_integer<int>(min_compat_client) << " ("
220 << min_compat_client << ")\n";
221 out << "last_failure\t" << last_failure << "\n"
222 << "last_failure_osd_epoch\t" << last_failure_osd_epoch << "\n";
223 out << "compat\t" << compat << "\n";
224 out << "max_mds\t" << max_mds << "\n";
225 out << "in\t" << in << "\n"
226 << "up\t" << up << "\n"
227 << "failed\t" << failed << "\n"
228 << "damaged\t" << damaged << "\n"
229 << "stopped\t" << stopped << "\n";
230 out << "data_pools\t" << data_pools << "\n";
231 out << "metadata_pool\t" << metadata_pool << "\n";
232 out << "inline_data\t" << (inline_data_enabled ? "enabled" : "disabled") << "\n";
233 out << "balancer\t" << balancer << "\n";
234 out << "standby_count_wanted\t" << std::max(0, standby_count_wanted) << "\n";
235
236 multimap< pair<mds_rank_t, unsigned>, mds_gid_t > foo;
237 for (const auto &p : mds_info) {
238 foo.insert(std::make_pair(
239 std::make_pair(p.second.rank, p.second.inc-1), p.first));
240 }
241
242 for (const auto &p : foo) {
243 out << mds_info.at(p.second) << "\n";
244 }
245 }
246
247 void MDSMap::print_summary(Formatter *f, ostream *out) const
248 {
249 map<mds_rank_t,string> by_rank;
250 map<string,int> by_state;
251
252 if (f) {
253 f->dump_unsigned("epoch", get_epoch());
254 f->dump_unsigned("up", up.size());
255 f->dump_unsigned("in", in.size());
256 f->dump_unsigned("max", max_mds);
257 } else {
258 *out << "e" << get_epoch() << ": " << up.size() << "/" << in.size() << "/" << max_mds << " up";
259 }
260
261 if (f)
262 f->open_array_section("by_rank");
263 for (const auto &p : mds_info) {
264 string s = ceph_mds_state_name(p.second.state);
265 if (p.second.laggy())
266 s += "(laggy or crashed)";
267
268 if (p.second.rank >= 0 && p.second.state != MDSMap::STATE_STANDBY_REPLAY) {
269 if (f) {
270 f->open_object_section("mds");
271 f->dump_unsigned("rank", p.second.rank);
272 f->dump_string("name", p.second.name);
273 f->dump_string("status", s);
274 f->close_section();
275 } else {
276 by_rank[p.second.rank] = p.second.name + "=" + s;
277 }
278 } else {
279 by_state[s]++;
280 }
281 }
282 if (f) {
283 f->close_section();
284 } else {
285 if (!by_rank.empty())
286 *out << " " << by_rank;
287 }
288
289 for (map<string,int>::reverse_iterator p = by_state.rbegin(); p != by_state.rend(); ++p) {
290 if (f) {
291 f->dump_unsigned(p->first.c_str(), p->second);
292 } else {
293 *out << ", " << p->second << " " << p->first;
294 }
295 }
296
297 if (!failed.empty()) {
298 if (f) {
299 f->dump_unsigned("failed", failed.size());
300 } else {
301 *out << ", " << failed.size() << " failed";
302 }
303 }
304
305 if (!damaged.empty()) {
306 if (f) {
307 f->dump_unsigned("damaged", damaged.size());
308 } else {
309 *out << ", " << damaged.size() << " damaged";
310 }
311 }
312 //if (stopped.size())
313 //out << ", " << stopped.size() << " stopped";
314 }
315
316 void MDSMap::get_health(list<pair<health_status_t,string> >& summary,
317 list<pair<health_status_t,string> > *detail) const
318 {
319 if (!failed.empty()) {
320 std::ostringstream oss;
321 oss << "mds rank"
322 << ((failed.size() > 1) ? "s ":" ")
323 << failed
324 << ((failed.size() > 1) ? " have":" has")
325 << " failed";
326 summary.push_back(make_pair(HEALTH_ERR, oss.str()));
327 if (detail) {
328 for (set<mds_rank_t>::const_iterator p = failed.begin(); p != failed.end(); ++p) {
329 std::ostringstream oss;
330 oss << "mds." << *p << " has failed";
331 detail->push_back(make_pair(HEALTH_ERR, oss.str()));
332 }
333 }
334 }
335
336 if (!damaged.empty()) {
337 std::ostringstream oss;
338 oss << "mds rank"
339 << ((damaged.size() > 1) ? "s ":" ")
340 << damaged
341 << ((damaged.size() > 1) ? " are":" is")
342 << " damaged";
343 summary.push_back(make_pair(HEALTH_ERR, oss.str()));
344 if (detail) {
345 for (set<mds_rank_t>::const_iterator p = damaged.begin(); p != damaged.end(); ++p) {
346 std::ostringstream oss;
347 oss << "mds." << *p << " is damaged";
348 detail->push_back(make_pair(HEALTH_ERR, oss.str()));
349 }
350 }
351 }
352
353 if (is_degraded()) {
354 summary.push_back(make_pair(HEALTH_WARN, "mds cluster is degraded"));
355 if (detail) {
356 detail->push_back(make_pair(HEALTH_WARN, "mds cluster is degraded"));
357 for (mds_rank_t i = mds_rank_t(0); i< get_max_mds(); i++) {
358 if (!is_up(i))
359 continue;
360 mds_gid_t gid = up.find(i)->second;
361 const auto& info = mds_info.at(gid);
362 stringstream ss;
363 if (is_resolve(i))
364 ss << "mds." << info.name << " at " << info.addrs
365 << " rank " << i << " is resolving";
366 if (is_replay(i))
367 ss << "mds." << info.name << " at " << info.addrs
368 << " rank " << i << " is replaying journal";
369 if (is_rejoin(i))
370 ss << "mds." << info.name << " at " << info.addrs
371 << " rank " << i << " is rejoining";
372 if (is_reconnect(i))
373 ss << "mds." << info.name << " at " << info.addrs
374 << " rank " << i << " is reconnecting to clients";
375 if (ss.str().length())
376 detail->push_back(make_pair(HEALTH_WARN, ss.str()));
377 }
378 }
379 }
380
381 {
382 stringstream ss;
383 ss << fs_name << " max_mds " << max_mds;
384 summary.push_back(make_pair(HEALTH_WARN, ss.str()));
385 }
386
387 if ((mds_rank_t)up.size() < max_mds) {
388 stringstream ss;
389 ss << fs_name << " has " << up.size()
390 << " active MDS(s), but has max_mds of " << max_mds;
391 summary.push_back(make_pair(HEALTH_WARN, ss.str()));
392 }
393
394 set<string> laggy;
395 for (const auto &u : up) {
396 const auto& info = mds_info.at(u.second);
397 if (info.laggy()) {
398 laggy.insert(info.name);
399 if (detail) {
400 std::ostringstream oss;
401 oss << "mds." << info.name << " at " << info.addrs
402 << " is laggy/unresponsive";
403 detail->push_back(make_pair(HEALTH_WARN, oss.str()));
404 }
405 }
406 }
407
408 if (!laggy.empty()) {
409 std::ostringstream oss;
410 oss << "mds " << laggy
411 << ((laggy.size() > 1) ? " are":" is")
412 << " laggy";
413 summary.push_back(make_pair(HEALTH_WARN, oss.str()));
414 }
415
416 if (get_max_mds() > 1 &&
417 was_snaps_ever_allowed() && !allows_multimds_snaps()) {
418 std::ostringstream oss;
419 oss << "multi-active mds while there are snapshots possibly created by pre-mimic MDS";
420 summary.push_back(make_pair(HEALTH_WARN, oss.str()));
421 }
422 }
423
424 void MDSMap::get_health_checks(health_check_map_t *checks) const
425 {
426 // MDS_DAMAGE
427 if (!damaged.empty()) {
428 health_check_t& check = checks->get_or_add("MDS_DAMAGE", HEALTH_ERR,
429 "%num% mds daemon%plurals% damaged",
430 damaged.size());
431 for (auto p : damaged) {
432 std::ostringstream oss;
433 oss << "fs " << fs_name << " mds." << p << " is damaged";
434 check.detail.push_back(oss.str());
435 }
436 }
437
438 // FS_DEGRADED
439 if (is_degraded()) {
440 health_check_t& fscheck = checks->get_or_add(
441 "FS_DEGRADED", HEALTH_WARN,
442 "%num% filesystem%plurals% %isorare% degraded", 1);
443 ostringstream ss;
444 ss << "fs " << fs_name << " is degraded";
445 fscheck.detail.push_back(ss.str());
446
447 list<string> detail;
448 for (mds_rank_t i = mds_rank_t(0); i< get_max_mds(); i++) {
449 if (!is_up(i))
450 continue;
451 mds_gid_t gid = up.find(i)->second;
452 const auto& info = mds_info.at(gid);
453 stringstream ss;
454 ss << "fs " << fs_name << " mds." << info.name << " at "
455 << info.addrs << " rank " << i;
456 if (is_resolve(i))
457 ss << " is resolving";
458 if (is_replay(i))
459 ss << " is replaying journal";
460 if (is_rejoin(i))
461 ss << " is rejoining";
462 if (is_reconnect(i))
463 ss << " is reconnecting to clients";
464 if (ss.str().length())
465 detail.push_back(ss.str());
466 }
467 }
468
469 // MDS_UP_LESS_THAN_MAX
470 if ((mds_rank_t)get_num_in_mds() < get_max_mds()) {
471 health_check_t& check = checks->add(
472 "MDS_UP_LESS_THAN_MAX", HEALTH_WARN,
473 "%num% filesystem%plurals% %isorare% online with fewer MDS than max_mds", 1);
474 stringstream ss;
475 ss << "fs " << fs_name << " has " << get_num_in_mds()
476 << " MDS online, but wants " << get_max_mds();
477 check.detail.push_back(ss.str());
478 }
479
480 // MDS_ALL_DOWN
481 if ((mds_rank_t)get_num_up_mds() == 0 && get_max_mds() > 0) {
482 health_check_t &check = checks->add(
483 "MDS_ALL_DOWN", HEALTH_ERR,
484 "%num% filesystem%plurals% %isorare% offline", 1);
485 stringstream ss;
486 ss << "fs " << fs_name << " is offline because no MDS is active for it.";
487 check.detail.push_back(ss.str());
488 }
489
490 if (get_max_mds() > 1 &&
491 was_snaps_ever_allowed() && !allows_multimds_snaps()) {
492 health_check_t &check = checks->add(
493 "MULTIMDS_WITH_OLDSNAPS", HEALTH_ERR,
494 "%num% filesystem%plurals% %isorare% multi-active mds with old snapshots", 1);
495 stringstream ss;
496 ss << "multi-active mds while there are snapshots possibly created by pre-mimic MDS";
497 check.detail.push_back(ss.str());
498 }
499
500 if (get_inline_data_enabled()) {
501 health_check_t &check = checks->add(
502 "FS_INLINE_DATA_DEPRECATED", HEALTH_WARN,
503 "%num% filesystem%plurals% with deprecated feature inline_data", 1);
504 stringstream ss;
505 ss << "fs " << fs_name << " has deprecated feature inline_data enabled.";
506 check.detail.push_back(ss.str());
507 }
508 }
509
510 void MDSMap::mds_info_t::encode_versioned(bufferlist& bl, uint64_t features) const
511 {
512 __u8 v = 9;
513 if (!HAVE_FEATURE(features, SERVER_NAUTILUS)) {
514 v = 7;
515 }
516 ENCODE_START(v, 4, bl);
517 encode(global_id, bl);
518 encode(name, bl);
519 encode(rank, bl);
520 encode(inc, bl);
521 encode((int32_t)state, bl);
522 encode(state_seq, bl);
523 if (v < 8) {
524 encode(addrs.legacy_addr(), bl, features);
525 } else {
526 encode(addrs, bl, features);
527 }
528 encode(laggy_since, bl);
529 encode(MDS_RANK_NONE, bl); /* standby_for_rank */
530 encode(std::string(), bl); /* standby_for_name */
531 encode(export_targets, bl);
532 encode(mds_features, bl);
533 encode(join_fscid, bl); /* formerly: standby_for_fscid */
534 encode(false, bl);
535 if (v >= 9) {
536 encode(flags, bl);
537 }
538 ENCODE_FINISH(bl);
539 }
540
541 void MDSMap::mds_info_t::encode_unversioned(bufferlist& bl) const
542 {
543 __u8 struct_v = 3;
544 using ceph::encode;
545 encode(struct_v, bl);
546 encode(global_id, bl);
547 encode(name, bl);
548 encode(rank, bl);
549 encode(inc, bl);
550 encode((int32_t)state, bl);
551 encode(state_seq, bl);
552 encode(addrs.legacy_addr(), bl, 0);
553 encode(laggy_since, bl);
554 encode(MDS_RANK_NONE, bl);
555 encode(std::string(), bl);
556 encode(export_targets, bl);
557 }
558
559 void MDSMap::mds_info_t::decode(bufferlist::const_iterator& bl)
560 {
561 DECODE_START_LEGACY_COMPAT_LEN(9, 4, 4, bl);
562 decode(global_id, bl);
563 decode(name, bl);
564 decode(rank, bl);
565 decode(inc, bl);
566 int32_t raw_state;
567 decode(raw_state, bl);
568 state = (MDSMap::DaemonState)raw_state;
569 decode(state_seq, bl);
570 decode(addrs, bl);
571 decode(laggy_since, bl);
572 {
573 mds_rank_t standby_for_rank;
574 decode(standby_for_rank, bl);
575 }
576 {
577 std::string standby_for_name;
578 decode(standby_for_name, bl);
579 }
580 if (struct_v >= 2)
581 decode(export_targets, bl);
582 if (struct_v >= 5)
583 decode(mds_features, bl);
584 if (struct_v >= 6) {
585 decode(join_fscid, bl);
586 }
587 if (struct_v >= 7) {
588 bool standby_replay;
589 decode(standby_replay, bl);
590 }
591 if (struct_v >= 9) {
592 decode(flags, bl);
593 }
594 DECODE_FINISH(bl);
595 }
596
597 std::string MDSMap::mds_info_t::human_name() const
598 {
599 // Like "daemon mds.myhost restarted", "Activating daemon mds.myhost"
600 std::ostringstream out;
601 out << "daemon mds." << name;
602 return out.str();
603 }
604
605 void MDSMap::encode(bufferlist& bl, uint64_t features) const
606 {
607 std::map<mds_rank_t,int32_t> inc; // Legacy field, fake it so that
608 // old-mon peers have something sane
609 // during upgrade
610 for (const auto rank : in) {
611 inc.insert(std::make_pair(rank, epoch));
612 }
613
614 using ceph::encode;
615 if ((features & CEPH_FEATURE_PGID64) == 0) {
616 __u16 v = 2;
617 encode(v, bl);
618 encode(epoch, bl);
619 encode(flags, bl);
620 encode(last_failure, bl);
621 encode(root, bl);
622 encode(session_timeout, bl);
623 encode(session_autoclose, bl);
624 encode(max_file_size, bl);
625 encode(max_mds, bl);
626 __u32 n = mds_info.size();
627 encode(n, bl);
628 for (map<mds_gid_t, mds_info_t>::const_iterator i = mds_info.begin();
629 i != mds_info.end(); ++i) {
630 encode(i->first, bl);
631 encode(i->second, bl, features);
632 }
633 n = data_pools.size();
634 encode(n, bl);
635 for (const auto p: data_pools) {
636 n = p;
637 encode(n, bl);
638 }
639
640 int32_t m = cas_pool;
641 encode(m, bl);
642 return;
643 } else if ((features & CEPH_FEATURE_MDSENC) == 0) {
644 __u16 v = 3;
645 encode(v, bl);
646 encode(epoch, bl);
647 encode(flags, bl);
648 encode(last_failure, bl);
649 encode(root, bl);
650 encode(session_timeout, bl);
651 encode(session_autoclose, bl);
652 encode(max_file_size, bl);
653 encode(max_mds, bl);
654 __u32 n = mds_info.size();
655 encode(n, bl);
656 for (map<mds_gid_t, mds_info_t>::const_iterator i = mds_info.begin();
657 i != mds_info.end(); ++i) {
658 encode(i->first, bl);
659 encode(i->second, bl, features);
660 }
661 encode(data_pools, bl);
662 encode(cas_pool, bl);
663
664 __u16 ev = 5;
665 encode(ev, bl);
666 encode(compat, bl);
667 encode(metadata_pool, bl);
668 encode(created, bl);
669 encode(modified, bl);
670 encode(tableserver, bl);
671 encode(in, bl);
672 encode(inc, bl);
673 encode(up, bl);
674 encode(failed, bl);
675 encode(stopped, bl);
676 encode(last_failure_osd_epoch, bl);
677 return;
678 }
679
680 ENCODE_START(5, 4, bl);
681 encode(epoch, bl);
682 encode(flags, bl);
683 encode(last_failure, bl);
684 encode(root, bl);
685 encode(session_timeout, bl);
686 encode(session_autoclose, bl);
687 encode(max_file_size, bl);
688 encode(max_mds, bl);
689 encode(mds_info, bl, features);
690 encode(data_pools, bl);
691 encode(cas_pool, bl);
692
693 __u16 ev = 15;
694 encode(ev, bl);
695 encode(compat, bl);
696 encode(metadata_pool, bl);
697 encode(created, bl);
698 encode(modified, bl);
699 encode(tableserver, bl);
700 encode(in, bl);
701 encode(inc, bl);
702 encode(up, bl);
703 encode(failed, bl);
704 encode(stopped, bl);
705 encode(last_failure_osd_epoch, bl);
706 encode(ever_allowed_features, bl);
707 encode(explicitly_allowed_features, bl);
708 encode(inline_data_enabled, bl);
709 encode(enabled, bl);
710 encode(fs_name, bl);
711 encode(damaged, bl);
712 encode(balancer, bl);
713 encode(standby_count_wanted, bl);
714 encode(old_max_mds, bl);
715 encode(min_compat_client, bl);
716 ENCODE_FINISH(bl);
717 }
718
719 void MDSMap::sanitize(const std::function<bool(int64_t pool)>& pool_exists)
720 {
721 /* Before we did stricter checking, it was possible to remove a data pool
722 * without also deleting it from the MDSMap. Check for that here after
723 * decoding the data pools.
724 */
725
726 for (auto it = data_pools.begin(); it != data_pools.end();) {
727 if (!pool_exists(*it)) {
728 dout(0) << "removed non-existant data pool " << *it << " from MDSMap" << dendl;
729 it = data_pools.erase(it);
730 } else {
731 it++;
732 }
733 }
734 }
735
736 void MDSMap::decode(bufferlist::const_iterator& p)
737 {
738 std::map<mds_rank_t,int32_t> inc; // Legacy field, parse and drop
739
740 cached_up_features = 0;
741 DECODE_START_LEGACY_COMPAT_LEN_16(5, 4, 4, p);
742 decode(epoch, p);
743 decode(flags, p);
744 decode(last_failure, p);
745 decode(root, p);
746 decode(session_timeout, p);
747 decode(session_autoclose, p);
748 decode(max_file_size, p);
749 decode(max_mds, p);
750 decode(mds_info, p);
751 if (struct_v < 3) {
752 __u32 n;
753 decode(n, p);
754 while (n--) {
755 __u32 m;
756 decode(m, p);
757 data_pools.push_back(m);
758 }
759 __s32 s;
760 decode(s, p);
761 cas_pool = s;
762 } else {
763 decode(data_pools, p);
764 decode(cas_pool, p);
765 }
766
767 // kclient ignores everything from here
768 __u16 ev = 1;
769 if (struct_v >= 2)
770 decode(ev, p);
771 if (ev >= 3)
772 decode(compat, p);
773 else
774 compat = get_compat_set_base();
775 if (ev < 5) {
776 __u32 n;
777 decode(n, p);
778 metadata_pool = n;
779 } else {
780 decode(metadata_pool, p);
781 }
782 decode(created, p);
783 decode(modified, p);
784 decode(tableserver, p);
785 decode(in, p);
786 decode(inc, p);
787 decode(up, p);
788 decode(failed, p);
789 decode(stopped, p);
790 if (ev >= 4)
791 decode(last_failure_osd_epoch, p);
792 if (ev >= 6) {
793 if (ev < 10) {
794 // previously this was a bool about snaps, not a flag map
795 bool flag;
796 decode(flag, p);
797 ever_allowed_features = flag ? CEPH_MDSMAP_ALLOW_SNAPS : 0;
798 decode(flag, p);
799 explicitly_allowed_features = flag ? CEPH_MDSMAP_ALLOW_SNAPS : 0;
800 } else {
801 decode(ever_allowed_features, p);
802 decode(explicitly_allowed_features, p);
803 }
804 } else {
805 ever_allowed_features = 0;
806 explicitly_allowed_features = 0;
807 }
808 if (ev >= 7)
809 decode(inline_data_enabled, p);
810
811 if (ev >= 8) {
812 ceph_assert(struct_v >= 5);
813 decode(enabled, p);
814 decode(fs_name, p);
815 } else {
816 if (epoch > 1) {
817 // If an MDS has ever been started, epoch will be greater than 1,
818 // assume filesystem is enabled.
819 enabled = true;
820 } else {
821 // Upgrading from a cluster that never used an MDS, switch off
822 // filesystem until it's explicitly enabled.
823 enabled = false;
824 }
825 }
826
827 if (ev >= 9) {
828 decode(damaged, p);
829 }
830
831 if (ev >= 11) {
832 decode(balancer, p);
833 }
834
835 if (ev >= 12) {
836 decode(standby_count_wanted, p);
837 }
838
839 if (ev >= 13) {
840 decode(old_max_mds, p);
841 }
842
843 if (ev == 14) {
844 int8_t r;
845 decode(r, p);
846 if (r < 0) {
847 min_compat_client = ceph_release_t::unknown;
848 } else {
849 min_compat_client = ceph_release_t{static_cast<uint8_t>(r)};
850 }
851 } else if (ev > 14) {
852 decode(min_compat_client, p);
853 }
854
855 DECODE_FINISH(p);
856 }
857
858 MDSMap::availability_t MDSMap::is_cluster_available() const
859 {
860 if (epoch == 0) {
861 // If I'm a client, this means I'm looking at an MDSMap instance
862 // that was never actually initialized from the mons. Client should
863 // wait.
864 return TRANSIENT_UNAVAILABLE;
865 }
866
867 // If a rank is marked damage (unavailable until operator intervenes)
868 if (damaged.size()) {
869 return STUCK_UNAVAILABLE;
870 }
871
872 // If no ranks are created (filesystem not initialized)
873 if (in.empty()) {
874 return STUCK_UNAVAILABLE;
875 }
876
877 for (const auto rank : in) {
878 if (up.count(rank) && mds_info.at(up.at(rank)).laggy()) {
879 // This might only be transient, but because we can't see
880 // standbys, we have no way of knowing whether there is a
881 // standby available to replace the laggy guy.
882 return STUCK_UNAVAILABLE;
883 }
884 }
885
886 if (get_num_mds(CEPH_MDS_STATE_ACTIVE) > 0) {
887 // Nobody looks stuck, so indicate to client they should go ahead
888 // and try mounting if anybody is active. This may include e.g.
889 // one MDS failing over and another active: the client should
890 // proceed to start talking to the active one and let the
891 // transiently-unavailable guy catch up later.
892 return AVAILABLE;
893 } else {
894 // Nothing indicating we were stuck, but nobody active (yet)
895 //return TRANSIENT_UNAVAILABLE;
896
897 // Because we don't have standbys in the MDSMap any more, we can't
898 // reliably indicate transient vs. stuck, so always say stuck so
899 // that the client doesn't block.
900 return STUCK_UNAVAILABLE;
901 }
902 }
903
904 bool MDSMap::state_transition_valid(DaemonState prev, DaemonState next)
905 {
906 bool state_valid = true;
907 if (next != prev) {
908 if (prev == MDSMap::STATE_REPLAY) {
909 if (next != MDSMap::STATE_RESOLVE && next != MDSMap::STATE_RECONNECT) {
910 state_valid = false;
911 }
912 } else if (prev == MDSMap::STATE_REJOIN) {
913 if (next != MDSMap::STATE_ACTIVE &&
914 next != MDSMap::STATE_CLIENTREPLAY &&
915 next != MDSMap::STATE_STOPPED) {
916 state_valid = false;
917 }
918 } else if (prev >= MDSMap::STATE_RESOLVE && prev < MDSMap::STATE_ACTIVE) {
919 // Once I have entered replay, the only allowable transitions are to
920 // the next next along in the sequence.
921 if (next != prev + 1) {
922 state_valid = false;
923 }
924 }
925 }
926
927 return state_valid;
928 }
929
930 bool MDSMap::check_health(mds_rank_t standby_daemon_count)
931 {
932 std::set<mds_rank_t> standbys;
933 get_standby_replay_mds_set(standbys);
934 std::set<mds_rank_t> actives;
935 get_active_mds_set(actives);
936 mds_rank_t standbys_avail = (mds_rank_t)standbys.size()+standby_daemon_count;
937
938 /* If there are standby daemons available/replaying and
939 * standby_count_wanted is unset (default), then we set it to 1. This will
940 * happen during health checks by the mons. Also, during initial creation
941 * of the FS we will have no actives so we don't want to change the default
942 * yet.
943 */
944 if (standby_count_wanted == -1 && actives.size() > 0 && standbys_avail > 0) {
945 set_standby_count_wanted(1);
946 return true;
947 }
948 return false;
949 }
950
951 mds_gid_t MDSMap::find_mds_gid_by_name(std::string_view s) const {
952 for (const auto& [gid, info] : mds_info) {
953 if (info.name == s) {
954 return gid;
955 }
956 }
957 return MDS_GID_NONE;
958 }
959
960 unsigned MDSMap::get_num_mds(int state) const {
961 unsigned n = 0;
962 for (std::map<mds_gid_t,mds_info_t>::const_iterator p = mds_info.begin();
963 p != mds_info.end();
964 ++p)
965 if (p->second.state == state) ++n;
966 return n;
967 }
968
969 void MDSMap::get_up_mds_set(std::set<mds_rank_t>& s) const {
970 for (std::map<mds_rank_t, mds_gid_t>::const_iterator p = up.begin();
971 p != up.end();
972 ++p)
973 s.insert(p->first);
974 }
975
976 uint64_t MDSMap::get_up_features() {
977 if (!cached_up_features) {
978 bool first = true;
979 for (std::map<mds_rank_t, mds_gid_t>::const_iterator p = up.begin();
980 p != up.end();
981 ++p) {
982 std::map<mds_gid_t, mds_info_t>::const_iterator q =
983 mds_info.find(p->second);
984 ceph_assert(q != mds_info.end());
985 if (first) {
986 cached_up_features = q->second.mds_features;
987 first = false;
988 } else {
989 cached_up_features &= q->second.mds_features;
990 }
991 }
992 }
993 return cached_up_features;
994 }
995
996 void MDSMap::get_recovery_mds_set(std::set<mds_rank_t>& s) const {
997 s = failed;
998 for (const auto& p : damaged)
999 s.insert(p);
1000 for (const auto& p : mds_info)
1001 if (p.second.state >= STATE_REPLAY && p.second.state <= STATE_STOPPING)
1002 s.insert(p.second.rank);
1003 }
1004
1005 void MDSMap::get_mds_set_lower_bound(std::set<mds_rank_t>& s, DaemonState first) const {
1006 for (std::map<mds_gid_t, mds_info_t>::const_iterator p = mds_info.begin();
1007 p != mds_info.end();
1008 ++p)
1009 if (p->second.state >= first && p->second.state <= STATE_STOPPING)
1010 s.insert(p->second.rank);
1011 }
1012
1013 void MDSMap::get_mds_set(std::set<mds_rank_t>& s, DaemonState state) const {
1014 for (std::map<mds_gid_t, mds_info_t>::const_iterator p = mds_info.begin();
1015 p != mds_info.end();
1016 ++p)
1017 if (p->second.state == state)
1018 s.insert(p->second.rank);
1019 }
1020
1021 mds_gid_t MDSMap::get_standby_replay(mds_rank_t r) const {
1022 for (auto& [gid,info] : mds_info) {
1023 if (info.rank == r && info.state == STATE_STANDBY_REPLAY) {
1024 return gid;
1025 }
1026 }
1027 return MDS_GID_NONE;
1028 }
1029
1030 bool MDSMap::is_degraded() const {
1031 if (!failed.empty() || !damaged.empty())
1032 return true;
1033 for (const auto& p : mds_info) {
1034 if (p.second.is_degraded())
1035 return true;
1036 }
1037 return false;
1038 }