1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
4 * Ceph - scalable distributed file system
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
17 #include "common/debug.h"
18 #include "mon/health_check.h"
34 using ceph::bufferlist
;
35 using ceph::Formatter
;
37 #define dout_context g_ceph_context
38 #define dout_subsys ceph_subsys_
41 CompatSet
MDSMap::get_compat_set_all() {
42 CompatSet::FeatureSet feature_compat
;
43 CompatSet::FeatureSet feature_ro_compat
;
44 CompatSet::FeatureSet feature_incompat
;
45 feature_incompat
.insert(MDS_FEATURE_INCOMPAT_BASE
);
46 feature_incompat
.insert(MDS_FEATURE_INCOMPAT_CLIENTRANGES
);
47 feature_incompat
.insert(MDS_FEATURE_INCOMPAT_FILELAYOUT
);
48 feature_incompat
.insert(MDS_FEATURE_INCOMPAT_DIRINODE
);
49 feature_incompat
.insert(MDS_FEATURE_INCOMPAT_ENCODING
);
50 feature_incompat
.insert(MDS_FEATURE_INCOMPAT_OMAPDIRFRAG
);
51 feature_incompat
.insert(MDS_FEATURE_INCOMPAT_INLINE
);
52 feature_incompat
.insert(MDS_FEATURE_INCOMPAT_NOANCHOR
);
53 feature_incompat
.insert(MDS_FEATURE_INCOMPAT_FILE_LAYOUT_V2
);
54 feature_incompat
.insert(MDS_FEATURE_INCOMPAT_SNAPREALM_V2
);
56 return CompatSet(feature_compat
, feature_ro_compat
, feature_incompat
);
59 CompatSet
MDSMap::get_compat_set_default() {
60 CompatSet::FeatureSet feature_compat
;
61 CompatSet::FeatureSet feature_ro_compat
;
62 CompatSet::FeatureSet feature_incompat
;
63 feature_incompat
.insert(MDS_FEATURE_INCOMPAT_BASE
);
64 feature_incompat
.insert(MDS_FEATURE_INCOMPAT_CLIENTRANGES
);
65 feature_incompat
.insert(MDS_FEATURE_INCOMPAT_FILELAYOUT
);
66 feature_incompat
.insert(MDS_FEATURE_INCOMPAT_DIRINODE
);
67 feature_incompat
.insert(MDS_FEATURE_INCOMPAT_ENCODING
);
68 feature_incompat
.insert(MDS_FEATURE_INCOMPAT_OMAPDIRFRAG
);
69 feature_incompat
.insert(MDS_FEATURE_INCOMPAT_NOANCHOR
);
70 feature_incompat
.insert(MDS_FEATURE_INCOMPAT_FILE_LAYOUT_V2
);
71 feature_incompat
.insert(MDS_FEATURE_INCOMPAT_SNAPREALM_V2
);
73 return CompatSet(feature_compat
, feature_ro_compat
, feature_incompat
);
77 CompatSet
MDSMap::get_compat_set_base() {
78 CompatSet::FeatureSet feature_compat_base
;
79 CompatSet::FeatureSet feature_incompat_base
;
80 feature_incompat_base
.insert(MDS_FEATURE_INCOMPAT_BASE
);
81 CompatSet::FeatureSet feature_ro_compat_base
;
83 return CompatSet(feature_compat_base
, feature_ro_compat_base
, feature_incompat_base
);
86 // pre-v16.2.5 CompatSet in MDS beacon
87 CompatSet
MDSMap::get_compat_set_v16_2_4() {
88 CompatSet::FeatureSet feature_compat
;
89 CompatSet::FeatureSet feature_ro_compat
;
90 CompatSet::FeatureSet feature_incompat
;
91 feature_incompat
.insert(MDS_FEATURE_INCOMPAT_BASE
);
92 feature_incompat
.insert(MDS_FEATURE_INCOMPAT_CLIENTRANGES
);
93 feature_incompat
.insert(MDS_FEATURE_INCOMPAT_FILELAYOUT
);
94 feature_incompat
.insert(MDS_FEATURE_INCOMPAT_DIRINODE
);
95 feature_incompat
.insert(MDS_FEATURE_INCOMPAT_ENCODING
);
96 feature_incompat
.insert(MDS_FEATURE_INCOMPAT_OMAPDIRFRAG
);
97 feature_incompat
.insert(MDS_FEATURE_INCOMPAT_INLINE
);
98 feature_incompat
.insert(MDS_FEATURE_INCOMPAT_NOANCHOR
);
99 feature_incompat
.insert(MDS_FEATURE_INCOMPAT_FILE_LAYOUT_V2
);
100 feature_incompat
.insert(MDS_FEATURE_INCOMPAT_SNAPREALM_V2
);
101 return CompatSet(feature_compat
, feature_ro_compat
, feature_incompat
);
104 void MDSMap::mds_info_t::dump(Formatter
*f
) const
106 f
->dump_unsigned("gid", global_id
);
107 f
->dump_string("name", name
);
108 f
->dump_int("rank", rank
);
109 f
->dump_int("incarnation", inc
);
110 f
->dump_stream("state") << ceph_mds_state_name(state
);
111 f
->dump_int("state_seq", state_seq
);
112 f
->dump_stream("addr") << addrs
.get_legacy_str();
113 f
->dump_object("addrs", addrs
);
114 f
->dump_int("join_fscid", join_fscid
);
115 if (laggy_since
!= utime_t())
116 f
->dump_stream("laggy_since") << laggy_since
;
118 f
->open_array_section("export_targets");
119 for (set
<mds_rank_t
>::iterator p
= export_targets
.begin();
120 p
!= export_targets
.end(); ++p
) {
121 f
->dump_int("mds", *p
);
124 f
->dump_unsigned("features", mds_features
);
125 f
->dump_unsigned("flags", flags
);
126 f
->dump_object("compat", compat
);
129 void MDSMap::mds_info_t::dump(std::ostream
& o
) const
131 o
<< "[mds." << name
<< "{" << rank
<< ":" << global_id
<< "}"
132 << " state " << ceph_mds_state_name(state
)
133 << " seq " << state_seq
;
135 o
<< " laggy since " << laggy_since
;
137 if (!export_targets
.empty()) {
138 o
<< " export targets " << export_targets
;
143 if (join_fscid
!= FS_CLUSTER_ID_NONE
) {
144 o
<< " join_fscid=" << join_fscid
;
146 o
<< " addr " << addrs
;
152 void MDSMap::mds_info_t::generate_test_instances(std::list
<mds_info_t
*>& ls
)
154 mds_info_t
*sample
= new mds_info_t();
155 ls
.push_back(sample
);
156 sample
= new mds_info_t();
157 sample
->global_id
= 1;
158 sample
->name
= "test_instance";
160 ls
.push_back(sample
);
163 void MDSMap::dump(Formatter
*f
) const
165 f
->dump_int("epoch", epoch
);
166 f
->dump_unsigned("flags", flags
);
168 f
->dump_unsigned("ever_allowed_features", ever_allowed_features
);
169 f
->dump_unsigned("explicitly_allowed_features", explicitly_allowed_features
);
170 f
->dump_stream("created") << created
;
171 f
->dump_stream("modified") << modified
;
172 f
->dump_int("tableserver", tableserver
);
173 f
->dump_int("root", root
);
174 f
->dump_int("session_timeout", session_timeout
);
175 f
->dump_int("session_autoclose", session_autoclose
);
176 f
->open_object_section("required_client_features");
177 cephfs_dump_features(f
, required_client_features
);
179 f
->dump_int("max_file_size", max_file_size
);
180 f
->dump_int("last_failure", last_failure
);
181 f
->dump_int("last_failure_osd_epoch", last_failure_osd_epoch
);
182 f
->open_object_section("compat");
185 f
->dump_int("max_mds", max_mds
);
186 f
->open_array_section("in");
187 for (set
<mds_rank_t
>::const_iterator p
= in
.begin(); p
!= in
.end(); ++p
)
188 f
->dump_int("mds", *p
);
190 f
->open_object_section("up");
191 for (map
<mds_rank_t
,mds_gid_t
>::const_iterator p
= up
.begin(); p
!= up
.end(); ++p
) {
193 sprintf(s
, "mds_%d", int(p
->first
));
194 f
->dump_int(s
, p
->second
);
197 f
->open_array_section("failed");
198 for (set
<mds_rank_t
>::const_iterator p
= failed
.begin(); p
!= failed
.end(); ++p
)
199 f
->dump_int("mds", *p
);
201 f
->open_array_section("damaged");
202 for (set
<mds_rank_t
>::const_iterator p
= damaged
.begin(); p
!= damaged
.end(); ++p
)
203 f
->dump_int("mds", *p
);
205 f
->open_array_section("stopped");
206 for (set
<mds_rank_t
>::const_iterator p
= stopped
.begin(); p
!= stopped
.end(); ++p
)
207 f
->dump_int("mds", *p
);
209 f
->open_object_section("info");
210 for (const auto& [gid
, info
] : mds_info
) {
211 char s
[25]; // 'gid_' + len(str(ULLONG_MAX)) + '\0'
212 sprintf(s
, "gid_%llu", (long long unsigned)gid
);
213 f
->open_object_section(s
);
218 f
->open_array_section("data_pools");
219 for (const auto& p
: data_pools
)
220 f
->dump_int("pool", p
);
222 f
->dump_int("metadata_pool", metadata_pool
);
223 f
->dump_bool("enabled", enabled
);
224 f
->dump_string("fs_name", fs_name
);
225 f
->dump_string("balancer", balancer
);
226 f
->dump_string("bal_rank_mask", bal_rank_mask
);
227 f
->dump_int("standby_count_wanted", std::max(0, standby_count_wanted
));
230 void MDSMap::dump_flags_state(Formatter
*f
) const
232 f
->open_object_section("flags_state");
233 f
->dump_bool(flag_display
.at(CEPH_MDSMAP_NOT_JOINABLE
), joinable());
234 f
->dump_bool(flag_display
.at(CEPH_MDSMAP_ALLOW_SNAPS
), allows_snaps());
235 f
->dump_bool(flag_display
.at(CEPH_MDSMAP_ALLOW_MULTIMDS_SNAPS
), allows_multimds_snaps());
236 f
->dump_bool(flag_display
.at(CEPH_MDSMAP_ALLOW_STANDBY_REPLAY
), allows_standby_replay());
237 f
->dump_bool(flag_display
.at(CEPH_MDSMAP_REFUSE_CLIENT_SESSION
), test_flag(CEPH_MDSMAP_REFUSE_CLIENT_SESSION
));
241 void MDSMap::generate_test_instances(std::list
<MDSMap
*>& ls
)
243 MDSMap
*m
= new MDSMap();
245 m
->data_pools
.push_back(0);
246 m
->metadata_pool
= 1;
248 m
->compat
= get_compat_set_all();
250 // these aren't the defaults, just in case anybody gets confused
251 m
->session_timeout
= 61;
252 m
->session_autoclose
= 301;
253 m
->max_file_size
= 1<<24;
257 void MDSMap::print(ostream
& out
) const
259 out
<< "fs_name\t" << fs_name
<< "\n";
260 out
<< "epoch\t" << epoch
<< "\n";
261 out
<< "flags\t" << hex
<< flags
<< dec
;
264 out
<< "created\t" << created
<< "\n";
265 out
<< "modified\t" << modified
<< "\n";
266 out
<< "tableserver\t" << tableserver
<< "\n";
267 out
<< "root\t" << root
<< "\n";
268 out
<< "session_timeout\t" << session_timeout
<< "\n"
269 << "session_autoclose\t" << session_autoclose
<< "\n";
270 out
<< "max_file_size\t" << max_file_size
<< "\n";
271 out
<< "required_client_features\t" << cephfs_stringify_features(required_client_features
) << "\n";
272 out
<< "last_failure\t" << last_failure
<< "\n"
273 << "last_failure_osd_epoch\t" << last_failure_osd_epoch
<< "\n";
274 out
<< "compat\t" << compat
<< "\n";
275 out
<< "max_mds\t" << max_mds
<< "\n";
276 out
<< "in\t" << in
<< "\n"
277 << "up\t" << up
<< "\n"
278 << "failed\t" << failed
<< "\n"
279 << "damaged\t" << damaged
<< "\n"
280 << "stopped\t" << stopped
<< "\n";
281 out
<< "data_pools\t" << data_pools
<< "\n";
282 out
<< "metadata_pool\t" << metadata_pool
<< "\n";
283 out
<< "inline_data\t" << (inline_data_enabled
? "enabled" : "disabled") << "\n";
284 out
<< "balancer\t" << balancer
<< "\n";
285 out
<< "bal_rank_mask\t" << bal_rank_mask
<< "\n";
286 out
<< "standby_count_wanted\t" << std::max(0, standby_count_wanted
) << "\n";
288 multimap
< pair
<mds_rank_t
, unsigned>, mds_gid_t
> foo
;
289 for (const auto &p
: mds_info
) {
290 foo
.insert(std::make_pair(
291 std::make_pair(p
.second
.rank
, p
.second
.inc
-1), p
.first
));
294 for (const auto &p
: foo
) {
295 out
<< mds_info
.at(p
.second
) << "\n";
299 void MDSMap::print_summary(Formatter
*f
, ostream
*out
) const
301 map
<mds_rank_t
,string
> by_rank
;
302 map
<string
,int> by_state
;
305 f
->dump_unsigned("epoch", get_epoch());
306 f
->dump_unsigned("up", up
.size());
307 f
->dump_unsigned("in", in
.size());
308 f
->dump_unsigned("max", max_mds
);
310 *out
<< "e" << get_epoch() << ": " << up
.size() << "/" << in
.size() << "/" << max_mds
<< " up";
314 f
->open_array_section("by_rank");
315 for (const auto &p
: mds_info
) {
316 string s
= ceph_mds_state_name(p
.second
.state
);
317 if (p
.second
.laggy())
318 s
+= "(laggy or crashed)";
320 if (p
.second
.rank
>= 0 && p
.second
.state
!= MDSMap::STATE_STANDBY_REPLAY
) {
322 f
->open_object_section("mds");
323 f
->dump_unsigned("rank", p
.second
.rank
);
324 f
->dump_string("name", p
.second
.name
);
325 f
->dump_string("status", s
);
328 by_rank
[p
.second
.rank
] = p
.second
.name
+ "=" + s
;
337 if (!by_rank
.empty())
338 *out
<< " " << by_rank
;
341 for (map
<string
,int>::reverse_iterator p
= by_state
.rbegin(); p
!= by_state
.rend(); ++p
) {
343 f
->dump_unsigned(p
->first
.c_str(), p
->second
);
345 *out
<< ", " << p
->second
<< " " << p
->first
;
349 if (!failed
.empty()) {
351 f
->dump_unsigned("failed", failed
.size());
353 *out
<< ", " << failed
.size() << " failed";
357 if (!damaged
.empty()) {
359 f
->dump_unsigned("damaged", damaged
.size());
361 *out
<< ", " << damaged
.size() << " damaged";
364 //if (stopped.size())
365 //out << ", " << stopped.size() << " stopped";
368 void MDSMap::print_flags(std::ostream
& out
) const {
370 out
<< " " << flag_display
.at(CEPH_MDSMAP_NOT_JOINABLE
);
372 out
<< " " << flag_display
.at(CEPH_MDSMAP_ALLOW_SNAPS
);
373 if (allows_multimds_snaps())
374 out
<< " " << flag_display
.at(CEPH_MDSMAP_ALLOW_MULTIMDS_SNAPS
);
375 if (allows_standby_replay())
376 out
<< " " << flag_display
.at(CEPH_MDSMAP_ALLOW_STANDBY_REPLAY
);
377 if (test_flag(CEPH_MDSMAP_REFUSE_CLIENT_SESSION
))
378 out
<< " " << flag_display
.at(CEPH_MDSMAP_REFUSE_CLIENT_SESSION
);
381 void MDSMap::get_health(list
<pair
<health_status_t
,string
> >& summary
,
382 list
<pair
<health_status_t
,string
> > *detail
) const
384 if (!failed
.empty()) {
385 CachedStackStringStream css
;
387 << ((failed
.size() > 1) ? "s ":" ")
389 << ((failed
.size() > 1) ? " have":" has")
391 summary
.push_back(make_pair(HEALTH_ERR
, css
->str()));
393 for (const auto& r
: failed
) {
394 CachedStackStringStream css
;
395 *css
<< "mds." << r
<< " has failed";
396 detail
->push_back(make_pair(HEALTH_ERR
, css
->str()));
401 if (!damaged
.empty()) {
402 CachedStackStringStream css
;
404 << ((damaged
.size() > 1) ? "s ":" ")
406 << ((damaged
.size() > 1) ? " are":" is")
408 summary
.push_back(make_pair(HEALTH_ERR
, css
->str()));
410 for (const auto& r
: damaged
) {
411 CachedStackStringStream css
;
412 *css
<< "mds." << r
<< " is damaged";
413 detail
->push_back(make_pair(HEALTH_ERR
, css
->str()));
419 summary
.push_back(make_pair(HEALTH_WARN
, "mds cluster is degraded"));
421 detail
->push_back(make_pair(HEALTH_WARN
, "mds cluster is degraded"));
422 for (mds_rank_t i
= mds_rank_t(0); i
< get_max_mds(); i
++) {
425 mds_gid_t gid
= up
.find(i
)->second
;
426 const auto& info
= mds_info
.at(gid
);
427 CachedStackStringStream css
;
429 *css
<< "mds." << info
.name
<< " at " << info
.addrs
430 << " rank " << i
<< " is resolving";
432 *css
<< "mds." << info
.name
<< " at " << info
.addrs
433 << " rank " << i
<< " is replaying journal";
435 *css
<< "mds." << info
.name
<< " at " << info
.addrs
436 << " rank " << i
<< " is rejoining";
438 *css
<< "mds." << info
.name
<< " at " << info
.addrs
439 << " rank " << i
<< " is reconnecting to clients";
440 if (css
->strv().length())
441 detail
->push_back(make_pair(HEALTH_WARN
, css
->str()));
447 CachedStackStringStream css
;
448 *css
<< fs_name
<< " max_mds " << max_mds
;
449 summary
.push_back(make_pair(HEALTH_WARN
, css
->str()));
452 if ((mds_rank_t
)up
.size() < max_mds
) {
453 CachedStackStringStream css
;
454 *css
<< fs_name
<< " has " << up
.size()
455 << " active MDS(s), but has max_mds of " << max_mds
;
456 summary
.push_back(make_pair(HEALTH_WARN
, css
->str()));
460 for (const auto &u
: up
) {
461 const auto& info
= mds_info
.at(u
.second
);
463 laggy
.insert(info
.name
);
465 CachedStackStringStream css
;
466 *css
<< "mds." << info
.name
<< " at " << info
.addrs
467 << " is laggy/unresponsive";
468 detail
->push_back(make_pair(HEALTH_WARN
, css
->str()));
473 if (!laggy
.empty()) {
474 CachedStackStringStream css
;
475 *css
<< "mds " << laggy
476 << ((laggy
.size() > 1) ? " are":" is")
478 summary
.push_back(make_pair(HEALTH_WARN
, css
->str()));
481 if (get_max_mds() > 1 &&
482 was_snaps_ever_allowed() && !allows_multimds_snaps()) {
483 CachedStackStringStream css
;
484 *css
<< "multi-active mds while there are snapshots possibly created by pre-mimic MDS";
485 summary
.push_back(make_pair(HEALTH_WARN
, css
->str()));
489 void MDSMap::get_health_checks(health_check_map_t
*checks
) const
492 if (!damaged
.empty()) {
493 health_check_t
& check
= checks
->get_or_add("MDS_DAMAGE", HEALTH_ERR
,
494 "%num% mds daemon%plurals% damaged",
496 for (const auto& p
: damaged
) {
497 CachedStackStringStream css
;
498 *css
<< "fs " << fs_name
<< " mds." << p
<< " is damaged";
499 check
.detail
.push_back(css
->str());
505 health_check_t
& fscheck
= checks
->get_or_add(
506 "FS_DEGRADED", HEALTH_WARN
,
507 "%num% filesystem%plurals% %isorare% degraded", 1);
508 CachedStackStringStream css
;
509 *css
<< "fs " << fs_name
<< " is degraded";
510 fscheck
.detail
.push_back(css
->str());
513 for (mds_rank_t i
= mds_rank_t(0); i
< get_max_mds(); i
++) {
516 mds_gid_t gid
= up
.find(i
)->second
;
517 const auto& info
= mds_info
.at(gid
);
518 CachedStackStringStream css
;
519 *css
<< "fs " << fs_name
<< " mds." << info
.name
<< " at "
520 << info
.addrs
<< " rank " << i
;
522 *css
<< " is resolving";
524 *css
<< " is replaying journal";
526 *css
<< " is rejoining";
528 *css
<< " is reconnecting to clients";
529 if (css
->strv().length())
530 detail
.push_back(css
->str());
534 // MDS_UP_LESS_THAN_MAX
535 if ((mds_rank_t
)get_num_in_mds() < get_max_mds()) {
536 health_check_t
& check
= checks
->add(
537 "MDS_UP_LESS_THAN_MAX", HEALTH_WARN
,
538 "%num% filesystem%plurals% %isorare% online with fewer MDS than max_mds", 1);
539 CachedStackStringStream css
;
540 *css
<< "fs " << fs_name
<< " has " << get_num_in_mds()
541 << " MDS online, but wants " << get_max_mds();
542 check
.detail
.push_back(css
->str());
546 if ((mds_rank_t
)get_num_up_mds() == 0 && get_max_mds() > 0) {
547 health_check_t
&check
= checks
->add(
548 "MDS_ALL_DOWN", HEALTH_ERR
,
549 "%num% filesystem%plurals% %isorare% offline", 1);
550 CachedStackStringStream css
;
551 *css
<< "fs " << fs_name
<< " is offline because no MDS is active for it.";
552 check
.detail
.push_back(css
->str());
555 if (get_max_mds() > 1 &&
556 was_snaps_ever_allowed() && !allows_multimds_snaps()) {
557 health_check_t
&check
= checks
->add(
558 "MULTIMDS_WITH_OLDSNAPS", HEALTH_ERR
,
559 "%num% filesystem%plurals% %isorare% multi-active mds with old snapshots", 1);
560 CachedStackStringStream css
;
561 *css
<< "multi-active mds while there are snapshots possibly created by pre-mimic MDS";
562 check
.detail
.push_back(css
->str());
565 if (get_inline_data_enabled()) {
566 health_check_t
&check
= checks
->add(
567 "FS_INLINE_DATA_DEPRECATED", HEALTH_WARN
,
568 "%num% filesystem%plurals% with deprecated feature inline_data", 1);
569 CachedStackStringStream css
;
570 *css
<< "fs " << fs_name
<< " has deprecated feature inline_data enabled.";
571 check
.detail
.push_back(css
->str());
575 void MDSMap::mds_info_t::encode_versioned(bufferlist
& bl
, uint64_t features
) const
578 if (!HAVE_FEATURE(features
, SERVER_NAUTILUS
)) {
581 ENCODE_START(v
, 4, bl
);
582 encode(global_id
, bl
);
586 encode((int32_t)state
, bl
);
587 encode(state_seq
, bl
);
589 encode(addrs
.legacy_addr(), bl
, features
);
591 encode(addrs
, bl
, features
);
593 encode(laggy_since
, bl
);
594 encode(MDS_RANK_NONE
, bl
); /* standby_for_rank */
595 encode(std::string(), bl
); /* standby_for_name */
596 encode(export_targets
, bl
);
597 encode(mds_features
, bl
);
598 encode(join_fscid
, bl
); /* formerly: standby_for_fscid */
609 void MDSMap::mds_info_t::encode_unversioned(bufferlist
& bl
) const
613 encode(struct_v
, bl
);
614 encode(global_id
, bl
);
618 encode((int32_t)state
, bl
);
619 encode(state_seq
, bl
);
620 encode(addrs
.legacy_addr(), bl
, 0);
621 encode(laggy_since
, bl
);
622 encode(MDS_RANK_NONE
, bl
);
623 encode(std::string(), bl
);
624 encode(export_targets
, bl
);
627 void MDSMap::mds_info_t::decode(bufferlist::const_iterator
& bl
)
629 DECODE_START_LEGACY_COMPAT_LEN(10, 4, 4, bl
);
630 decode(global_id
, bl
);
635 decode(raw_state
, bl
);
636 state
= (MDSMap::DaemonState
)raw_state
;
637 decode(state_seq
, bl
);
639 decode(laggy_since
, bl
);
641 mds_rank_t standby_for_rank
;
642 decode(standby_for_rank
, bl
);
645 std::string standby_for_name
;
646 decode(standby_for_name
, bl
);
649 decode(export_targets
, bl
);
651 decode(mds_features
, bl
);
653 decode(join_fscid
, bl
);
657 decode(standby_replay
, bl
);
662 if (struct_v
>= 10) {
665 compat
= MDSMap::get_compat_set_v16_2_4();
670 std::string
MDSMap::mds_info_t::human_name() const
672 // Like "daemon mds.myhost restarted", "Activating daemon mds.myhost"
673 CachedStackStringStream css
;
674 *css
<< "daemon mds." << name
;
678 void MDSMap::encode(bufferlist
& bl
, uint64_t features
) const
680 std::map
<mds_rank_t
,int32_t> inc
; // Legacy field, fake it so that
681 // old-mon peers have something sane
683 for (const auto rank
: in
) {
684 inc
.insert(std::make_pair(rank
, epoch
));
688 if ((features
& CEPH_FEATURE_PGID64
) == 0) {
693 encode(last_failure
, bl
);
695 encode(session_timeout
, bl
);
696 encode(session_autoclose
, bl
);
697 encode(max_file_size
, bl
);
699 __u32 n
= mds_info
.size();
701 for (map
<mds_gid_t
, mds_info_t
>::const_iterator i
= mds_info
.begin();
702 i
!= mds_info
.end(); ++i
) {
703 encode(i
->first
, bl
);
704 encode(i
->second
, bl
, features
);
706 n
= data_pools
.size();
708 for (const auto p
: data_pools
) {
713 int32_t m
= cas_pool
;
716 } else if ((features
& CEPH_FEATURE_MDSENC
) == 0) {
721 encode(last_failure
, bl
);
723 encode(session_timeout
, bl
);
724 encode(session_autoclose
, bl
);
725 encode(max_file_size
, bl
);
727 __u32 n
= mds_info
.size();
729 for (map
<mds_gid_t
, mds_info_t
>::const_iterator i
= mds_info
.begin();
730 i
!= mds_info
.end(); ++i
) {
731 encode(i
->first
, bl
);
732 encode(i
->second
, bl
, features
);
734 encode(data_pools
, bl
);
735 encode(cas_pool
, bl
);
740 encode(metadata_pool
, bl
);
742 encode(modified
, bl
);
743 encode(tableserver
, bl
);
749 encode(last_failure_osd_epoch
, bl
);
753 ENCODE_START(5, 4, bl
);
756 encode(last_failure
, bl
);
758 encode(session_timeout
, bl
);
759 encode(session_autoclose
, bl
);
760 encode(max_file_size
, bl
);
762 encode(mds_info
, bl
, features
);
763 encode(data_pools
, bl
);
764 encode(cas_pool
, bl
);
769 encode(metadata_pool
, bl
);
771 encode(modified
, bl
);
772 encode(tableserver
, bl
);
778 encode(last_failure_osd_epoch
, bl
);
779 encode(ever_allowed_features
, bl
);
780 encode(explicitly_allowed_features
, bl
);
781 encode(inline_data_enabled
, bl
);
785 encode(balancer
, bl
);
786 encode(standby_count_wanted
, bl
);
787 encode(old_max_mds
, bl
);
789 ceph_release_t min_compat_client
= ceph_release_t::unknown
;
790 encode(min_compat_client
, bl
);
792 encode(required_client_features
, bl
);
793 encode(bal_rank_mask
, bl
);
797 void MDSMap::sanitize(const std::function
<bool(int64_t pool
)>& pool_exists
)
799 /* Before we did stricter checking, it was possible to remove a data pool
800 * without also deleting it from the MDSMap. Check for that here after
801 * decoding the data pools.
804 for (auto it
= data_pools
.begin(); it
!= data_pools
.end();) {
805 if (!pool_exists(*it
)) {
806 dout(0) << "removed non-existant data pool " << *it
<< " from MDSMap" << dendl
;
807 it
= data_pools
.erase(it
);
814 void MDSMap::decode(bufferlist::const_iterator
& p
)
816 std::map
<mds_rank_t
,int32_t> inc
; // Legacy field, parse and drop
818 cached_up_features
= 0;
819 DECODE_START_LEGACY_COMPAT_LEN_16(5, 4, 4, p
);
822 decode(last_failure
, p
);
824 decode(session_timeout
, p
);
825 decode(session_autoclose
, p
);
826 decode(max_file_size
, p
);
835 data_pools
.push_back(m
);
841 decode(data_pools
, p
);
845 // kclient ignores everything from here
852 compat
= get_compat_set_base();
858 decode(metadata_pool
, p
);
862 decode(tableserver
, p
);
869 decode(last_failure_osd_epoch
, p
);
872 // previously this was a bool about snaps, not a flag map
875 ever_allowed_features
= flag
? CEPH_MDSMAP_ALLOW_SNAPS
: 0;
877 explicitly_allowed_features
= flag
? CEPH_MDSMAP_ALLOW_SNAPS
: 0;
879 decode(ever_allowed_features
, p
);
880 decode(explicitly_allowed_features
, p
);
883 ever_allowed_features
= 0;
884 explicitly_allowed_features
= 0;
887 decode(inline_data_enabled
, p
);
890 ceph_assert(struct_v
>= 5);
895 // If an MDS has ever been started, epoch will be greater than 1,
896 // assume filesystem is enabled.
899 // Upgrading from a cluster that never used an MDS, switch off
900 // filesystem until it's explicitly enabled.
914 decode(standby_count_wanted
, p
);
918 decode(old_max_mds
, p
);
922 ceph_release_t min_compat_client
;
927 min_compat_client
= ceph_release_t::unknown
;
929 min_compat_client
= ceph_release_t
{static_cast<uint8_t>(r
)};
931 } else if (ev
>= 15) {
932 decode(min_compat_client
, p
);
935 decode(required_client_features
, p
);
937 set_min_compat_client(min_compat_client
);
942 decode(bal_rank_mask
, p
);
945 /* All MDS since at least v14.0.0 understand INLINE */
946 /* TODO: remove after R is released */
947 compat
.incompat
.insert(MDS_FEATURE_INCOMPAT_INLINE
);
949 for (auto& p
: mds_info
) {
950 static const CompatSet empty
;
951 auto& info
= p
.second
;
952 if (empty
.compare(info
.compat
) == 0) {
953 /* bootstrap old compat; mds_info_t::decode does not have access to MDSMap */
954 info
.compat
= compat
;
956 /* All MDS since at least v14.0.0 understand INLINE */
957 /* TODO: remove after R is released */
958 info
.compat
.incompat
.insert(MDS_FEATURE_INCOMPAT_INLINE
);
964 MDSMap::availability_t
MDSMap::is_cluster_available() const
967 // If I'm a client, this means I'm looking at an MDSMap instance
968 // that was never actually initialized from the mons. Client should
970 return TRANSIENT_UNAVAILABLE
;
973 // If a rank is marked damage (unavailable until operator intervenes)
974 if (damaged
.size()) {
975 return STUCK_UNAVAILABLE
;
978 // If no ranks are created (filesystem not initialized)
980 return STUCK_UNAVAILABLE
;
983 for (const auto rank
: in
) {
984 if (up
.count(rank
) && mds_info
.at(up
.at(rank
)).laggy()) {
985 // This might only be transient, but because we can't see
986 // standbys, we have no way of knowing whether there is a
987 // standby available to replace the laggy guy.
988 return STUCK_UNAVAILABLE
;
992 if (get_num_mds(CEPH_MDS_STATE_ACTIVE
) > 0) {
993 // Nobody looks stuck, so indicate to client they should go ahead
994 // and try mounting if anybody is active. This may include e.g.
995 // one MDS failing over and another active: the client should
996 // proceed to start talking to the active one and let the
997 // transiently-unavailable guy catch up later.
1000 // Nothing indicating we were stuck, but nobody active (yet)
1001 //return TRANSIENT_UNAVAILABLE;
1003 // Because we don't have standbys in the MDSMap any more, we can't
1004 // reliably indicate transient vs. stuck, so always say stuck so
1005 // that the client doesn't block.
1006 return STUCK_UNAVAILABLE
;
1010 bool MDSMap::state_transition_valid(DaemonState prev
, DaemonState next
)
1014 if (next
== MDSMap::STATE_DAMAGED
)
1017 if (prev
== MDSMap::STATE_BOOT
) {
1018 return next
== MDSMap::STATE_STANDBY
;
1019 } else if (prev
== MDSMap::STATE_STANDBY
) {
1020 return next
== MDSMap::STATE_STANDBY_REPLAY
||
1021 next
== MDSMap::STATE_REPLAY
||
1022 next
== MDSMap::STATE_CREATING
||
1023 next
== MDSMap::STATE_STARTING
;
1024 } else if (prev
== MDSMap::STATE_CREATING
|| prev
== MDSMap::STATE_STARTING
) {
1025 return next
== MDSMap::STATE_ACTIVE
;
1026 } else if (prev
== MDSMap::STATE_STANDBY_REPLAY
) {
1027 return next
== MDSMap::STATE_REPLAY
;
1028 } else if (prev
== MDSMap::STATE_REPLAY
) {
1029 return next
== MDSMap::STATE_RESOLVE
||
1030 next
== MDSMap::STATE_RECONNECT
;
1031 } else if (prev
>= MDSMap::STATE_RESOLVE
&& prev
< MDSMap::STATE_ACTIVE
) {
1032 // Once I have entered replay, the only allowable transitions are to
1033 // the next next along in the sequence.
1035 if (prev
== MDSMap::STATE_REJOIN
&&
1036 (next
== MDSMap::STATE_ACTIVE
|| // No need to do client replay
1037 next
== MDSMap::STATE_STOPPED
)) { // no subtrees
1040 return next
== prev
+ 1;
1041 } else if (prev
== MDSMap::STATE_ACTIVE
) {
1042 return next
== MDSMap::STATE_STOPPING
;
1043 } else if (prev
== MDSMap::STATE_STOPPING
) {
1044 return next
== MDSMap::STATE_STOPPED
;
1046 derr
<< __func__
<< ": Unknown prev state "
1047 << ceph_mds_state_name(prev
) << "(" << prev
<< ")" << dendl
;
1052 bool MDSMap::check_health(mds_rank_t standby_daemon_count
)
1054 std::set
<mds_rank_t
> standbys
;
1055 get_standby_replay_mds_set(standbys
);
1056 std::set
<mds_rank_t
> actives
;
1057 get_active_mds_set(actives
);
1058 mds_rank_t standbys_avail
= (mds_rank_t
)standbys
.size()+standby_daemon_count
;
1060 /* If there are standby daemons available/replaying and
1061 * standby_count_wanted is unset (default), then we set it to 1. This will
1062 * happen during health checks by the mons. Also, during initial creation
1063 * of the FS we will have no actives so we don't want to change the default
1066 if (standby_count_wanted
== -1 && actives
.size() > 0 && standbys_avail
> 0) {
1067 set_standby_count_wanted(1);
1073 mds_gid_t
MDSMap::find_mds_gid_by_name(std::string_view s
) const {
1074 for (const auto& [gid
, info
] : mds_info
) {
1075 if (info
.name
== s
) {
1079 return MDS_GID_NONE
;
1082 unsigned MDSMap::get_num_mds(int state
) const {
1084 for (std::map
<mds_gid_t
,mds_info_t
>::const_iterator p
= mds_info
.begin();
1085 p
!= mds_info
.end();
1087 if (p
->second
.state
== state
) ++n
;
1091 void MDSMap::get_up_mds_set(std::set
<mds_rank_t
>& s
) const {
1092 for (std::map
<mds_rank_t
, mds_gid_t
>::const_iterator p
= up
.begin();
1098 uint64_t MDSMap::get_up_features() {
1099 if (!cached_up_features
) {
1101 for (std::map
<mds_rank_t
, mds_gid_t
>::const_iterator p
= up
.begin();
1104 std::map
<mds_gid_t
, mds_info_t
>::const_iterator q
=
1105 mds_info
.find(p
->second
);
1106 ceph_assert(q
!= mds_info
.end());
1108 cached_up_features
= q
->second
.mds_features
;
1111 cached_up_features
&= q
->second
.mds_features
;
1115 return cached_up_features
;
1118 void MDSMap::get_recovery_mds_set(std::set
<mds_rank_t
>& s
) const {
1120 for (const auto& p
: damaged
)
1122 for (const auto& p
: mds_info
)
1123 if (p
.second
.state
>= STATE_REPLAY
&& p
.second
.state
<= STATE_STOPPING
)
1124 s
.insert(p
.second
.rank
);
1127 void MDSMap::get_mds_set_lower_bound(std::set
<mds_rank_t
>& s
, DaemonState first
) const {
1128 for (std::map
<mds_gid_t
, mds_info_t
>::const_iterator p
= mds_info
.begin();
1129 p
!= mds_info
.end();
1131 if (p
->second
.state
>= first
&& p
->second
.state
<= STATE_STOPPING
)
1132 s
.insert(p
->second
.rank
);
1135 void MDSMap::get_mds_set(std::set
<mds_rank_t
>& s
, DaemonState state
) const {
1136 for (std::map
<mds_gid_t
, mds_info_t
>::const_iterator p
= mds_info
.begin();
1137 p
!= mds_info
.end();
1139 if (p
->second
.state
== state
)
1140 s
.insert(p
->second
.rank
);
1143 mds_gid_t
MDSMap::get_standby_replay(mds_rank_t r
) const {
1144 for (auto& [gid
,info
] : mds_info
) {
1145 if (info
.rank
== r
&& info
.state
== STATE_STANDBY_REPLAY
) {
1149 return MDS_GID_NONE
;
1152 bool MDSMap::is_degraded() const {
1153 if (!failed
.empty() || !damaged
.empty())
1155 for (const auto& p
: mds_info
) {
1156 if (p
.second
.is_degraded())
1162 void MDSMap::set_min_compat_client(ceph_release_t version
)
1164 vector
<size_t> bits
;
1166 if (version
>= ceph_release_t::octopus
)
1167 bits
.push_back(CEPHFS_FEATURE_OCTOPUS
);
1168 else if (version
>= ceph_release_t::nautilus
)
1169 bits
.push_back(CEPHFS_FEATURE_NAUTILUS
);
1170 else if (version
>= ceph_release_t::mimic
)
1171 bits
.push_back(CEPHFS_FEATURE_MIMIC
);
1172 else if (version
>= ceph_release_t::luminous
)
1173 bits
.push_back(CEPHFS_FEATURE_LUMINOUS
);
1174 else if (version
>= ceph_release_t::kraken
)
1175 bits
.push_back(CEPHFS_FEATURE_KRAKEN
);
1176 else if (version
>= ceph_release_t::jewel
)
1177 bits
.push_back(CEPHFS_FEATURE_JEWEL
);
1179 std::sort(bits
.begin(), bits
.end());
1180 required_client_features
= feature_bitset_t(bits
);
1183 const std::bitset
<MAX_MDS
>& MDSMap::get_bal_rank_mask_bitset() const {
1184 return bal_rank_mask_bitset
;
1187 void MDSMap::set_bal_rank_mask(std::string val
)
1189 bal_rank_mask
= val
;
1190 dout(10) << "set bal_rank_mask to \"" << bal_rank_mask
<< "\""<< dendl
;
1193 const bool MDSMap::check_special_bal_rank_mask(std::string val
, bal_rank_mask_type_t type
) const
1195 if ((type
== BAL_RANK_MASK_TYPE_ANY
|| type
== BAL_RANK_MASK_TYPE_ALL
) && (val
== "-1" || val
== "all")) {
1198 if ((type
== BAL_RANK_MASK_TYPE_ANY
|| type
== BAL_RANK_MASK_TYPE_NONE
) && (val
== "0x0" || val
== "0")) {
1204 void MDSMap::update_num_mdss_in_rank_mask_bitset()
1208 if (bal_rank_mask
.length() && !check_special_bal_rank_mask(bal_rank_mask
, BAL_RANK_MASK_TYPE_ANY
)) {
1209 std::string bin_string
;
1210 CachedStackStringStream css
;
1212 r
= hex2bin(bal_rank_mask
, bin_string
, MAX_MDS
, *css
);
1214 auto _mds_bal_mask_bitset
= std::bitset
<MAX_MDS
>(bin_string
);
1215 bal_rank_mask_bitset
= _mds_bal_mask_bitset
;
1216 num_mdss_in_rank_mask_bitset
= _mds_bal_mask_bitset
.count();
1218 dout(10) << css
->str() << dendl
;
1223 if (check_special_bal_rank_mask(bal_rank_mask
, BAL_RANK_MASK_TYPE_NONE
)) {
1224 dout(10) << "Balancer is disabled with bal_rank_mask " << bal_rank_mask
<< dendl
;
1225 bal_rank_mask_bitset
.reset();
1226 num_mdss_in_rank_mask_bitset
= 0;
1228 dout(10) << "Balancer distributes mds workloads to all ranks as bal_rank_mask is empty or invalid" << dendl
;
1229 bal_rank_mask_bitset
.set();
1230 num_mdss_in_rank_mask_bitset
= get_max_mds();
1234 dout(10) << "update num_mdss_in_rank_mask_bitset to " << num_mdss_in_rank_mask_bitset
<< dendl
;
1237 int MDSMap::hex2bin(std::string hex_string
, std::string
&bin_string
, unsigned int max_bits
, std::ostream
& ss
) const
1239 static const unsigned int BITS_PER_QUARTET
= CHAR_BIT
/ 2;
1240 static const unsigned int BITS_PER_ULLONG
= sizeof(unsigned long long) * CHAR_BIT
;
1241 static const unsigned int QUARTETS_PER_ULLONG
= BITS_PER_ULLONG
/BITS_PER_QUARTET
;
1242 unsigned int offset
= 0;
1244 std::transform(hex_string
.begin(), hex_string
.end(), hex_string
.begin(), ::tolower
);
1246 if (hex_string
.substr(0, 2) == "0x") {
1250 for (unsigned int i
= offset
; i
< hex_string
.size(); i
+= QUARTETS_PER_ULLONG
) {
1251 unsigned long long value
;
1253 value
= stoull(hex_string
.substr(i
, QUARTETS_PER_ULLONG
), nullptr, 16);
1254 } catch (std::invalid_argument
const& ex
) {
1255 ss
<< "invalid hex value ";
1258 auto bit_str
= std::bitset
<BITS_PER_ULLONG
>(value
);
1259 bin_string
+= bit_str
.to_string();
1262 if (bin_string
.length() > max_bits
) {
1263 ss
<< "a value exceeds max_mds " << max_bits
;
1267 if (bin_string
.find('1') == std::string::npos
) {
1268 ss
<< "at least one rank must be set";
1272 if (bin_string
.length() < max_bits
) {
1273 bin_string
.insert(0, max_bits
- bin_string
.length(), '0');