1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
4 * Ceph - scalable distributed file system
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
17 #include "common/debug.h"
18 #include "mon/health_check.h"
34 using ceph::bufferlist
;
35 using ceph::Formatter
;
37 #define dout_context g_ceph_context
38 #define dout_subsys ceph_subsys_
41 CompatSet
MDSMap::get_compat_set_all() {
42 CompatSet::FeatureSet feature_compat
;
43 CompatSet::FeatureSet feature_ro_compat
;
44 CompatSet::FeatureSet feature_incompat
;
45 feature_incompat
.insert(MDS_FEATURE_INCOMPAT_BASE
);
46 feature_incompat
.insert(MDS_FEATURE_INCOMPAT_CLIENTRANGES
);
47 feature_incompat
.insert(MDS_FEATURE_INCOMPAT_FILELAYOUT
);
48 feature_incompat
.insert(MDS_FEATURE_INCOMPAT_DIRINODE
);
49 feature_incompat
.insert(MDS_FEATURE_INCOMPAT_ENCODING
);
50 feature_incompat
.insert(MDS_FEATURE_INCOMPAT_OMAPDIRFRAG
);
51 feature_incompat
.insert(MDS_FEATURE_INCOMPAT_INLINE
);
52 feature_incompat
.insert(MDS_FEATURE_INCOMPAT_NOANCHOR
);
53 feature_incompat
.insert(MDS_FEATURE_INCOMPAT_FILE_LAYOUT_V2
);
54 feature_incompat
.insert(MDS_FEATURE_INCOMPAT_SNAPREALM_V2
);
56 return CompatSet(feature_compat
, feature_ro_compat
, feature_incompat
);
59 CompatSet
MDSMap::get_compat_set_default() {
60 CompatSet::FeatureSet feature_compat
;
61 CompatSet::FeatureSet feature_ro_compat
;
62 CompatSet::FeatureSet feature_incompat
;
63 feature_incompat
.insert(MDS_FEATURE_INCOMPAT_BASE
);
64 feature_incompat
.insert(MDS_FEATURE_INCOMPAT_CLIENTRANGES
);
65 feature_incompat
.insert(MDS_FEATURE_INCOMPAT_FILELAYOUT
);
66 feature_incompat
.insert(MDS_FEATURE_INCOMPAT_DIRINODE
);
67 feature_incompat
.insert(MDS_FEATURE_INCOMPAT_ENCODING
);
68 feature_incompat
.insert(MDS_FEATURE_INCOMPAT_OMAPDIRFRAG
);
69 feature_incompat
.insert(MDS_FEATURE_INCOMPAT_NOANCHOR
);
70 feature_incompat
.insert(MDS_FEATURE_INCOMPAT_FILE_LAYOUT_V2
);
71 feature_incompat
.insert(MDS_FEATURE_INCOMPAT_SNAPREALM_V2
);
73 return CompatSet(feature_compat
, feature_ro_compat
, feature_incompat
);
77 CompatSet
MDSMap::get_compat_set_base() {
78 CompatSet::FeatureSet feature_compat_base
;
79 CompatSet::FeatureSet feature_incompat_base
;
80 feature_incompat_base
.insert(MDS_FEATURE_INCOMPAT_BASE
);
81 CompatSet::FeatureSet feature_ro_compat_base
;
83 return CompatSet(feature_compat_base
, feature_ro_compat_base
, feature_incompat_base
);
86 // pre-v16.2.5 CompatSet in MDS beacon
87 CompatSet
MDSMap::get_compat_set_v16_2_4() {
88 CompatSet::FeatureSet feature_compat
;
89 CompatSet::FeatureSet feature_ro_compat
;
90 CompatSet::FeatureSet feature_incompat
;
91 feature_incompat
.insert(MDS_FEATURE_INCOMPAT_BASE
);
92 feature_incompat
.insert(MDS_FEATURE_INCOMPAT_CLIENTRANGES
);
93 feature_incompat
.insert(MDS_FEATURE_INCOMPAT_FILELAYOUT
);
94 feature_incompat
.insert(MDS_FEATURE_INCOMPAT_DIRINODE
);
95 feature_incompat
.insert(MDS_FEATURE_INCOMPAT_ENCODING
);
96 feature_incompat
.insert(MDS_FEATURE_INCOMPAT_OMAPDIRFRAG
);
97 feature_incompat
.insert(MDS_FEATURE_INCOMPAT_INLINE
);
98 feature_incompat
.insert(MDS_FEATURE_INCOMPAT_NOANCHOR
);
99 feature_incompat
.insert(MDS_FEATURE_INCOMPAT_FILE_LAYOUT_V2
);
100 feature_incompat
.insert(MDS_FEATURE_INCOMPAT_SNAPREALM_V2
);
101 return CompatSet(feature_compat
, feature_ro_compat
, feature_incompat
);
104 void MDSMap::mds_info_t::dump(Formatter
*f
) const
106 f
->dump_unsigned("gid", global_id
);
107 f
->dump_string("name", name
);
108 f
->dump_int("rank", rank
);
109 f
->dump_int("incarnation", inc
);
110 f
->dump_stream("state") << ceph_mds_state_name(state
);
111 f
->dump_int("state_seq", state_seq
);
112 f
->dump_stream("addr") << addrs
.get_legacy_str();
113 f
->dump_object("addrs", addrs
);
114 f
->dump_int("join_fscid", join_fscid
);
115 if (laggy_since
!= utime_t())
116 f
->dump_stream("laggy_since") << laggy_since
;
118 f
->open_array_section("export_targets");
119 for (set
<mds_rank_t
>::iterator p
= export_targets
.begin();
120 p
!= export_targets
.end(); ++p
) {
121 f
->dump_int("mds", *p
);
124 f
->dump_unsigned("features", mds_features
);
125 f
->dump_unsigned("flags", flags
);
126 f
->dump_object("compat", compat
);
129 void MDSMap::mds_info_t::dump(std::ostream
& o
) const
131 o
<< "[mds." << name
<< "{" << rank
<< ":" << global_id
<< "}"
132 << " state " << ceph_mds_state_name(state
)
133 << " seq " << state_seq
;
135 o
<< " laggy since " << laggy_since
;
137 if (!export_targets
.empty()) {
138 o
<< " export targets " << export_targets
;
143 if (join_fscid
!= FS_CLUSTER_ID_NONE
) {
144 o
<< " join_fscid=" << join_fscid
;
146 o
<< " addr " << addrs
;
152 void MDSMap::mds_info_t::generate_test_instances(std::list
<mds_info_t
*>& ls
)
154 mds_info_t
*sample
= new mds_info_t();
155 ls
.push_back(sample
);
156 sample
= new mds_info_t();
157 sample
->global_id
= 1;
158 sample
->name
= "test_instance";
160 ls
.push_back(sample
);
163 void MDSMap::dump(Formatter
*f
) const
165 f
->dump_int("epoch", epoch
);
166 f
->dump_unsigned("flags", flags
);
168 f
->dump_unsigned("ever_allowed_features", ever_allowed_features
);
169 f
->dump_unsigned("explicitly_allowed_features", explicitly_allowed_features
);
170 f
->dump_stream("created") << created
;
171 f
->dump_stream("modified") << modified
;
172 f
->dump_int("tableserver", tableserver
);
173 f
->dump_int("root", root
);
174 f
->dump_int("session_timeout", session_timeout
);
175 f
->dump_int("session_autoclose", session_autoclose
);
176 f
->open_object_section("required_client_features");
177 cephfs_dump_features(f
, required_client_features
);
179 f
->dump_int("max_file_size", max_file_size
);
180 f
->dump_int("max_xattr_size", max_xattr_size
);
181 f
->dump_int("last_failure", last_failure
);
182 f
->dump_int("last_failure_osd_epoch", last_failure_osd_epoch
);
183 f
->open_object_section("compat");
186 f
->dump_int("max_mds", max_mds
);
187 f
->open_array_section("in");
188 for (set
<mds_rank_t
>::const_iterator p
= in
.begin(); p
!= in
.end(); ++p
)
189 f
->dump_int("mds", *p
);
191 f
->open_object_section("up");
192 for (map
<mds_rank_t
,mds_gid_t
>::const_iterator p
= up
.begin(); p
!= up
.end(); ++p
) {
194 sprintf(s
, "mds_%d", int(p
->first
));
195 f
->dump_int(s
, p
->second
);
198 f
->open_array_section("failed");
199 for (set
<mds_rank_t
>::const_iterator p
= failed
.begin(); p
!= failed
.end(); ++p
)
200 f
->dump_int("mds", *p
);
202 f
->open_array_section("damaged");
203 for (set
<mds_rank_t
>::const_iterator p
= damaged
.begin(); p
!= damaged
.end(); ++p
)
204 f
->dump_int("mds", *p
);
206 f
->open_array_section("stopped");
207 for (set
<mds_rank_t
>::const_iterator p
= stopped
.begin(); p
!= stopped
.end(); ++p
)
208 f
->dump_int("mds", *p
);
210 f
->open_object_section("info");
211 for (const auto& [gid
, info
] : mds_info
) {
212 char s
[25]; // 'gid_' + len(str(ULLONG_MAX)) + '\0'
213 sprintf(s
, "gid_%llu", (long long unsigned)gid
);
214 f
->open_object_section(s
);
219 f
->open_array_section("data_pools");
220 for (const auto& p
: data_pools
)
221 f
->dump_int("pool", p
);
223 f
->dump_int("metadata_pool", metadata_pool
);
224 f
->dump_bool("enabled", enabled
);
225 f
->dump_string("fs_name", fs_name
);
226 f
->dump_string("balancer", balancer
);
227 f
->dump_string("bal_rank_mask", bal_rank_mask
);
228 f
->dump_int("standby_count_wanted", std::max(0, standby_count_wanted
));
231 void MDSMap::dump_flags_state(Formatter
*f
) const
233 f
->open_object_section("flags_state");
234 f
->dump_bool(flag_display
.at(CEPH_MDSMAP_NOT_JOINABLE
), joinable());
235 f
->dump_bool(flag_display
.at(CEPH_MDSMAP_ALLOW_SNAPS
), allows_snaps());
236 f
->dump_bool(flag_display
.at(CEPH_MDSMAP_ALLOW_MULTIMDS_SNAPS
), allows_multimds_snaps());
237 f
->dump_bool(flag_display
.at(CEPH_MDSMAP_ALLOW_STANDBY_REPLAY
), allows_standby_replay());
238 f
->dump_bool(flag_display
.at(CEPH_MDSMAP_REFUSE_CLIENT_SESSION
), test_flag(CEPH_MDSMAP_REFUSE_CLIENT_SESSION
));
239 f
->dump_bool(flag_display
.at(CEPH_MDSMAP_REFUSE_STANDBY_FOR_ANOTHER_FS
), test_flag(CEPH_MDSMAP_REFUSE_STANDBY_FOR_ANOTHER_FS
));
240 f
->dump_bool(flag_display
.at(CEPH_MDSMAP_BALANCE_AUTOMATE
), test_flag(CEPH_MDSMAP_BALANCE_AUTOMATE
));
244 void MDSMap::generate_test_instances(std::list
<MDSMap
*>& ls
)
246 MDSMap
*m
= new MDSMap();
248 m
->data_pools
.push_back(0);
249 m
->metadata_pool
= 1;
251 m
->compat
= get_compat_set_all();
253 // these aren't the defaults, just in case anybody gets confused
254 m
->session_timeout
= 61;
255 m
->session_autoclose
= 301;
256 m
->max_file_size
= 1<<24;
260 void MDSMap::print(ostream
& out
) const
262 out
<< "fs_name\t" << fs_name
<< "\n";
263 out
<< "epoch\t" << epoch
<< "\n";
264 out
<< "flags\t" << hex
<< flags
<< dec
;
267 out
<< "created\t" << created
<< "\n";
268 out
<< "modified\t" << modified
<< "\n";
269 out
<< "tableserver\t" << tableserver
<< "\n";
270 out
<< "root\t" << root
<< "\n";
271 out
<< "session_timeout\t" << session_timeout
<< "\n"
272 << "session_autoclose\t" << session_autoclose
<< "\n";
273 out
<< "max_file_size\t" << max_file_size
<< "\n";
274 out
<< "max_xattr_size\t" << max_xattr_size
<< "\n";
275 out
<< "required_client_features\t" << cephfs_stringify_features(required_client_features
) << "\n";
276 out
<< "last_failure\t" << last_failure
<< "\n"
277 << "last_failure_osd_epoch\t" << last_failure_osd_epoch
<< "\n";
278 out
<< "compat\t" << compat
<< "\n";
279 out
<< "max_mds\t" << max_mds
<< "\n";
280 out
<< "in\t" << in
<< "\n"
281 << "up\t" << up
<< "\n"
282 << "failed\t" << failed
<< "\n"
283 << "damaged\t" << damaged
<< "\n"
284 << "stopped\t" << stopped
<< "\n";
285 out
<< "data_pools\t" << data_pools
<< "\n";
286 out
<< "metadata_pool\t" << metadata_pool
<< "\n";
287 out
<< "inline_data\t" << (inline_data_enabled
? "enabled" : "disabled") << "\n";
288 out
<< "balancer\t" << balancer
<< "\n";
289 out
<< "bal_rank_mask\t" << bal_rank_mask
<< "\n";
290 out
<< "standby_count_wanted\t" << std::max(0, standby_count_wanted
) << "\n";
292 multimap
< pair
<mds_rank_t
, unsigned>, mds_gid_t
> foo
;
293 for (const auto &p
: mds_info
) {
294 foo
.insert(std::make_pair(
295 std::make_pair(p
.second
.rank
, p
.second
.inc
-1), p
.first
));
298 for (const auto &p
: foo
) {
299 out
<< mds_info
.at(p
.second
) << "\n";
303 void MDSMap::print_summary(Formatter
*f
, ostream
*out
) const
305 map
<mds_rank_t
,string
> by_rank
;
306 map
<string
,int> by_state
;
309 f
->dump_unsigned("epoch", get_epoch());
310 f
->dump_unsigned("up", up
.size());
311 f
->dump_unsigned("in", in
.size());
312 f
->dump_unsigned("max", max_mds
);
314 *out
<< "e" << get_epoch() << ": " << up
.size() << "/" << in
.size() << "/" << max_mds
<< " up";
318 f
->open_array_section("by_rank");
319 for (const auto &p
: mds_info
) {
320 string s
= ceph_mds_state_name(p
.second
.state
);
321 if (p
.second
.laggy())
322 s
+= "(laggy or crashed)";
324 if (p
.second
.rank
>= 0 && p
.second
.state
!= MDSMap::STATE_STANDBY_REPLAY
) {
326 f
->open_object_section("mds");
327 f
->dump_unsigned("rank", p
.second
.rank
);
328 f
->dump_string("name", p
.second
.name
);
329 f
->dump_string("status", s
);
332 by_rank
[p
.second
.rank
] = p
.second
.name
+ "=" + s
;
341 if (!by_rank
.empty())
342 *out
<< " " << by_rank
;
345 for (map
<string
,int>::reverse_iterator p
= by_state
.rbegin(); p
!= by_state
.rend(); ++p
) {
347 f
->dump_unsigned(p
->first
.c_str(), p
->second
);
349 *out
<< ", " << p
->second
<< " " << p
->first
;
353 if (!failed
.empty()) {
355 f
->dump_unsigned("failed", failed
.size());
357 *out
<< ", " << failed
.size() << " failed";
361 if (!damaged
.empty()) {
363 f
->dump_unsigned("damaged", damaged
.size());
365 *out
<< ", " << damaged
.size() << " damaged";
368 //if (stopped.size())
369 //out << ", " << stopped.size() << " stopped";
372 void MDSMap::print_flags(std::ostream
& out
) const {
374 out
<< " " << flag_display
.at(CEPH_MDSMAP_NOT_JOINABLE
);
376 out
<< " " << flag_display
.at(CEPH_MDSMAP_ALLOW_SNAPS
);
377 if (allows_multimds_snaps())
378 out
<< " " << flag_display
.at(CEPH_MDSMAP_ALLOW_MULTIMDS_SNAPS
);
379 if (allows_standby_replay())
380 out
<< " " << flag_display
.at(CEPH_MDSMAP_ALLOW_STANDBY_REPLAY
);
381 if (test_flag(CEPH_MDSMAP_REFUSE_CLIENT_SESSION
))
382 out
<< " " << flag_display
.at(CEPH_MDSMAP_REFUSE_CLIENT_SESSION
);
383 if (test_flag(CEPH_MDSMAP_REFUSE_STANDBY_FOR_ANOTHER_FS
))
384 out
<< " " << flag_display
.at(CEPH_MDSMAP_REFUSE_STANDBY_FOR_ANOTHER_FS
);
385 if (test_flag(CEPH_MDSMAP_BALANCE_AUTOMATE
))
386 out
<< " " << flag_display
.at(CEPH_MDSMAP_BALANCE_AUTOMATE
);
389 void MDSMap::get_health(list
<pair
<health_status_t
,string
> >& summary
,
390 list
<pair
<health_status_t
,string
> > *detail
) const
392 if (!failed
.empty()) {
393 CachedStackStringStream css
;
395 << ((failed
.size() > 1) ? "s ":" ")
397 << ((failed
.size() > 1) ? " have":" has")
399 summary
.push_back(make_pair(HEALTH_ERR
, css
->str()));
401 for (const auto& r
: failed
) {
402 CachedStackStringStream css
;
403 *css
<< "mds." << r
<< " has failed";
404 detail
->push_back(make_pair(HEALTH_ERR
, css
->str()));
409 if (!damaged
.empty()) {
410 CachedStackStringStream css
;
412 << ((damaged
.size() > 1) ? "s ":" ")
414 << ((damaged
.size() > 1) ? " are":" is")
416 summary
.push_back(make_pair(HEALTH_ERR
, css
->str()));
418 for (const auto& r
: damaged
) {
419 CachedStackStringStream css
;
420 *css
<< "mds." << r
<< " is damaged";
421 detail
->push_back(make_pair(HEALTH_ERR
, css
->str()));
427 summary
.push_back(make_pair(HEALTH_WARN
, "mds cluster is degraded"));
429 detail
->push_back(make_pair(HEALTH_WARN
, "mds cluster is degraded"));
430 for (mds_rank_t i
= mds_rank_t(0); i
< get_max_mds(); i
++) {
433 mds_gid_t gid
= up
.find(i
)->second
;
434 const auto& info
= mds_info
.at(gid
);
435 CachedStackStringStream css
;
437 *css
<< "mds." << info
.name
<< " at " << info
.addrs
438 << " rank " << i
<< " is resolving";
440 *css
<< "mds." << info
.name
<< " at " << info
.addrs
441 << " rank " << i
<< " is replaying journal";
443 *css
<< "mds." << info
.name
<< " at " << info
.addrs
444 << " rank " << i
<< " is rejoining";
446 *css
<< "mds." << info
.name
<< " at " << info
.addrs
447 << " rank " << i
<< " is reconnecting to clients";
448 if (css
->strv().length())
449 detail
->push_back(make_pair(HEALTH_WARN
, css
->str()));
455 CachedStackStringStream css
;
456 *css
<< fs_name
<< " max_mds " << max_mds
;
457 summary
.push_back(make_pair(HEALTH_WARN
, css
->str()));
460 if ((mds_rank_t
)up
.size() < max_mds
) {
461 CachedStackStringStream css
;
462 *css
<< fs_name
<< " has " << up
.size()
463 << " active MDS(s), but has max_mds of " << max_mds
;
464 summary
.push_back(make_pair(HEALTH_WARN
, css
->str()));
468 for (const auto &u
: up
) {
469 const auto& info
= mds_info
.at(u
.second
);
471 laggy
.insert(info
.name
);
473 CachedStackStringStream css
;
474 *css
<< "mds." << info
.name
<< " at " << info
.addrs
475 << " is laggy/unresponsive";
476 detail
->push_back(make_pair(HEALTH_WARN
, css
->str()));
481 if (!laggy
.empty()) {
482 CachedStackStringStream css
;
483 *css
<< "mds " << laggy
484 << ((laggy
.size() > 1) ? " are":" is")
486 summary
.push_back(make_pair(HEALTH_WARN
, css
->str()));
489 if (get_max_mds() > 1 &&
490 was_snaps_ever_allowed() && !allows_multimds_snaps()) {
491 CachedStackStringStream css
;
492 *css
<< "multi-active mds while there are snapshots possibly created by pre-mimic MDS";
493 summary
.push_back(make_pair(HEALTH_WARN
, css
->str()));
497 void MDSMap::get_health_checks(health_check_map_t
*checks
) const
500 if (!damaged
.empty()) {
501 health_check_t
& check
= checks
->get_or_add("MDS_DAMAGE", HEALTH_ERR
,
502 "%num% mds daemon%plurals% damaged",
504 for (const auto& p
: damaged
) {
505 CachedStackStringStream css
;
506 *css
<< "fs " << fs_name
<< " mds." << p
<< " is damaged";
507 check
.detail
.push_back(css
->str());
513 health_check_t
& fscheck
= checks
->get_or_add(
514 "FS_DEGRADED", HEALTH_WARN
,
515 "%num% filesystem%plurals% %isorare% degraded", 1);
516 CachedStackStringStream css
;
517 *css
<< "fs " << fs_name
<< " is degraded";
518 fscheck
.detail
.push_back(css
->str());
521 for (mds_rank_t i
= mds_rank_t(0); i
< get_max_mds(); i
++) {
524 mds_gid_t gid
= up
.find(i
)->second
;
525 const auto& info
= mds_info
.at(gid
);
526 CachedStackStringStream css
;
527 *css
<< "fs " << fs_name
<< " mds." << info
.name
<< " at "
528 << info
.addrs
<< " rank " << i
;
530 *css
<< " is resolving";
532 *css
<< " is replaying journal";
534 *css
<< " is rejoining";
536 *css
<< " is reconnecting to clients";
537 if (css
->strv().length())
538 detail
.push_back(css
->str());
542 // MDS_UP_LESS_THAN_MAX
543 if ((mds_rank_t
)get_num_in_mds() < get_max_mds()) {
544 health_check_t
& check
= checks
->add(
545 "MDS_UP_LESS_THAN_MAX", HEALTH_WARN
,
546 "%num% filesystem%plurals% %isorare% online with fewer MDS than max_mds", 1);
547 CachedStackStringStream css
;
548 *css
<< "fs " << fs_name
<< " has " << get_num_in_mds()
549 << " MDS online, but wants " << get_max_mds();
550 check
.detail
.push_back(css
->str());
554 if ((mds_rank_t
)get_num_up_mds() == 0 && get_max_mds() > 0) {
555 health_check_t
&check
= checks
->add(
556 "MDS_ALL_DOWN", HEALTH_ERR
,
557 "%num% filesystem%plurals% %isorare% offline", 1);
558 CachedStackStringStream css
;
559 *css
<< "fs " << fs_name
<< " is offline because no MDS is active for it.";
560 check
.detail
.push_back(css
->str());
563 if (get_max_mds() > 1 &&
564 was_snaps_ever_allowed() && !allows_multimds_snaps()) {
565 health_check_t
&check
= checks
->add(
566 "MULTIMDS_WITH_OLDSNAPS", HEALTH_ERR
,
567 "%num% filesystem%plurals% %isorare% multi-active mds with old snapshots", 1);
568 CachedStackStringStream css
;
569 *css
<< "multi-active mds while there are snapshots possibly created by pre-mimic MDS";
570 check
.detail
.push_back(css
->str());
573 if (get_inline_data_enabled()) {
574 health_check_t
&check
= checks
->add(
575 "FS_INLINE_DATA_DEPRECATED", HEALTH_WARN
,
576 "%num% filesystem%plurals% with deprecated feature inline_data", 1);
577 CachedStackStringStream css
;
578 *css
<< "fs " << fs_name
<< " has deprecated feature inline_data enabled.";
579 check
.detail
.push_back(css
->str());
583 void MDSMap::mds_info_t::encode_versioned(bufferlist
& bl
, uint64_t features
) const
586 if (!HAVE_FEATURE(features
, SERVER_NAUTILUS
)) {
589 ENCODE_START(v
, 4, bl
);
590 encode(global_id
, bl
);
594 encode((int32_t)state
, bl
);
595 encode(state_seq
, bl
);
597 encode(addrs
.legacy_addr(), bl
, features
);
599 encode(addrs
, bl
, features
);
601 encode(laggy_since
, bl
);
602 encode(MDS_RANK_NONE
, bl
); /* standby_for_rank */
603 encode(std::string(), bl
); /* standby_for_name */
604 encode(export_targets
, bl
);
605 encode(mds_features
, bl
);
606 encode(join_fscid
, bl
); /* formerly: standby_for_fscid */
617 void MDSMap::mds_info_t::encode_unversioned(bufferlist
& bl
) const
621 encode(struct_v
, bl
);
622 encode(global_id
, bl
);
626 encode((int32_t)state
, bl
);
627 encode(state_seq
, bl
);
628 encode(addrs
.legacy_addr(), bl
, 0);
629 encode(laggy_since
, bl
);
630 encode(MDS_RANK_NONE
, bl
);
631 encode(std::string(), bl
);
632 encode(export_targets
, bl
);
635 void MDSMap::mds_info_t::decode(bufferlist::const_iterator
& bl
)
637 DECODE_START_LEGACY_COMPAT_LEN(10, 4, 4, bl
);
638 decode(global_id
, bl
);
643 decode(raw_state
, bl
);
644 state
= (MDSMap::DaemonState
)raw_state
;
645 decode(state_seq
, bl
);
647 decode(laggy_since
, bl
);
649 mds_rank_t standby_for_rank
;
650 decode(standby_for_rank
, bl
);
653 std::string standby_for_name
;
654 decode(standby_for_name
, bl
);
657 decode(export_targets
, bl
);
659 decode(mds_features
, bl
);
661 decode(join_fscid
, bl
);
665 decode(standby_replay
, bl
);
670 if (struct_v
>= 10) {
673 compat
= MDSMap::get_compat_set_v16_2_4();
678 std::string
MDSMap::mds_info_t::human_name() const
680 // Like "daemon mds.myhost restarted", "Activating daemon mds.myhost"
681 CachedStackStringStream css
;
682 *css
<< "daemon mds." << name
;
686 void MDSMap::encode(bufferlist
& bl
, uint64_t features
) const
688 std::map
<mds_rank_t
,int32_t> inc
; // Legacy field, fake it so that
689 // old-mon peers have something sane
691 for (const auto rank
: in
) {
692 inc
.insert(std::make_pair(rank
, epoch
));
696 if ((features
& CEPH_FEATURE_PGID64
) == 0) {
701 encode(last_failure
, bl
);
703 encode(session_timeout
, bl
);
704 encode(session_autoclose
, bl
);
705 encode(max_file_size
, bl
);
707 __u32 n
= mds_info
.size();
709 for (map
<mds_gid_t
, mds_info_t
>::const_iterator i
= mds_info
.begin();
710 i
!= mds_info
.end(); ++i
) {
711 encode(i
->first
, bl
);
712 encode(i
->second
, bl
, features
);
714 n
= data_pools
.size();
716 for (const auto p
: data_pools
) {
721 int32_t m
= cas_pool
;
724 } else if ((features
& CEPH_FEATURE_MDSENC
) == 0) {
729 encode(last_failure
, bl
);
731 encode(session_timeout
, bl
);
732 encode(session_autoclose
, bl
);
733 encode(max_file_size
, bl
);
735 __u32 n
= mds_info
.size();
737 for (map
<mds_gid_t
, mds_info_t
>::const_iterator i
= mds_info
.begin();
738 i
!= mds_info
.end(); ++i
) {
739 encode(i
->first
, bl
);
740 encode(i
->second
, bl
, features
);
742 encode(data_pools
, bl
);
743 encode(cas_pool
, bl
);
748 encode(metadata_pool
, bl
);
750 encode(modified
, bl
);
751 encode(tableserver
, bl
);
757 encode(last_failure_osd_epoch
, bl
);
761 ENCODE_START(5, 4, bl
);
764 encode(last_failure
, bl
);
766 encode(session_timeout
, bl
);
767 encode(session_autoclose
, bl
);
768 encode(max_file_size
, bl
);
770 encode(mds_info
, bl
, features
);
771 encode(data_pools
, bl
);
772 encode(cas_pool
, bl
);
777 encode(metadata_pool
, bl
);
779 encode(modified
, bl
);
780 encode(tableserver
, bl
);
786 encode(last_failure_osd_epoch
, bl
);
787 encode(ever_allowed_features
, bl
);
788 encode(explicitly_allowed_features
, bl
);
789 encode(inline_data_enabled
, bl
);
793 encode(balancer
, bl
);
794 encode(standby_count_wanted
, bl
);
795 encode(old_max_mds
, bl
);
797 ceph_release_t min_compat_client
= ceph_release_t::unknown
;
798 encode(min_compat_client
, bl
);
800 encode(required_client_features
, bl
);
801 encode(bal_rank_mask
, bl
);
802 encode(max_xattr_size
, bl
);
806 void MDSMap::sanitize(const std::function
<bool(int64_t pool
)>& pool_exists
)
808 /* Before we did stricter checking, it was possible to remove a data pool
809 * without also deleting it from the MDSMap. Check for that here after
810 * decoding the data pools.
813 for (auto it
= data_pools
.begin(); it
!= data_pools
.end();) {
814 if (!pool_exists(*it
)) {
815 dout(0) << "removed non-existant data pool " << *it
<< " from MDSMap" << dendl
;
816 it
= data_pools
.erase(it
);
823 void MDSMap::decode(bufferlist::const_iterator
& p
)
825 std::map
<mds_rank_t
,int32_t> inc
; // Legacy field, parse and drop
827 cached_up_features
= 0;
828 DECODE_START_LEGACY_COMPAT_LEN_16(5, 4, 4, p
);
831 decode(last_failure
, p
);
833 decode(session_timeout
, p
);
834 decode(session_autoclose
, p
);
835 decode(max_file_size
, p
);
844 data_pools
.push_back(m
);
850 decode(data_pools
, p
);
854 // kclient skips most of what's below
855 // see fs/ceph/mdsmap.c for current decoding
862 compat
= get_compat_set_base();
868 decode(metadata_pool
, p
);
872 decode(tableserver
, p
);
879 decode(last_failure_osd_epoch
, p
);
882 // previously this was a bool about snaps, not a flag map
885 ever_allowed_features
= flag
? CEPH_MDSMAP_ALLOW_SNAPS
: 0;
887 explicitly_allowed_features
= flag
? CEPH_MDSMAP_ALLOW_SNAPS
: 0;
889 decode(ever_allowed_features
, p
);
890 decode(explicitly_allowed_features
, p
);
893 ever_allowed_features
= 0;
894 explicitly_allowed_features
= 0;
897 decode(inline_data_enabled
, p
);
900 ceph_assert(struct_v
>= 5);
905 // If an MDS has ever been started, epoch will be greater than 1,
906 // assume filesystem is enabled.
909 // Upgrading from a cluster that never used an MDS, switch off
910 // filesystem until it's explicitly enabled.
924 decode(standby_count_wanted
, p
);
928 decode(old_max_mds
, p
);
932 ceph_release_t min_compat_client
;
937 min_compat_client
= ceph_release_t::unknown
;
939 min_compat_client
= ceph_release_t
{static_cast<uint8_t>(r
)};
941 } else if (ev
>= 15) {
942 decode(min_compat_client
, p
);
945 decode(required_client_features
, p
);
947 set_min_compat_client(min_compat_client
);
952 decode(bal_rank_mask
, p
);
956 decode(max_xattr_size
, p
);
959 /* All MDS since at least v14.0.0 understand INLINE */
960 /* TODO: remove after R is released */
961 compat
.incompat
.insert(MDS_FEATURE_INCOMPAT_INLINE
);
963 for (auto& p
: mds_info
) {
964 static const CompatSet empty
;
965 auto& info
= p
.second
;
966 if (empty
.compare(info
.compat
) == 0) {
967 /* bootstrap old compat; mds_info_t::decode does not have access to MDSMap */
968 info
.compat
= compat
;
970 /* All MDS since at least v14.0.0 understand INLINE */
971 /* TODO: remove after R is released */
972 info
.compat
.incompat
.insert(MDS_FEATURE_INCOMPAT_INLINE
);
978 MDSMap::availability_t
MDSMap::is_cluster_available() const
981 // If I'm a client, this means I'm looking at an MDSMap instance
982 // that was never actually initialized from the mons. Client should
984 return TRANSIENT_UNAVAILABLE
;
987 // If a rank is marked damage (unavailable until operator intervenes)
988 if (damaged
.size()) {
989 return STUCK_UNAVAILABLE
;
992 // If no ranks are created (filesystem not initialized)
994 return STUCK_UNAVAILABLE
;
997 for (const auto rank
: in
) {
998 if (up
.count(rank
) && mds_info
.at(up
.at(rank
)).laggy()) {
999 // This might only be transient, but because we can't see
1000 // standbys, we have no way of knowing whether there is a
1001 // standby available to replace the laggy guy.
1002 return STUCK_UNAVAILABLE
;
1006 if (get_num_mds(CEPH_MDS_STATE_ACTIVE
) > 0) {
1007 // Nobody looks stuck, so indicate to client they should go ahead
1008 // and try mounting if anybody is active. This may include e.g.
1009 // one MDS failing over and another active: the client should
1010 // proceed to start talking to the active one and let the
1011 // transiently-unavailable guy catch up later.
1014 // Nothing indicating we were stuck, but nobody active (yet)
1015 //return TRANSIENT_UNAVAILABLE;
1017 // Because we don't have standbys in the MDSMap any more, we can't
1018 // reliably indicate transient vs. stuck, so always say stuck so
1019 // that the client doesn't block.
1020 return STUCK_UNAVAILABLE
;
1024 bool MDSMap::state_transition_valid(DaemonState prev
, DaemonState next
)
1028 if (next
== MDSMap::STATE_DAMAGED
)
1031 if (prev
== MDSMap::STATE_BOOT
) {
1032 return next
== MDSMap::STATE_STANDBY
;
1033 } else if (prev
== MDSMap::STATE_STANDBY
) {
1034 return next
== MDSMap::STATE_STANDBY_REPLAY
||
1035 next
== MDSMap::STATE_REPLAY
||
1036 next
== MDSMap::STATE_CREATING
||
1037 next
== MDSMap::STATE_STARTING
;
1038 } else if (prev
== MDSMap::STATE_CREATING
|| prev
== MDSMap::STATE_STARTING
) {
1039 return next
== MDSMap::STATE_ACTIVE
;
1040 } else if (prev
== MDSMap::STATE_STANDBY_REPLAY
) {
1041 return next
== MDSMap::STATE_REPLAY
;
1042 } else if (prev
== MDSMap::STATE_REPLAY
) {
1043 return next
== MDSMap::STATE_RESOLVE
||
1044 next
== MDSMap::STATE_RECONNECT
;
1045 } else if (prev
>= MDSMap::STATE_RESOLVE
&& prev
< MDSMap::STATE_ACTIVE
) {
1046 // Once I have entered replay, the only allowable transitions are to
1047 // the next next along in the sequence.
1049 if (prev
== MDSMap::STATE_REJOIN
&&
1050 (next
== MDSMap::STATE_ACTIVE
|| // No need to do client replay
1051 next
== MDSMap::STATE_STOPPED
)) { // no subtrees
1054 return next
== prev
+ 1;
1055 } else if (prev
== MDSMap::STATE_ACTIVE
) {
1056 return next
== MDSMap::STATE_STOPPING
;
1057 } else if (prev
== MDSMap::STATE_STOPPING
) {
1058 return next
== MDSMap::STATE_STOPPED
;
1060 derr
<< __func__
<< ": Unknown prev state "
1061 << ceph_mds_state_name(prev
) << "(" << prev
<< ")" << dendl
;
1066 bool MDSMap::check_health(mds_rank_t standby_daemon_count
)
1068 std::set
<mds_rank_t
> standbys
;
1069 get_standby_replay_mds_set(standbys
);
1070 std::set
<mds_rank_t
> actives
;
1071 get_active_mds_set(actives
);
1072 mds_rank_t standbys_avail
= (mds_rank_t
)standbys
.size()+standby_daemon_count
;
1074 /* If there are standby daemons available/replaying and
1075 * standby_count_wanted is unset (default), then we set it to 1. This will
1076 * happen during health checks by the mons. Also, during initial creation
1077 * of the FS we will have no actives so we don't want to change the default
1080 if (standby_count_wanted
== -1 && actives
.size() > 0 && standbys_avail
> 0) {
1081 set_standby_count_wanted(1);
1087 mds_gid_t
MDSMap::find_mds_gid_by_name(std::string_view s
) const {
1088 for (const auto& [gid
, info
] : mds_info
) {
1089 if (info
.name
== s
) {
1093 return MDS_GID_NONE
;
1096 unsigned MDSMap::get_num_mds(int state
) const {
1098 for (std::map
<mds_gid_t
,mds_info_t
>::const_iterator p
= mds_info
.begin();
1099 p
!= mds_info
.end();
1101 if (p
->second
.state
== state
) ++n
;
1105 void MDSMap::get_up_mds_set(std::set
<mds_rank_t
>& s
) const {
1106 for (std::map
<mds_rank_t
, mds_gid_t
>::const_iterator p
= up
.begin();
1112 uint64_t MDSMap::get_up_features() {
1113 if (!cached_up_features
) {
1115 for (std::map
<mds_rank_t
, mds_gid_t
>::const_iterator p
= up
.begin();
1118 std::map
<mds_gid_t
, mds_info_t
>::const_iterator q
=
1119 mds_info
.find(p
->second
);
1120 ceph_assert(q
!= mds_info
.end());
1122 cached_up_features
= q
->second
.mds_features
;
1125 cached_up_features
&= q
->second
.mds_features
;
1129 return cached_up_features
;
1132 void MDSMap::get_recovery_mds_set(std::set
<mds_rank_t
>& s
) const {
1134 for (const auto& p
: damaged
)
1136 for (const auto& p
: mds_info
)
1137 if (p
.second
.state
>= STATE_REPLAY
&& p
.second
.state
<= STATE_STOPPING
)
1138 s
.insert(p
.second
.rank
);
1141 void MDSMap::get_mds_set_lower_bound(std::set
<mds_rank_t
>& s
, DaemonState first
) const {
1142 for (std::map
<mds_gid_t
, mds_info_t
>::const_iterator p
= mds_info
.begin();
1143 p
!= mds_info
.end();
1145 if (p
->second
.state
>= first
&& p
->second
.state
<= STATE_STOPPING
)
1146 s
.insert(p
->second
.rank
);
1149 void MDSMap::get_mds_set(std::set
<mds_rank_t
>& s
, DaemonState state
) const {
1150 for (std::map
<mds_gid_t
, mds_info_t
>::const_iterator p
= mds_info
.begin();
1151 p
!= mds_info
.end();
1153 if (p
->second
.state
== state
)
1154 s
.insert(p
->second
.rank
);
1157 mds_gid_t
MDSMap::get_standby_replay(mds_rank_t r
) const {
1158 for (auto& [gid
,info
] : mds_info
) {
1159 if (info
.rank
== r
&& info
.state
== STATE_STANDBY_REPLAY
) {
1163 return MDS_GID_NONE
;
1166 bool MDSMap::is_degraded() const {
1167 if (!failed
.empty() || !damaged
.empty())
1169 for (const auto& p
: mds_info
) {
1170 if (p
.second
.is_degraded())
1176 void MDSMap::set_min_compat_client(ceph_release_t version
)
1178 vector
<size_t> bits
;
1180 if (version
>= ceph_release_t::octopus
)
1181 bits
.push_back(CEPHFS_FEATURE_OCTOPUS
);
1182 else if (version
>= ceph_release_t::nautilus
)
1183 bits
.push_back(CEPHFS_FEATURE_NAUTILUS
);
1184 else if (version
>= ceph_release_t::mimic
)
1185 bits
.push_back(CEPHFS_FEATURE_MIMIC
);
1186 else if (version
>= ceph_release_t::luminous
)
1187 bits
.push_back(CEPHFS_FEATURE_LUMINOUS
);
1188 else if (version
>= ceph_release_t::kraken
)
1189 bits
.push_back(CEPHFS_FEATURE_KRAKEN
);
1190 else if (version
>= ceph_release_t::jewel
)
1191 bits
.push_back(CEPHFS_FEATURE_JEWEL
);
1193 std::sort(bits
.begin(), bits
.end());
1194 required_client_features
= feature_bitset_t(bits
);
1197 const std::bitset
<MAX_MDS
>& MDSMap::get_bal_rank_mask_bitset() const {
1198 return bal_rank_mask_bitset
;
1201 void MDSMap::set_bal_rank_mask(std::string val
)
1203 bal_rank_mask
= val
;
1204 dout(10) << "set bal_rank_mask to \"" << bal_rank_mask
<< "\""<< dendl
;
1207 const bool MDSMap::check_special_bal_rank_mask(std::string val
, bal_rank_mask_type_t type
) const
1209 if ((type
== BAL_RANK_MASK_TYPE_ANY
|| type
== BAL_RANK_MASK_TYPE_ALL
) && (val
== "-1" || val
== "all")) {
1212 if ((type
== BAL_RANK_MASK_TYPE_ANY
|| type
== BAL_RANK_MASK_TYPE_NONE
) && (val
== "0x0" || val
== "0")) {
1218 void MDSMap::update_num_mdss_in_rank_mask_bitset()
1222 if (bal_rank_mask
.length() && !check_special_bal_rank_mask(bal_rank_mask
, BAL_RANK_MASK_TYPE_ANY
)) {
1223 std::string bin_string
;
1224 CachedStackStringStream css
;
1226 r
= hex2bin(bal_rank_mask
, bin_string
, MAX_MDS
, *css
);
1228 auto _mds_bal_mask_bitset
= std::bitset
<MAX_MDS
>(bin_string
);
1229 bal_rank_mask_bitset
= _mds_bal_mask_bitset
;
1230 num_mdss_in_rank_mask_bitset
= _mds_bal_mask_bitset
.count();
1232 dout(10) << css
->str() << dendl
;
1237 if (check_special_bal_rank_mask(bal_rank_mask
, BAL_RANK_MASK_TYPE_NONE
)) {
1238 dout(10) << "Balancer is disabled with bal_rank_mask " << bal_rank_mask
<< dendl
;
1239 bal_rank_mask_bitset
.reset();
1240 num_mdss_in_rank_mask_bitset
= 0;
1242 dout(10) << "Balancer distributes mds workloads to all ranks as bal_rank_mask is empty or invalid" << dendl
;
1243 bal_rank_mask_bitset
.set();
1244 num_mdss_in_rank_mask_bitset
= get_max_mds();
1248 dout(10) << "update num_mdss_in_rank_mask_bitset to " << num_mdss_in_rank_mask_bitset
<< dendl
;
1251 int MDSMap::hex2bin(std::string hex_string
, std::string
&bin_string
, unsigned int max_bits
, std::ostream
& ss
) const
1253 static const unsigned int BITS_PER_QUARTET
= CHAR_BIT
/ 2;
1254 static const unsigned int BITS_PER_ULLONG
= sizeof(unsigned long long) * CHAR_BIT
;
1255 static const unsigned int QUARTETS_PER_ULLONG
= BITS_PER_ULLONG
/BITS_PER_QUARTET
;
1256 unsigned int offset
= 0;
1258 std::transform(hex_string
.begin(), hex_string
.end(), hex_string
.begin(), ::tolower
);
1260 if (hex_string
.substr(0, 2) == "0x") {
1264 for (unsigned int i
= offset
; i
< hex_string
.size(); i
+= QUARTETS_PER_ULLONG
) {
1265 unsigned long long value
;
1267 value
= stoull(hex_string
.substr(i
, QUARTETS_PER_ULLONG
), nullptr, 16);
1268 } catch (std::invalid_argument
const& ex
) {
1269 ss
<< "invalid hex value ";
1272 auto bit_str
= std::bitset
<BITS_PER_ULLONG
>(value
);
1273 bin_string
+= bit_str
.to_string();
1276 if (bin_string
.length() > max_bits
) {
1277 ss
<< "a value exceeds max_mds " << max_bits
;
1281 if (bin_string
.find('1') == std::string::npos
) {
1282 ss
<< "at least one rank must be set";
1286 if (bin_string
.length() < max_bits
) {
1287 bin_string
.insert(0, max_bits
- bin_string
.length(), '0');