1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
4 * Ceph - scalable distributed file system
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
18 #include "common/StackStringStream.h"
22 #include "crimson/common/config_proxy.h"
24 #include "common/config_proxy.h"
26 #include "global/global_context.h"
27 #include "mon/health_check.h"
29 using std::stringstream
;
31 void Filesystem::dump(Formatter
*f
) const
33 f
->open_object_section("mdsmap");
36 f
->dump_int("id", fscid
);
39 void FSMap::dump(Formatter
*f
) const
41 f
->dump_int("epoch", epoch
);
42 // Use 'default' naming to match 'set-default' CLI
43 f
->dump_int("default_fscid", legacy_client_fscid
);
45 f
->open_object_section("compat");
49 f
->open_object_section("feature_flags");
50 f
->dump_bool("enable_multiple", enable_multiple
);
51 f
->dump_bool("ever_enabled_multiple", ever_enabled_multiple
);
54 f
->open_array_section("standbys");
55 for (const auto& [gid
, info
] : standby_daemons
) {
56 f
->open_object_section("info");
58 f
->dump_int("epoch", standby_epochs
.at(gid
));
63 f
->open_array_section("filesystems");
64 for (const auto &fs
: filesystems
) {
65 f
->open_object_section("filesystem");
72 FSMap
&FSMap::operator=(const FSMap
&rhs
)
75 next_filesystem_id
= rhs
.next_filesystem_id
;
76 legacy_client_fscid
= rhs
.legacy_client_fscid
;
78 enable_multiple
= rhs
.enable_multiple
;
79 mds_roles
= rhs
.mds_roles
;
80 standby_daemons
= rhs
.standby_daemons
;
81 standby_epochs
= rhs
.standby_epochs
;
84 for (const auto &i
: rhs
.filesystems
) {
85 const auto &fs
= i
.second
;
86 filesystems
[fs
->fscid
] = std::make_shared
<Filesystem
>(*fs
);
92 void FSMap::generate_test_instances(std::list
<FSMap
*>& ls
)
94 FSMap
*m
= new FSMap();
96 std::list
<MDSMap
*> mds_map_instances
;
97 MDSMap::generate_test_instances(mds_map_instances
);
100 for (auto i
: mds_map_instances
) {
101 auto fs
= Filesystem::create();
105 m
->filesystems
[fs
->fscid
] = fs
;
107 mds_map_instances
.clear();
112 void FSMap::print(ostream
& out
) const
114 out
<< "e" << epoch
<< std::endl
;
115 out
<< "enable_multiple, ever_enabled_multiple: " << enable_multiple
<< ","
116 << ever_enabled_multiple
<< std::endl
;
117 out
<< "compat: " << compat
<< std::endl
;
118 out
<< "legacy client fscid: " << legacy_client_fscid
<< std::endl
;
119 out
<< " " << std::endl
;
121 if (filesystems
.empty()) {
122 out
<< "No filesystems configured" << std::endl
;
125 for (const auto& p
: filesystems
) {
126 p
.second
->print(out
);
127 out
<< " " << std::endl
<< " " << std::endl
; // Space out a bit
130 if (!standby_daemons
.empty()) {
131 out
<< "Standby daemons:" << std::endl
<< " " << std::endl
;
134 for (const auto& p
: standby_daemons
) {
135 out
<< p
.second
<< std::endl
;
139 void FSMap::print_summary(Formatter
*f
, ostream
*out
) const
142 f
->dump_unsigned("epoch", get_epoch());
143 for (const auto &p
: filesystems
) {
145 f
->dump_unsigned("id", fs
->fscid
);
146 f
->dump_unsigned("up", fs
->mds_map
.up
.size());
147 f
->dump_unsigned("in", fs
->mds_map
.in
.size());
148 f
->dump_unsigned("max", fs
->mds_map
.max_mds
);
151 auto count
= filesystems
.size();
154 for (const auto& p
: filesystems
) {
155 const auto& fs
= p
.second
;
159 if (fs
->mds_map
.is_degraded()) {
160 *out
<< fs
->mds_map
.fs_name
<< ":" << fs
->mds_map
.up
.size() << "/" << fs
->mds_map
.in
.size();
162 *out
<< fs
->mds_map
.fs_name
<< ":" << fs
->mds_map
.in
.size();
167 *out
<< count
<< " fs";
168 unsigned degraded
= 0;
169 CachedStackStringStream css
;
170 *css
<< " (degraded: ";
171 for (const auto& p
: filesystems
) {
172 const auto& fs
= p
.second
;
173 if (fs
->mds_map
.is_degraded()) {
176 *css
<< fs
->mds_map
.fs_name
<< ":" << fs
->mds_map
.up
.size() << "/" << fs
->mds_map
.in
.size();
185 *out
<< " (degraded: " << degraded
<< " fs)";
192 f
->open_array_section("by_rank");
195 std::map
<MDSMap::DaemonState
,unsigned> by_state
;
196 std::map
<mds_role_t
, std::pair
<MDSMap::DaemonState
, std::string
>> by_rank
;
197 by_state
[MDSMap::DaemonState::STATE_STANDBY
] = standby_daemons
.size();
198 for (const auto& [gid
, fscid
] : mds_roles
) {
199 if (fscid
== FS_CLUSTER_ID_NONE
)
202 const auto& info
= filesystems
.at(fscid
)->mds_map
.get_info_gid(gid
);
203 auto s
= std::string(ceph_mds_state_name(info
.state
));
205 s
+= "(laggy or crashed)";
209 f
->open_object_section("mds");
210 f
->dump_unsigned("filesystem_id", fscid
);
211 f
->dump_unsigned("rank", info
.rank
);
212 f
->dump_string("name", info
.name
);
213 f
->dump_string("status", s
);
214 f
->dump_unsigned("gid", gid
);
216 } else if (info
.state
!= MDSMap::DaemonState::STATE_STANDBY_REPLAY
) {
217 by_rank
[mds_role_t(fscid
, info
.rank
)] = std::make_pair(info
.state
, info
.name
+ "=" + s
);
219 by_state
[info
.state
]++;
225 if (0 < by_rank
.size() && by_rank
.size() < 5) {
226 if (filesystems
.size() > 1) {
227 // Disambiguate filesystems
228 std::map
<std::string
, std::string
> pretty
;
229 for (const auto& [role
,status
] : by_rank
) {
230 const auto &fs_name
= filesystems
.at(role
.fscid
)->mds_map
.fs_name
;
231 CachedStackStringStream css
;
232 *css
<< fs_name
<< ":" << role
.rank
;
233 pretty
.emplace(std::piecewise_construct
, std::forward_as_tuple(css
->strv()), std::forward_as_tuple(status
.second
));
234 --by_state
[status
.first
]; /* already printed! */
236 *out
<< " " << pretty
;
238 // Omit FSCID in output when only one filesystem exists
239 std::map
<mds_rank_t
, std::string
> shortened
;
240 for (const auto& [role
,status
] : by_rank
) {
241 shortened
[role
.rank
] = status
.second
;
242 --by_state
[status
.first
]; /* already printed! */
244 *out
<< " " << shortened
;
247 for (const auto& [state
, count
] : by_state
) {
249 auto s
= std::string_view(ceph_mds_state_name(state
));
250 *out
<< " " << count
<< " " << s
;
256 const auto state
= MDSMap::DaemonState::STATE_STANDBY
;
257 auto&& name
= ceph_mds_state_name(state
);
258 auto count
= standby_daemons
.size();
259 f
->dump_unsigned(name
, count
);
264 for (const auto& p
: filesystems
) {
266 failed
+= fs
->mds_map
.failed
.size();
267 damaged
+= fs
->mds_map
.damaged
.size();
272 f
->dump_unsigned("failed", failed
);
274 *out
<< ", " << failed
<< " failed";
280 f
->dump_unsigned("damaged", damaged
);
282 *out
<< ", " << damaged
<< " damaged";
285 //if (stopped.size())
286 //out << ", " << stopped.size() << " stopped";
289 mds_gid_t
Filesystem::get_standby_replay(mds_gid_t who
) const
291 for (const auto &i
: mds_map
.mds_info
) {
292 const auto &info
= i
.second
;
293 if (info
.state
== MDSMap::STATE_STANDBY_REPLAY
294 && info
.rank
== mds_map
.mds_info
.at(who
).rank
) {
295 return info
.global_id
;
301 Filesystem::ref
FSMap::create_filesystem(std::string_view name
,
302 int64_t metadata_pool
, int64_t data_pool
, uint64_t features
)
304 auto fs
= Filesystem::create();
305 fs
->mds_map
.epoch
= epoch
;
306 fs
->mds_map
.fs_name
= name
;
307 fs
->mds_map
.data_pools
.push_back(data_pool
);
308 fs
->mds_map
.metadata_pool
= metadata_pool
;
309 fs
->mds_map
.cas_pool
= -1;
310 fs
->mds_map
.compat
= compat
;
311 fs
->mds_map
.created
= ceph_clock_now();
312 fs
->mds_map
.modified
= ceph_clock_now();
313 fs
->mds_map
.enabled
= true;
314 fs
->fscid
= next_filesystem_id
++;
315 // ANONYMOUS is only for upgrades from legacy mdsmaps, we should
316 // have initialized next_filesystem_id such that it's never used here.
317 ceph_assert(fs
->fscid
!= FS_CLUSTER_ID_ANONYMOUS
);
318 filesystems
[fs
->fscid
] = fs
;
320 // Created first filesystem? Set it as the one
321 // for legacy clients to use
322 if (filesystems
.size() == 1) {
323 legacy_client_fscid
= fs
->fscid
;
329 Filesystem::const_ref
FSMap::get_filesystem(std::string_view name
) const
331 for (const auto& p
: filesystems
) {
332 if (p
.second
->mds_map
.fs_name
== name
) {
339 std::vector
<Filesystem::const_ref
> FSMap::get_filesystems(void) const
341 std::vector
<Filesystem::const_ref
> ret
;
342 for (const auto& p
: filesystems
) {
343 ret
.push_back(p
.second
);
348 void FSMap::reset_filesystem(fs_cluster_id_t fscid
)
350 auto fs
= get_filesystem(fscid
);
351 auto new_fs
= Filesystem::create();
353 // Populate rank 0 as existing (so don't go into CREATING)
354 // but failed (so that next available MDS is assigned the rank)
355 new_fs
->mds_map
.in
.insert(mds_rank_t(0));
356 new_fs
->mds_map
.failed
.insert(mds_rank_t(0));
358 // Carry forward what makes sense
359 new_fs
->fscid
= fs
->fscid
;
360 new_fs
->mds_map
.inline_data_enabled
= fs
->mds_map
.inline_data_enabled
;
361 new_fs
->mds_map
.data_pools
= fs
->mds_map
.data_pools
;
362 new_fs
->mds_map
.metadata_pool
= fs
->mds_map
.metadata_pool
;
363 new_fs
->mds_map
.cas_pool
= fs
->mds_map
.cas_pool
;
364 new_fs
->mds_map
.fs_name
= fs
->mds_map
.fs_name
;
365 new_fs
->mds_map
.compat
= compat
;
366 new_fs
->mds_map
.created
= ceph_clock_now();
367 new_fs
->mds_map
.modified
= ceph_clock_now();
368 new_fs
->mds_map
.standby_count_wanted
= fs
->mds_map
.standby_count_wanted
;
369 new_fs
->mds_map
.enabled
= true;
371 // Remember mds ranks that have ever started. (They should load old inotable
372 // instead of creating new one if they start again.)
373 new_fs
->mds_map
.stopped
.insert(fs
->mds_map
.in
.begin(), fs
->mds_map
.in
.end());
374 new_fs
->mds_map
.stopped
.insert(fs
->mds_map
.stopped
.begin(), fs
->mds_map
.stopped
.end());
375 new_fs
->mds_map
.stopped
.erase(mds_rank_t(0));
377 // Persist the new FSMap
378 filesystems
[new_fs
->fscid
] = new_fs
;
381 void FSMap::get_health(list
<pair
<health_status_t
,string
> >& summary
,
382 list
<pair
<health_status_t
,string
> > *detail
) const
384 mds_rank_t standby_count_wanted
= 0;
385 for (const auto &i
: filesystems
) {
386 const auto &fs
= i
.second
;
388 // TODO: move get_health up into here so that we can qualify
389 // all the messages with what filesystem they're talking about
390 fs
->mds_map
.get_health(summary
, detail
);
392 standby_count_wanted
= std::max(standby_count_wanted
, fs
->mds_map
.get_standby_count_wanted((mds_rank_t
)standby_daemons
.size()));
395 if (standby_count_wanted
) {
396 std::ostringstream oss
;
397 oss
<< "insufficient standby daemons available: have " << standby_daemons
.size() << "; want " << standby_count_wanted
<< " more";
398 summary
.push_back(make_pair(HEALTH_WARN
, oss
.str()));
402 bool FSMap::check_health(void)
404 bool changed
= false;
405 for (auto &i
: filesystems
) {
406 changed
|= i
.second
->mds_map
.check_health((mds_rank_t
)standby_daemons
.size());
411 void FSMap::get_health_checks(health_check_map_t
*checks
) const
413 mds_rank_t standby_count_wanted
= 0;
414 for (const auto &i
: filesystems
) {
415 const auto &fs
= i
.second
;
416 health_check_map_t fschecks
;
418 fs
->mds_map
.get_health_checks(&fschecks
);
420 // Some of the failed ranks might be transient (i.e. there are standbys
421 // ready to replace them). We will report only on "stuck" failed, i.e.
422 // ranks which are failed and have no standby replacement available.
423 std::set
<mds_rank_t
> stuck_failed
;
425 for (const auto &rank
: fs
->mds_map
.failed
) {
426 auto rep_info
= find_replacement_for({fs
->fscid
, rank
});
428 stuck_failed
.insert(rank
);
432 // FS_WITH_FAILED_MDS
433 if (!stuck_failed
.empty()) {
434 health_check_t
& fscheck
= checks
->get_or_add(
435 "FS_WITH_FAILED_MDS", HEALTH_WARN
,
436 "%num% filesystem%plurals% %hasorhave% a failed mds daemon", 1);
438 ss
<< "fs " << fs
->mds_map
.fs_name
<< " has " << stuck_failed
.size()
439 << " failed mds" << (stuck_failed
.size() > 1 ? "s" : "");
440 fscheck
.detail
.push_back(ss
.str()); }
442 checks
->merge(fschecks
);
443 standby_count_wanted
= std::max(
444 standby_count_wanted
,
445 fs
->mds_map
.get_standby_count_wanted((mds_rank_t
)standby_daemons
.size()));
448 // MDS_INSUFFICIENT_STANDBY
449 if (standby_count_wanted
) {
450 std::ostringstream oss
, dss
;
451 oss
<< "insufficient standby MDS daemons available";
452 auto& d
= checks
->get_or_add("MDS_INSUFFICIENT_STANDBY", HEALTH_WARN
, oss
.str(), 1);
453 dss
<< "have " << standby_daemons
.size() << "; want " << standby_count_wanted
455 d
.detail
.push_back(dss
.str());
459 void FSMap::update_compat(const CompatSet
&c
)
461 // We could do something more complicated here to enable
462 // different filesystems to be served by different MDS versions,
463 // but this is a lot simpler because it doesn't require us to
464 // track the compat versions for standby daemons.
466 for (const auto &i
: filesystems
) {
467 MDSMap
&mds_map
= i
.second
->mds_map
;
469 mds_map
.epoch
= epoch
;
473 void FSMap::encode(bufferlist
& bl
, uint64_t features
) const
475 ENCODE_START(7, 6, bl
);
477 encode(next_filesystem_id
, bl
);
478 encode(legacy_client_fscid
, bl
);
480 encode(enable_multiple
, bl
);
482 std::vector
<Filesystem::ref
> v
;
483 v
.reserve(filesystems
.size());
484 for (auto& p
: filesystems
) v
.emplace_back(p
.second
);
485 encode(v
, bl
, features
);
487 encode(mds_roles
, bl
);
488 encode(standby_daemons
, bl
, features
);
489 encode(standby_epochs
, bl
);
490 encode(ever_enabled_multiple
, bl
);
494 void FSMap::decode(bufferlist::const_iterator
& p
)
496 // The highest MDSMap encoding version before we changed the
497 // MDSMonitor to store an FSMap instead of an MDSMap was
498 // 5, so anything older than 6 is decoded as an MDSMap,
499 // and anything newer is decoded as an FSMap.
500 DECODE_START_LEGACY_COMPAT_LEN_16(8, 4, 4, p
);
502 // Because the mon used to store an MDSMap where we now
503 // store an FSMap, FSMap knows how to decode the legacy
504 // MDSMap format (it never needs to encode it though).
505 MDSMap legacy_mds_map
;
507 // Decoding an MDSMap (upgrade)
509 decode(legacy_mds_map
.flags
, p
);
510 decode(legacy_mds_map
.last_failure
, p
);
511 decode(legacy_mds_map
.root
, p
);
512 decode(legacy_mds_map
.session_timeout
, p
);
513 decode(legacy_mds_map
.session_autoclose
, p
);
514 decode(legacy_mds_map
.max_file_size
, p
);
515 decode(legacy_mds_map
.max_mds
, p
);
516 decode(legacy_mds_map
.mds_info
, p
);
523 legacy_mds_map
.data_pools
.push_back(m
);
527 legacy_mds_map
.cas_pool
= s
;
529 decode(legacy_mds_map
.data_pools
, p
);
530 decode(legacy_mds_map
.cas_pool
, p
);
533 // kclient ignores everything from here
538 decode(legacy_mds_map
.compat
, p
);
540 legacy_mds_map
.compat
= MDSMap::get_compat_set_base();
544 legacy_mds_map
.metadata_pool
= n
;
546 decode(legacy_mds_map
.metadata_pool
, p
);
548 decode(legacy_mds_map
.created
, p
);
549 decode(legacy_mds_map
.modified
, p
);
550 decode(legacy_mds_map
.tableserver
, p
);
551 decode(legacy_mds_map
.in
, p
);
552 std::map
<mds_rank_t
,int32_t> inc
; // Legacy field, parse and drop
554 decode(legacy_mds_map
.up
, p
);
555 decode(legacy_mds_map
.failed
, p
);
556 decode(legacy_mds_map
.stopped
, p
);
558 decode(legacy_mds_map
.last_failure_osd_epoch
, p
);
561 // previously this was a bool about snaps, not a flag map
564 legacy_mds_map
.ever_allowed_features
= flag
?
565 CEPH_MDSMAP_ALLOW_SNAPS
: 0;
567 legacy_mds_map
.explicitly_allowed_features
= flag
?
568 CEPH_MDSMAP_ALLOW_SNAPS
: 0;
570 decode(legacy_mds_map
.ever_allowed_features
, p
);
571 decode(legacy_mds_map
.explicitly_allowed_features
, p
);
574 legacy_mds_map
.ever_allowed_features
= 0;
575 legacy_mds_map
.explicitly_allowed_features
= 0;
578 decode(legacy_mds_map
.inline_data_enabled
, p
);
581 ceph_assert(struct_v
>= 5);
582 decode(legacy_mds_map
.enabled
, p
);
583 decode(legacy_mds_map
.fs_name
, p
);
585 legacy_mds_map
.fs_name
= "default";
587 // If an MDS has ever been started, epoch will be greater than 1,
588 // assume filesystem is enabled.
589 legacy_mds_map
.enabled
= true;
591 // Upgrading from a cluster that never used an MDS, switch off
592 // filesystem until it's explicitly enabled.
593 legacy_mds_map
.enabled
= false;
598 decode(legacy_mds_map
.damaged
, p
);
601 // We're upgrading, populate filesystems from the legacy fields
603 standby_daemons
.clear();
604 standby_epochs
.clear();
606 compat
= legacy_mds_map
.compat
;
607 enable_multiple
= false;
609 // Synthesise a Filesystem from legacy_mds_map, if enabled
610 if (legacy_mds_map
.enabled
) {
611 // Construct a Filesystem from the legacy MDSMap
612 auto migrate_fs
= Filesystem::create();
613 migrate_fs
->fscid
= FS_CLUSTER_ID_ANONYMOUS
;
614 migrate_fs
->mds_map
= legacy_mds_map
;
615 migrate_fs
->mds_map
.epoch
= epoch
;
616 filesystems
[migrate_fs
->fscid
] = migrate_fs
;
618 // List of GIDs that had invalid states
619 std::set
<mds_gid_t
> drop_gids
;
621 // Construct mds_roles, standby_daemons, and remove
622 // standbys from the MDSMap in the Filesystem.
623 for (const auto& [gid
, info
] : migrate_fs
->mds_map
.mds_info
) {
624 if (info
.state
== MDSMap::STATE_STANDBY_REPLAY
) {
625 /* drop any legacy standby-replay daemons */
626 drop_gids
.insert(gid
);
627 } else if (info
.rank
== MDS_RANK_NONE
) {
628 if (info
.state
!= MDSMap::STATE_STANDBY
) {
629 // Old MDSMaps can have down:dne here, which
630 // is invalid in an FSMap (#17837)
631 drop_gids
.insert(gid
);
633 insert(info
); // into standby_daemons
636 mds_roles
[gid
] = migrate_fs
->fscid
;
639 for (const auto &p
: standby_daemons
) {
640 // Erase from this Filesystem's MDSMap, because it has
641 // been copied into FSMap::Standby_daemons above
642 migrate_fs
->mds_map
.mds_info
.erase(p
.first
);
644 for (const auto &gid
: drop_gids
) {
645 // Throw away all info for this MDS because it was identified
646 // as having invalid state above.
647 migrate_fs
->mds_map
.mds_info
.erase(gid
);
650 legacy_client_fscid
= migrate_fs
->fscid
;
652 legacy_client_fscid
= FS_CLUSTER_ID_NONE
;
656 decode(next_filesystem_id
, p
);
657 decode(legacy_client_fscid
, p
);
659 decode(enable_multiple
, p
);
661 std::vector
<Filesystem::ref
> v
;
664 for (auto& ref
: v
) {
665 auto em
= filesystems
.emplace(std::piecewise_construct
, std::forward_as_tuple(ref
->fscid
), std::forward_as_tuple(std::move(ref
)));
666 ceph_assert(em
.second
);
669 decode(mds_roles
, p
);
670 decode(standby_daemons
, p
);
671 decode(standby_epochs
, p
);
673 decode(ever_enabled_multiple
, p
);
680 void FSMap::sanitize(const std::function
<bool(int64_t pool
)>& pool_exists
)
682 for (auto &fs
: filesystems
) {
683 fs
.second
->mds_map
.sanitize(pool_exists
);
687 void Filesystem::encode(bufferlist
& bl
, uint64_t features
) const
689 ENCODE_START(1, 1, bl
);
691 bufferlist mdsmap_bl
;
692 mds_map
.encode(mdsmap_bl
, features
);
693 encode(mdsmap_bl
, bl
);
697 void Filesystem::decode(bufferlist::const_iterator
& p
)
701 bufferlist mdsmap_bl
;
702 decode(mdsmap_bl
, p
);
703 auto mdsmap_bl_iter
= mdsmap_bl
.cbegin();
704 mds_map
.decode(mdsmap_bl_iter
);
708 int FSMap::parse_filesystem(
709 std::string_view ns_str
,
710 Filesystem::const_ref
* result
714 std::string
s(ns_str
);
715 fs_cluster_id_t fscid
= strict_strtol(s
.c_str(), 10, &ns_err
);
716 if (!ns_err
.empty() || filesystems
.count(fscid
) == 0) {
717 for (auto &fs
: filesystems
) {
718 if (fs
.second
->mds_map
.fs_name
== s
) {
719 *result
= std::const_pointer_cast
<const Filesystem
>(fs
.second
);
725 *result
= get_filesystem(fscid
);
730 void Filesystem::print(std::ostream
&out
) const
732 out
<< "Filesystem '" << mds_map
.fs_name
733 << "' (" << fscid
<< ")" << std::endl
;
737 bool FSMap::is_any_degraded() const
739 for (auto& i
: filesystems
) {
740 if (i
.second
->mds_map
.is_degraded()) {
747 std::map
<mds_gid_t
, MDSMap::mds_info_t
> FSMap::get_mds_info() const
749 std::map
<mds_gid_t
, mds_info_t
> result
;
750 for (const auto &i
: standby_daemons
) {
751 result
[i
.first
] = i
.second
;
754 for (const auto &i
: filesystems
) {
755 const auto &fs_info
= i
.second
->mds_map
.get_mds_info();
756 for (const auto &j
: fs_info
) {
757 result
[j
.first
] = j
.second
;
764 const MDSMap::mds_info_t
* FSMap::get_available_standby(fs_cluster_id_t fscid
) const
766 const mds_info_t
* who
= nullptr;
767 for (const auto& [gid
, info
] : standby_daemons
) {
768 ceph_assert(info
.rank
== MDS_RANK_NONE
);
769 ceph_assert(info
.state
== MDSMap::STATE_STANDBY
);
771 if (info
.laggy() || info
.is_frozen()) {
775 if (info
.join_fscid
== fscid
) {
778 } else if (info
.join_fscid
== FS_CLUSTER_ID_NONE
) {
779 who
= &info
; /* vanilla standby */
780 } else if (who
== nullptr) {
781 who
= &info
; /* standby for another fs, last resort */
787 mds_gid_t
FSMap::find_mds_gid_by_name(std::string_view s
) const
789 const auto info
= get_mds_info();
790 for (const auto &p
: info
) {
791 if (p
.second
.name
== s
) {
798 const MDSMap::mds_info_t
* FSMap::find_by_name(std::string_view name
) const
800 std::map
<mds_gid_t
, mds_info_t
> result
;
801 for (const auto &i
: standby_daemons
) {
802 if (i
.second
.name
== name
) {
807 for (const auto &i
: filesystems
) {
808 const auto &fs_info
= i
.second
->mds_map
.get_mds_info();
809 for (const auto &j
: fs_info
) {
810 if (j
.second
.name
== name
) {
819 const MDSMap::mds_info_t
* FSMap::find_replacement_for(mds_role_t role
) const
821 auto&& fs
= get_filesystem(role
.fscid
);
823 // First see if we have a STANDBY_REPLAY
824 for (const auto& [gid
, info
] : fs
->mds_map
.mds_info
) {
825 if (info
.rank
== role
.rank
&& info
.state
== MDSMap::STATE_STANDBY_REPLAY
) {
826 if (info
.is_frozen()) {
827 /* the standby-replay is frozen, do nothing! */
835 return get_available_standby(role
.fscid
);
838 void FSMap::sanity() const
840 if (legacy_client_fscid
!= FS_CLUSTER_ID_NONE
) {
841 ceph_assert(filesystems
.count(legacy_client_fscid
) == 1);
844 for (const auto &i
: filesystems
) {
846 ceph_assert(fs
->mds_map
.compat
.compare(compat
) == 0);
847 ceph_assert(fs
->fscid
== i
.first
);
848 for (const auto &j
: fs
->mds_map
.mds_info
) {
849 ceph_assert(j
.second
.rank
!= MDS_RANK_NONE
);
850 ceph_assert(mds_roles
.count(j
.first
) == 1);
851 ceph_assert(standby_daemons
.count(j
.first
) == 0);
852 ceph_assert(standby_epochs
.count(j
.first
) == 0);
853 ceph_assert(mds_roles
.at(j
.first
) == i
.first
);
854 if (j
.second
.state
!= MDSMap::STATE_STANDBY_REPLAY
) {
855 ceph_assert(fs
->mds_map
.up
.at(j
.second
.rank
) == j
.first
);
856 ceph_assert(fs
->mds_map
.failed
.count(j
.second
.rank
) == 0);
857 ceph_assert(fs
->mds_map
.damaged
.count(j
.second
.rank
) == 0);
861 for (const auto &j
: fs
->mds_map
.up
) {
862 mds_rank_t rank
= j
.first
;
863 ceph_assert(fs
->mds_map
.in
.count(rank
) == 1);
864 mds_gid_t gid
= j
.second
;
865 ceph_assert(fs
->mds_map
.mds_info
.count(gid
) == 1);
869 for (const auto &i
: standby_daemons
) {
870 ceph_assert(i
.second
.state
== MDSMap::STATE_STANDBY
);
871 ceph_assert(i
.second
.rank
== MDS_RANK_NONE
);
872 ceph_assert(i
.second
.global_id
== i
.first
);
873 ceph_assert(standby_epochs
.count(i
.first
) == 1);
874 ceph_assert(mds_roles
.count(i
.first
) == 1);
875 ceph_assert(mds_roles
.at(i
.first
) == FS_CLUSTER_ID_NONE
);
878 for (const auto &i
: standby_epochs
) {
879 ceph_assert(standby_daemons
.count(i
.first
) == 1);
882 for (const auto &i
: mds_roles
) {
883 if (i
.second
== FS_CLUSTER_ID_NONE
) {
884 ceph_assert(standby_daemons
.count(i
.first
) == 1);
886 ceph_assert(filesystems
.count(i
.second
) == 1);
887 ceph_assert(filesystems
.at(i
.second
)->mds_map
.mds_info
.count(i
.first
) == 1);
893 mds_gid_t standby_gid
,
894 Filesystem
& filesystem
,
895 mds_rank_t assigned_rank
)
897 ceph_assert(gid_exists(standby_gid
));
898 bool is_standby_replay
= mds_roles
.at(standby_gid
) != FS_CLUSTER_ID_NONE
;
899 if (!is_standby_replay
) {
900 ceph_assert(standby_daemons
.count(standby_gid
));
901 ceph_assert(standby_daemons
.at(standby_gid
).state
== MDSMap::STATE_STANDBY
);
904 MDSMap
&mds_map
= filesystem
.mds_map
;
906 // Insert daemon state to Filesystem
907 if (!is_standby_replay
) {
908 mds_map
.mds_info
[standby_gid
] = standby_daemons
.at(standby_gid
);
910 ceph_assert(mds_map
.mds_info
.count(standby_gid
));
911 ceph_assert(mds_map
.mds_info
.at(standby_gid
).state
== MDSMap::STATE_STANDBY_REPLAY
);
912 ceph_assert(mds_map
.mds_info
.at(standby_gid
).rank
== assigned_rank
);
914 auto& info
= mds_map
.mds_info
[standby_gid
];
916 if (mds_map
.stopped
.erase(assigned_rank
)) {
917 // The cluster is being expanded with a stopped rank
918 info
.state
= MDSMap::STATE_STARTING
;
919 } else if (!mds_map
.is_in(assigned_rank
)) {
920 // The cluster is being expanded with a new rank
921 info
.state
= MDSMap::STATE_CREATING
;
923 // An existing rank is being assigned to a replacement
924 info
.state
= MDSMap::STATE_REPLAY
;
925 mds_map
.failed
.erase(assigned_rank
);
927 info
.rank
= assigned_rank
;
929 mds_roles
[standby_gid
] = filesystem
.fscid
;
931 // Update the rank state in Filesystem
932 mds_map
.in
.insert(assigned_rank
);
933 mds_map
.up
[assigned_rank
] = standby_gid
;
935 // Remove from the list of standbys
936 if (!is_standby_replay
) {
937 standby_daemons
.erase(standby_gid
);
938 standby_epochs
.erase(standby_gid
);
941 // Indicate that Filesystem has been modified
942 mds_map
.epoch
= epoch
;
945 void FSMap::assign_standby_replay(
946 const mds_gid_t standby_gid
,
947 const fs_cluster_id_t leader_ns
,
948 const mds_rank_t leader_rank
)
950 ceph_assert(mds_roles
.at(standby_gid
) == FS_CLUSTER_ID_NONE
);
951 ceph_assert(gid_exists(standby_gid
));
952 ceph_assert(!gid_has_rank(standby_gid
));
953 ceph_assert(standby_daemons
.count(standby_gid
));
955 // Insert to the filesystem
956 auto fs
= filesystems
.at(leader_ns
);
957 fs
->mds_map
.mds_info
[standby_gid
] = standby_daemons
.at(standby_gid
);
958 fs
->mds_map
.mds_info
[standby_gid
].rank
= leader_rank
;
959 fs
->mds_map
.mds_info
[standby_gid
].state
= MDSMap::STATE_STANDBY_REPLAY
;
960 mds_roles
[standby_gid
] = leader_ns
;
962 // Remove from the list of standbys
963 standby_daemons
.erase(standby_gid
);
964 standby_epochs
.erase(standby_gid
);
966 // Indicate that Filesystem has been modified
967 fs
->mds_map
.epoch
= epoch
;
970 void FSMap::erase(mds_gid_t who
, epoch_t blacklist_epoch
)
972 if (mds_roles
.at(who
) == FS_CLUSTER_ID_NONE
) {
973 standby_daemons
.erase(who
);
974 standby_epochs
.erase(who
);
976 auto &fs
= filesystems
.at(mds_roles
.at(who
));
977 const auto &info
= fs
->mds_map
.mds_info
.at(who
);
978 if (info
.state
!= MDSMap::STATE_STANDBY_REPLAY
) {
979 if (info
.state
== MDSMap::STATE_CREATING
) {
980 // If this gid didn't make it past CREATING, then forget
981 // the rank ever existed so that next time it's handed out
982 // to a gid it'll go back into CREATING.
983 fs
->mds_map
.in
.erase(info
.rank
);
985 // Put this rank into the failed list so that the next available
986 // STANDBY will pick it up.
987 fs
->mds_map
.failed
.insert(info
.rank
);
989 ceph_assert(fs
->mds_map
.up
.at(info
.rank
) == info
.global_id
);
990 fs
->mds_map
.up
.erase(info
.rank
);
992 fs
->mds_map
.mds_info
.erase(who
);
993 fs
->mds_map
.last_failure_osd_epoch
= blacklist_epoch
;
994 fs
->mds_map
.epoch
= epoch
;
997 mds_roles
.erase(who
);
1000 void FSMap::damaged(mds_gid_t who
, epoch_t blacklist_epoch
)
1002 ceph_assert(mds_roles
.at(who
) != FS_CLUSTER_ID_NONE
);
1003 auto fs
= filesystems
.at(mds_roles
.at(who
));
1004 mds_rank_t rank
= fs
->mds_map
.mds_info
[who
].rank
;
1006 erase(who
, blacklist_epoch
);
1007 fs
->mds_map
.failed
.erase(rank
);
1008 fs
->mds_map
.damaged
.insert(rank
);
1010 ceph_assert(fs
->mds_map
.epoch
== epoch
);
1014 * Update to indicate that the rank `rank` is to be removed
1015 * from the damaged list of the filesystem `fscid`
1017 bool FSMap::undamaged(const fs_cluster_id_t fscid
, const mds_rank_t rank
)
1019 auto fs
= filesystems
.at(fscid
);
1021 if (fs
->mds_map
.damaged
.erase(rank
)) {
1022 fs
->mds_map
.failed
.insert(rank
);
1023 fs
->mds_map
.epoch
= epoch
;
1030 void FSMap::insert(const MDSMap::mds_info_t
&new_info
)
1032 ceph_assert(new_info
.state
== MDSMap::STATE_STANDBY
);
1033 ceph_assert(new_info
.rank
== MDS_RANK_NONE
);
1034 mds_roles
[new_info
.global_id
] = FS_CLUSTER_ID_NONE
;
1035 standby_daemons
[new_info
.global_id
] = new_info
;
1036 standby_epochs
[new_info
.global_id
] = epoch
;
1039 std::vector
<mds_gid_t
> FSMap::stop(mds_gid_t who
)
1041 ceph_assert(mds_roles
.at(who
) != FS_CLUSTER_ID_NONE
);
1042 auto fs
= filesystems
.at(mds_roles
.at(who
));
1043 const auto &info
= fs
->mds_map
.mds_info
.at(who
);
1044 fs
->mds_map
.up
.erase(info
.rank
);
1045 fs
->mds_map
.in
.erase(info
.rank
);
1046 fs
->mds_map
.stopped
.insert(info
.rank
);
1048 // Also drop any standby replays that were following this rank
1049 std::vector
<mds_gid_t
> standbys
;
1050 for (const auto &i
: fs
->mds_map
.mds_info
) {
1051 const auto &other_gid
= i
.first
;
1052 const auto &other_info
= i
.second
;
1053 if (other_info
.rank
== info
.rank
1054 && other_info
.state
== MDSMap::STATE_STANDBY_REPLAY
) {
1055 standbys
.push_back(other_gid
);
1056 erase(other_gid
, 0);
1060 fs
->mds_map
.mds_info
.erase(who
);
1061 mds_roles
.erase(who
);
1063 fs
->mds_map
.epoch
= epoch
;
1070 * Given one of the following forms:
1075 * Parse into a mds_role_t. The rank-only form is only valid
1076 * if legacy_client_ns is set.
1078 int FSMap::parse_role(
1079 std::string_view role_str
,
1081 std::ostream
&ss
) const
1083 size_t colon_pos
= role_str
.find(":");
1085 Filesystem::const_ref fs
;
1086 if (colon_pos
== std::string::npos
) {
1087 if (legacy_client_fscid
== FS_CLUSTER_ID_NONE
) {
1088 ss
<< "No filesystem selected";
1091 fs
= get_filesystem(legacy_client_fscid
);
1094 if (parse_filesystem(role_str
.substr(0, colon_pos
), &fs
) < 0) {
1095 ss
<< "Invalid filesystem";
1098 rank_pos
= colon_pos
+1;
1103 std::string
rank_str(role_str
.substr(rank_pos
));
1104 long rank_i
= strict_strtol(rank_str
.c_str(), 10, &err
);
1105 if (rank_i
< 0 || !err
.empty()) {
1106 ss
<< "Invalid rank '" << rank_str
<< "'";
1112 if (fs
->mds_map
.in
.count(rank
) == 0) {
1113 ss
<< "Rank '" << rank
<< "' not found";
1117 *role
= {fs
->fscid
, rank
};
1122 bool FSMap::pool_in_use(int64_t poolid
) const
1124 for (auto const &i
: filesystems
) {
1125 if (i
.second
->mds_map
.is_data_pool(poolid
)
1126 || i
.second
->mds_map
.metadata_pool
== poolid
) {
1133 void FSMap::erase_filesystem(fs_cluster_id_t fscid
)
1135 filesystems
.erase(fscid
);
1136 for (auto& [gid
, info
] : standby_daemons
) {
1137 if (info
.join_fscid
== fscid
) {
1138 modify_daemon(gid
, [](auto& info
) {
1139 info
.join_fscid
= FS_CLUSTER_ID_NONE
;
1143 for (auto& p
: filesystems
) {
1144 for (auto& [gid
, info
] : p
.second
->mds_map
.get_mds_info()) {
1145 if (info
.join_fscid
== fscid
) {
1146 modify_daemon(gid
, [](auto& info
) {
1147 info
.join_fscid
= FS_CLUSTER_ID_NONE
;