1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
4 * Ceph - scalable distributed file system
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
19 using std::stringstream
;
22 void Filesystem::dump(Formatter
*f
) const
24 f
->open_object_section("mdsmap");
27 f
->dump_int("id", fscid
);
30 void FSMap::dump(Formatter
*f
) const
32 f
->dump_int("epoch", epoch
);
34 f
->open_object_section("compat");
38 f
->open_object_section("feature_flags");
39 f
->dump_bool("enable_multiple", enable_multiple
);
40 f
->dump_bool("ever_enabled_multiple", ever_enabled_multiple
);
43 f
->open_array_section("standbys");
44 for (const auto &i
: standby_daemons
) {
45 f
->open_object_section("info");
47 f
->dump_int("epoch", standby_epochs
.at(i
.first
));
52 f
->open_array_section("filesystems");
53 for (const auto &fs
: filesystems
) {
54 f
->open_object_section("filesystem");
61 void FSMap::generate_test_instances(list
<FSMap
*>& ls
)
63 FSMap
*m
= new FSMap();
65 std::list
<MDSMap
*> mds_map_instances
;
66 MDSMap::generate_test_instances(mds_map_instances
);
69 for (auto i
: mds_map_instances
) {
70 auto fs
= std::make_shared
<Filesystem
>();
74 m
->filesystems
[fs
->fscid
] = fs
;
76 mds_map_instances
.clear();
81 void FSMap::print(ostream
& out
) const
83 out
<< "e" << epoch
<< std::endl
;
84 out
<< "enable_multiple, ever_enabled_multiple: " << enable_multiple
<< ","
85 << ever_enabled_multiple
<< std::endl
;
86 out
<< "compat: " << compat
<< std::endl
;
87 out
<< "legacy client fscid: " << legacy_client_fscid
<< std::endl
;
88 out
<< " " << std::endl
;
90 if (filesystems
.empty()) {
91 out
<< "No filesystems configured" << std::endl
;
95 for (const auto &fs
: filesystems
) {
96 fs
.second
->print(out
);
97 out
<< " " << std::endl
<< " " << std::endl
; // Space out a bit
100 if (!standby_daemons
.empty()) {
101 out
<< "Standby daemons:" << std::endl
<< " " << std::endl
;
104 for (const auto &p
: standby_daemons
) {
105 p
.second
.print_summary(out
);
112 void FSMap::print_summary(Formatter
*f
, ostream
*out
) const
114 map
<mds_role_t
,string
> by_rank
;
115 map
<string
,int> by_state
;
118 f
->dump_unsigned("epoch", get_epoch());
119 for (auto i
: filesystems
) {
121 f
->dump_unsigned("id", fs
->fscid
);
122 f
->dump_unsigned("up", fs
->mds_map
.up
.size());
123 f
->dump_unsigned("in", fs
->mds_map
.in
.size());
124 f
->dump_unsigned("max", fs
->mds_map
.max_mds
);
127 if (filesystems
.size() == 1) {
128 auto fs
= filesystems
.begin()->second
;
129 *out
<< fs
->mds_map
.up
.size() << "/" << fs
->mds_map
.in
.size() << "/"
130 << fs
->mds_map
.max_mds
<< " up";
132 for (auto i
: filesystems
) {
134 *out
<< fs
->mds_map
.fs_name
<< "-" << fs
->mds_map
.up
.size() << "/"
135 << fs
->mds_map
.in
.size() << "/" << fs
->mds_map
.max_mds
<< " up ";
141 f
->open_array_section("by_rank");
144 const auto all_info
= get_mds_info();
145 for (const auto &p
: all_info
) {
146 const auto &info
= p
.second
;
147 string s
= ceph_mds_state_name(info
.state
);
149 s
+= "(laggy or crashed)";
152 const fs_cluster_id_t fscid
= mds_roles
.at(info
.global_id
);
154 if (info
.rank
!= MDS_RANK_NONE
&&
155 info
.state
!= MDSMap::STATE_STANDBY_REPLAY
) {
157 f
->open_object_section("mds");
158 f
->dump_unsigned("filesystem_id", fscid
);
159 f
->dump_unsigned("rank", info
.rank
);
160 f
->dump_string("name", info
.name
);
161 f
->dump_string("status", s
);
164 by_rank
[mds_role_t(fscid
, info
.rank
)] = info
.name
+ "=" + s
;
174 if (!by_rank
.empty()) {
175 if (filesystems
.size() > 1) {
176 // Disambiguate filesystems
177 std::map
<std::string
, std::string
> pretty
;
178 for (auto i
: by_rank
) {
179 const auto &fs_name
= filesystems
.at(i
.first
.fscid
)->mds_map
.fs_name
;
180 std::ostringstream o
;
181 o
<< "[" << fs_name
<< ":" << i
.first
.rank
<< "]";
182 pretty
[o
.str()] = i
.second
;
184 *out
<< " " << pretty
;
186 // Omit FSCID in output when only one filesystem exists
187 std::map
<mds_rank_t
, std::string
> shortened
;
188 for (auto i
: by_rank
) {
189 shortened
[i
.first
.rank
] = i
.second
;
191 *out
<< " " << shortened
;
196 for (map
<string
,int>::reverse_iterator p
= by_state
.rbegin(); p
!= by_state
.rend(); ++p
) {
198 f
->dump_unsigned(p
->first
.c_str(), p
->second
);
200 *out
<< ", " << p
->second
<< " " << p
->first
;
206 for (auto i
: filesystems
) {
208 failed
+= fs
->mds_map
.failed
.size();
209 damaged
+= fs
->mds_map
.damaged
.size();
214 f
->dump_unsigned("failed", failed
);
216 *out
<< ", " << failed
<< " failed";
222 f
->dump_unsigned("damaged", damaged
);
224 *out
<< ", " << damaged
<< " damaged";
227 //if (stopped.size())
228 //out << ", " << stopped.size() << " stopped";
232 void FSMap::create_filesystem(const std::string
&name
,
233 int64_t metadata_pool
, int64_t data_pool
,
236 auto fs
= std::make_shared
<Filesystem
>();
237 fs
->mds_map
.fs_name
= name
;
238 fs
->mds_map
.max_mds
= 1;
239 fs
->mds_map
.data_pools
.push_back(data_pool
);
240 fs
->mds_map
.metadata_pool
= metadata_pool
;
241 fs
->mds_map
.cas_pool
= -1;
242 fs
->mds_map
.max_file_size
= g_conf
->mds_max_file_size
;
243 fs
->mds_map
.compat
= compat
;
244 fs
->mds_map
.created
= ceph_clock_now();
245 fs
->mds_map
.modified
= ceph_clock_now();
246 fs
->mds_map
.session_timeout
= g_conf
->mds_session_timeout
;
247 fs
->mds_map
.session_autoclose
= g_conf
->mds_session_autoclose
;
248 fs
->mds_map
.enabled
= true;
249 if (features
& CEPH_FEATURE_SERVER_JEWEL
) {
250 fs
->fscid
= next_filesystem_id
++;
251 // ANONYMOUS is only for upgrades from legacy mdsmaps, we should
252 // have initialized next_filesystem_id such that it's never used here.
253 assert(fs
->fscid
!= FS_CLUSTER_ID_ANONYMOUS
);
255 // Use anon fscid because this will get thrown away when encoding
256 // as legacy MDSMap for legacy mons.
257 assert(filesystems
.empty());
258 fs
->fscid
= FS_CLUSTER_ID_ANONYMOUS
;
260 filesystems
[fs
->fscid
] = fs
;
262 // Created first filesystem? Set it as the one
263 // for legacy clients to use
264 if (filesystems
.size() == 1) {
265 legacy_client_fscid
= fs
->fscid
;
269 void FSMap::reset_filesystem(fs_cluster_id_t fscid
)
271 auto fs
= get_filesystem(fscid
);
272 auto new_fs
= std::make_shared
<Filesystem
>();
274 // Populate rank 0 as existing (so don't go into CREATING)
275 // but failed (so that next available MDS is assigned the rank)
276 new_fs
->mds_map
.in
.insert(mds_rank_t(0));
277 new_fs
->mds_map
.failed
.insert(mds_rank_t(0));
279 // Carry forward what makes sense
280 new_fs
->fscid
= fs
->fscid
;
281 new_fs
->mds_map
.inline_data_enabled
= fs
->mds_map
.inline_data_enabled
;
282 new_fs
->mds_map
.max_mds
= 1;
283 new_fs
->mds_map
.data_pools
= fs
->mds_map
.data_pools
;
284 new_fs
->mds_map
.metadata_pool
= fs
->mds_map
.metadata_pool
;
285 new_fs
->mds_map
.cas_pool
= fs
->mds_map
.cas_pool
;
286 new_fs
->mds_map
.fs_name
= fs
->mds_map
.fs_name
;
287 new_fs
->mds_map
.max_file_size
= g_conf
->mds_max_file_size
;
288 new_fs
->mds_map
.compat
= compat
;
289 new_fs
->mds_map
.created
= ceph_clock_now();
290 new_fs
->mds_map
.modified
= ceph_clock_now();
291 new_fs
->mds_map
.session_timeout
= g_conf
->mds_session_timeout
;
292 new_fs
->mds_map
.session_autoclose
= g_conf
->mds_session_autoclose
;
293 new_fs
->mds_map
.standby_count_wanted
= fs
->mds_map
.standby_count_wanted
;
294 new_fs
->mds_map
.enabled
= true;
296 // Persist the new FSMap
297 filesystems
[new_fs
->fscid
] = new_fs
;
300 void FSMap::get_health(list
<pair
<health_status_t
,string
> >& summary
,
301 list
<pair
<health_status_t
,string
> > *detail
) const
303 mds_rank_t standby_count_wanted
= 0;
304 for (const auto &i
: filesystems
) {
305 const auto &fs
= i
.second
;
307 // TODO: move get_health up into here so that we can qualify
308 // all the messages with what filesystem they're talking about
309 fs
->mds_map
.get_health(summary
, detail
);
311 standby_count_wanted
= std::max(standby_count_wanted
, fs
->mds_map
.get_standby_count_wanted((mds_rank_t
)standby_daemons
.size()));
314 if (standby_count_wanted
) {
315 std::ostringstream oss
;
316 oss
<< "insufficient standby daemons available: have " << standby_daemons
.size() << "; want " << standby_count_wanted
<< " more";
317 summary
.push_back(make_pair(HEALTH_WARN
, oss
.str()));
321 bool FSMap::check_health(void)
323 bool changed
= false;
324 for (auto &i
: filesystems
) {
325 changed
|= i
.second
->mds_map
.check_health((mds_rank_t
)standby_daemons
.size());
330 void FSMap::encode(bufferlist
& bl
, uint64_t features
) const
332 if (features
& CEPH_FEATURE_SERVER_JEWEL
) {
333 ENCODE_START(7, 6, bl
);
335 ::encode(next_filesystem_id
, bl
);
336 ::encode(legacy_client_fscid
, bl
);
337 ::encode(compat
, bl
);
338 ::encode(enable_multiple
, bl
);
339 std::vector
<Filesystem
> fs_list
;
340 for (auto i
: filesystems
) {
341 fs_list
.push_back(*(i
.second
));
343 ::encode(fs_list
, bl
, features
);
344 ::encode(mds_roles
, bl
);
345 ::encode(standby_daemons
, bl
, features
);
346 ::encode(standby_epochs
, bl
);
347 ::encode(ever_enabled_multiple
, bl
);
350 if (filesystems
.empty()) {
352 disabled_map
.epoch
= epoch
;
353 disabled_map
.encode(bl
, features
);
355 // MDSMonitor should never have created multiple filesystems
356 // until the quorum features indicated Jewel
357 assert(filesystems
.size() == 1);
358 auto fs
= filesystems
.begin()->second
;
360 // Take the MDSMap for the enabled filesystem, and populated its
361 // mds_info with the standbys to get a pre-jewel-style mon MDSMap.
362 MDSMap full_mdsmap
= fs
->mds_map
;
363 full_mdsmap
.epoch
= epoch
;
364 for (const auto &p
: standby_daemons
) {
365 full_mdsmap
.mds_info
[p
.first
] = p
.second
;
368 // Old MDSMaps don't set rank on standby replay daemons
369 for (auto &i
: full_mdsmap
.mds_info
) {
370 auto &info
= i
.second
;
371 if (info
.state
== MDSMap::STATE_STANDBY_REPLAY
) {
372 info
.rank
= MDS_RANK_NONE
;
376 full_mdsmap
.encode(bl
, features
);
381 void FSMap::decode(bufferlist::iterator
& p
)
383 // Because the mon used to store an MDSMap where we now
384 // store an FSMap, FSMap knows how to decode the legacy
385 // MDSMap format (it never needs to encode it though).
386 MDSMap legacy_mds_map
;
388 // The highest MDSMap encoding version before we changed the
389 // MDSMonitor to store an FSMap instead of an MDSMap was
390 // 5, so anything older than 6 is decoded as an MDSMap,
391 // and anything newer is decoded as an FSMap.
392 DECODE_START_LEGACY_COMPAT_LEN_16(7, 4, 4, p
);
394 // Decoding an MDSMap (upgrade)
396 ::decode(legacy_mds_map
.flags
, p
);
397 ::decode(legacy_mds_map
.last_failure
, p
);
398 ::decode(legacy_mds_map
.root
, p
);
399 ::decode(legacy_mds_map
.session_timeout
, p
);
400 ::decode(legacy_mds_map
.session_autoclose
, p
);
401 ::decode(legacy_mds_map
.max_file_size
, p
);
402 ::decode(legacy_mds_map
.max_mds
, p
);
403 ::decode(legacy_mds_map
.mds_info
, p
);
410 legacy_mds_map
.data_pools
.push_back(m
);
414 legacy_mds_map
.cas_pool
= s
;
416 ::decode(legacy_mds_map
.data_pools
, p
);
417 ::decode(legacy_mds_map
.cas_pool
, p
);
420 // kclient ignores everything from here
425 ::decode(legacy_mds_map
.compat
, p
);
427 legacy_mds_map
.compat
= get_mdsmap_compat_set_base();
431 legacy_mds_map
.metadata_pool
= n
;
433 ::decode(legacy_mds_map
.metadata_pool
, p
);
435 ::decode(legacy_mds_map
.created
, p
);
436 ::decode(legacy_mds_map
.modified
, p
);
437 ::decode(legacy_mds_map
.tableserver
, p
);
438 ::decode(legacy_mds_map
.in
, p
);
439 std::map
<mds_rank_t
,int32_t> inc
; // Legacy field, parse and drop
441 ::decode(legacy_mds_map
.up
, p
);
442 ::decode(legacy_mds_map
.failed
, p
);
443 ::decode(legacy_mds_map
.stopped
, p
);
445 ::decode(legacy_mds_map
.last_failure_osd_epoch
, p
);
448 // previously this was a bool about snaps, not a flag map
451 legacy_mds_map
.ever_allowed_features
= flag
?
452 CEPH_MDSMAP_ALLOW_SNAPS
: 0;
454 legacy_mds_map
.explicitly_allowed_features
= flag
?
455 CEPH_MDSMAP_ALLOW_SNAPS
: 0;
456 if (legacy_mds_map
.max_mds
> 1) {
457 legacy_mds_map
.set_multimds_allowed();
460 ::decode(legacy_mds_map
.ever_allowed_features
, p
);
461 ::decode(legacy_mds_map
.explicitly_allowed_features
, p
);
464 legacy_mds_map
.ever_allowed_features
= CEPH_MDSMAP_ALLOW_CLASSICS
;
465 legacy_mds_map
.explicitly_allowed_features
= 0;
466 if (legacy_mds_map
.max_mds
> 1) {
467 legacy_mds_map
.set_multimds_allowed();
471 ::decode(legacy_mds_map
.inline_data_enabled
, p
);
474 assert(struct_v
>= 5);
475 ::decode(legacy_mds_map
.enabled
, p
);
476 ::decode(legacy_mds_map
.fs_name
, p
);
478 legacy_mds_map
.fs_name
= "default";
480 // If an MDS has ever been started, epoch will be greater than 1,
481 // assume filesystem is enabled.
482 legacy_mds_map
.enabled
= true;
484 // Upgrading from a cluster that never used an MDS, switch off
485 // filesystem until it's explicitly enabled.
486 legacy_mds_map
.enabled
= false;
491 ::decode(legacy_mds_map
.damaged
, p
);
494 // We're upgrading, populate filesystems from the legacy fields
496 standby_daemons
.clear();
497 standby_epochs
.clear();
499 compat
= legacy_mds_map
.compat
;
500 enable_multiple
= false;
502 // Synthesise a Filesystem from legacy_mds_map, if enabled
503 if (legacy_mds_map
.enabled
) {
504 // Construct a Filesystem from the legacy MDSMap
505 auto migrate_fs
= std::make_shared
<Filesystem
>();
506 migrate_fs
->fscid
= FS_CLUSTER_ID_ANONYMOUS
;
507 migrate_fs
->mds_map
= legacy_mds_map
;
508 migrate_fs
->mds_map
.epoch
= epoch
;
509 filesystems
[migrate_fs
->fscid
] = migrate_fs
;
511 // List of GIDs that had invalid states
512 std::set
<mds_gid_t
> drop_gids
;
514 // Construct mds_roles, standby_daemons, and remove
515 // standbys from the MDSMap in the Filesystem.
516 for (auto &p
: migrate_fs
->mds_map
.mds_info
) {
517 if (p
.second
.state
== MDSMap::STATE_STANDBY_REPLAY
) {
518 // In legacy MDSMap, standby replay daemons don't have
519 // rank set, but since FSMap they do.
520 p
.second
.rank
= p
.second
.standby_for_rank
;
522 if (p
.second
.rank
== MDS_RANK_NONE
) {
523 if (p
.second
.state
!= MDSMap::STATE_STANDBY
) {
524 // Old MDSMaps can have down:dne here, which
525 // is invalid in an FSMap (#17837)
526 drop_gids
.insert(p
.first
);
528 insert(p
.second
); // into standby_daemons
531 mds_roles
[p
.first
] = migrate_fs
->fscid
;
534 for (const auto &p
: standby_daemons
) {
535 // Erase from this Filesystem's MDSMap, because it has
536 // been copied into FSMap::Standby_daemons above
537 migrate_fs
->mds_map
.mds_info
.erase(p
.first
);
539 for (const auto &gid
: drop_gids
) {
540 // Throw away all info for this MDS because it was identified
541 // as having invalid state above.
542 migrate_fs
->mds_map
.mds_info
.erase(gid
);
545 legacy_client_fscid
= migrate_fs
->fscid
;
547 legacy_client_fscid
= FS_CLUSTER_ID_NONE
;
551 ::decode(next_filesystem_id
, p
);
552 ::decode(legacy_client_fscid
, p
);
554 ::decode(enable_multiple
, p
);
555 std::vector
<Filesystem
> fs_list
;
556 ::decode(fs_list
, p
);
558 for (std::vector
<Filesystem
>::const_iterator fs
= fs_list
.begin(); fs
!= fs_list
.end(); ++fs
) {
559 filesystems
[fs
->fscid
] = std::make_shared
<Filesystem
>(*fs
);
562 ::decode(mds_roles
, p
);
563 ::decode(standby_daemons
, p
);
564 ::decode(standby_epochs
, p
);
566 ::decode(ever_enabled_multiple
, p
);
574 void Filesystem::encode(bufferlist
& bl
, uint64_t features
) const
576 ENCODE_START(1, 1, bl
);
578 bufferlist mdsmap_bl
;
579 mds_map
.encode(mdsmap_bl
, features
);
580 ::encode(mdsmap_bl
, bl
);
584 void Filesystem::decode(bufferlist::iterator
& p
)
588 bufferlist mdsmap_bl
;
589 ::decode(mdsmap_bl
, p
);
590 bufferlist::iterator mdsmap_bl_iter
= mdsmap_bl
.begin();
591 mds_map
.decode(mdsmap_bl_iter
);
595 int FSMap::parse_filesystem(
596 std::string
const &ns_str
,
597 std::shared_ptr
<const Filesystem
> *result
601 fs_cluster_id_t fscid
= strict_strtol(ns_str
.c_str(), 10, &ns_err
);
602 if (!ns_err
.empty() || filesystems
.count(fscid
) == 0) {
603 for (auto &fs
: filesystems
) {
604 if (fs
.second
->mds_map
.fs_name
== ns_str
) {
605 *result
= std::const_pointer_cast
<const Filesystem
>(fs
.second
);
611 *result
= get_filesystem(fscid
);
616 void Filesystem::print(std::ostream
&out
) const
618 out
<< "Filesystem '" << mds_map
.fs_name
619 << "' (" << fscid
<< ")" << std::endl
;
623 mds_gid_t
FSMap::find_standby_for(mds_role_t role
, const std::string
& name
) const
625 mds_gid_t result
= MDS_GID_NONE
;
627 // First see if we have a STANDBY_REPLAY
628 auto fs
= get_filesystem(role
.fscid
);
629 for (const auto &i
: fs
->mds_map
.mds_info
) {
630 const auto &info
= i
.second
;
631 if (info
.rank
== role
.rank
&& info
.state
== MDSMap::STATE_STANDBY_REPLAY
) {
632 return info
.global_id
;
636 // See if there are any STANDBY daemons available
637 for (const auto &i
: standby_daemons
) {
638 const auto &gid
= i
.first
;
639 const auto &info
= i
.second
;
640 assert(info
.state
== MDSMap::STATE_STANDBY
);
641 assert(info
.rank
== MDS_RANK_NONE
);
647 // The mds_info_t may or may not tell us exactly which filesystem
648 // the standby_for_rank refers to: lookup via legacy_client_fscid
649 mds_role_t target_role
= {
650 info
.standby_for_fscid
== FS_CLUSTER_ID_NONE
?
651 legacy_client_fscid
: info
.standby_for_fscid
,
652 info
.standby_for_rank
};
654 if ((target_role
.rank
== role
.rank
&& target_role
.fscid
== role
.fscid
)
655 || (name
.length() && info
.standby_for_name
== name
)) {
656 // It's a named standby for *me*, use it.
659 info
.standby_for_rank
< 0 && info
.standby_for_name
.length() == 0 &&
660 (info
.standby_for_fscid
== FS_CLUSTER_ID_NONE
||
661 info
.standby_for_fscid
== role
.fscid
)) {
662 // It's not a named standby for anyone, use it if we don't find
663 // a named standby for me later, unless it targets another FSCID.
671 mds_gid_t
FSMap::find_unused_for(mds_role_t role
,
672 bool force_standby_active
) const {
673 for (const auto &i
: standby_daemons
) {
674 const auto &gid
= i
.first
;
675 const auto &info
= i
.second
;
676 assert(info
.state
== MDSMap::STATE_STANDBY
);
678 if (info
.laggy() || info
.rank
>= 0)
681 if (info
.standby_for_fscid
!= FS_CLUSTER_ID_NONE
&&
682 info
.standby_for_fscid
!= role
.fscid
)
684 if (info
.standby_for_rank
!= MDS_RANK_NONE
&&
685 info
.standby_for_rank
!= role
.rank
)
688 // To be considered 'unused' a daemon must either not
689 // be selected for standby-replay or the force_standby_active
690 // setting must be enabled to use replay daemons anyway.
691 if (!info
.standby_replay
|| force_standby_active
) {
698 mds_gid_t
FSMap::find_replacement_for(mds_role_t role
, const std::string
& name
,
699 bool force_standby_active
) const {
700 const mds_gid_t standby
= find_standby_for(role
, name
);
704 return find_unused_for(role
, force_standby_active
);
707 void FSMap::sanity() const
709 if (legacy_client_fscid
!= FS_CLUSTER_ID_NONE
) {
710 assert(filesystems
.count(legacy_client_fscid
) == 1);
713 for (const auto &i
: filesystems
) {
715 assert(fs
->mds_map
.compat
.compare(compat
) == 0);
716 assert(fs
->fscid
== i
.first
);
717 for (const auto &j
: fs
->mds_map
.mds_info
) {
718 assert(j
.second
.rank
!= MDS_RANK_NONE
);
719 assert(mds_roles
.count(j
.first
) == 1);
720 assert(standby_daemons
.count(j
.first
) == 0);
721 assert(standby_epochs
.count(j
.first
) == 0);
722 assert(mds_roles
.at(j
.first
) == i
.first
);
723 if (j
.second
.state
!= MDSMap::STATE_STANDBY_REPLAY
) {
724 assert(fs
->mds_map
.up
.at(j
.second
.rank
) == j
.first
);
725 assert(fs
->mds_map
.failed
.count(j
.second
.rank
) == 0);
726 assert(fs
->mds_map
.damaged
.count(j
.second
.rank
) == 0);
730 for (const auto &j
: fs
->mds_map
.up
) {
731 mds_rank_t rank
= j
.first
;
732 assert(fs
->mds_map
.in
.count(rank
) == 1);
733 mds_gid_t gid
= j
.second
;
734 assert(fs
->mds_map
.mds_info
.count(gid
) == 1);
738 for (const auto &i
: standby_daemons
) {
739 assert(i
.second
.state
== MDSMap::STATE_STANDBY
);
740 assert(i
.second
.rank
== MDS_RANK_NONE
);
741 assert(i
.second
.global_id
== i
.first
);
742 assert(standby_epochs
.count(i
.first
) == 1);
743 assert(mds_roles
.count(i
.first
) == 1);
744 assert(mds_roles
.at(i
.first
) == FS_CLUSTER_ID_NONE
);
747 for (const auto &i
: standby_epochs
) {
748 assert(standby_daemons
.count(i
.first
) == 1);
751 for (const auto &i
: mds_roles
) {
752 if (i
.second
== FS_CLUSTER_ID_NONE
) {
753 assert(standby_daemons
.count(i
.first
) == 1);
755 assert(filesystems
.count(i
.second
) == 1);
756 assert(filesystems
.at(i
.second
)->mds_map
.mds_info
.count(i
.first
) == 1);
762 mds_gid_t standby_gid
,
763 const std::shared_ptr
<Filesystem
> &filesystem
,
764 mds_rank_t assigned_rank
)
766 assert(gid_exists(standby_gid
));
767 bool is_standby_replay
= mds_roles
.at(standby_gid
) != FS_CLUSTER_ID_NONE
;
768 if (!is_standby_replay
) {
769 assert(standby_daemons
.count(standby_gid
));
770 assert(standby_daemons
.at(standby_gid
).state
== MDSMap::STATE_STANDBY
);
773 MDSMap
&mds_map
= filesystem
->mds_map
;
775 // Insert daemon state to Filesystem
776 if (!is_standby_replay
) {
777 mds_map
.mds_info
[standby_gid
] = standby_daemons
.at(standby_gid
);
779 assert(mds_map
.mds_info
.count(standby_gid
));
780 assert(mds_map
.mds_info
.at(standby_gid
).state
== MDSMap::STATE_STANDBY_REPLAY
);
781 assert(mds_map
.mds_info
.at(standby_gid
).rank
== assigned_rank
);
783 MDSMap::mds_info_t
&info
= mds_map
.mds_info
[standby_gid
];
785 if (mds_map
.stopped
.erase(assigned_rank
)) {
786 // The cluster is being expanded with a stopped rank
787 info
.state
= MDSMap::STATE_STARTING
;
788 } else if (!mds_map
.is_in(assigned_rank
)) {
789 // The cluster is being expanded with a new rank
790 info
.state
= MDSMap::STATE_CREATING
;
792 // An existing rank is being assigned to a replacement
793 info
.state
= MDSMap::STATE_REPLAY
;
794 mds_map
.failed
.erase(assigned_rank
);
796 info
.rank
= assigned_rank
;
798 mds_roles
[standby_gid
] = filesystem
->fscid
;
800 // Update the rank state in Filesystem
801 mds_map
.in
.insert(assigned_rank
);
802 mds_map
.up
[assigned_rank
] = standby_gid
;
804 // Remove from the list of standbys
805 if (!is_standby_replay
) {
806 standby_daemons
.erase(standby_gid
);
807 standby_epochs
.erase(standby_gid
);
810 // Indicate that Filesystem has been modified
811 mds_map
.epoch
= epoch
;
814 void FSMap::assign_standby_replay(
815 const mds_gid_t standby_gid
,
816 const fs_cluster_id_t leader_ns
,
817 const mds_rank_t leader_rank
)
819 assert(mds_roles
.at(standby_gid
) == FS_CLUSTER_ID_NONE
);
820 assert(gid_exists(standby_gid
));
821 assert(!gid_has_rank(standby_gid
));
822 assert(standby_daemons
.count(standby_gid
));
824 // Insert to the filesystem
825 auto fs
= filesystems
.at(leader_ns
);
826 fs
->mds_map
.mds_info
[standby_gid
] = standby_daemons
.at(standby_gid
);
827 fs
->mds_map
.mds_info
[standby_gid
].rank
= leader_rank
;
828 fs
->mds_map
.mds_info
[standby_gid
].state
= MDSMap::STATE_STANDBY_REPLAY
;
829 mds_roles
[standby_gid
] = leader_ns
;
831 // Remove from the list of standbys
832 standby_daemons
.erase(standby_gid
);
833 standby_epochs
.erase(standby_gid
);
835 // Indicate that Filesystem has been modified
836 fs
->mds_map
.epoch
= epoch
;
839 void FSMap::erase(mds_gid_t who
, epoch_t blacklist_epoch
)
841 if (mds_roles
.at(who
) == FS_CLUSTER_ID_NONE
) {
842 standby_daemons
.erase(who
);
843 standby_epochs
.erase(who
);
845 auto &fs
= filesystems
.at(mds_roles
.at(who
));
846 const auto &info
= fs
->mds_map
.mds_info
.at(who
);
847 if (info
.state
!= MDSMap::STATE_STANDBY_REPLAY
) {
848 if (info
.state
== MDSMap::STATE_CREATING
) {
849 // If this gid didn't make it past CREATING, then forget
850 // the rank ever existed so that next time it's handed out
851 // to a gid it'll go back into CREATING.
852 fs
->mds_map
.in
.erase(info
.rank
);
854 // Put this rank into the failed list so that the next available
855 // STANDBY will pick it up.
856 fs
->mds_map
.failed
.insert(info
.rank
);
858 assert(fs
->mds_map
.up
.at(info
.rank
) == info
.global_id
);
859 fs
->mds_map
.up
.erase(info
.rank
);
861 fs
->mds_map
.mds_info
.erase(who
);
862 fs
->mds_map
.last_failure_osd_epoch
= blacklist_epoch
;
863 fs
->mds_map
.epoch
= epoch
;
866 mds_roles
.erase(who
);
869 void FSMap::damaged(mds_gid_t who
, epoch_t blacklist_epoch
)
871 assert(mds_roles
.at(who
) != FS_CLUSTER_ID_NONE
);
872 auto fs
= filesystems
.at(mds_roles
.at(who
));
873 mds_rank_t rank
= fs
->mds_map
.mds_info
[who
].rank
;
875 erase(who
, blacklist_epoch
);
876 fs
->mds_map
.failed
.erase(rank
);
877 fs
->mds_map
.damaged
.insert(rank
);
879 assert(fs
->mds_map
.epoch
== epoch
);
883 * Update to indicate that the rank `rank` is to be removed
884 * from the damaged list of the filesystem `fscid`
886 bool FSMap::undamaged(const fs_cluster_id_t fscid
, const mds_rank_t rank
)
888 auto fs
= filesystems
.at(fscid
);
890 if (fs
->mds_map
.damaged
.erase(rank
)) {
891 fs
->mds_map
.failed
.insert(rank
);
892 fs
->mds_map
.epoch
= epoch
;
899 void FSMap::insert(const MDSMap::mds_info_t
&new_info
)
901 assert(new_info
.state
== MDSMap::STATE_STANDBY
);
902 assert(new_info
.rank
== MDS_RANK_NONE
);
903 mds_roles
[new_info
.global_id
] = FS_CLUSTER_ID_NONE
;
904 standby_daemons
[new_info
.global_id
] = new_info
;
905 standby_epochs
[new_info
.global_id
] = epoch
;
908 std::list
<mds_gid_t
> FSMap::stop(mds_gid_t who
)
910 assert(mds_roles
.at(who
) != FS_CLUSTER_ID_NONE
);
911 auto fs
= filesystems
.at(mds_roles
.at(who
));
912 const auto &info
= fs
->mds_map
.mds_info
.at(who
);
913 fs
->mds_map
.up
.erase(info
.rank
);
914 fs
->mds_map
.in
.erase(info
.rank
);
915 fs
->mds_map
.stopped
.insert(info
.rank
);
917 // Also drop any standby replays that were following this rank
918 std::list
<mds_gid_t
> standbys
;
919 for (const auto &i
: fs
->mds_map
.mds_info
) {
920 const auto &other_gid
= i
.first
;
921 const auto &other_info
= i
.second
;
922 if (other_info
.rank
== info
.rank
923 && other_info
.state
== MDSMap::STATE_STANDBY_REPLAY
) {
924 standbys
.push_back(other_gid
);
929 fs
->mds_map
.mds_info
.erase(who
);
930 mds_roles
.erase(who
);
932 fs
->mds_map
.epoch
= epoch
;
939 * Given one of the following forms:
944 * Parse into a mds_role_t. The rank-only form is only valid
945 * if legacy_client_ns is set.
947 int FSMap::parse_role(
948 const std::string
&role_str
,
950 std::ostream
&ss
) const
952 size_t colon_pos
= role_str
.find(":");
954 std::shared_ptr
<const Filesystem
> fs
;
955 if (colon_pos
== std::string::npos
) {
956 if (legacy_client_fscid
== FS_CLUSTER_ID_NONE
) {
957 ss
<< "No filesystem selected";
960 fs
= get_filesystem(legacy_client_fscid
);
963 if (parse_filesystem(role_str
.substr(0, colon_pos
), &fs
) < 0) {
964 ss
<< "Invalid filesystem";
967 rank_pos
= colon_pos
+1;
972 std::string rank_str
= role_str
.substr(rank_pos
);
973 long rank_i
= strict_strtol(rank_str
.c_str(), 10, &err
);
974 if (rank_i
< 0 || !err
.empty()) {
975 ss
<< "Invalid rank '" << rank_str
<< "'";
981 if (fs
->mds_map
.in
.count(rank
) == 0) {
982 ss
<< "Rank '" << rank
<< "' not found";
986 *role
= {fs
->fscid
, rank
};