1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
4 * Ceph - scalable distributed file system
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
19 using std::stringstream
;
21 #include "mon/health_check.h"
24 void Filesystem::dump(Formatter
*f
) const
26 f
->open_object_section("mdsmap");
29 f
->dump_int("id", fscid
);
32 void FSMap::dump(Formatter
*f
) const
34 f
->dump_int("epoch", epoch
);
36 f
->open_object_section("compat");
40 f
->open_object_section("feature_flags");
41 f
->dump_bool("enable_multiple", enable_multiple
);
42 f
->dump_bool("ever_enabled_multiple", ever_enabled_multiple
);
45 f
->open_array_section("standbys");
46 for (const auto &i
: standby_daemons
) {
47 f
->open_object_section("info");
49 f
->dump_int("epoch", standby_epochs
.at(i
.first
));
54 f
->open_array_section("filesystems");
55 for (const auto &fs
: filesystems
) {
56 f
->open_object_section("filesystem");
63 void FSMap::generate_test_instances(list
<FSMap
*>& ls
)
65 FSMap
*m
= new FSMap();
67 std::list
<MDSMap
*> mds_map_instances
;
68 MDSMap::generate_test_instances(mds_map_instances
);
71 for (auto i
: mds_map_instances
) {
72 auto fs
= std::make_shared
<Filesystem
>();
76 m
->filesystems
[fs
->fscid
] = fs
;
78 mds_map_instances
.clear();
83 void FSMap::print(ostream
& out
) const
85 out
<< "e" << epoch
<< std::endl
;
86 out
<< "enable_multiple, ever_enabled_multiple: " << enable_multiple
<< ","
87 << ever_enabled_multiple
<< std::endl
;
88 out
<< "compat: " << compat
<< std::endl
;
89 out
<< "legacy client fscid: " << legacy_client_fscid
<< std::endl
;
90 out
<< " " << std::endl
;
92 if (filesystems
.empty()) {
93 out
<< "No filesystems configured" << std::endl
;
97 for (const auto &fs
: filesystems
) {
98 fs
.second
->print(out
);
99 out
<< " " << std::endl
<< " " << std::endl
; // Space out a bit
102 if (!standby_daemons
.empty()) {
103 out
<< "Standby daemons:" << std::endl
<< " " << std::endl
;
106 for (const auto &p
: standby_daemons
) {
107 p
.second
.print_summary(out
);
114 void FSMap::print_summary(Formatter
*f
, ostream
*out
) const
116 map
<mds_role_t
,string
> by_rank
;
117 map
<string
,int> by_state
;
120 f
->dump_unsigned("epoch", get_epoch());
121 for (auto i
: filesystems
) {
123 f
->dump_unsigned("id", fs
->fscid
);
124 f
->dump_unsigned("up", fs
->mds_map
.up
.size());
125 f
->dump_unsigned("in", fs
->mds_map
.in
.size());
126 f
->dump_unsigned("max", fs
->mds_map
.max_mds
);
129 if (filesystems
.size() == 1) {
130 auto fs
= filesystems
.begin()->second
;
131 *out
<< fs
->mds_map
.up
.size() << "/" << fs
->mds_map
.in
.size() << "/"
132 << fs
->mds_map
.max_mds
<< " up";
134 for (auto i
: filesystems
) {
136 *out
<< fs
->mds_map
.fs_name
<< "-" << fs
->mds_map
.up
.size() << "/"
137 << fs
->mds_map
.in
.size() << "/" << fs
->mds_map
.max_mds
<< " up ";
143 f
->open_array_section("by_rank");
146 const auto all_info
= get_mds_info();
147 for (const auto &p
: all_info
) {
148 const auto &info
= p
.second
;
149 string s
= ceph_mds_state_name(info
.state
);
151 s
+= "(laggy or crashed)";
154 const fs_cluster_id_t fscid
= mds_roles
.at(info
.global_id
);
156 if (info
.rank
!= MDS_RANK_NONE
&&
157 info
.state
!= MDSMap::STATE_STANDBY_REPLAY
) {
159 f
->open_object_section("mds");
160 f
->dump_unsigned("filesystem_id", fscid
);
161 f
->dump_unsigned("rank", info
.rank
);
162 f
->dump_string("name", info
.name
);
163 f
->dump_string("status", s
);
166 by_rank
[mds_role_t(fscid
, info
.rank
)] = info
.name
+ "=" + s
;
176 if (!by_rank
.empty()) {
177 if (filesystems
.size() > 1) {
178 // Disambiguate filesystems
179 std::map
<std::string
, std::string
> pretty
;
180 for (auto i
: by_rank
) {
181 const auto &fs_name
= filesystems
.at(i
.first
.fscid
)->mds_map
.fs_name
;
182 std::ostringstream o
;
183 o
<< "[" << fs_name
<< ":" << i
.first
.rank
<< "]";
184 pretty
[o
.str()] = i
.second
;
186 *out
<< " " << pretty
;
188 // Omit FSCID in output when only one filesystem exists
189 std::map
<mds_rank_t
, std::string
> shortened
;
190 for (auto i
: by_rank
) {
191 shortened
[i
.first
.rank
] = i
.second
;
193 *out
<< " " << shortened
;
198 for (map
<string
,int>::reverse_iterator p
= by_state
.rbegin(); p
!= by_state
.rend(); ++p
) {
200 f
->dump_unsigned(p
->first
.c_str(), p
->second
);
202 *out
<< ", " << p
->second
<< " " << p
->first
;
208 for (auto i
: filesystems
) {
210 failed
+= fs
->mds_map
.failed
.size();
211 damaged
+= fs
->mds_map
.damaged
.size();
216 f
->dump_unsigned("failed", failed
);
218 *out
<< ", " << failed
<< " failed";
224 f
->dump_unsigned("damaged", damaged
);
226 *out
<< ", " << damaged
<< " damaged";
229 //if (stopped.size())
230 //out << ", " << stopped.size() << " stopped";
234 void FSMap::create_filesystem(const std::string
&name
,
235 int64_t metadata_pool
, int64_t data_pool
,
238 auto fs
= std::make_shared
<Filesystem
>();
239 fs
->mds_map
.fs_name
= name
;
240 fs
->mds_map
.max_mds
= 1;
241 fs
->mds_map
.data_pools
.push_back(data_pool
);
242 fs
->mds_map
.metadata_pool
= metadata_pool
;
243 fs
->mds_map
.cas_pool
= -1;
244 fs
->mds_map
.max_file_size
= g_conf
->mds_max_file_size
;
245 fs
->mds_map
.compat
= compat
;
246 fs
->mds_map
.created
= ceph_clock_now();
247 fs
->mds_map
.modified
= ceph_clock_now();
248 fs
->mds_map
.session_timeout
= g_conf
->mds_session_timeout
;
249 fs
->mds_map
.session_autoclose
= g_conf
->mds_session_autoclose
;
250 fs
->mds_map
.enabled
= true;
251 if (features
& CEPH_FEATURE_SERVER_JEWEL
) {
252 fs
->fscid
= next_filesystem_id
++;
253 // ANONYMOUS is only for upgrades from legacy mdsmaps, we should
254 // have initialized next_filesystem_id such that it's never used here.
255 assert(fs
->fscid
!= FS_CLUSTER_ID_ANONYMOUS
);
257 // Use anon fscid because this will get thrown away when encoding
258 // as legacy MDSMap for legacy mons.
259 assert(filesystems
.empty());
260 fs
->fscid
= FS_CLUSTER_ID_ANONYMOUS
;
262 filesystems
[fs
->fscid
] = fs
;
264 // Created first filesystem? Set it as the one
265 // for legacy clients to use
266 if (filesystems
.size() == 1) {
267 legacy_client_fscid
= fs
->fscid
;
271 void FSMap::reset_filesystem(fs_cluster_id_t fscid
)
273 auto fs
= get_filesystem(fscid
);
274 auto new_fs
= std::make_shared
<Filesystem
>();
276 // Populate rank 0 as existing (so don't go into CREATING)
277 // but failed (so that next available MDS is assigned the rank)
278 new_fs
->mds_map
.in
.insert(mds_rank_t(0));
279 new_fs
->mds_map
.failed
.insert(mds_rank_t(0));
281 // Carry forward what makes sense
282 new_fs
->fscid
= fs
->fscid
;
283 new_fs
->mds_map
.inline_data_enabled
= fs
->mds_map
.inline_data_enabled
;
284 new_fs
->mds_map
.max_mds
= 1;
285 new_fs
->mds_map
.data_pools
= fs
->mds_map
.data_pools
;
286 new_fs
->mds_map
.metadata_pool
= fs
->mds_map
.metadata_pool
;
287 new_fs
->mds_map
.cas_pool
= fs
->mds_map
.cas_pool
;
288 new_fs
->mds_map
.fs_name
= fs
->mds_map
.fs_name
;
289 new_fs
->mds_map
.max_file_size
= g_conf
->mds_max_file_size
;
290 new_fs
->mds_map
.compat
= compat
;
291 new_fs
->mds_map
.created
= ceph_clock_now();
292 new_fs
->mds_map
.modified
= ceph_clock_now();
293 new_fs
->mds_map
.session_timeout
= g_conf
->mds_session_timeout
;
294 new_fs
->mds_map
.session_autoclose
= g_conf
->mds_session_autoclose
;
295 new_fs
->mds_map
.standby_count_wanted
= fs
->mds_map
.standby_count_wanted
;
296 new_fs
->mds_map
.enabled
= true;
298 // Persist the new FSMap
299 filesystems
[new_fs
->fscid
] = new_fs
;
302 void FSMap::get_health(list
<pair
<health_status_t
,string
> >& summary
,
303 list
<pair
<health_status_t
,string
> > *detail
) const
305 mds_rank_t standby_count_wanted
= 0;
306 for (const auto &i
: filesystems
) {
307 const auto &fs
= i
.second
;
309 // TODO: move get_health up into here so that we can qualify
310 // all the messages with what filesystem they're talking about
311 fs
->mds_map
.get_health(summary
, detail
);
313 standby_count_wanted
= std::max(standby_count_wanted
, fs
->mds_map
.get_standby_count_wanted((mds_rank_t
)standby_daemons
.size()));
316 if (standby_count_wanted
) {
317 std::ostringstream oss
;
318 oss
<< "insufficient standby daemons available: have " << standby_daemons
.size() << "; want " << standby_count_wanted
<< " more";
319 summary
.push_back(make_pair(HEALTH_WARN
, oss
.str()));
323 bool FSMap::check_health(void)
325 bool changed
= false;
326 for (auto &i
: filesystems
) {
327 changed
|= i
.second
->mds_map
.check_health((mds_rank_t
)standby_daemons
.size());
332 void FSMap::get_health_checks(health_check_map_t
*checks
) const
334 mds_rank_t standby_count_wanted
= 0;
335 for (const auto &i
: filesystems
) {
336 const auto &fs
= i
.second
;
337 health_check_map_t fschecks
;
338 fs
->mds_map
.get_health_checks(&fschecks
);
339 checks
->merge(fschecks
);
340 standby_count_wanted
= std::max(
341 standby_count_wanted
,
342 fs
->mds_map
.get_standby_count_wanted((mds_rank_t
)standby_daemons
.size()));
345 // MDS_INSUFFICIENT_STANDBY
346 if (standby_count_wanted
) {
347 std::ostringstream oss
, dss
;
348 oss
<< "insufficient standby daemons available";
349 auto& d
= checks
->add("MDS_INSUFFICIENT_STANDBY", HEALTH_WARN
, oss
.str());
350 dss
<< "have " << standby_daemons
.size() << "; want " << standby_count_wanted
352 d
.detail
.push_back(dss
.str());
356 void FSMap::encode(bufferlist
& bl
, uint64_t features
) const
358 if (features
& CEPH_FEATURE_SERVER_JEWEL
) {
359 ENCODE_START(7, 6, bl
);
361 ::encode(next_filesystem_id
, bl
);
362 ::encode(legacy_client_fscid
, bl
);
363 ::encode(compat
, bl
);
364 ::encode(enable_multiple
, bl
);
365 std::vector
<Filesystem
> fs_list
;
366 for (auto i
: filesystems
) {
367 fs_list
.push_back(*(i
.second
));
369 ::encode(fs_list
, bl
, features
);
370 ::encode(mds_roles
, bl
);
371 ::encode(standby_daemons
, bl
, features
);
372 ::encode(standby_epochs
, bl
);
373 ::encode(ever_enabled_multiple
, bl
);
376 if (filesystems
.empty()) {
378 disabled_map
.epoch
= epoch
;
379 disabled_map
.encode(bl
, features
);
381 // MDSMonitor should never have created multiple filesystems
382 // until the quorum features indicated Jewel
383 assert(filesystems
.size() == 1);
384 auto fs
= filesystems
.begin()->second
;
386 // Take the MDSMap for the enabled filesystem, and populated its
387 // mds_info with the standbys to get a pre-jewel-style mon MDSMap.
388 MDSMap full_mdsmap
= fs
->mds_map
;
389 full_mdsmap
.epoch
= epoch
;
390 for (const auto &p
: standby_daemons
) {
391 full_mdsmap
.mds_info
[p
.first
] = p
.second
;
394 // Old MDSMaps don't set rank on standby replay daemons
395 for (auto &i
: full_mdsmap
.mds_info
) {
396 auto &info
= i
.second
;
397 if (info
.state
== MDSMap::STATE_STANDBY_REPLAY
) {
398 info
.rank
= MDS_RANK_NONE
;
402 full_mdsmap
.encode(bl
, features
);
407 void FSMap::decode(bufferlist::iterator
& p
)
409 // Because the mon used to store an MDSMap where we now
410 // store an FSMap, FSMap knows how to decode the legacy
411 // MDSMap format (it never needs to encode it though).
412 MDSMap legacy_mds_map
;
414 // The highest MDSMap encoding version before we changed the
415 // MDSMonitor to store an FSMap instead of an MDSMap was
416 // 5, so anything older than 6 is decoded as an MDSMap,
417 // and anything newer is decoded as an FSMap.
418 DECODE_START_LEGACY_COMPAT_LEN_16(7, 4, 4, p
);
420 // Decoding an MDSMap (upgrade)
422 ::decode(legacy_mds_map
.flags
, p
);
423 ::decode(legacy_mds_map
.last_failure
, p
);
424 ::decode(legacy_mds_map
.root
, p
);
425 ::decode(legacy_mds_map
.session_timeout
, p
);
426 ::decode(legacy_mds_map
.session_autoclose
, p
);
427 ::decode(legacy_mds_map
.max_file_size
, p
);
428 ::decode(legacy_mds_map
.max_mds
, p
);
429 ::decode(legacy_mds_map
.mds_info
, p
);
436 legacy_mds_map
.data_pools
.push_back(m
);
440 legacy_mds_map
.cas_pool
= s
;
442 ::decode(legacy_mds_map
.data_pools
, p
);
443 ::decode(legacy_mds_map
.cas_pool
, p
);
446 // kclient ignores everything from here
451 ::decode(legacy_mds_map
.compat
, p
);
453 legacy_mds_map
.compat
= get_mdsmap_compat_set_base();
457 legacy_mds_map
.metadata_pool
= n
;
459 ::decode(legacy_mds_map
.metadata_pool
, p
);
461 ::decode(legacy_mds_map
.created
, p
);
462 ::decode(legacy_mds_map
.modified
, p
);
463 ::decode(legacy_mds_map
.tableserver
, p
);
464 ::decode(legacy_mds_map
.in
, p
);
465 std::map
<mds_rank_t
,int32_t> inc
; // Legacy field, parse and drop
467 ::decode(legacy_mds_map
.up
, p
);
468 ::decode(legacy_mds_map
.failed
, p
);
469 ::decode(legacy_mds_map
.stopped
, p
);
471 ::decode(legacy_mds_map
.last_failure_osd_epoch
, p
);
474 // previously this was a bool about snaps, not a flag map
477 legacy_mds_map
.ever_allowed_features
= flag
?
478 CEPH_MDSMAP_ALLOW_SNAPS
: 0;
480 legacy_mds_map
.explicitly_allowed_features
= flag
?
481 CEPH_MDSMAP_ALLOW_SNAPS
: 0;
482 if (legacy_mds_map
.max_mds
> 1) {
483 legacy_mds_map
.set_multimds_allowed();
486 ::decode(legacy_mds_map
.ever_allowed_features
, p
);
487 ::decode(legacy_mds_map
.explicitly_allowed_features
, p
);
490 legacy_mds_map
.ever_allowed_features
= CEPH_MDSMAP_ALLOW_CLASSICS
;
491 legacy_mds_map
.explicitly_allowed_features
= 0;
492 if (legacy_mds_map
.max_mds
> 1) {
493 legacy_mds_map
.set_multimds_allowed();
497 ::decode(legacy_mds_map
.inline_data_enabled
, p
);
500 assert(struct_v
>= 5);
501 ::decode(legacy_mds_map
.enabled
, p
);
502 ::decode(legacy_mds_map
.fs_name
, p
);
504 legacy_mds_map
.fs_name
= "default";
506 // If an MDS has ever been started, epoch will be greater than 1,
507 // assume filesystem is enabled.
508 legacy_mds_map
.enabled
= true;
510 // Upgrading from a cluster that never used an MDS, switch off
511 // filesystem until it's explicitly enabled.
512 legacy_mds_map
.enabled
= false;
517 ::decode(legacy_mds_map
.damaged
, p
);
520 // We're upgrading, populate filesystems from the legacy fields
522 standby_daemons
.clear();
523 standby_epochs
.clear();
525 compat
= legacy_mds_map
.compat
;
526 enable_multiple
= false;
528 // Synthesise a Filesystem from legacy_mds_map, if enabled
529 if (legacy_mds_map
.enabled
) {
530 // Construct a Filesystem from the legacy MDSMap
531 auto migrate_fs
= std::make_shared
<Filesystem
>();
532 migrate_fs
->fscid
= FS_CLUSTER_ID_ANONYMOUS
;
533 migrate_fs
->mds_map
= legacy_mds_map
;
534 migrate_fs
->mds_map
.epoch
= epoch
;
535 filesystems
[migrate_fs
->fscid
] = migrate_fs
;
537 // List of GIDs that had invalid states
538 std::set
<mds_gid_t
> drop_gids
;
540 // Construct mds_roles, standby_daemons, and remove
541 // standbys from the MDSMap in the Filesystem.
542 for (auto &p
: migrate_fs
->mds_map
.mds_info
) {
543 if (p
.second
.state
== MDSMap::STATE_STANDBY_REPLAY
) {
544 // In legacy MDSMap, standby replay daemons don't have
545 // rank set, but since FSMap they do.
546 p
.second
.rank
= p
.second
.standby_for_rank
;
548 if (p
.second
.rank
== MDS_RANK_NONE
) {
549 if (p
.second
.state
!= MDSMap::STATE_STANDBY
) {
550 // Old MDSMaps can have down:dne here, which
551 // is invalid in an FSMap (#17837)
552 drop_gids
.insert(p
.first
);
554 insert(p
.second
); // into standby_daemons
557 mds_roles
[p
.first
] = migrate_fs
->fscid
;
560 for (const auto &p
: standby_daemons
) {
561 // Erase from this Filesystem's MDSMap, because it has
562 // been copied into FSMap::Standby_daemons above
563 migrate_fs
->mds_map
.mds_info
.erase(p
.first
);
565 for (const auto &gid
: drop_gids
) {
566 // Throw away all info for this MDS because it was identified
567 // as having invalid state above.
568 migrate_fs
->mds_map
.mds_info
.erase(gid
);
571 legacy_client_fscid
= migrate_fs
->fscid
;
573 legacy_client_fscid
= FS_CLUSTER_ID_NONE
;
577 ::decode(next_filesystem_id
, p
);
578 ::decode(legacy_client_fscid
, p
);
580 ::decode(enable_multiple
, p
);
581 std::vector
<Filesystem
> fs_list
;
582 ::decode(fs_list
, p
);
584 for (std::vector
<Filesystem
>::const_iterator fs
= fs_list
.begin(); fs
!= fs_list
.end(); ++fs
) {
585 filesystems
[fs
->fscid
] = std::make_shared
<Filesystem
>(*fs
);
588 ::decode(mds_roles
, p
);
589 ::decode(standby_daemons
, p
);
590 ::decode(standby_epochs
, p
);
592 ::decode(ever_enabled_multiple
, p
);
600 void Filesystem::encode(bufferlist
& bl
, uint64_t features
) const
602 ENCODE_START(1, 1, bl
);
604 bufferlist mdsmap_bl
;
605 mds_map
.encode(mdsmap_bl
, features
);
606 ::encode(mdsmap_bl
, bl
);
610 void Filesystem::decode(bufferlist::iterator
& p
)
614 bufferlist mdsmap_bl
;
615 ::decode(mdsmap_bl
, p
);
616 bufferlist::iterator mdsmap_bl_iter
= mdsmap_bl
.begin();
617 mds_map
.decode(mdsmap_bl_iter
);
621 int FSMap::parse_filesystem(
622 std::string
const &ns_str
,
623 std::shared_ptr
<const Filesystem
> *result
627 fs_cluster_id_t fscid
= strict_strtol(ns_str
.c_str(), 10, &ns_err
);
628 if (!ns_err
.empty() || filesystems
.count(fscid
) == 0) {
629 for (auto &fs
: filesystems
) {
630 if (fs
.second
->mds_map
.fs_name
== ns_str
) {
631 *result
= std::const_pointer_cast
<const Filesystem
>(fs
.second
);
637 *result
= get_filesystem(fscid
);
642 void Filesystem::print(std::ostream
&out
) const
644 out
<< "Filesystem '" << mds_map
.fs_name
645 << "' (" << fscid
<< ")" << std::endl
;
649 mds_gid_t
FSMap::find_standby_for(mds_role_t role
, const std::string
& name
) const
651 mds_gid_t result
= MDS_GID_NONE
;
653 // First see if we have a STANDBY_REPLAY
654 auto fs
= get_filesystem(role
.fscid
);
655 for (const auto &i
: fs
->mds_map
.mds_info
) {
656 const auto &info
= i
.second
;
657 if (info
.rank
== role
.rank
&& info
.state
== MDSMap::STATE_STANDBY_REPLAY
) {
658 return info
.global_id
;
662 // See if there are any STANDBY daemons available
663 for (const auto &i
: standby_daemons
) {
664 const auto &gid
= i
.first
;
665 const auto &info
= i
.second
;
666 assert(info
.state
== MDSMap::STATE_STANDBY
);
667 assert(info
.rank
== MDS_RANK_NONE
);
673 // The mds_info_t may or may not tell us exactly which filesystem
674 // the standby_for_rank refers to: lookup via legacy_client_fscid
675 mds_role_t target_role
= {
676 info
.standby_for_fscid
== FS_CLUSTER_ID_NONE
?
677 legacy_client_fscid
: info
.standby_for_fscid
,
678 info
.standby_for_rank
};
680 if ((target_role
.rank
== role
.rank
&& target_role
.fscid
== role
.fscid
)
681 || (name
.length() && info
.standby_for_name
== name
)) {
682 // It's a named standby for *me*, use it.
685 info
.standby_for_rank
< 0 && info
.standby_for_name
.length() == 0 &&
686 (info
.standby_for_fscid
== FS_CLUSTER_ID_NONE
||
687 info
.standby_for_fscid
== role
.fscid
)) {
688 // It's not a named standby for anyone, use it if we don't find
689 // a named standby for me later, unless it targets another FSCID.
697 mds_gid_t
FSMap::find_unused_for(mds_role_t role
,
698 bool force_standby_active
) const {
699 for (const auto &i
: standby_daemons
) {
700 const auto &gid
= i
.first
;
701 const auto &info
= i
.second
;
702 assert(info
.state
== MDSMap::STATE_STANDBY
);
704 if (info
.laggy() || info
.rank
>= 0)
707 if (info
.standby_for_fscid
!= FS_CLUSTER_ID_NONE
&&
708 info
.standby_for_fscid
!= role
.fscid
)
710 if (info
.standby_for_rank
!= MDS_RANK_NONE
&&
711 info
.standby_for_rank
!= role
.rank
)
714 // To be considered 'unused' a daemon must either not
715 // be selected for standby-replay or the force_standby_active
716 // setting must be enabled to use replay daemons anyway.
717 if (!info
.standby_replay
|| force_standby_active
) {
724 mds_gid_t
FSMap::find_replacement_for(mds_role_t role
, const std::string
& name
,
725 bool force_standby_active
) const {
726 const mds_gid_t standby
= find_standby_for(role
, name
);
730 return find_unused_for(role
, force_standby_active
);
733 void FSMap::sanity() const
735 if (legacy_client_fscid
!= FS_CLUSTER_ID_NONE
) {
736 assert(filesystems
.count(legacy_client_fscid
) == 1);
739 for (const auto &i
: filesystems
) {
741 assert(fs
->mds_map
.compat
.compare(compat
) == 0);
742 assert(fs
->fscid
== i
.first
);
743 for (const auto &j
: fs
->mds_map
.mds_info
) {
744 assert(j
.second
.rank
!= MDS_RANK_NONE
);
745 assert(mds_roles
.count(j
.first
) == 1);
746 assert(standby_daemons
.count(j
.first
) == 0);
747 assert(standby_epochs
.count(j
.first
) == 0);
748 assert(mds_roles
.at(j
.first
) == i
.first
);
749 if (j
.second
.state
!= MDSMap::STATE_STANDBY_REPLAY
) {
750 assert(fs
->mds_map
.up
.at(j
.second
.rank
) == j
.first
);
751 assert(fs
->mds_map
.failed
.count(j
.second
.rank
) == 0);
752 assert(fs
->mds_map
.damaged
.count(j
.second
.rank
) == 0);
756 for (const auto &j
: fs
->mds_map
.up
) {
757 mds_rank_t rank
= j
.first
;
758 assert(fs
->mds_map
.in
.count(rank
) == 1);
759 mds_gid_t gid
= j
.second
;
760 assert(fs
->mds_map
.mds_info
.count(gid
) == 1);
764 for (const auto &i
: standby_daemons
) {
765 assert(i
.second
.state
== MDSMap::STATE_STANDBY
);
766 assert(i
.second
.rank
== MDS_RANK_NONE
);
767 assert(i
.second
.global_id
== i
.first
);
768 assert(standby_epochs
.count(i
.first
) == 1);
769 assert(mds_roles
.count(i
.first
) == 1);
770 assert(mds_roles
.at(i
.first
) == FS_CLUSTER_ID_NONE
);
773 for (const auto &i
: standby_epochs
) {
774 assert(standby_daemons
.count(i
.first
) == 1);
777 for (const auto &i
: mds_roles
) {
778 if (i
.second
== FS_CLUSTER_ID_NONE
) {
779 assert(standby_daemons
.count(i
.first
) == 1);
781 assert(filesystems
.count(i
.second
) == 1);
782 assert(filesystems
.at(i
.second
)->mds_map
.mds_info
.count(i
.first
) == 1);
788 mds_gid_t standby_gid
,
789 const std::shared_ptr
<Filesystem
> &filesystem
,
790 mds_rank_t assigned_rank
)
792 assert(gid_exists(standby_gid
));
793 bool is_standby_replay
= mds_roles
.at(standby_gid
) != FS_CLUSTER_ID_NONE
;
794 if (!is_standby_replay
) {
795 assert(standby_daemons
.count(standby_gid
));
796 assert(standby_daemons
.at(standby_gid
).state
== MDSMap::STATE_STANDBY
);
799 MDSMap
&mds_map
= filesystem
->mds_map
;
801 // Insert daemon state to Filesystem
802 if (!is_standby_replay
) {
803 mds_map
.mds_info
[standby_gid
] = standby_daemons
.at(standby_gid
);
805 assert(mds_map
.mds_info
.count(standby_gid
));
806 assert(mds_map
.mds_info
.at(standby_gid
).state
== MDSMap::STATE_STANDBY_REPLAY
);
807 assert(mds_map
.mds_info
.at(standby_gid
).rank
== assigned_rank
);
809 MDSMap::mds_info_t
&info
= mds_map
.mds_info
[standby_gid
];
811 if (mds_map
.stopped
.erase(assigned_rank
)) {
812 // The cluster is being expanded with a stopped rank
813 info
.state
= MDSMap::STATE_STARTING
;
814 } else if (!mds_map
.is_in(assigned_rank
)) {
815 // The cluster is being expanded with a new rank
816 info
.state
= MDSMap::STATE_CREATING
;
818 // An existing rank is being assigned to a replacement
819 info
.state
= MDSMap::STATE_REPLAY
;
820 mds_map
.failed
.erase(assigned_rank
);
822 info
.rank
= assigned_rank
;
824 mds_roles
[standby_gid
] = filesystem
->fscid
;
826 // Update the rank state in Filesystem
827 mds_map
.in
.insert(assigned_rank
);
828 mds_map
.up
[assigned_rank
] = standby_gid
;
830 // Remove from the list of standbys
831 if (!is_standby_replay
) {
832 standby_daemons
.erase(standby_gid
);
833 standby_epochs
.erase(standby_gid
);
836 // Indicate that Filesystem has been modified
837 mds_map
.epoch
= epoch
;
840 void FSMap::assign_standby_replay(
841 const mds_gid_t standby_gid
,
842 const fs_cluster_id_t leader_ns
,
843 const mds_rank_t leader_rank
)
845 assert(mds_roles
.at(standby_gid
) == FS_CLUSTER_ID_NONE
);
846 assert(gid_exists(standby_gid
));
847 assert(!gid_has_rank(standby_gid
));
848 assert(standby_daemons
.count(standby_gid
));
850 // Insert to the filesystem
851 auto fs
= filesystems
.at(leader_ns
);
852 fs
->mds_map
.mds_info
[standby_gid
] = standby_daemons
.at(standby_gid
);
853 fs
->mds_map
.mds_info
[standby_gid
].rank
= leader_rank
;
854 fs
->mds_map
.mds_info
[standby_gid
].state
= MDSMap::STATE_STANDBY_REPLAY
;
855 mds_roles
[standby_gid
] = leader_ns
;
857 // Remove from the list of standbys
858 standby_daemons
.erase(standby_gid
);
859 standby_epochs
.erase(standby_gid
);
861 // Indicate that Filesystem has been modified
862 fs
->mds_map
.epoch
= epoch
;
865 void FSMap::erase(mds_gid_t who
, epoch_t blacklist_epoch
)
867 if (mds_roles
.at(who
) == FS_CLUSTER_ID_NONE
) {
868 standby_daemons
.erase(who
);
869 standby_epochs
.erase(who
);
871 auto &fs
= filesystems
.at(mds_roles
.at(who
));
872 const auto &info
= fs
->mds_map
.mds_info
.at(who
);
873 if (info
.state
!= MDSMap::STATE_STANDBY_REPLAY
) {
874 if (info
.state
== MDSMap::STATE_CREATING
) {
875 // If this gid didn't make it past CREATING, then forget
876 // the rank ever existed so that next time it's handed out
877 // to a gid it'll go back into CREATING.
878 fs
->mds_map
.in
.erase(info
.rank
);
880 // Put this rank into the failed list so that the next available
881 // STANDBY will pick it up.
882 fs
->mds_map
.failed
.insert(info
.rank
);
884 assert(fs
->mds_map
.up
.at(info
.rank
) == info
.global_id
);
885 fs
->mds_map
.up
.erase(info
.rank
);
887 fs
->mds_map
.mds_info
.erase(who
);
888 fs
->mds_map
.last_failure_osd_epoch
= blacklist_epoch
;
889 fs
->mds_map
.epoch
= epoch
;
892 mds_roles
.erase(who
);
895 void FSMap::damaged(mds_gid_t who
, epoch_t blacklist_epoch
)
897 assert(mds_roles
.at(who
) != FS_CLUSTER_ID_NONE
);
898 auto fs
= filesystems
.at(mds_roles
.at(who
));
899 mds_rank_t rank
= fs
->mds_map
.mds_info
[who
].rank
;
901 erase(who
, blacklist_epoch
);
902 fs
->mds_map
.failed
.erase(rank
);
903 fs
->mds_map
.damaged
.insert(rank
);
905 assert(fs
->mds_map
.epoch
== epoch
);
909 * Update to indicate that the rank `rank` is to be removed
910 * from the damaged list of the filesystem `fscid`
912 bool FSMap::undamaged(const fs_cluster_id_t fscid
, const mds_rank_t rank
)
914 auto fs
= filesystems
.at(fscid
);
916 if (fs
->mds_map
.damaged
.erase(rank
)) {
917 fs
->mds_map
.failed
.insert(rank
);
918 fs
->mds_map
.epoch
= epoch
;
925 void FSMap::insert(const MDSMap::mds_info_t
&new_info
)
927 assert(new_info
.state
== MDSMap::STATE_STANDBY
);
928 assert(new_info
.rank
== MDS_RANK_NONE
);
929 mds_roles
[new_info
.global_id
] = FS_CLUSTER_ID_NONE
;
930 standby_daemons
[new_info
.global_id
] = new_info
;
931 standby_epochs
[new_info
.global_id
] = epoch
;
934 std::list
<mds_gid_t
> FSMap::stop(mds_gid_t who
)
936 assert(mds_roles
.at(who
) != FS_CLUSTER_ID_NONE
);
937 auto fs
= filesystems
.at(mds_roles
.at(who
));
938 const auto &info
= fs
->mds_map
.mds_info
.at(who
);
939 fs
->mds_map
.up
.erase(info
.rank
);
940 fs
->mds_map
.in
.erase(info
.rank
);
941 fs
->mds_map
.stopped
.insert(info
.rank
);
943 // Also drop any standby replays that were following this rank
944 std::list
<mds_gid_t
> standbys
;
945 for (const auto &i
: fs
->mds_map
.mds_info
) {
946 const auto &other_gid
= i
.first
;
947 const auto &other_info
= i
.second
;
948 if (other_info
.rank
== info
.rank
949 && other_info
.state
== MDSMap::STATE_STANDBY_REPLAY
) {
950 standbys
.push_back(other_gid
);
955 fs
->mds_map
.mds_info
.erase(who
);
956 mds_roles
.erase(who
);
958 fs
->mds_map
.epoch
= epoch
;
965 * Given one of the following forms:
970 * Parse into a mds_role_t. The rank-only form is only valid
971 * if legacy_client_ns is set.
973 int FSMap::parse_role(
974 const std::string
&role_str
,
976 std::ostream
&ss
) const
978 size_t colon_pos
= role_str
.find(":");
980 std::shared_ptr
<const Filesystem
> fs
;
981 if (colon_pos
== std::string::npos
) {
982 if (legacy_client_fscid
== FS_CLUSTER_ID_NONE
) {
983 ss
<< "No filesystem selected";
986 fs
= get_filesystem(legacy_client_fscid
);
989 if (parse_filesystem(role_str
.substr(0, colon_pos
), &fs
) < 0) {
990 ss
<< "Invalid filesystem";
993 rank_pos
= colon_pos
+1;
998 std::string rank_str
= role_str
.substr(rank_pos
);
999 long rank_i
= strict_strtol(rank_str
.c_str(), 10, &err
);
1000 if (rank_i
< 0 || !err
.empty()) {
1001 ss
<< "Invalid rank '" << rank_str
<< "'";
1007 if (fs
->mds_map
.in
.count(rank
) == 0) {
1008 ss
<< "Rank '" << rank
<< "' not found";
1012 *role
= {fs
->fscid
, rank
};