1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
4 * Ceph - scalable distributed file system
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
19 using std::stringstream
;
21 #include "mon/health_check.h"
24 void Filesystem::dump(Formatter
*f
) const
26 f
->open_object_section("mdsmap");
29 f
->dump_int("id", fscid
);
32 void FSMap::dump(Formatter
*f
) const
34 f
->dump_int("epoch", epoch
);
36 f
->open_object_section("compat");
40 f
->open_object_section("feature_flags");
41 f
->dump_bool("enable_multiple", enable_multiple
);
42 f
->dump_bool("ever_enabled_multiple", ever_enabled_multiple
);
45 f
->open_array_section("standbys");
46 for (const auto &i
: standby_daemons
) {
47 f
->open_object_section("info");
49 f
->dump_int("epoch", standby_epochs
.at(i
.first
));
54 f
->open_array_section("filesystems");
55 for (const auto &fs
: filesystems
) {
56 f
->open_object_section("filesystem");
63 void FSMap::generate_test_instances(list
<FSMap
*>& ls
)
65 FSMap
*m
= new FSMap();
67 std::list
<MDSMap
*> mds_map_instances
;
68 MDSMap::generate_test_instances(mds_map_instances
);
71 for (auto i
: mds_map_instances
) {
72 auto fs
= std::make_shared
<Filesystem
>();
76 m
->filesystems
[fs
->fscid
] = fs
;
78 mds_map_instances
.clear();
83 void FSMap::print(ostream
& out
) const
85 out
<< "e" << epoch
<< std::endl
;
86 out
<< "enable_multiple, ever_enabled_multiple: " << enable_multiple
<< ","
87 << ever_enabled_multiple
<< std::endl
;
88 out
<< "compat: " << compat
<< std::endl
;
89 out
<< "legacy client fscid: " << legacy_client_fscid
<< std::endl
;
90 out
<< " " << std::endl
;
92 if (filesystems
.empty()) {
93 out
<< "No filesystems configured" << std::endl
;
97 for (const auto &fs
: filesystems
) {
98 fs
.second
->print(out
);
99 out
<< " " << std::endl
<< " " << std::endl
; // Space out a bit
102 if (!standby_daemons
.empty()) {
103 out
<< "Standby daemons:" << std::endl
<< " " << std::endl
;
106 for (const auto &p
: standby_daemons
) {
107 p
.second
.print_summary(out
);
114 void FSMap::print_summary(Formatter
*f
, ostream
*out
) const
116 map
<mds_role_t
,string
> by_rank
;
117 map
<string
,int> by_state
;
120 f
->dump_unsigned("epoch", get_epoch());
121 for (auto i
: filesystems
) {
123 f
->dump_unsigned("id", fs
->fscid
);
124 f
->dump_unsigned("up", fs
->mds_map
.up
.size());
125 f
->dump_unsigned("in", fs
->mds_map
.in
.size());
126 f
->dump_unsigned("max", fs
->mds_map
.max_mds
);
129 for (auto i
: filesystems
) {
131 *out
<< fs
->mds_map
.fs_name
<< "-" << fs
->mds_map
.up
.size() << "/"
132 << fs
->mds_map
.in
.size() << "/" << fs
->mds_map
.max_mds
<< " up ";
137 f
->open_array_section("by_rank");
140 const auto all_info
= get_mds_info();
141 for (const auto &p
: all_info
) {
142 const auto &info
= p
.second
;
143 string s
= ceph_mds_state_name(info
.state
);
145 s
+= "(laggy or crashed)";
148 const fs_cluster_id_t fscid
= mds_roles
.at(info
.global_id
);
150 if (info
.rank
!= MDS_RANK_NONE
&&
151 info
.state
!= MDSMap::STATE_STANDBY_REPLAY
) {
153 f
->open_object_section("mds");
154 f
->dump_unsigned("filesystem_id", fscid
);
155 f
->dump_unsigned("rank", info
.rank
);
156 f
->dump_string("name", info
.name
);
157 f
->dump_string("status", s
);
160 by_rank
[mds_role_t(fscid
, info
.rank
)] = info
.name
+ "=" + s
;
170 if (!by_rank
.empty()) {
171 if (filesystems
.size() > 1) {
172 // Disambiguate filesystems
173 std::map
<std::string
, std::string
> pretty
;
174 for (auto i
: by_rank
) {
175 const auto &fs_name
= filesystems
.at(i
.first
.fscid
)->mds_map
.fs_name
;
176 std::ostringstream o
;
177 o
<< "[" << fs_name
<< ":" << i
.first
.rank
<< "]";
178 pretty
[o
.str()] = i
.second
;
180 *out
<< " " << pretty
;
182 // Omit FSCID in output when only one filesystem exists
183 std::map
<mds_rank_t
, std::string
> shortened
;
184 for (auto i
: by_rank
) {
185 shortened
[i
.first
.rank
] = i
.second
;
187 *out
<< " " << shortened
;
192 for (map
<string
,int>::reverse_iterator p
= by_state
.rbegin(); p
!= by_state
.rend(); ++p
) {
194 f
->dump_unsigned(p
->first
.c_str(), p
->second
);
196 *out
<< ", " << p
->second
<< " " << p
->first
;
202 for (auto i
: filesystems
) {
204 failed
+= fs
->mds_map
.failed
.size();
205 damaged
+= fs
->mds_map
.damaged
.size();
210 f
->dump_unsigned("failed", failed
);
212 *out
<< ", " << failed
<< " failed";
218 f
->dump_unsigned("damaged", damaged
);
220 *out
<< ", " << damaged
<< " damaged";
223 //if (stopped.size())
224 //out << ", " << stopped.size() << " stopped";
228 void FSMap::create_filesystem(boost::string_view name
,
229 int64_t metadata_pool
, int64_t data_pool
,
232 auto fs
= std::make_shared
<Filesystem
>();
233 fs
->mds_map
.fs_name
= std::string(name
);
234 fs
->mds_map
.max_mds
= 1;
235 fs
->mds_map
.data_pools
.push_back(data_pool
);
236 fs
->mds_map
.metadata_pool
= metadata_pool
;
237 fs
->mds_map
.cas_pool
= -1;
238 fs
->mds_map
.max_file_size
= g_conf
->mds_max_file_size
;
239 fs
->mds_map
.compat
= compat
;
240 fs
->mds_map
.created
= ceph_clock_now();
241 fs
->mds_map
.modified
= ceph_clock_now();
242 fs
->mds_map
.session_timeout
= g_conf
->mds_session_timeout
;
243 fs
->mds_map
.session_autoclose
= g_conf
->mds_session_autoclose
;
244 fs
->mds_map
.enabled
= true;
245 if (features
& CEPH_FEATURE_SERVER_JEWEL
) {
246 fs
->fscid
= next_filesystem_id
++;
247 // ANONYMOUS is only for upgrades from legacy mdsmaps, we should
248 // have initialized next_filesystem_id such that it's never used here.
249 assert(fs
->fscid
!= FS_CLUSTER_ID_ANONYMOUS
);
251 // Use anon fscid because this will get thrown away when encoding
252 // as legacy MDSMap for legacy mons.
253 assert(filesystems
.empty());
254 fs
->fscid
= FS_CLUSTER_ID_ANONYMOUS
;
256 filesystems
[fs
->fscid
] = fs
;
258 // Created first filesystem? Set it as the one
259 // for legacy clients to use
260 if (filesystems
.size() == 1) {
261 legacy_client_fscid
= fs
->fscid
;
265 void FSMap::reset_filesystem(fs_cluster_id_t fscid
)
267 auto fs
= get_filesystem(fscid
);
268 auto new_fs
= std::make_shared
<Filesystem
>();
270 // Populate rank 0 as existing (so don't go into CREATING)
271 // but failed (so that next available MDS is assigned the rank)
272 new_fs
->mds_map
.in
.insert(mds_rank_t(0));
273 new_fs
->mds_map
.failed
.insert(mds_rank_t(0));
275 // Carry forward what makes sense
276 new_fs
->fscid
= fs
->fscid
;
277 new_fs
->mds_map
.inline_data_enabled
= fs
->mds_map
.inline_data_enabled
;
278 new_fs
->mds_map
.max_mds
= 1;
279 new_fs
->mds_map
.data_pools
= fs
->mds_map
.data_pools
;
280 new_fs
->mds_map
.metadata_pool
= fs
->mds_map
.metadata_pool
;
281 new_fs
->mds_map
.cas_pool
= fs
->mds_map
.cas_pool
;
282 new_fs
->mds_map
.fs_name
= fs
->mds_map
.fs_name
;
283 new_fs
->mds_map
.max_file_size
= g_conf
->mds_max_file_size
;
284 new_fs
->mds_map
.compat
= compat
;
285 new_fs
->mds_map
.created
= ceph_clock_now();
286 new_fs
->mds_map
.modified
= ceph_clock_now();
287 new_fs
->mds_map
.session_timeout
= g_conf
->mds_session_timeout
;
288 new_fs
->mds_map
.session_autoclose
= g_conf
->mds_session_autoclose
;
289 new_fs
->mds_map
.standby_count_wanted
= fs
->mds_map
.standby_count_wanted
;
290 new_fs
->mds_map
.enabled
= true;
292 // Remember mds ranks that have ever started. (They should load old inotable
293 // instead of creating new one if they start again.)
294 new_fs
->mds_map
.stopped
.insert(fs
->mds_map
.in
.begin(), fs
->mds_map
.in
.end());
295 new_fs
->mds_map
.stopped
.insert(fs
->mds_map
.stopped
.begin(), fs
->mds_map
.stopped
.end());
296 new_fs
->mds_map
.stopped
.erase(mds_rank_t(0));
298 // Persist the new FSMap
299 filesystems
[new_fs
->fscid
] = new_fs
;
302 void FSMap::get_health(list
<pair
<health_status_t
,string
> >& summary
,
303 list
<pair
<health_status_t
,string
> > *detail
) const
305 mds_rank_t standby_count_wanted
= 0;
306 for (const auto &i
: filesystems
) {
307 const auto &fs
= i
.second
;
309 // TODO: move get_health up into here so that we can qualify
310 // all the messages with what filesystem they're talking about
311 fs
->mds_map
.get_health(summary
, detail
);
313 standby_count_wanted
= std::max(standby_count_wanted
, fs
->mds_map
.get_standby_count_wanted((mds_rank_t
)standby_daemons
.size()));
316 if (standby_count_wanted
) {
317 std::ostringstream oss
;
318 oss
<< "insufficient standby daemons available: have " << standby_daemons
.size() << "; want " << standby_count_wanted
<< " more";
319 summary
.push_back(make_pair(HEALTH_WARN
, oss
.str()));
323 bool FSMap::check_health(void)
325 bool changed
= false;
326 for (auto &i
: filesystems
) {
327 changed
|= i
.second
->mds_map
.check_health((mds_rank_t
)standby_daemons
.size());
332 void FSMap::get_health_checks(health_check_map_t
*checks
) const
334 mds_rank_t standby_count_wanted
= 0;
335 for (const auto &i
: filesystems
) {
336 const auto &fs
= i
.second
;
337 health_check_map_t fschecks
;
339 fs
->mds_map
.get_health_checks(&fschecks
);
341 // Some of the failed ranks might be transient (i.e. there are standbys
342 // ready to replace them). We will report only on "stuck" failed, i.e.
343 // ranks which are failed and have no standby replacement available.
344 std::set
<mds_rank_t
> stuck_failed
;
346 for (const auto &rank
: fs
->mds_map
.failed
) {
347 const mds_gid_t replacement
= find_replacement_for(
348 {fs
->fscid
, rank
}, {}, g_conf
->mon_force_standby_active
);
349 if (replacement
== MDS_GID_NONE
) {
350 stuck_failed
.insert(rank
);
354 // FS_WITH_FAILED_MDS
355 if (!stuck_failed
.empty()) {
356 health_check_t
& fscheck
= checks
->get_or_add(
357 "FS_WITH_FAILED_MDS", HEALTH_WARN
,
358 "%num% filesystem%plurals% %hasorhave% a failed mds daemon");
360 ss
<< "fs " << fs
->mds_map
.fs_name
<< " has " << stuck_failed
.size()
361 << " failed mds" << (stuck_failed
.size() > 1 ? "s" : "");
362 fscheck
.detail
.push_back(ss
.str()); }
364 checks
->merge(fschecks
);
365 standby_count_wanted
= std::max(
366 standby_count_wanted
,
367 fs
->mds_map
.get_standby_count_wanted((mds_rank_t
)standby_daemons
.size()));
370 // MDS_INSUFFICIENT_STANDBY
371 if (standby_count_wanted
) {
372 std::ostringstream oss
, dss
;
373 oss
<< "insufficient standby MDS daemons available";
374 auto& d
= checks
->get_or_add("MDS_INSUFFICIENT_STANDBY", HEALTH_WARN
, oss
.str());
375 dss
<< "have " << standby_daemons
.size() << "; want " << standby_count_wanted
377 d
.detail
.push_back(dss
.str());
381 void FSMap::encode(bufferlist
& bl
, uint64_t features
) const
383 if (features
& CEPH_FEATURE_SERVER_JEWEL
) {
384 ENCODE_START(7, 6, bl
);
386 ::encode(next_filesystem_id
, bl
);
387 ::encode(legacy_client_fscid
, bl
);
388 ::encode(compat
, bl
);
389 ::encode(enable_multiple
, bl
);
390 std::vector
<Filesystem
> fs_list
;
391 for (auto i
: filesystems
) {
392 fs_list
.push_back(*(i
.second
));
394 ::encode(fs_list
, bl
, features
);
395 ::encode(mds_roles
, bl
);
396 ::encode(standby_daemons
, bl
, features
);
397 ::encode(standby_epochs
, bl
);
398 ::encode(ever_enabled_multiple
, bl
);
401 if (filesystems
.empty()) {
403 disabled_map
.epoch
= epoch
;
404 disabled_map
.encode(bl
, features
);
406 // MDSMonitor should never have created multiple filesystems
407 // until the quorum features indicated Jewel
408 assert(filesystems
.size() == 1);
409 auto fs
= filesystems
.begin()->second
;
411 // Take the MDSMap for the enabled filesystem, and populated its
412 // mds_info with the standbys to get a pre-jewel-style mon MDSMap.
413 MDSMap full_mdsmap
= fs
->mds_map
;
414 full_mdsmap
.epoch
= epoch
;
415 for (const auto &p
: standby_daemons
) {
416 full_mdsmap
.mds_info
[p
.first
] = p
.second
;
419 // Old MDSMaps don't set rank on standby replay daemons
420 for (auto &i
: full_mdsmap
.mds_info
) {
421 auto &info
= i
.second
;
422 if (info
.state
== MDSMap::STATE_STANDBY_REPLAY
) {
423 info
.rank
= MDS_RANK_NONE
;
427 full_mdsmap
.encode(bl
, features
);
432 void FSMap::decode(bufferlist::iterator
& p
)
434 // The highest MDSMap encoding version before we changed the
435 // MDSMonitor to store an FSMap instead of an MDSMap was
436 // 5, so anything older than 6 is decoded as an MDSMap,
437 // and anything newer is decoded as an FSMap.
438 DECODE_START_LEGACY_COMPAT_LEN_16(7, 4, 4, p
);
440 // Because the mon used to store an MDSMap where we now
441 // store an FSMap, FSMap knows how to decode the legacy
442 // MDSMap format (it never needs to encode it though).
443 MDSMap legacy_mds_map
;
445 // Decoding an MDSMap (upgrade)
447 ::decode(legacy_mds_map
.flags
, p
);
448 ::decode(legacy_mds_map
.last_failure
, p
);
449 ::decode(legacy_mds_map
.root
, p
);
450 ::decode(legacy_mds_map
.session_timeout
, p
);
451 ::decode(legacy_mds_map
.session_autoclose
, p
);
452 ::decode(legacy_mds_map
.max_file_size
, p
);
453 ::decode(legacy_mds_map
.max_mds
, p
);
454 ::decode(legacy_mds_map
.mds_info
, p
);
461 legacy_mds_map
.data_pools
.push_back(m
);
465 legacy_mds_map
.cas_pool
= s
;
467 ::decode(legacy_mds_map
.data_pools
, p
);
468 ::decode(legacy_mds_map
.cas_pool
, p
);
471 // kclient ignores everything from here
476 ::decode(legacy_mds_map
.compat
, p
);
478 legacy_mds_map
.compat
= get_mdsmap_compat_set_base();
482 legacy_mds_map
.metadata_pool
= n
;
484 ::decode(legacy_mds_map
.metadata_pool
, p
);
486 ::decode(legacy_mds_map
.created
, p
);
487 ::decode(legacy_mds_map
.modified
, p
);
488 ::decode(legacy_mds_map
.tableserver
, p
);
489 ::decode(legacy_mds_map
.in
, p
);
490 std::map
<mds_rank_t
,int32_t> inc
; // Legacy field, parse and drop
492 ::decode(legacy_mds_map
.up
, p
);
493 ::decode(legacy_mds_map
.failed
, p
);
494 ::decode(legacy_mds_map
.stopped
, p
);
496 ::decode(legacy_mds_map
.last_failure_osd_epoch
, p
);
499 // previously this was a bool about snaps, not a flag map
502 legacy_mds_map
.ever_allowed_features
= flag
?
503 CEPH_MDSMAP_ALLOW_SNAPS
: 0;
505 legacy_mds_map
.explicitly_allowed_features
= flag
?
506 CEPH_MDSMAP_ALLOW_SNAPS
: 0;
507 if (legacy_mds_map
.max_mds
> 1) {
508 legacy_mds_map
.set_multimds_allowed();
511 ::decode(legacy_mds_map
.ever_allowed_features
, p
);
512 ::decode(legacy_mds_map
.explicitly_allowed_features
, p
);
515 legacy_mds_map
.ever_allowed_features
= CEPH_MDSMAP_ALLOW_CLASSICS
;
516 legacy_mds_map
.explicitly_allowed_features
= 0;
517 if (legacy_mds_map
.max_mds
> 1) {
518 legacy_mds_map
.set_multimds_allowed();
522 ::decode(legacy_mds_map
.inline_data_enabled
, p
);
525 assert(struct_v
>= 5);
526 ::decode(legacy_mds_map
.enabled
, p
);
527 ::decode(legacy_mds_map
.fs_name
, p
);
529 legacy_mds_map
.fs_name
= "default";
531 // If an MDS has ever been started, epoch will be greater than 1,
532 // assume filesystem is enabled.
533 legacy_mds_map
.enabled
= true;
535 // Upgrading from a cluster that never used an MDS, switch off
536 // filesystem until it's explicitly enabled.
537 legacy_mds_map
.enabled
= false;
542 ::decode(legacy_mds_map
.damaged
, p
);
545 // We're upgrading, populate filesystems from the legacy fields
547 standby_daemons
.clear();
548 standby_epochs
.clear();
550 compat
= legacy_mds_map
.compat
;
551 enable_multiple
= false;
553 // Synthesise a Filesystem from legacy_mds_map, if enabled
554 if (legacy_mds_map
.enabled
) {
555 // Construct a Filesystem from the legacy MDSMap
556 auto migrate_fs
= std::make_shared
<Filesystem
>();
557 migrate_fs
->fscid
= FS_CLUSTER_ID_ANONYMOUS
;
558 migrate_fs
->mds_map
= legacy_mds_map
;
559 migrate_fs
->mds_map
.epoch
= epoch
;
560 filesystems
[migrate_fs
->fscid
] = migrate_fs
;
562 // List of GIDs that had invalid states
563 std::set
<mds_gid_t
> drop_gids
;
565 // Construct mds_roles, standby_daemons, and remove
566 // standbys from the MDSMap in the Filesystem.
567 for (auto &p
: migrate_fs
->mds_map
.mds_info
) {
568 if (p
.second
.state
== MDSMap::STATE_STANDBY_REPLAY
) {
569 // In legacy MDSMap, standby replay daemons don't have
570 // rank set, but since FSMap they do.
571 p
.second
.rank
= p
.second
.standby_for_rank
;
573 if (p
.second
.rank
== MDS_RANK_NONE
) {
574 if (p
.second
.state
!= MDSMap::STATE_STANDBY
) {
575 // Old MDSMaps can have down:dne here, which
576 // is invalid in an FSMap (#17837)
577 drop_gids
.insert(p
.first
);
579 insert(p
.second
); // into standby_daemons
582 mds_roles
[p
.first
] = migrate_fs
->fscid
;
585 for (const auto &p
: standby_daemons
) {
586 // Erase from this Filesystem's MDSMap, because it has
587 // been copied into FSMap::Standby_daemons above
588 migrate_fs
->mds_map
.mds_info
.erase(p
.first
);
590 for (const auto &gid
: drop_gids
) {
591 // Throw away all info for this MDS because it was identified
592 // as having invalid state above.
593 migrate_fs
->mds_map
.mds_info
.erase(gid
);
596 legacy_client_fscid
= migrate_fs
->fscid
;
598 legacy_client_fscid
= FS_CLUSTER_ID_NONE
;
602 ::decode(next_filesystem_id
, p
);
603 ::decode(legacy_client_fscid
, p
);
605 ::decode(enable_multiple
, p
);
606 std::vector
<Filesystem
> fs_list
;
607 ::decode(fs_list
, p
);
609 for (std::vector
<Filesystem
>::const_iterator fs
= fs_list
.begin(); fs
!= fs_list
.end(); ++fs
) {
610 filesystems
[fs
->fscid
] = std::make_shared
<Filesystem
>(*fs
);
613 ::decode(mds_roles
, p
);
614 ::decode(standby_daemons
, p
);
615 ::decode(standby_epochs
, p
);
617 ::decode(ever_enabled_multiple
, p
);
624 void FSMap::sanitize(std::function
<bool(int64_t pool
)> pool_exists
)
626 for (auto &fs
: filesystems
) {
627 fs
.second
->mds_map
.sanitize(pool_exists
);
631 void Filesystem::encode(bufferlist
& bl
, uint64_t features
) const
633 ENCODE_START(1, 1, bl
);
635 bufferlist mdsmap_bl
;
636 mds_map
.encode(mdsmap_bl
, features
);
637 ::encode(mdsmap_bl
, bl
);
641 void Filesystem::decode(bufferlist::iterator
& p
)
645 bufferlist mdsmap_bl
;
646 ::decode(mdsmap_bl
, p
);
647 bufferlist::iterator mdsmap_bl_iter
= mdsmap_bl
.begin();
648 mds_map
.decode(mdsmap_bl_iter
);
652 int FSMap::parse_filesystem(
653 boost::string_view ns_str
,
654 std::shared_ptr
<const Filesystem
> *result
658 std::string
s(ns_str
);
659 fs_cluster_id_t fscid
= strict_strtol(s
.c_str(), 10, &ns_err
);
660 if (!ns_err
.empty() || filesystems
.count(fscid
) == 0) {
661 for (auto &fs
: filesystems
) {
662 if (fs
.second
->mds_map
.fs_name
== s
) {
663 *result
= std::const_pointer_cast
<const Filesystem
>(fs
.second
);
669 *result
= get_filesystem(fscid
);
674 void Filesystem::print(std::ostream
&out
) const
676 out
<< "Filesystem '" << mds_map
.fs_name
677 << "' (" << fscid
<< ")" << std::endl
;
681 mds_gid_t
FSMap::find_standby_for(mds_role_t role
, boost::string_view name
) const
683 mds_gid_t result
= MDS_GID_NONE
;
685 // First see if we have a STANDBY_REPLAY
686 auto fs
= get_filesystem(role
.fscid
);
687 for (const auto &i
: fs
->mds_map
.mds_info
) {
688 const auto &info
= i
.second
;
689 if (info
.rank
== role
.rank
&& info
.state
== MDSMap::STATE_STANDBY_REPLAY
) {
690 return info
.global_id
;
694 // See if there are any STANDBY daemons available
695 for (const auto &i
: standby_daemons
) {
696 const auto &gid
= i
.first
;
697 const auto &info
= i
.second
;
698 assert(info
.state
== MDSMap::STATE_STANDBY
);
699 assert(info
.rank
== MDS_RANK_NONE
);
705 // The mds_info_t may or may not tell us exactly which filesystem
706 // the standby_for_rank refers to: lookup via legacy_client_fscid
707 mds_role_t target_role
= {
708 info
.standby_for_fscid
== FS_CLUSTER_ID_NONE
?
709 legacy_client_fscid
: info
.standby_for_fscid
,
710 info
.standby_for_rank
};
712 if ((target_role
.rank
== role
.rank
&& target_role
.fscid
== role
.fscid
)
713 || (name
.length() && info
.standby_for_name
== name
)) {
714 // It's a named standby for *me*, use it.
717 info
.standby_for_rank
< 0 && info
.standby_for_name
.length() == 0 &&
718 (info
.standby_for_fscid
== FS_CLUSTER_ID_NONE
||
719 info
.standby_for_fscid
== role
.fscid
)) {
720 // It's not a named standby for anyone, use it if we don't find
721 // a named standby for me later, unless it targets another FSCID.
729 mds_gid_t
FSMap::find_unused_for(mds_role_t role
,
730 bool force_standby_active
) const {
731 for (const auto &i
: standby_daemons
) {
732 const auto &gid
= i
.first
;
733 const auto &info
= i
.second
;
734 assert(info
.state
== MDSMap::STATE_STANDBY
);
736 if (info
.laggy() || info
.rank
>= 0)
739 if (info
.standby_for_fscid
!= FS_CLUSTER_ID_NONE
&&
740 info
.standby_for_fscid
!= role
.fscid
)
742 if (info
.standby_for_rank
!= MDS_RANK_NONE
&&
743 info
.standby_for_rank
!= role
.rank
)
746 // To be considered 'unused' a daemon must either not
747 // be selected for standby-replay or the force_standby_active
748 // setting must be enabled to use replay daemons anyway.
749 if (!info
.standby_replay
|| force_standby_active
) {
756 mds_gid_t
FSMap::find_replacement_for(mds_role_t role
, boost::string_view name
,
757 bool force_standby_active
) const {
758 const mds_gid_t standby
= find_standby_for(role
, name
);
762 return find_unused_for(role
, force_standby_active
);
765 void FSMap::sanity() const
767 if (legacy_client_fscid
!= FS_CLUSTER_ID_NONE
) {
768 assert(filesystems
.count(legacy_client_fscid
) == 1);
771 for (const auto &i
: filesystems
) {
773 assert(fs
->mds_map
.compat
.compare(compat
) == 0);
774 assert(fs
->fscid
== i
.first
);
775 for (const auto &j
: fs
->mds_map
.mds_info
) {
776 assert(j
.second
.rank
!= MDS_RANK_NONE
);
777 assert(mds_roles
.count(j
.first
) == 1);
778 assert(standby_daemons
.count(j
.first
) == 0);
779 assert(standby_epochs
.count(j
.first
) == 0);
780 assert(mds_roles
.at(j
.first
) == i
.first
);
781 if (j
.second
.state
!= MDSMap::STATE_STANDBY_REPLAY
) {
782 assert(fs
->mds_map
.up
.at(j
.second
.rank
) == j
.first
);
783 assert(fs
->mds_map
.failed
.count(j
.second
.rank
) == 0);
784 assert(fs
->mds_map
.damaged
.count(j
.second
.rank
) == 0);
788 for (const auto &j
: fs
->mds_map
.up
) {
789 mds_rank_t rank
= j
.first
;
790 assert(fs
->mds_map
.in
.count(rank
) == 1);
791 mds_gid_t gid
= j
.second
;
792 assert(fs
->mds_map
.mds_info
.count(gid
) == 1);
796 for (const auto &i
: standby_daemons
) {
797 assert(i
.second
.state
== MDSMap::STATE_STANDBY
);
798 assert(i
.second
.rank
== MDS_RANK_NONE
);
799 assert(i
.second
.global_id
== i
.first
);
800 assert(standby_epochs
.count(i
.first
) == 1);
801 assert(mds_roles
.count(i
.first
) == 1);
802 assert(mds_roles
.at(i
.first
) == FS_CLUSTER_ID_NONE
);
805 for (const auto &i
: standby_epochs
) {
806 assert(standby_daemons
.count(i
.first
) == 1);
809 for (const auto &i
: mds_roles
) {
810 if (i
.second
== FS_CLUSTER_ID_NONE
) {
811 assert(standby_daemons
.count(i
.first
) == 1);
813 assert(filesystems
.count(i
.second
) == 1);
814 assert(filesystems
.at(i
.second
)->mds_map
.mds_info
.count(i
.first
) == 1);
820 mds_gid_t standby_gid
,
821 const std::shared_ptr
<Filesystem
> &filesystem
,
822 mds_rank_t assigned_rank
)
824 assert(gid_exists(standby_gid
));
825 bool is_standby_replay
= mds_roles
.at(standby_gid
) != FS_CLUSTER_ID_NONE
;
826 if (!is_standby_replay
) {
827 assert(standby_daemons
.count(standby_gid
));
828 assert(standby_daemons
.at(standby_gid
).state
== MDSMap::STATE_STANDBY
);
831 MDSMap
&mds_map
= filesystem
->mds_map
;
833 // Insert daemon state to Filesystem
834 if (!is_standby_replay
) {
835 mds_map
.mds_info
[standby_gid
] = standby_daemons
.at(standby_gid
);
837 assert(mds_map
.mds_info
.count(standby_gid
));
838 assert(mds_map
.mds_info
.at(standby_gid
).state
== MDSMap::STATE_STANDBY_REPLAY
);
839 assert(mds_map
.mds_info
.at(standby_gid
).rank
== assigned_rank
);
841 MDSMap::mds_info_t
&info
= mds_map
.mds_info
[standby_gid
];
843 if (mds_map
.stopped
.erase(assigned_rank
)) {
844 // The cluster is being expanded with a stopped rank
845 info
.state
= MDSMap::STATE_STARTING
;
846 } else if (!mds_map
.is_in(assigned_rank
)) {
847 // The cluster is being expanded with a new rank
848 info
.state
= MDSMap::STATE_CREATING
;
850 // An existing rank is being assigned to a replacement
851 info
.state
= MDSMap::STATE_REPLAY
;
852 mds_map
.failed
.erase(assigned_rank
);
854 info
.rank
= assigned_rank
;
856 mds_roles
[standby_gid
] = filesystem
->fscid
;
858 // Update the rank state in Filesystem
859 mds_map
.in
.insert(assigned_rank
);
860 mds_map
.up
[assigned_rank
] = standby_gid
;
862 // Remove from the list of standbys
863 if (!is_standby_replay
) {
864 standby_daemons
.erase(standby_gid
);
865 standby_epochs
.erase(standby_gid
);
868 // Indicate that Filesystem has been modified
869 mds_map
.epoch
= epoch
;
872 void FSMap::assign_standby_replay(
873 const mds_gid_t standby_gid
,
874 const fs_cluster_id_t leader_ns
,
875 const mds_rank_t leader_rank
)
877 assert(mds_roles
.at(standby_gid
) == FS_CLUSTER_ID_NONE
);
878 assert(gid_exists(standby_gid
));
879 assert(!gid_has_rank(standby_gid
));
880 assert(standby_daemons
.count(standby_gid
));
882 // Insert to the filesystem
883 auto fs
= filesystems
.at(leader_ns
);
884 fs
->mds_map
.mds_info
[standby_gid
] = standby_daemons
.at(standby_gid
);
885 fs
->mds_map
.mds_info
[standby_gid
].rank
= leader_rank
;
886 fs
->mds_map
.mds_info
[standby_gid
].state
= MDSMap::STATE_STANDBY_REPLAY
;
887 mds_roles
[standby_gid
] = leader_ns
;
889 // Remove from the list of standbys
890 standby_daemons
.erase(standby_gid
);
891 standby_epochs
.erase(standby_gid
);
893 // Indicate that Filesystem has been modified
894 fs
->mds_map
.epoch
= epoch
;
897 void FSMap::erase(mds_gid_t who
, epoch_t blacklist_epoch
)
899 if (mds_roles
.at(who
) == FS_CLUSTER_ID_NONE
) {
900 standby_daemons
.erase(who
);
901 standby_epochs
.erase(who
);
903 auto &fs
= filesystems
.at(mds_roles
.at(who
));
904 const auto &info
= fs
->mds_map
.mds_info
.at(who
);
905 if (info
.state
!= MDSMap::STATE_STANDBY_REPLAY
) {
906 if (info
.state
== MDSMap::STATE_CREATING
) {
907 // If this gid didn't make it past CREATING, then forget
908 // the rank ever existed so that next time it's handed out
909 // to a gid it'll go back into CREATING.
910 fs
->mds_map
.in
.erase(info
.rank
);
912 // Put this rank into the failed list so that the next available
913 // STANDBY will pick it up.
914 fs
->mds_map
.failed
.insert(info
.rank
);
916 assert(fs
->mds_map
.up
.at(info
.rank
) == info
.global_id
);
917 fs
->mds_map
.up
.erase(info
.rank
);
919 fs
->mds_map
.mds_info
.erase(who
);
920 fs
->mds_map
.last_failure_osd_epoch
= blacklist_epoch
;
921 fs
->mds_map
.epoch
= epoch
;
924 mds_roles
.erase(who
);
927 void FSMap::damaged(mds_gid_t who
, epoch_t blacklist_epoch
)
929 assert(mds_roles
.at(who
) != FS_CLUSTER_ID_NONE
);
930 auto fs
= filesystems
.at(mds_roles
.at(who
));
931 mds_rank_t rank
= fs
->mds_map
.mds_info
[who
].rank
;
933 erase(who
, blacklist_epoch
);
934 fs
->mds_map
.failed
.erase(rank
);
935 fs
->mds_map
.damaged
.insert(rank
);
937 assert(fs
->mds_map
.epoch
== epoch
);
941 * Update to indicate that the rank `rank` is to be removed
942 * from the damaged list of the filesystem `fscid`
944 bool FSMap::undamaged(const fs_cluster_id_t fscid
, const mds_rank_t rank
)
946 auto fs
= filesystems
.at(fscid
);
948 if (fs
->mds_map
.damaged
.erase(rank
)) {
949 fs
->mds_map
.failed
.insert(rank
);
950 fs
->mds_map
.epoch
= epoch
;
957 void FSMap::insert(const MDSMap::mds_info_t
&new_info
)
959 assert(new_info
.state
== MDSMap::STATE_STANDBY
);
960 assert(new_info
.rank
== MDS_RANK_NONE
);
961 mds_roles
[new_info
.global_id
] = FS_CLUSTER_ID_NONE
;
962 standby_daemons
[new_info
.global_id
] = new_info
;
963 standby_epochs
[new_info
.global_id
] = epoch
;
966 std::list
<mds_gid_t
> FSMap::stop(mds_gid_t who
)
968 assert(mds_roles
.at(who
) != FS_CLUSTER_ID_NONE
);
969 auto fs
= filesystems
.at(mds_roles
.at(who
));
970 const auto &info
= fs
->mds_map
.mds_info
.at(who
);
971 fs
->mds_map
.up
.erase(info
.rank
);
972 fs
->mds_map
.in
.erase(info
.rank
);
973 fs
->mds_map
.stopped
.insert(info
.rank
);
975 // Also drop any standby replays that were following this rank
976 std::list
<mds_gid_t
> standbys
;
977 for (const auto &i
: fs
->mds_map
.mds_info
) {
978 const auto &other_gid
= i
.first
;
979 const auto &other_info
= i
.second
;
980 if (other_info
.rank
== info
.rank
981 && other_info
.state
== MDSMap::STATE_STANDBY_REPLAY
) {
982 standbys
.push_back(other_gid
);
987 fs
->mds_map
.mds_info
.erase(who
);
988 mds_roles
.erase(who
);
990 fs
->mds_map
.epoch
= epoch
;
997 * Given one of the following forms:
1002 * Parse into a mds_role_t. The rank-only form is only valid
1003 * if legacy_client_ns is set.
1005 int FSMap::parse_role(
1006 boost::string_view role_str
,
1008 std::ostream
&ss
) const
1010 size_t colon_pos
= role_str
.find(":");
1012 std::shared_ptr
<const Filesystem
> fs
;
1013 if (colon_pos
== std::string::npos
) {
1014 if (legacy_client_fscid
== FS_CLUSTER_ID_NONE
) {
1015 ss
<< "No filesystem selected";
1018 fs
= get_filesystem(legacy_client_fscid
);
1021 if (parse_filesystem(role_str
.substr(0, colon_pos
), &fs
) < 0) {
1022 ss
<< "Invalid filesystem";
1025 rank_pos
= colon_pos
+1;
1030 std::string
rank_str(role_str
.substr(rank_pos
));
1031 long rank_i
= strict_strtol(rank_str
.c_str(), 10, &err
);
1032 if (rank_i
< 0 || !err
.empty()) {
1033 ss
<< "Invalid rank '" << rank_str
<< "'";
1039 if (fs
->mds_map
.in
.count(rank
) == 0) {
1040 ss
<< "Rank '" << rank
<< "' not found";
1044 *role
= {fs
->fscid
, rank
};