1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
4 * Ceph - scalable distributed file system
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
19 using std::stringstream
;
21 #include "mon/health_check.h"
24 void Filesystem::dump(Formatter
*f
) const
26 f
->open_object_section("mdsmap");
29 f
->dump_int("id", fscid
);
32 void FSMap::dump(Formatter
*f
) const
34 f
->dump_int("epoch", epoch
);
36 f
->open_object_section("compat");
40 f
->open_object_section("feature_flags");
41 f
->dump_bool("enable_multiple", enable_multiple
);
42 f
->dump_bool("ever_enabled_multiple", ever_enabled_multiple
);
45 f
->open_array_section("standbys");
46 for (const auto &i
: standby_daemons
) {
47 f
->open_object_section("info");
49 f
->dump_int("epoch", standby_epochs
.at(i
.first
));
54 f
->open_array_section("filesystems");
55 for (const auto &fs
: filesystems
) {
56 f
->open_object_section("filesystem");
63 void FSMap::generate_test_instances(list
<FSMap
*>& ls
)
65 FSMap
*m
= new FSMap();
67 std::list
<MDSMap
*> mds_map_instances
;
68 MDSMap::generate_test_instances(mds_map_instances
);
71 for (auto i
: mds_map_instances
) {
72 auto fs
= std::make_shared
<Filesystem
>();
76 m
->filesystems
[fs
->fscid
] = fs
;
78 mds_map_instances
.clear();
83 void FSMap::print(ostream
& out
) const
85 out
<< "e" << epoch
<< std::endl
;
86 out
<< "enable_multiple, ever_enabled_multiple: " << enable_multiple
<< ","
87 << ever_enabled_multiple
<< std::endl
;
88 out
<< "compat: " << compat
<< std::endl
;
89 out
<< "legacy client fscid: " << legacy_client_fscid
<< std::endl
;
90 out
<< " " << std::endl
;
92 if (filesystems
.empty()) {
93 out
<< "No filesystems configured" << std::endl
;
97 for (const auto &fs
: filesystems
) {
98 fs
.second
->print(out
);
99 out
<< " " << std::endl
<< " " << std::endl
; // Space out a bit
102 if (!standby_daemons
.empty()) {
103 out
<< "Standby daemons:" << std::endl
<< " " << std::endl
;
106 for (const auto &p
: standby_daemons
) {
107 p
.second
.print_summary(out
);
114 void FSMap::print_summary(Formatter
*f
, ostream
*out
) const
116 map
<mds_role_t
,string
> by_rank
;
117 map
<string
,int> by_state
;
120 f
->dump_unsigned("epoch", get_epoch());
121 for (auto i
: filesystems
) {
123 f
->dump_unsigned("id", fs
->fscid
);
124 f
->dump_unsigned("up", fs
->mds_map
.up
.size());
125 f
->dump_unsigned("in", fs
->mds_map
.in
.size());
126 f
->dump_unsigned("max", fs
->mds_map
.max_mds
);
129 for (auto i
: filesystems
) {
131 *out
<< fs
->mds_map
.fs_name
<< "-" << fs
->mds_map
.up
.size() << "/"
132 << fs
->mds_map
.in
.size() << "/" << fs
->mds_map
.max_mds
<< " up ";
137 f
->open_array_section("by_rank");
140 const auto all_info
= get_mds_info();
141 for (const auto &p
: all_info
) {
142 const auto &info
= p
.second
;
143 string s
= ceph_mds_state_name(info
.state
);
145 s
+= "(laggy or crashed)";
148 const fs_cluster_id_t fscid
= mds_roles
.at(info
.global_id
);
150 if (info
.rank
!= MDS_RANK_NONE
&&
151 info
.state
!= MDSMap::STATE_STANDBY_REPLAY
) {
153 f
->open_object_section("mds");
154 f
->dump_unsigned("filesystem_id", fscid
);
155 f
->dump_unsigned("rank", info
.rank
);
156 f
->dump_string("name", info
.name
);
157 f
->dump_string("status", s
);
160 by_rank
[mds_role_t(fscid
, info
.rank
)] = info
.name
+ "=" + s
;
170 if (!by_rank
.empty()) {
171 if (filesystems
.size() > 1) {
172 // Disambiguate filesystems
173 std::map
<std::string
, std::string
> pretty
;
174 for (auto i
: by_rank
) {
175 const auto &fs_name
= filesystems
.at(i
.first
.fscid
)->mds_map
.fs_name
;
176 std::ostringstream o
;
177 o
<< "[" << fs_name
<< ":" << i
.first
.rank
<< "]";
178 pretty
[o
.str()] = i
.second
;
180 *out
<< " " << pretty
;
182 // Omit FSCID in output when only one filesystem exists
183 std::map
<mds_rank_t
, std::string
> shortened
;
184 for (auto i
: by_rank
) {
185 shortened
[i
.first
.rank
] = i
.second
;
187 *out
<< " " << shortened
;
192 for (map
<string
,int>::reverse_iterator p
= by_state
.rbegin(); p
!= by_state
.rend(); ++p
) {
194 f
->dump_unsigned(p
->first
.c_str(), p
->second
);
196 *out
<< ", " << p
->second
<< " " << p
->first
;
202 for (auto i
: filesystems
) {
204 failed
+= fs
->mds_map
.failed
.size();
205 damaged
+= fs
->mds_map
.damaged
.size();
210 f
->dump_unsigned("failed", failed
);
212 *out
<< ", " << failed
<< " failed";
218 f
->dump_unsigned("damaged", damaged
);
220 *out
<< ", " << damaged
<< " damaged";
223 //if (stopped.size())
224 //out << ", " << stopped.size() << " stopped";
228 void FSMap::create_filesystem(boost::string_view name
,
229 int64_t metadata_pool
, int64_t data_pool
,
232 auto fs
= std::make_shared
<Filesystem
>();
233 fs
->mds_map
.epoch
= epoch
;
234 fs
->mds_map
.fs_name
= std::string(name
);
235 fs
->mds_map
.max_mds
= 1;
236 fs
->mds_map
.data_pools
.push_back(data_pool
);
237 fs
->mds_map
.metadata_pool
= metadata_pool
;
238 fs
->mds_map
.cas_pool
= -1;
239 fs
->mds_map
.max_file_size
= g_conf
->mds_max_file_size
;
240 fs
->mds_map
.compat
= compat
;
241 fs
->mds_map
.created
= ceph_clock_now();
242 fs
->mds_map
.modified
= ceph_clock_now();
243 fs
->mds_map
.session_timeout
= g_conf
->mds_session_timeout
;
244 fs
->mds_map
.session_autoclose
= g_conf
->mds_session_autoclose
;
245 fs
->mds_map
.enabled
= true;
246 if (features
& CEPH_FEATURE_SERVER_JEWEL
) {
247 fs
->fscid
= next_filesystem_id
++;
248 // ANONYMOUS is only for upgrades from legacy mdsmaps, we should
249 // have initialized next_filesystem_id such that it's never used here.
250 assert(fs
->fscid
!= FS_CLUSTER_ID_ANONYMOUS
);
252 // Use anon fscid because this will get thrown away when encoding
253 // as legacy MDSMap for legacy mons.
254 assert(filesystems
.empty());
255 fs
->fscid
= FS_CLUSTER_ID_ANONYMOUS
;
257 filesystems
[fs
->fscid
] = fs
;
259 // Created first filesystem? Set it as the one
260 // for legacy clients to use
261 if (filesystems
.size() == 1) {
262 legacy_client_fscid
= fs
->fscid
;
266 void FSMap::reset_filesystem(fs_cluster_id_t fscid
)
268 auto fs
= get_filesystem(fscid
);
269 auto new_fs
= std::make_shared
<Filesystem
>();
271 // Populate rank 0 as existing (so don't go into CREATING)
272 // but failed (so that next available MDS is assigned the rank)
273 new_fs
->mds_map
.in
.insert(mds_rank_t(0));
274 new_fs
->mds_map
.failed
.insert(mds_rank_t(0));
276 // Carry forward what makes sense
277 new_fs
->fscid
= fs
->fscid
;
278 new_fs
->mds_map
.inline_data_enabled
= fs
->mds_map
.inline_data_enabled
;
279 new_fs
->mds_map
.max_mds
= 1;
280 new_fs
->mds_map
.data_pools
= fs
->mds_map
.data_pools
;
281 new_fs
->mds_map
.metadata_pool
= fs
->mds_map
.metadata_pool
;
282 new_fs
->mds_map
.cas_pool
= fs
->mds_map
.cas_pool
;
283 new_fs
->mds_map
.fs_name
= fs
->mds_map
.fs_name
;
284 new_fs
->mds_map
.max_file_size
= g_conf
->mds_max_file_size
;
285 new_fs
->mds_map
.compat
= compat
;
286 new_fs
->mds_map
.created
= ceph_clock_now();
287 new_fs
->mds_map
.modified
= ceph_clock_now();
288 new_fs
->mds_map
.session_timeout
= g_conf
->mds_session_timeout
;
289 new_fs
->mds_map
.session_autoclose
= g_conf
->mds_session_autoclose
;
290 new_fs
->mds_map
.standby_count_wanted
= fs
->mds_map
.standby_count_wanted
;
291 new_fs
->mds_map
.enabled
= true;
293 // Remember mds ranks that have ever started. (They should load old inotable
294 // instead of creating new one if they start again.)
295 new_fs
->mds_map
.stopped
.insert(fs
->mds_map
.in
.begin(), fs
->mds_map
.in
.end());
296 new_fs
->mds_map
.stopped
.insert(fs
->mds_map
.stopped
.begin(), fs
->mds_map
.stopped
.end());
297 new_fs
->mds_map
.stopped
.erase(mds_rank_t(0));
299 // Persist the new FSMap
300 filesystems
[new_fs
->fscid
] = new_fs
;
303 void FSMap::get_health(list
<pair
<health_status_t
,string
> >& summary
,
304 list
<pair
<health_status_t
,string
> > *detail
) const
306 mds_rank_t standby_count_wanted
= 0;
307 for (const auto &i
: filesystems
) {
308 const auto &fs
= i
.second
;
310 // TODO: move get_health up into here so that we can qualify
311 // all the messages with what filesystem they're talking about
312 fs
->mds_map
.get_health(summary
, detail
);
314 standby_count_wanted
= std::max(standby_count_wanted
, fs
->mds_map
.get_standby_count_wanted((mds_rank_t
)standby_daemons
.size()));
317 if (standby_count_wanted
) {
318 std::ostringstream oss
;
319 oss
<< "insufficient standby daemons available: have " << standby_daemons
.size() << "; want " << standby_count_wanted
<< " more";
320 summary
.push_back(make_pair(HEALTH_WARN
, oss
.str()));
324 bool FSMap::check_health(void)
326 bool changed
= false;
327 for (auto &i
: filesystems
) {
328 changed
|= i
.second
->mds_map
.check_health((mds_rank_t
)standby_daemons
.size());
333 void FSMap::get_health_checks(health_check_map_t
*checks
) const
335 mds_rank_t standby_count_wanted
= 0;
336 for (const auto &i
: filesystems
) {
337 const auto &fs
= i
.second
;
338 health_check_map_t fschecks
;
340 fs
->mds_map
.get_health_checks(&fschecks
);
342 // Some of the failed ranks might be transient (i.e. there are standbys
343 // ready to replace them). We will report only on "stuck" failed, i.e.
344 // ranks which are failed and have no standby replacement available.
345 std::set
<mds_rank_t
> stuck_failed
;
347 for (const auto &rank
: fs
->mds_map
.failed
) {
348 const mds_gid_t replacement
= find_replacement_for(
349 {fs
->fscid
, rank
}, {}, g_conf
->mon_force_standby_active
);
350 if (replacement
== MDS_GID_NONE
) {
351 stuck_failed
.insert(rank
);
355 // FS_WITH_FAILED_MDS
356 if (!stuck_failed
.empty()) {
357 health_check_t
& fscheck
= checks
->get_or_add(
358 "FS_WITH_FAILED_MDS", HEALTH_WARN
,
359 "%num% filesystem%plurals% %hasorhave% a failed mds daemon");
361 ss
<< "fs " << fs
->mds_map
.fs_name
<< " has " << stuck_failed
.size()
362 << " failed mds" << (stuck_failed
.size() > 1 ? "s" : "");
363 fscheck
.detail
.push_back(ss
.str()); }
365 checks
->merge(fschecks
);
366 standby_count_wanted
= std::max(
367 standby_count_wanted
,
368 fs
->mds_map
.get_standby_count_wanted((mds_rank_t
)standby_daemons
.size()));
371 // MDS_INSUFFICIENT_STANDBY
372 if (standby_count_wanted
) {
373 std::ostringstream oss
, dss
;
374 oss
<< "insufficient standby MDS daemons available";
375 auto& d
= checks
->get_or_add("MDS_INSUFFICIENT_STANDBY", HEALTH_WARN
, oss
.str());
376 dss
<< "have " << standby_daemons
.size() << "; want " << standby_count_wanted
378 d
.detail
.push_back(dss
.str());
382 void FSMap::encode(bufferlist
& bl
, uint64_t features
) const
384 if (features
& CEPH_FEATURE_SERVER_JEWEL
) {
385 ENCODE_START(7, 6, bl
);
387 ::encode(next_filesystem_id
, bl
);
388 ::encode(legacy_client_fscid
, bl
);
389 ::encode(compat
, bl
);
390 ::encode(enable_multiple
, bl
);
391 std::vector
<Filesystem
> fs_list
;
392 for (auto i
: filesystems
) {
393 fs_list
.push_back(*(i
.second
));
395 ::encode(fs_list
, bl
, features
);
396 ::encode(mds_roles
, bl
);
397 ::encode(standby_daemons
, bl
, features
);
398 ::encode(standby_epochs
, bl
);
399 ::encode(ever_enabled_multiple
, bl
);
402 if (filesystems
.empty()) {
404 disabled_map
.epoch
= epoch
;
405 disabled_map
.encode(bl
, features
);
407 // MDSMonitor should never have created multiple filesystems
408 // until the quorum features indicated Jewel
409 assert(filesystems
.size() == 1);
410 auto fs
= filesystems
.begin()->second
;
412 // Take the MDSMap for the enabled filesystem, and populated its
413 // mds_info with the standbys to get a pre-jewel-style mon MDSMap.
414 MDSMap full_mdsmap
= fs
->mds_map
;
415 full_mdsmap
.epoch
= epoch
;
416 for (const auto &p
: standby_daemons
) {
417 full_mdsmap
.mds_info
[p
.first
] = p
.second
;
420 // Old MDSMaps don't set rank on standby replay daemons
421 for (auto &i
: full_mdsmap
.mds_info
) {
422 auto &info
= i
.second
;
423 if (info
.state
== MDSMap::STATE_STANDBY_REPLAY
) {
424 info
.rank
= MDS_RANK_NONE
;
428 full_mdsmap
.encode(bl
, features
);
433 void FSMap::decode(bufferlist::iterator
& p
)
435 // The highest MDSMap encoding version before we changed the
436 // MDSMonitor to store an FSMap instead of an MDSMap was
437 // 5, so anything older than 6 is decoded as an MDSMap,
438 // and anything newer is decoded as an FSMap.
439 DECODE_START_LEGACY_COMPAT_LEN_16(7, 4, 4, p
);
441 // Because the mon used to store an MDSMap where we now
442 // store an FSMap, FSMap knows how to decode the legacy
443 // MDSMap format (it never needs to encode it though).
444 MDSMap legacy_mds_map
;
446 // Decoding an MDSMap (upgrade)
448 ::decode(legacy_mds_map
.flags
, p
);
449 ::decode(legacy_mds_map
.last_failure
, p
);
450 ::decode(legacy_mds_map
.root
, p
);
451 ::decode(legacy_mds_map
.session_timeout
, p
);
452 ::decode(legacy_mds_map
.session_autoclose
, p
);
453 ::decode(legacy_mds_map
.max_file_size
, p
);
454 ::decode(legacy_mds_map
.max_mds
, p
);
455 ::decode(legacy_mds_map
.mds_info
, p
);
462 legacy_mds_map
.data_pools
.push_back(m
);
466 legacy_mds_map
.cas_pool
= s
;
468 ::decode(legacy_mds_map
.data_pools
, p
);
469 ::decode(legacy_mds_map
.cas_pool
, p
);
472 // kclient ignores everything from here
477 ::decode(legacy_mds_map
.compat
, p
);
479 legacy_mds_map
.compat
= get_mdsmap_compat_set_base();
483 legacy_mds_map
.metadata_pool
= n
;
485 ::decode(legacy_mds_map
.metadata_pool
, p
);
487 ::decode(legacy_mds_map
.created
, p
);
488 ::decode(legacy_mds_map
.modified
, p
);
489 ::decode(legacy_mds_map
.tableserver
, p
);
490 ::decode(legacy_mds_map
.in
, p
);
491 std::map
<mds_rank_t
,int32_t> inc
; // Legacy field, parse and drop
493 ::decode(legacy_mds_map
.up
, p
);
494 ::decode(legacy_mds_map
.failed
, p
);
495 ::decode(legacy_mds_map
.stopped
, p
);
497 ::decode(legacy_mds_map
.last_failure_osd_epoch
, p
);
500 // previously this was a bool about snaps, not a flag map
503 legacy_mds_map
.ever_allowed_features
= flag
?
504 CEPH_MDSMAP_ALLOW_SNAPS
: 0;
506 legacy_mds_map
.explicitly_allowed_features
= flag
?
507 CEPH_MDSMAP_ALLOW_SNAPS
: 0;
508 if (legacy_mds_map
.max_mds
> 1) {
509 legacy_mds_map
.set_multimds_allowed();
512 ::decode(legacy_mds_map
.ever_allowed_features
, p
);
513 ::decode(legacy_mds_map
.explicitly_allowed_features
, p
);
516 legacy_mds_map
.ever_allowed_features
= CEPH_MDSMAP_ALLOW_CLASSICS
;
517 legacy_mds_map
.explicitly_allowed_features
= 0;
518 if (legacy_mds_map
.max_mds
> 1) {
519 legacy_mds_map
.set_multimds_allowed();
523 ::decode(legacy_mds_map
.inline_data_enabled
, p
);
526 assert(struct_v
>= 5);
527 ::decode(legacy_mds_map
.enabled
, p
);
528 ::decode(legacy_mds_map
.fs_name
, p
);
530 legacy_mds_map
.fs_name
= "default";
532 // If an MDS has ever been started, epoch will be greater than 1,
533 // assume filesystem is enabled.
534 legacy_mds_map
.enabled
= true;
536 // Upgrading from a cluster that never used an MDS, switch off
537 // filesystem until it's explicitly enabled.
538 legacy_mds_map
.enabled
= false;
543 ::decode(legacy_mds_map
.damaged
, p
);
546 // We're upgrading, populate filesystems from the legacy fields
548 standby_daemons
.clear();
549 standby_epochs
.clear();
551 compat
= legacy_mds_map
.compat
;
552 enable_multiple
= false;
554 // Synthesise a Filesystem from legacy_mds_map, if enabled
555 if (legacy_mds_map
.enabled
) {
556 // Construct a Filesystem from the legacy MDSMap
557 auto migrate_fs
= std::make_shared
<Filesystem
>();
558 migrate_fs
->fscid
= FS_CLUSTER_ID_ANONYMOUS
;
559 migrate_fs
->mds_map
= legacy_mds_map
;
560 migrate_fs
->mds_map
.epoch
= epoch
;
561 filesystems
[migrate_fs
->fscid
] = migrate_fs
;
563 // List of GIDs that had invalid states
564 std::set
<mds_gid_t
> drop_gids
;
566 // Construct mds_roles, standby_daemons, and remove
567 // standbys from the MDSMap in the Filesystem.
568 for (auto &p
: migrate_fs
->mds_map
.mds_info
) {
569 if (p
.second
.state
== MDSMap::STATE_STANDBY_REPLAY
) {
570 // In legacy MDSMap, standby replay daemons don't have
571 // rank set, but since FSMap they do.
572 p
.second
.rank
= p
.second
.standby_for_rank
;
574 if (p
.second
.rank
== MDS_RANK_NONE
) {
575 if (p
.second
.state
!= MDSMap::STATE_STANDBY
) {
576 // Old MDSMaps can have down:dne here, which
577 // is invalid in an FSMap (#17837)
578 drop_gids
.insert(p
.first
);
580 insert(p
.second
); // into standby_daemons
583 mds_roles
[p
.first
] = migrate_fs
->fscid
;
586 for (const auto &p
: standby_daemons
) {
587 // Erase from this Filesystem's MDSMap, because it has
588 // been copied into FSMap::Standby_daemons above
589 migrate_fs
->mds_map
.mds_info
.erase(p
.first
);
591 for (const auto &gid
: drop_gids
) {
592 // Throw away all info for this MDS because it was identified
593 // as having invalid state above.
594 migrate_fs
->mds_map
.mds_info
.erase(gid
);
597 legacy_client_fscid
= migrate_fs
->fscid
;
599 legacy_client_fscid
= FS_CLUSTER_ID_NONE
;
603 ::decode(next_filesystem_id
, p
);
604 ::decode(legacy_client_fscid
, p
);
606 ::decode(enable_multiple
, p
);
607 std::vector
<Filesystem
> fs_list
;
608 ::decode(fs_list
, p
);
610 for (std::vector
<Filesystem
>::const_iterator fs
= fs_list
.begin(); fs
!= fs_list
.end(); ++fs
) {
611 filesystems
[fs
->fscid
] = std::make_shared
<Filesystem
>(*fs
);
614 ::decode(mds_roles
, p
);
615 ::decode(standby_daemons
, p
);
616 ::decode(standby_epochs
, p
);
618 ::decode(ever_enabled_multiple
, p
);
625 void FSMap::sanitize(std::function
<bool(int64_t pool
)> pool_exists
)
627 for (auto &fs
: filesystems
) {
628 fs
.second
->mds_map
.sanitize(pool_exists
);
632 void Filesystem::encode(bufferlist
& bl
, uint64_t features
) const
634 ENCODE_START(1, 1, bl
);
636 bufferlist mdsmap_bl
;
637 mds_map
.encode(mdsmap_bl
, features
);
638 ::encode(mdsmap_bl
, bl
);
642 void Filesystem::decode(bufferlist::iterator
& p
)
646 bufferlist mdsmap_bl
;
647 ::decode(mdsmap_bl
, p
);
648 bufferlist::iterator mdsmap_bl_iter
= mdsmap_bl
.begin();
649 mds_map
.decode(mdsmap_bl_iter
);
653 int FSMap::parse_filesystem(
654 boost::string_view ns_str
,
655 std::shared_ptr
<const Filesystem
> *result
659 std::string
s(ns_str
);
660 fs_cluster_id_t fscid
= strict_strtol(s
.c_str(), 10, &ns_err
);
661 if (!ns_err
.empty() || filesystems
.count(fscid
) == 0) {
662 for (auto &fs
: filesystems
) {
663 if (fs
.second
->mds_map
.fs_name
== s
) {
664 *result
= std::const_pointer_cast
<const Filesystem
>(fs
.second
);
670 *result
= get_filesystem(fscid
);
675 void Filesystem::print(std::ostream
&out
) const
677 out
<< "Filesystem '" << mds_map
.fs_name
678 << "' (" << fscid
<< ")" << std::endl
;
682 mds_gid_t
FSMap::find_standby_for(mds_role_t role
, boost::string_view name
) const
684 mds_gid_t result
= MDS_GID_NONE
;
686 // First see if we have a STANDBY_REPLAY
687 auto fs
= get_filesystem(role
.fscid
);
688 for (const auto &i
: fs
->mds_map
.mds_info
) {
689 const auto &info
= i
.second
;
690 if (info
.rank
== role
.rank
&& info
.state
== MDSMap::STATE_STANDBY_REPLAY
) {
691 return info
.global_id
;
695 // See if there are any STANDBY daemons available
696 for (const auto &i
: standby_daemons
) {
697 const auto &gid
= i
.first
;
698 const auto &info
= i
.second
;
699 assert(info
.state
== MDSMap::STATE_STANDBY
);
700 assert(info
.rank
== MDS_RANK_NONE
);
706 // The mds_info_t may or may not tell us exactly which filesystem
707 // the standby_for_rank refers to: lookup via legacy_client_fscid
708 mds_role_t target_role
= {
709 info
.standby_for_fscid
== FS_CLUSTER_ID_NONE
?
710 legacy_client_fscid
: info
.standby_for_fscid
,
711 info
.standby_for_rank
};
713 if ((target_role
.rank
== role
.rank
&& target_role
.fscid
== role
.fscid
)
714 || (name
.length() && info
.standby_for_name
== name
)) {
715 // It's a named standby for *me*, use it.
718 info
.standby_for_rank
< 0 && info
.standby_for_name
.length() == 0 &&
719 (info
.standby_for_fscid
== FS_CLUSTER_ID_NONE
||
720 info
.standby_for_fscid
== role
.fscid
)) {
721 // It's not a named standby for anyone, use it if we don't find
722 // a named standby for me later, unless it targets another FSCID.
730 mds_gid_t
FSMap::find_unused_for(mds_role_t role
,
731 bool force_standby_active
) const {
732 for (const auto &i
: standby_daemons
) {
733 const auto &gid
= i
.first
;
734 const auto &info
= i
.second
;
735 assert(info
.state
== MDSMap::STATE_STANDBY
);
737 if (info
.laggy() || info
.rank
>= 0)
740 if (info
.standby_for_fscid
!= FS_CLUSTER_ID_NONE
&&
741 info
.standby_for_fscid
!= role
.fscid
)
743 if (info
.standby_for_rank
!= MDS_RANK_NONE
&&
744 info
.standby_for_rank
!= role
.rank
)
747 // To be considered 'unused' a daemon must either not
748 // be selected for standby-replay or the force_standby_active
749 // setting must be enabled to use replay daemons anyway.
750 if (!info
.standby_replay
|| force_standby_active
) {
757 mds_gid_t
FSMap::find_replacement_for(mds_role_t role
, boost::string_view name
,
758 bool force_standby_active
) const {
759 const mds_gid_t standby
= find_standby_for(role
, name
);
763 return find_unused_for(role
, force_standby_active
);
766 void FSMap::sanity() const
768 if (legacy_client_fscid
!= FS_CLUSTER_ID_NONE
) {
769 assert(filesystems
.count(legacy_client_fscid
) == 1);
772 for (const auto &i
: filesystems
) {
774 assert(fs
->mds_map
.compat
.compare(compat
) == 0);
775 assert(fs
->fscid
== i
.first
);
776 for (const auto &j
: fs
->mds_map
.mds_info
) {
777 assert(j
.second
.rank
!= MDS_RANK_NONE
);
778 assert(mds_roles
.count(j
.first
) == 1);
779 assert(standby_daemons
.count(j
.first
) == 0);
780 assert(standby_epochs
.count(j
.first
) == 0);
781 assert(mds_roles
.at(j
.first
) == i
.first
);
782 if (j
.second
.state
!= MDSMap::STATE_STANDBY_REPLAY
) {
783 assert(fs
->mds_map
.up
.at(j
.second
.rank
) == j
.first
);
784 assert(fs
->mds_map
.failed
.count(j
.second
.rank
) == 0);
785 assert(fs
->mds_map
.damaged
.count(j
.second
.rank
) == 0);
789 for (const auto &j
: fs
->mds_map
.up
) {
790 mds_rank_t rank
= j
.first
;
791 assert(fs
->mds_map
.in
.count(rank
) == 1);
792 mds_gid_t gid
= j
.second
;
793 assert(fs
->mds_map
.mds_info
.count(gid
) == 1);
797 for (const auto &i
: standby_daemons
) {
798 assert(i
.second
.state
== MDSMap::STATE_STANDBY
);
799 assert(i
.second
.rank
== MDS_RANK_NONE
);
800 assert(i
.second
.global_id
== i
.first
);
801 assert(standby_epochs
.count(i
.first
) == 1);
802 assert(mds_roles
.count(i
.first
) == 1);
803 assert(mds_roles
.at(i
.first
) == FS_CLUSTER_ID_NONE
);
806 for (const auto &i
: standby_epochs
) {
807 assert(standby_daemons
.count(i
.first
) == 1);
810 for (const auto &i
: mds_roles
) {
811 if (i
.second
== FS_CLUSTER_ID_NONE
) {
812 assert(standby_daemons
.count(i
.first
) == 1);
814 assert(filesystems
.count(i
.second
) == 1);
815 assert(filesystems
.at(i
.second
)->mds_map
.mds_info
.count(i
.first
) == 1);
821 mds_gid_t standby_gid
,
822 const std::shared_ptr
<Filesystem
> &filesystem
,
823 mds_rank_t assigned_rank
)
825 assert(gid_exists(standby_gid
));
826 bool is_standby_replay
= mds_roles
.at(standby_gid
) != FS_CLUSTER_ID_NONE
;
827 if (!is_standby_replay
) {
828 assert(standby_daemons
.count(standby_gid
));
829 assert(standby_daemons
.at(standby_gid
).state
== MDSMap::STATE_STANDBY
);
832 MDSMap
&mds_map
= filesystem
->mds_map
;
834 // Insert daemon state to Filesystem
835 if (!is_standby_replay
) {
836 mds_map
.mds_info
[standby_gid
] = standby_daemons
.at(standby_gid
);
838 assert(mds_map
.mds_info
.count(standby_gid
));
839 assert(mds_map
.mds_info
.at(standby_gid
).state
== MDSMap::STATE_STANDBY_REPLAY
);
840 assert(mds_map
.mds_info
.at(standby_gid
).rank
== assigned_rank
);
842 MDSMap::mds_info_t
&info
= mds_map
.mds_info
[standby_gid
];
844 if (mds_map
.stopped
.erase(assigned_rank
)) {
845 // The cluster is being expanded with a stopped rank
846 info
.state
= MDSMap::STATE_STARTING
;
847 } else if (!mds_map
.is_in(assigned_rank
)) {
848 // The cluster is being expanded with a new rank
849 info
.state
= MDSMap::STATE_CREATING
;
851 // An existing rank is being assigned to a replacement
852 info
.state
= MDSMap::STATE_REPLAY
;
853 mds_map
.failed
.erase(assigned_rank
);
855 info
.rank
= assigned_rank
;
857 mds_roles
[standby_gid
] = filesystem
->fscid
;
859 // Update the rank state in Filesystem
860 mds_map
.in
.insert(assigned_rank
);
861 mds_map
.up
[assigned_rank
] = standby_gid
;
863 // Remove from the list of standbys
864 if (!is_standby_replay
) {
865 standby_daemons
.erase(standby_gid
);
866 standby_epochs
.erase(standby_gid
);
869 // Indicate that Filesystem has been modified
870 mds_map
.epoch
= epoch
;
873 void FSMap::assign_standby_replay(
874 const mds_gid_t standby_gid
,
875 const fs_cluster_id_t leader_ns
,
876 const mds_rank_t leader_rank
)
878 assert(mds_roles
.at(standby_gid
) == FS_CLUSTER_ID_NONE
);
879 assert(gid_exists(standby_gid
));
880 assert(!gid_has_rank(standby_gid
));
881 assert(standby_daemons
.count(standby_gid
));
883 // Insert to the filesystem
884 auto fs
= filesystems
.at(leader_ns
);
885 fs
->mds_map
.mds_info
[standby_gid
] = standby_daemons
.at(standby_gid
);
886 fs
->mds_map
.mds_info
[standby_gid
].rank
= leader_rank
;
887 fs
->mds_map
.mds_info
[standby_gid
].state
= MDSMap::STATE_STANDBY_REPLAY
;
888 mds_roles
[standby_gid
] = leader_ns
;
890 // Remove from the list of standbys
891 standby_daemons
.erase(standby_gid
);
892 standby_epochs
.erase(standby_gid
);
894 // Indicate that Filesystem has been modified
895 fs
->mds_map
.epoch
= epoch
;
898 void FSMap::erase(mds_gid_t who
, epoch_t blacklist_epoch
)
900 if (mds_roles
.at(who
) == FS_CLUSTER_ID_NONE
) {
901 standby_daemons
.erase(who
);
902 standby_epochs
.erase(who
);
904 auto &fs
= filesystems
.at(mds_roles
.at(who
));
905 const auto &info
= fs
->mds_map
.mds_info
.at(who
);
906 if (info
.state
!= MDSMap::STATE_STANDBY_REPLAY
) {
907 if (info
.state
== MDSMap::STATE_CREATING
) {
908 // If this gid didn't make it past CREATING, then forget
909 // the rank ever existed so that next time it's handed out
910 // to a gid it'll go back into CREATING.
911 fs
->mds_map
.in
.erase(info
.rank
);
913 // Put this rank into the failed list so that the next available
914 // STANDBY will pick it up.
915 fs
->mds_map
.failed
.insert(info
.rank
);
917 assert(fs
->mds_map
.up
.at(info
.rank
) == info
.global_id
);
918 fs
->mds_map
.up
.erase(info
.rank
);
920 fs
->mds_map
.mds_info
.erase(who
);
921 fs
->mds_map
.last_failure_osd_epoch
= blacklist_epoch
;
922 fs
->mds_map
.epoch
= epoch
;
925 mds_roles
.erase(who
);
928 void FSMap::damaged(mds_gid_t who
, epoch_t blacklist_epoch
)
930 assert(mds_roles
.at(who
) != FS_CLUSTER_ID_NONE
);
931 auto fs
= filesystems
.at(mds_roles
.at(who
));
932 mds_rank_t rank
= fs
->mds_map
.mds_info
[who
].rank
;
934 erase(who
, blacklist_epoch
);
935 fs
->mds_map
.failed
.erase(rank
);
936 fs
->mds_map
.damaged
.insert(rank
);
938 assert(fs
->mds_map
.epoch
== epoch
);
942 * Update to indicate that the rank `rank` is to be removed
943 * from the damaged list of the filesystem `fscid`
945 bool FSMap::undamaged(const fs_cluster_id_t fscid
, const mds_rank_t rank
)
947 auto fs
= filesystems
.at(fscid
);
949 if (fs
->mds_map
.damaged
.erase(rank
)) {
950 fs
->mds_map
.failed
.insert(rank
);
951 fs
->mds_map
.epoch
= epoch
;
958 void FSMap::insert(const MDSMap::mds_info_t
&new_info
)
960 assert(new_info
.state
== MDSMap::STATE_STANDBY
);
961 assert(new_info
.rank
== MDS_RANK_NONE
);
962 mds_roles
[new_info
.global_id
] = FS_CLUSTER_ID_NONE
;
963 standby_daemons
[new_info
.global_id
] = new_info
;
964 standby_epochs
[new_info
.global_id
] = epoch
;
967 std::list
<mds_gid_t
> FSMap::stop(mds_gid_t who
)
969 assert(mds_roles
.at(who
) != FS_CLUSTER_ID_NONE
);
970 auto fs
= filesystems
.at(mds_roles
.at(who
));
971 const auto &info
= fs
->mds_map
.mds_info
.at(who
);
972 fs
->mds_map
.up
.erase(info
.rank
);
973 fs
->mds_map
.in
.erase(info
.rank
);
974 fs
->mds_map
.stopped
.insert(info
.rank
);
976 // Also drop any standby replays that were following this rank
977 std::list
<mds_gid_t
> standbys
;
978 for (const auto &i
: fs
->mds_map
.mds_info
) {
979 const auto &other_gid
= i
.first
;
980 const auto &other_info
= i
.second
;
981 if (other_info
.rank
== info
.rank
982 && other_info
.state
== MDSMap::STATE_STANDBY_REPLAY
) {
983 standbys
.push_back(other_gid
);
988 fs
->mds_map
.mds_info
.erase(who
);
989 mds_roles
.erase(who
);
991 fs
->mds_map
.epoch
= epoch
;
998 * Given one of the following forms:
1003 * Parse into a mds_role_t. The rank-only form is only valid
1004 * if legacy_client_ns is set.
1006 int FSMap::parse_role(
1007 boost::string_view role_str
,
1009 std::ostream
&ss
) const
1011 size_t colon_pos
= role_str
.find(":");
1013 std::shared_ptr
<const Filesystem
> fs
;
1014 if (colon_pos
== std::string::npos
) {
1015 if (legacy_client_fscid
== FS_CLUSTER_ID_NONE
) {
1016 ss
<< "No filesystem selected";
1019 fs
= get_filesystem(legacy_client_fscid
);
1022 if (parse_filesystem(role_str
.substr(0, colon_pos
), &fs
) < 0) {
1023 ss
<< "Invalid filesystem";
1026 rank_pos
= colon_pos
+1;
1031 std::string
rank_str(role_str
.substr(rank_pos
));
1032 long rank_i
= strict_strtol(rank_str
.c_str(), 10, &err
);
1033 if (rank_i
< 0 || !err
.empty()) {
1034 ss
<< "Invalid rank '" << rank_str
<< "'";
1040 if (fs
->mds_map
.in
.count(rank
) == 0) {
1041 ss
<< "Rank '" << rank
<< "' not found";
1045 *role
= {fs
->fscid
, rank
};