1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
4 * Ceph - scalable distributed file system
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
19 using std::stringstream
;
21 #include "mon/health_check.h"
24 void Filesystem::dump(Formatter
*f
) const
26 f
->open_object_section("mdsmap");
29 f
->dump_int("id", fscid
);
32 void FSMap::dump(Formatter
*f
) const
34 f
->dump_int("epoch", epoch
);
36 f
->open_object_section("compat");
40 f
->open_object_section("feature_flags");
41 f
->dump_bool("enable_multiple", enable_multiple
);
42 f
->dump_bool("ever_enabled_multiple", ever_enabled_multiple
);
45 f
->open_array_section("standbys");
46 for (const auto &i
: standby_daemons
) {
47 f
->open_object_section("info");
49 f
->dump_int("epoch", standby_epochs
.at(i
.first
));
54 f
->open_array_section("filesystems");
55 for (const auto &fs
: filesystems
) {
56 f
->open_object_section("filesystem");
63 void FSMap::generate_test_instances(list
<FSMap
*>& ls
)
65 FSMap
*m
= new FSMap();
67 std::list
<MDSMap
*> mds_map_instances
;
68 MDSMap::generate_test_instances(mds_map_instances
);
71 for (auto i
: mds_map_instances
) {
72 auto fs
= std::make_shared
<Filesystem
>();
76 m
->filesystems
[fs
->fscid
] = fs
;
78 mds_map_instances
.clear();
83 void FSMap::print(ostream
& out
) const
85 out
<< "e" << epoch
<< std::endl
;
86 out
<< "enable_multiple, ever_enabled_multiple: " << enable_multiple
<< ","
87 << ever_enabled_multiple
<< std::endl
;
88 out
<< "compat: " << compat
<< std::endl
;
89 out
<< "legacy client fscid: " << legacy_client_fscid
<< std::endl
;
90 out
<< " " << std::endl
;
92 if (filesystems
.empty()) {
93 out
<< "No filesystems configured" << std::endl
;
97 for (const auto &fs
: filesystems
) {
98 fs
.second
->print(out
);
99 out
<< " " << std::endl
<< " " << std::endl
; // Space out a bit
102 if (!standby_daemons
.empty()) {
103 out
<< "Standby daemons:" << std::endl
<< " " << std::endl
;
106 for (const auto &p
: standby_daemons
) {
107 p
.second
.print_summary(out
);
114 void FSMap::print_summary(Formatter
*f
, ostream
*out
) const
116 map
<mds_role_t
,string
> by_rank
;
117 map
<string
,int> by_state
;
120 f
->dump_unsigned("epoch", get_epoch());
121 for (auto i
: filesystems
) {
123 f
->dump_unsigned("id", fs
->fscid
);
124 f
->dump_unsigned("up", fs
->mds_map
.up
.size());
125 f
->dump_unsigned("in", fs
->mds_map
.in
.size());
126 f
->dump_unsigned("max", fs
->mds_map
.max_mds
);
129 if (filesystems
.size() == 1) {
130 auto fs
= filesystems
.begin()->second
;
131 *out
<< fs
->mds_map
.up
.size() << "/" << fs
->mds_map
.in
.size() << "/"
132 << fs
->mds_map
.max_mds
<< " up";
134 for (auto i
: filesystems
) {
136 *out
<< fs
->mds_map
.fs_name
<< "-" << fs
->mds_map
.up
.size() << "/"
137 << fs
->mds_map
.in
.size() << "/" << fs
->mds_map
.max_mds
<< " up ";
143 f
->open_array_section("by_rank");
146 const auto all_info
= get_mds_info();
147 for (const auto &p
: all_info
) {
148 const auto &info
= p
.second
;
149 string s
= ceph_mds_state_name(info
.state
);
151 s
+= "(laggy or crashed)";
154 const fs_cluster_id_t fscid
= mds_roles
.at(info
.global_id
);
156 if (info
.rank
!= MDS_RANK_NONE
&&
157 info
.state
!= MDSMap::STATE_STANDBY_REPLAY
) {
159 f
->open_object_section("mds");
160 f
->dump_unsigned("filesystem_id", fscid
);
161 f
->dump_unsigned("rank", info
.rank
);
162 f
->dump_string("name", info
.name
);
163 f
->dump_string("status", s
);
166 by_rank
[mds_role_t(fscid
, info
.rank
)] = info
.name
+ "=" + s
;
176 if (!by_rank
.empty()) {
177 if (filesystems
.size() > 1) {
178 // Disambiguate filesystems
179 std::map
<std::string
, std::string
> pretty
;
180 for (auto i
: by_rank
) {
181 const auto &fs_name
= filesystems
.at(i
.first
.fscid
)->mds_map
.fs_name
;
182 std::ostringstream o
;
183 o
<< "[" << fs_name
<< ":" << i
.first
.rank
<< "]";
184 pretty
[o
.str()] = i
.second
;
186 *out
<< " " << pretty
;
188 // Omit FSCID in output when only one filesystem exists
189 std::map
<mds_rank_t
, std::string
> shortened
;
190 for (auto i
: by_rank
) {
191 shortened
[i
.first
.rank
] = i
.second
;
193 *out
<< " " << shortened
;
198 for (map
<string
,int>::reverse_iterator p
= by_state
.rbegin(); p
!= by_state
.rend(); ++p
) {
200 f
->dump_unsigned(p
->first
.c_str(), p
->second
);
202 *out
<< ", " << p
->second
<< " " << p
->first
;
208 for (auto i
: filesystems
) {
210 failed
+= fs
->mds_map
.failed
.size();
211 damaged
+= fs
->mds_map
.damaged
.size();
216 f
->dump_unsigned("failed", failed
);
218 *out
<< ", " << failed
<< " failed";
224 f
->dump_unsigned("damaged", damaged
);
226 *out
<< ", " << damaged
<< " damaged";
229 //if (stopped.size())
230 //out << ", " << stopped.size() << " stopped";
234 void FSMap::create_filesystem(const std::string
&name
,
235 int64_t metadata_pool
, int64_t data_pool
,
238 auto fs
= std::make_shared
<Filesystem
>();
239 fs
->mds_map
.fs_name
= name
;
240 fs
->mds_map
.max_mds
= 1;
241 fs
->mds_map
.data_pools
.push_back(data_pool
);
242 fs
->mds_map
.metadata_pool
= metadata_pool
;
243 fs
->mds_map
.cas_pool
= -1;
244 fs
->mds_map
.max_file_size
= g_conf
->mds_max_file_size
;
245 fs
->mds_map
.compat
= compat
;
246 fs
->mds_map
.created
= ceph_clock_now();
247 fs
->mds_map
.modified
= ceph_clock_now();
248 fs
->mds_map
.session_timeout
= g_conf
->mds_session_timeout
;
249 fs
->mds_map
.session_autoclose
= g_conf
->mds_session_autoclose
;
250 fs
->mds_map
.enabled
= true;
251 if (features
& CEPH_FEATURE_SERVER_JEWEL
) {
252 fs
->fscid
= next_filesystem_id
++;
253 // ANONYMOUS is only for upgrades from legacy mdsmaps, we should
254 // have initialized next_filesystem_id such that it's never used here.
255 assert(fs
->fscid
!= FS_CLUSTER_ID_ANONYMOUS
);
257 // Use anon fscid because this will get thrown away when encoding
258 // as legacy MDSMap for legacy mons.
259 assert(filesystems
.empty());
260 fs
->fscid
= FS_CLUSTER_ID_ANONYMOUS
;
262 filesystems
[fs
->fscid
] = fs
;
264 // Created first filesystem? Set it as the one
265 // for legacy clients to use
266 if (filesystems
.size() == 1) {
267 legacy_client_fscid
= fs
->fscid
;
271 void FSMap::reset_filesystem(fs_cluster_id_t fscid
)
273 auto fs
= get_filesystem(fscid
);
274 auto new_fs
= std::make_shared
<Filesystem
>();
276 // Populate rank 0 as existing (so don't go into CREATING)
277 // but failed (so that next available MDS is assigned the rank)
278 new_fs
->mds_map
.in
.insert(mds_rank_t(0));
279 new_fs
->mds_map
.failed
.insert(mds_rank_t(0));
281 // Carry forward what makes sense
282 new_fs
->fscid
= fs
->fscid
;
283 new_fs
->mds_map
.inline_data_enabled
= fs
->mds_map
.inline_data_enabled
;
284 new_fs
->mds_map
.max_mds
= 1;
285 new_fs
->mds_map
.data_pools
= fs
->mds_map
.data_pools
;
286 new_fs
->mds_map
.metadata_pool
= fs
->mds_map
.metadata_pool
;
287 new_fs
->mds_map
.cas_pool
= fs
->mds_map
.cas_pool
;
288 new_fs
->mds_map
.fs_name
= fs
->mds_map
.fs_name
;
289 new_fs
->mds_map
.max_file_size
= g_conf
->mds_max_file_size
;
290 new_fs
->mds_map
.compat
= compat
;
291 new_fs
->mds_map
.created
= ceph_clock_now();
292 new_fs
->mds_map
.modified
= ceph_clock_now();
293 new_fs
->mds_map
.session_timeout
= g_conf
->mds_session_timeout
;
294 new_fs
->mds_map
.session_autoclose
= g_conf
->mds_session_autoclose
;
295 new_fs
->mds_map
.standby_count_wanted
= fs
->mds_map
.standby_count_wanted
;
296 new_fs
->mds_map
.enabled
= true;
298 // Remember mds ranks that have ever started. (They should load old inotable
299 // instead of creating new one if they start again.)
300 new_fs
->mds_map
.stopped
.insert(fs
->mds_map
.in
.begin(), fs
->mds_map
.in
.end());
301 new_fs
->mds_map
.stopped
.insert(fs
->mds_map
.stopped
.begin(), fs
->mds_map
.stopped
.end());
302 new_fs
->mds_map
.stopped
.erase(mds_rank_t(0));
304 // Persist the new FSMap
305 filesystems
[new_fs
->fscid
] = new_fs
;
308 void FSMap::get_health(list
<pair
<health_status_t
,string
> >& summary
,
309 list
<pair
<health_status_t
,string
> > *detail
) const
311 mds_rank_t standby_count_wanted
= 0;
312 for (const auto &i
: filesystems
) {
313 const auto &fs
= i
.second
;
315 // TODO: move get_health up into here so that we can qualify
316 // all the messages with what filesystem they're talking about
317 fs
->mds_map
.get_health(summary
, detail
);
319 standby_count_wanted
= std::max(standby_count_wanted
, fs
->mds_map
.get_standby_count_wanted((mds_rank_t
)standby_daemons
.size()));
322 if (standby_count_wanted
) {
323 std::ostringstream oss
;
324 oss
<< "insufficient standby daemons available: have " << standby_daemons
.size() << "; want " << standby_count_wanted
<< " more";
325 summary
.push_back(make_pair(HEALTH_WARN
, oss
.str()));
329 bool FSMap::check_health(void)
331 bool changed
= false;
332 for (auto &i
: filesystems
) {
333 changed
|= i
.second
->mds_map
.check_health((mds_rank_t
)standby_daemons
.size());
338 void FSMap::get_health_checks(health_check_map_t
*checks
) const
340 mds_rank_t standby_count_wanted
= 0;
341 for (const auto &i
: filesystems
) {
342 const auto &fs
= i
.second
;
343 health_check_map_t fschecks
;
344 fs
->mds_map
.get_health_checks(&fschecks
);
345 checks
->merge(fschecks
);
346 standby_count_wanted
= std::max(
347 standby_count_wanted
,
348 fs
->mds_map
.get_standby_count_wanted((mds_rank_t
)standby_daemons
.size()));
351 // MDS_INSUFFICIENT_STANDBY
352 if (standby_count_wanted
) {
353 std::ostringstream oss
, dss
;
354 oss
<< "insufficient standby daemons available";
355 auto& d
= checks
->add("MDS_INSUFFICIENT_STANDBY", HEALTH_WARN
, oss
.str());
356 dss
<< "have " << standby_daemons
.size() << "; want " << standby_count_wanted
358 d
.detail
.push_back(dss
.str());
362 void FSMap::encode(bufferlist
& bl
, uint64_t features
) const
364 if (features
& CEPH_FEATURE_SERVER_JEWEL
) {
365 ENCODE_START(7, 6, bl
);
367 ::encode(next_filesystem_id
, bl
);
368 ::encode(legacy_client_fscid
, bl
);
369 ::encode(compat
, bl
);
370 ::encode(enable_multiple
, bl
);
371 std::vector
<Filesystem
> fs_list
;
372 for (auto i
: filesystems
) {
373 fs_list
.push_back(*(i
.second
));
375 ::encode(fs_list
, bl
, features
);
376 ::encode(mds_roles
, bl
);
377 ::encode(standby_daemons
, bl
, features
);
378 ::encode(standby_epochs
, bl
);
379 ::encode(ever_enabled_multiple
, bl
);
382 if (filesystems
.empty()) {
384 disabled_map
.epoch
= epoch
;
385 disabled_map
.encode(bl
, features
);
387 // MDSMonitor should never have created multiple filesystems
388 // until the quorum features indicated Jewel
389 assert(filesystems
.size() == 1);
390 auto fs
= filesystems
.begin()->second
;
392 // Take the MDSMap for the enabled filesystem, and populated its
393 // mds_info with the standbys to get a pre-jewel-style mon MDSMap.
394 MDSMap full_mdsmap
= fs
->mds_map
;
395 full_mdsmap
.epoch
= epoch
;
396 for (const auto &p
: standby_daemons
) {
397 full_mdsmap
.mds_info
[p
.first
] = p
.second
;
400 // Old MDSMaps don't set rank on standby replay daemons
401 for (auto &i
: full_mdsmap
.mds_info
) {
402 auto &info
= i
.second
;
403 if (info
.state
== MDSMap::STATE_STANDBY_REPLAY
) {
404 info
.rank
= MDS_RANK_NONE
;
408 full_mdsmap
.encode(bl
, features
);
413 void FSMap::decode(bufferlist::iterator
& p
)
415 // Because the mon used to store an MDSMap where we now
416 // store an FSMap, FSMap knows how to decode the legacy
417 // MDSMap format (it never needs to encode it though).
418 MDSMap legacy_mds_map
;
420 // The highest MDSMap encoding version before we changed the
421 // MDSMonitor to store an FSMap instead of an MDSMap was
422 // 5, so anything older than 6 is decoded as an MDSMap,
423 // and anything newer is decoded as an FSMap.
424 DECODE_START_LEGACY_COMPAT_LEN_16(7, 4, 4, p
);
426 // Decoding an MDSMap (upgrade)
428 ::decode(legacy_mds_map
.flags
, p
);
429 ::decode(legacy_mds_map
.last_failure
, p
);
430 ::decode(legacy_mds_map
.root
, p
);
431 ::decode(legacy_mds_map
.session_timeout
, p
);
432 ::decode(legacy_mds_map
.session_autoclose
, p
);
433 ::decode(legacy_mds_map
.max_file_size
, p
);
434 ::decode(legacy_mds_map
.max_mds
, p
);
435 ::decode(legacy_mds_map
.mds_info
, p
);
442 legacy_mds_map
.data_pools
.push_back(m
);
446 legacy_mds_map
.cas_pool
= s
;
448 ::decode(legacy_mds_map
.data_pools
, p
);
449 ::decode(legacy_mds_map
.cas_pool
, p
);
452 // kclient ignores everything from here
457 ::decode(legacy_mds_map
.compat
, p
);
459 legacy_mds_map
.compat
= get_mdsmap_compat_set_base();
463 legacy_mds_map
.metadata_pool
= n
;
465 ::decode(legacy_mds_map
.metadata_pool
, p
);
467 ::decode(legacy_mds_map
.created
, p
);
468 ::decode(legacy_mds_map
.modified
, p
);
469 ::decode(legacy_mds_map
.tableserver
, p
);
470 ::decode(legacy_mds_map
.in
, p
);
471 std::map
<mds_rank_t
,int32_t> inc
; // Legacy field, parse and drop
473 ::decode(legacy_mds_map
.up
, p
);
474 ::decode(legacy_mds_map
.failed
, p
);
475 ::decode(legacy_mds_map
.stopped
, p
);
477 ::decode(legacy_mds_map
.last_failure_osd_epoch
, p
);
480 // previously this was a bool about snaps, not a flag map
483 legacy_mds_map
.ever_allowed_features
= flag
?
484 CEPH_MDSMAP_ALLOW_SNAPS
: 0;
486 legacy_mds_map
.explicitly_allowed_features
= flag
?
487 CEPH_MDSMAP_ALLOW_SNAPS
: 0;
488 if (legacy_mds_map
.max_mds
> 1) {
489 legacy_mds_map
.set_multimds_allowed();
492 ::decode(legacy_mds_map
.ever_allowed_features
, p
);
493 ::decode(legacy_mds_map
.explicitly_allowed_features
, p
);
496 legacy_mds_map
.ever_allowed_features
= CEPH_MDSMAP_ALLOW_CLASSICS
;
497 legacy_mds_map
.explicitly_allowed_features
= 0;
498 if (legacy_mds_map
.max_mds
> 1) {
499 legacy_mds_map
.set_multimds_allowed();
503 ::decode(legacy_mds_map
.inline_data_enabled
, p
);
506 assert(struct_v
>= 5);
507 ::decode(legacy_mds_map
.enabled
, p
);
508 ::decode(legacy_mds_map
.fs_name
, p
);
510 legacy_mds_map
.fs_name
= "default";
512 // If an MDS has ever been started, epoch will be greater than 1,
513 // assume filesystem is enabled.
514 legacy_mds_map
.enabled
= true;
516 // Upgrading from a cluster that never used an MDS, switch off
517 // filesystem until it's explicitly enabled.
518 legacy_mds_map
.enabled
= false;
523 ::decode(legacy_mds_map
.damaged
, p
);
526 // We're upgrading, populate filesystems from the legacy fields
528 standby_daemons
.clear();
529 standby_epochs
.clear();
531 compat
= legacy_mds_map
.compat
;
532 enable_multiple
= false;
534 // Synthesise a Filesystem from legacy_mds_map, if enabled
535 if (legacy_mds_map
.enabled
) {
536 // Construct a Filesystem from the legacy MDSMap
537 auto migrate_fs
= std::make_shared
<Filesystem
>();
538 migrate_fs
->fscid
= FS_CLUSTER_ID_ANONYMOUS
;
539 migrate_fs
->mds_map
= legacy_mds_map
;
540 migrate_fs
->mds_map
.epoch
= epoch
;
541 filesystems
[migrate_fs
->fscid
] = migrate_fs
;
543 // List of GIDs that had invalid states
544 std::set
<mds_gid_t
> drop_gids
;
546 // Construct mds_roles, standby_daemons, and remove
547 // standbys from the MDSMap in the Filesystem.
548 for (auto &p
: migrate_fs
->mds_map
.mds_info
) {
549 if (p
.second
.state
== MDSMap::STATE_STANDBY_REPLAY
) {
550 // In legacy MDSMap, standby replay daemons don't have
551 // rank set, but since FSMap they do.
552 p
.second
.rank
= p
.second
.standby_for_rank
;
554 if (p
.second
.rank
== MDS_RANK_NONE
) {
555 if (p
.second
.state
!= MDSMap::STATE_STANDBY
) {
556 // Old MDSMaps can have down:dne here, which
557 // is invalid in an FSMap (#17837)
558 drop_gids
.insert(p
.first
);
560 insert(p
.second
); // into standby_daemons
563 mds_roles
[p
.first
] = migrate_fs
->fscid
;
566 for (const auto &p
: standby_daemons
) {
567 // Erase from this Filesystem's MDSMap, because it has
568 // been copied into FSMap::Standby_daemons above
569 migrate_fs
->mds_map
.mds_info
.erase(p
.first
);
571 for (const auto &gid
: drop_gids
) {
572 // Throw away all info for this MDS because it was identified
573 // as having invalid state above.
574 migrate_fs
->mds_map
.mds_info
.erase(gid
);
577 legacy_client_fscid
= migrate_fs
->fscid
;
579 legacy_client_fscid
= FS_CLUSTER_ID_NONE
;
583 ::decode(next_filesystem_id
, p
);
584 ::decode(legacy_client_fscid
, p
);
586 ::decode(enable_multiple
, p
);
587 std::vector
<Filesystem
> fs_list
;
588 ::decode(fs_list
, p
);
590 for (std::vector
<Filesystem
>::const_iterator fs
= fs_list
.begin(); fs
!= fs_list
.end(); ++fs
) {
591 filesystems
[fs
->fscid
] = std::make_shared
<Filesystem
>(*fs
);
594 ::decode(mds_roles
, p
);
595 ::decode(standby_daemons
, p
);
596 ::decode(standby_epochs
, p
);
598 ::decode(ever_enabled_multiple
, p
);
606 void Filesystem::encode(bufferlist
& bl
, uint64_t features
) const
608 ENCODE_START(1, 1, bl
);
610 bufferlist mdsmap_bl
;
611 mds_map
.encode(mdsmap_bl
, features
);
612 ::encode(mdsmap_bl
, bl
);
616 void Filesystem::decode(bufferlist::iterator
& p
)
620 bufferlist mdsmap_bl
;
621 ::decode(mdsmap_bl
, p
);
622 bufferlist::iterator mdsmap_bl_iter
= mdsmap_bl
.begin();
623 mds_map
.decode(mdsmap_bl_iter
);
627 int FSMap::parse_filesystem(
628 std::string
const &ns_str
,
629 std::shared_ptr
<const Filesystem
> *result
633 fs_cluster_id_t fscid
= strict_strtol(ns_str
.c_str(), 10, &ns_err
);
634 if (!ns_err
.empty() || filesystems
.count(fscid
) == 0) {
635 for (auto &fs
: filesystems
) {
636 if (fs
.second
->mds_map
.fs_name
== ns_str
) {
637 *result
= std::const_pointer_cast
<const Filesystem
>(fs
.second
);
643 *result
= get_filesystem(fscid
);
648 void Filesystem::print(std::ostream
&out
) const
650 out
<< "Filesystem '" << mds_map
.fs_name
651 << "' (" << fscid
<< ")" << std::endl
;
655 mds_gid_t
FSMap::find_standby_for(mds_role_t role
, const std::string
& name
) const
657 mds_gid_t result
= MDS_GID_NONE
;
659 // First see if we have a STANDBY_REPLAY
660 auto fs
= get_filesystem(role
.fscid
);
661 for (const auto &i
: fs
->mds_map
.mds_info
) {
662 const auto &info
= i
.second
;
663 if (info
.rank
== role
.rank
&& info
.state
== MDSMap::STATE_STANDBY_REPLAY
) {
664 return info
.global_id
;
668 // See if there are any STANDBY daemons available
669 for (const auto &i
: standby_daemons
) {
670 const auto &gid
= i
.first
;
671 const auto &info
= i
.second
;
672 assert(info
.state
== MDSMap::STATE_STANDBY
);
673 assert(info
.rank
== MDS_RANK_NONE
);
679 // The mds_info_t may or may not tell us exactly which filesystem
680 // the standby_for_rank refers to: lookup via legacy_client_fscid
681 mds_role_t target_role
= {
682 info
.standby_for_fscid
== FS_CLUSTER_ID_NONE
?
683 legacy_client_fscid
: info
.standby_for_fscid
,
684 info
.standby_for_rank
};
686 if ((target_role
.rank
== role
.rank
&& target_role
.fscid
== role
.fscid
)
687 || (name
.length() && info
.standby_for_name
== name
)) {
688 // It's a named standby for *me*, use it.
691 info
.standby_for_rank
< 0 && info
.standby_for_name
.length() == 0 &&
692 (info
.standby_for_fscid
== FS_CLUSTER_ID_NONE
||
693 info
.standby_for_fscid
== role
.fscid
)) {
694 // It's not a named standby for anyone, use it if we don't find
695 // a named standby for me later, unless it targets another FSCID.
703 mds_gid_t
FSMap::find_unused_for(mds_role_t role
,
704 bool force_standby_active
) const {
705 for (const auto &i
: standby_daemons
) {
706 const auto &gid
= i
.first
;
707 const auto &info
= i
.second
;
708 assert(info
.state
== MDSMap::STATE_STANDBY
);
710 if (info
.laggy() || info
.rank
>= 0)
713 if (info
.standby_for_fscid
!= FS_CLUSTER_ID_NONE
&&
714 info
.standby_for_fscid
!= role
.fscid
)
716 if (info
.standby_for_rank
!= MDS_RANK_NONE
&&
717 info
.standby_for_rank
!= role
.rank
)
720 // To be considered 'unused' a daemon must either not
721 // be selected for standby-replay or the force_standby_active
722 // setting must be enabled to use replay daemons anyway.
723 if (!info
.standby_replay
|| force_standby_active
) {
730 mds_gid_t
FSMap::find_replacement_for(mds_role_t role
, const std::string
& name
,
731 bool force_standby_active
) const {
732 const mds_gid_t standby
= find_standby_for(role
, name
);
736 return find_unused_for(role
, force_standby_active
);
739 void FSMap::sanity() const
741 if (legacy_client_fscid
!= FS_CLUSTER_ID_NONE
) {
742 assert(filesystems
.count(legacy_client_fscid
) == 1);
745 for (const auto &i
: filesystems
) {
747 assert(fs
->mds_map
.compat
.compare(compat
) == 0);
748 assert(fs
->fscid
== i
.first
);
749 for (const auto &j
: fs
->mds_map
.mds_info
) {
750 assert(j
.second
.rank
!= MDS_RANK_NONE
);
751 assert(mds_roles
.count(j
.first
) == 1);
752 assert(standby_daemons
.count(j
.first
) == 0);
753 assert(standby_epochs
.count(j
.first
) == 0);
754 assert(mds_roles
.at(j
.first
) == i
.first
);
755 if (j
.second
.state
!= MDSMap::STATE_STANDBY_REPLAY
) {
756 assert(fs
->mds_map
.up
.at(j
.second
.rank
) == j
.first
);
757 assert(fs
->mds_map
.failed
.count(j
.second
.rank
) == 0);
758 assert(fs
->mds_map
.damaged
.count(j
.second
.rank
) == 0);
762 for (const auto &j
: fs
->mds_map
.up
) {
763 mds_rank_t rank
= j
.first
;
764 assert(fs
->mds_map
.in
.count(rank
) == 1);
765 mds_gid_t gid
= j
.second
;
766 assert(fs
->mds_map
.mds_info
.count(gid
) == 1);
770 for (const auto &i
: standby_daemons
) {
771 assert(i
.second
.state
== MDSMap::STATE_STANDBY
);
772 assert(i
.second
.rank
== MDS_RANK_NONE
);
773 assert(i
.second
.global_id
== i
.first
);
774 assert(standby_epochs
.count(i
.first
) == 1);
775 assert(mds_roles
.count(i
.first
) == 1);
776 assert(mds_roles
.at(i
.first
) == FS_CLUSTER_ID_NONE
);
779 for (const auto &i
: standby_epochs
) {
780 assert(standby_daemons
.count(i
.first
) == 1);
783 for (const auto &i
: mds_roles
) {
784 if (i
.second
== FS_CLUSTER_ID_NONE
) {
785 assert(standby_daemons
.count(i
.first
) == 1);
787 assert(filesystems
.count(i
.second
) == 1);
788 assert(filesystems
.at(i
.second
)->mds_map
.mds_info
.count(i
.first
) == 1);
794 mds_gid_t standby_gid
,
795 const std::shared_ptr
<Filesystem
> &filesystem
,
796 mds_rank_t assigned_rank
)
798 assert(gid_exists(standby_gid
));
799 bool is_standby_replay
= mds_roles
.at(standby_gid
) != FS_CLUSTER_ID_NONE
;
800 if (!is_standby_replay
) {
801 assert(standby_daemons
.count(standby_gid
));
802 assert(standby_daemons
.at(standby_gid
).state
== MDSMap::STATE_STANDBY
);
805 MDSMap
&mds_map
= filesystem
->mds_map
;
807 // Insert daemon state to Filesystem
808 if (!is_standby_replay
) {
809 mds_map
.mds_info
[standby_gid
] = standby_daemons
.at(standby_gid
);
811 assert(mds_map
.mds_info
.count(standby_gid
));
812 assert(mds_map
.mds_info
.at(standby_gid
).state
== MDSMap::STATE_STANDBY_REPLAY
);
813 assert(mds_map
.mds_info
.at(standby_gid
).rank
== assigned_rank
);
815 MDSMap::mds_info_t
&info
= mds_map
.mds_info
[standby_gid
];
817 if (mds_map
.stopped
.erase(assigned_rank
)) {
818 // The cluster is being expanded with a stopped rank
819 info
.state
= MDSMap::STATE_STARTING
;
820 } else if (!mds_map
.is_in(assigned_rank
)) {
821 // The cluster is being expanded with a new rank
822 info
.state
= MDSMap::STATE_CREATING
;
824 // An existing rank is being assigned to a replacement
825 info
.state
= MDSMap::STATE_REPLAY
;
826 mds_map
.failed
.erase(assigned_rank
);
828 info
.rank
= assigned_rank
;
830 mds_roles
[standby_gid
] = filesystem
->fscid
;
832 // Update the rank state in Filesystem
833 mds_map
.in
.insert(assigned_rank
);
834 mds_map
.up
[assigned_rank
] = standby_gid
;
836 // Remove from the list of standbys
837 if (!is_standby_replay
) {
838 standby_daemons
.erase(standby_gid
);
839 standby_epochs
.erase(standby_gid
);
842 // Indicate that Filesystem has been modified
843 mds_map
.epoch
= epoch
;
846 void FSMap::assign_standby_replay(
847 const mds_gid_t standby_gid
,
848 const fs_cluster_id_t leader_ns
,
849 const mds_rank_t leader_rank
)
851 assert(mds_roles
.at(standby_gid
) == FS_CLUSTER_ID_NONE
);
852 assert(gid_exists(standby_gid
));
853 assert(!gid_has_rank(standby_gid
));
854 assert(standby_daemons
.count(standby_gid
));
856 // Insert to the filesystem
857 auto fs
= filesystems
.at(leader_ns
);
858 fs
->mds_map
.mds_info
[standby_gid
] = standby_daemons
.at(standby_gid
);
859 fs
->mds_map
.mds_info
[standby_gid
].rank
= leader_rank
;
860 fs
->mds_map
.mds_info
[standby_gid
].state
= MDSMap::STATE_STANDBY_REPLAY
;
861 mds_roles
[standby_gid
] = leader_ns
;
863 // Remove from the list of standbys
864 standby_daemons
.erase(standby_gid
);
865 standby_epochs
.erase(standby_gid
);
867 // Indicate that Filesystem has been modified
868 fs
->mds_map
.epoch
= epoch
;
871 void FSMap::erase(mds_gid_t who
, epoch_t blacklist_epoch
)
873 if (mds_roles
.at(who
) == FS_CLUSTER_ID_NONE
) {
874 standby_daemons
.erase(who
);
875 standby_epochs
.erase(who
);
877 auto &fs
= filesystems
.at(mds_roles
.at(who
));
878 const auto &info
= fs
->mds_map
.mds_info
.at(who
);
879 if (info
.state
!= MDSMap::STATE_STANDBY_REPLAY
) {
880 if (info
.state
== MDSMap::STATE_CREATING
) {
881 // If this gid didn't make it past CREATING, then forget
882 // the rank ever existed so that next time it's handed out
883 // to a gid it'll go back into CREATING.
884 fs
->mds_map
.in
.erase(info
.rank
);
886 // Put this rank into the failed list so that the next available
887 // STANDBY will pick it up.
888 fs
->mds_map
.failed
.insert(info
.rank
);
890 assert(fs
->mds_map
.up
.at(info
.rank
) == info
.global_id
);
891 fs
->mds_map
.up
.erase(info
.rank
);
893 fs
->mds_map
.mds_info
.erase(who
);
894 fs
->mds_map
.last_failure_osd_epoch
= blacklist_epoch
;
895 fs
->mds_map
.epoch
= epoch
;
898 mds_roles
.erase(who
);
901 void FSMap::damaged(mds_gid_t who
, epoch_t blacklist_epoch
)
903 assert(mds_roles
.at(who
) != FS_CLUSTER_ID_NONE
);
904 auto fs
= filesystems
.at(mds_roles
.at(who
));
905 mds_rank_t rank
= fs
->mds_map
.mds_info
[who
].rank
;
907 erase(who
, blacklist_epoch
);
908 fs
->mds_map
.failed
.erase(rank
);
909 fs
->mds_map
.damaged
.insert(rank
);
911 assert(fs
->mds_map
.epoch
== epoch
);
915 * Update to indicate that the rank `rank` is to be removed
916 * from the damaged list of the filesystem `fscid`
918 bool FSMap::undamaged(const fs_cluster_id_t fscid
, const mds_rank_t rank
)
920 auto fs
= filesystems
.at(fscid
);
922 if (fs
->mds_map
.damaged
.erase(rank
)) {
923 fs
->mds_map
.failed
.insert(rank
);
924 fs
->mds_map
.epoch
= epoch
;
931 void FSMap::insert(const MDSMap::mds_info_t
&new_info
)
933 assert(new_info
.state
== MDSMap::STATE_STANDBY
);
934 assert(new_info
.rank
== MDS_RANK_NONE
);
935 mds_roles
[new_info
.global_id
] = FS_CLUSTER_ID_NONE
;
936 standby_daemons
[new_info
.global_id
] = new_info
;
937 standby_epochs
[new_info
.global_id
] = epoch
;
940 std::list
<mds_gid_t
> FSMap::stop(mds_gid_t who
)
942 assert(mds_roles
.at(who
) != FS_CLUSTER_ID_NONE
);
943 auto fs
= filesystems
.at(mds_roles
.at(who
));
944 const auto &info
= fs
->mds_map
.mds_info
.at(who
);
945 fs
->mds_map
.up
.erase(info
.rank
);
946 fs
->mds_map
.in
.erase(info
.rank
);
947 fs
->mds_map
.stopped
.insert(info
.rank
);
949 // Also drop any standby replays that were following this rank
950 std::list
<mds_gid_t
> standbys
;
951 for (const auto &i
: fs
->mds_map
.mds_info
) {
952 const auto &other_gid
= i
.first
;
953 const auto &other_info
= i
.second
;
954 if (other_info
.rank
== info
.rank
955 && other_info
.state
== MDSMap::STATE_STANDBY_REPLAY
) {
956 standbys
.push_back(other_gid
);
961 fs
->mds_map
.mds_info
.erase(who
);
962 mds_roles
.erase(who
);
964 fs
->mds_map
.epoch
= epoch
;
971 * Given one of the following forms:
976 * Parse into a mds_role_t. The rank-only form is only valid
977 * if legacy_client_ns is set.
979 int FSMap::parse_role(
980 const std::string
&role_str
,
982 std::ostream
&ss
) const
984 size_t colon_pos
= role_str
.find(":");
986 std::shared_ptr
<const Filesystem
> fs
;
987 if (colon_pos
== std::string::npos
) {
988 if (legacy_client_fscid
== FS_CLUSTER_ID_NONE
) {
989 ss
<< "No filesystem selected";
992 fs
= get_filesystem(legacy_client_fscid
);
995 if (parse_filesystem(role_str
.substr(0, colon_pos
), &fs
) < 0) {
996 ss
<< "Invalid filesystem";
999 rank_pos
= colon_pos
+1;
1004 std::string rank_str
= role_str
.substr(rank_pos
);
1005 long rank_i
= strict_strtol(rank_str
.c_str(), 10, &err
);
1006 if (rank_i
< 0 || !err
.empty()) {
1007 ss
<< "Invalid rank '" << rank_str
<< "'";
1013 if (fs
->mds_map
.in
.count(rank
) == 0) {
1014 ss
<< "Rank '" << rank
<< "' not found";
1018 *role
= {fs
->fscid
, rank
};