1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
4 * Ceph - scalable distributed file system
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
21 #include "include/types.h"
22 #include "common/Clock.h"
23 #include "msg/Message.h"
29 #include "common/config.h"
31 #include "include/CompatSet.h"
32 #include "include/ceph_features.h"
33 #include "common/Formatter.h"
34 #include "mds/mdstypes.h"
38 boot --> standby, creating, or starting.
41 dne ----> creating -----> active*
47 stopped <---- stopping* <-/ / |
49 ----- starting* ----/ |
53 \--> replay* --> reconnect* --> rejoin*
61 extern CompatSet
get_mdsmap_compat_set_all();
62 extern CompatSet
get_mdsmap_compat_set_default();
63 extern CompatSet
get_mdsmap_compat_set_base(); // pre v0.20
65 #define MDS_FEATURE_INCOMPAT_BASE CompatSet::Feature(1, "base v0.20")
66 #define MDS_FEATURE_INCOMPAT_CLIENTRANGES CompatSet::Feature(2, "client writeable ranges")
67 #define MDS_FEATURE_INCOMPAT_FILELAYOUT CompatSet::Feature(3, "default file layouts on dirs")
68 #define MDS_FEATURE_INCOMPAT_DIRINODE CompatSet::Feature(4, "dir inode in separate object")
69 #define MDS_FEATURE_INCOMPAT_ENCODING CompatSet::Feature(5, "mds uses versioned encoding")
70 #define MDS_FEATURE_INCOMPAT_OMAPDIRFRAG CompatSet::Feature(6, "dirfrag is stored in omap")
71 #define MDS_FEATURE_INCOMPAT_INLINE CompatSet::Feature(7, "mds uses inline data")
72 #define MDS_FEATURE_INCOMPAT_NOANCHOR CompatSet::Feature(8, "no anchor table")
73 #define MDS_FEATURE_INCOMPAT_FILE_LAYOUT_V2 CompatSet::Feature(8, "file layout v2")
75 #define MDS_FS_NAME_DEFAULT "cephfs"
79 /* These states are the union of the set of possible states of an MDS daemon,
80 * and the set of possible states of an MDS rank */
82 // States of an MDS daemon not currently holding a rank
83 // ====================================================
84 STATE_NULL
= CEPH_MDS_STATE_NULL
, // null value for fns returning this type.
85 STATE_BOOT
= CEPH_MDS_STATE_BOOT
, // up, boot announcement. destiny unknown.
86 STATE_STANDBY
= CEPH_MDS_STATE_STANDBY
, // up, idle. waiting for assignment by monitor.
87 STATE_STANDBY_REPLAY
= CEPH_MDS_STATE_STANDBY_REPLAY
, // up, replaying active node, ready to take over.
89 // States of an MDS rank, and of any MDS daemon holding that rank
90 // ==============================================================
91 STATE_STOPPED
= CEPH_MDS_STATE_STOPPED
, // down, once existed, but no subtrees. empty log. may not be held by a daemon.
93 STATE_CREATING
= CEPH_MDS_STATE_CREATING
, // up, creating MDS instance (new journal, idalloc..).
94 STATE_STARTING
= CEPH_MDS_STATE_STARTING
, // up, starting prior stopped MDS instance.
96 STATE_REPLAY
= CEPH_MDS_STATE_REPLAY
, // up, starting prior failed instance. scanning journal.
97 STATE_RESOLVE
= CEPH_MDS_STATE_RESOLVE
, // up, disambiguating distributed operations (import, rename, etc.)
98 STATE_RECONNECT
= CEPH_MDS_STATE_RECONNECT
, // up, reconnect to clients
99 STATE_REJOIN
= CEPH_MDS_STATE_REJOIN
, // up, replayed journal, rejoining distributed cache
100 STATE_CLIENTREPLAY
= CEPH_MDS_STATE_CLIENTREPLAY
, // up, active
101 STATE_ACTIVE
= CEPH_MDS_STATE_ACTIVE
, // up, active
102 STATE_STOPPING
= CEPH_MDS_STATE_STOPPING
, // up, exporting metadata (-> standby or out)
103 STATE_DNE
= CEPH_MDS_STATE_DNE
, // down, rank does not exist
105 // State which a daemon may send to MDSMonitor in its beacon
106 // to indicate that offline repair is required. Daemon must stop
107 // immediately after indicating this state.
108 STATE_DAMAGED
= CEPH_MDS_STATE_DAMAGED
111 * In addition to explicit states, an MDS rank implicitly in state:
112 * - STOPPED if it is not currently associated with an MDS daemon gid but it
113 * is in MDSMap::stopped
114 * - FAILED if it is not currently associated with an MDS daemon gid but it
115 * is in MDSMap::failed
116 * - DNE if it is not currently associated with an MDS daemon gid and it is
117 * missing from both MDSMap::failed and MDSMap::stopped
126 MDSMap::DaemonState state
;
130 mds_rank_t standby_for_rank
;
131 std::string standby_for_name
;
132 fs_cluster_id_t standby_for_fscid
;
134 std::set
<mds_rank_t
> export_targets
;
135 uint64_t mds_features
;
137 mds_info_t() : global_id(MDS_GID_NONE
), rank(MDS_RANK_NONE
), inc(0),
138 state(STATE_STANDBY
), state_seq(0),
139 standby_for_rank(MDS_RANK_NONE
),
140 standby_for_fscid(FS_CLUSTER_ID_NONE
),
141 standby_replay(false)
144 bool laggy() const { return !(laggy_since
== utime_t()); }
145 void clear_laggy() { laggy_since
= utime_t(); }
147 entity_inst_t
get_inst() const { return entity_inst_t(entity_name_t::MDS(rank
), addr
); }
149 void encode(bufferlist
& bl
, uint64_t features
) const {
150 if ((features
& CEPH_FEATURE_MDSENC
) == 0 ) encode_unversioned(bl
);
151 else encode_versioned(bl
, features
);
153 void decode(bufferlist::iterator
& p
);
154 void dump(Formatter
*f
) const;
155 void print_summary(ostream
&out
) const;
156 static void generate_test_instances(list
<mds_info_t
*>& ls
);
158 void encode_versioned(bufferlist
& bl
, uint64_t features
) const;
159 void encode_unversioned(bufferlist
& bl
) const;
168 uint32_t flags
; // flags
169 epoch_t last_failure
; // mds epoch of last failure
170 epoch_t last_failure_osd_epoch
; // osd epoch of last failure; any mds entering replay needs
171 // at least this osdmap to ensure the blacklist propagates.
172 utime_t created
, modified
;
174 mds_rank_t tableserver
; // which MDS has snaptable
175 mds_rank_t root
; // which MDS has root directory
177 __u32 session_timeout
;
178 __u32 session_autoclose
;
179 uint64_t max_file_size
;
181 std::set
<int64_t> data_pools
; // file data pools available to clients (via an ioctl). first is the default.
182 int64_t cas_pool
; // where CAS objects go
183 int64_t metadata_pool
; // where fs metadata objects go
186 * in: the set of logical mds #'s that define the cluster. this is the set
187 * of mds's the metadata may be distributed over.
188 * up: map from logical mds #'s to the addrs filling those roles.
189 * failed: subset of @in that are failed.
190 * stopped: set of nodes that have been initialized, but are not active.
192 * @up + @failed = @in. @in * @stopped = {}.
195 mds_rank_t max_mds
; /* The maximum number of active MDSes. Also, the maximum rank. */
196 mds_rank_t standby_count_wanted
;
197 string balancer
; /* The name/version of the mantle balancer (i.e. the rados obj name) */
199 std::set
<mds_rank_t
> in
; // currently defined cluster
201 // which ranks are failed, stopped, damaged (i.e. not held by a daemon)
202 std::set
<mds_rank_t
> failed
, stopped
, damaged
;
203 std::map
<mds_rank_t
, mds_gid_t
> up
; // who is in those roles
204 std::map
<mds_gid_t
, mds_info_t
> mds_info
;
206 uint8_t ever_allowed_features
; //< bitmap of features the cluster has allowed
207 uint8_t explicitly_allowed_features
; //< bitmap of features explicitly enabled
209 bool inline_data_enabled
;
211 uint64_t cached_up_features
;
216 friend class MDSMonitor
;
217 friend class Filesystem
;
222 : epoch(0), enabled(false), fs_name(MDS_FS_NAME_DEFAULT
),
223 flags(CEPH_MDSMAP_DEFAULTS
), last_failure(0),
224 last_failure_osd_epoch(0),
225 tableserver(0), root(0),
227 session_autoclose(0),
232 standby_count_wanted(-1),
233 ever_allowed_features(0),
234 explicitly_allowed_features(0),
235 inline_data_enabled(false),
236 cached_up_features(0)
239 bool get_inline_data_enabled() const { return inline_data_enabled
; }
240 void set_inline_data_enabled(bool enabled
) { inline_data_enabled
= enabled
; }
242 utime_t
get_session_timeout() const {
243 return utime_t(session_timeout
,0);
245 uint64_t get_max_filesize() const { return max_file_size
; }
246 void set_max_filesize(uint64_t m
) { max_file_size
= m
; }
248 int get_flags() const { return flags
; }
249 bool test_flag(int f
) const { return flags
& f
; }
250 void set_flag(int f
) { flags
|= f
; }
251 void clear_flag(int f
) { flags
&= ~f
; }
253 const std::string
&get_fs_name() const {return fs_name
;}
255 void set_snaps_allowed() {
256 set_flag(CEPH_MDSMAP_ALLOW_SNAPS
);
257 ever_allowed_features
|= CEPH_MDSMAP_ALLOW_SNAPS
;
258 explicitly_allowed_features
|= CEPH_MDSMAP_ALLOW_SNAPS
;
260 void clear_snaps_allowed() { clear_flag(CEPH_MDSMAP_ALLOW_SNAPS
); }
261 bool allows_snaps() const { return test_flag(CEPH_MDSMAP_ALLOW_SNAPS
); }
263 void set_multimds_allowed() {
264 set_flag(CEPH_MDSMAP_ALLOW_MULTIMDS
);
265 ever_allowed_features
|= CEPH_MDSMAP_ALLOW_MULTIMDS
;
266 explicitly_allowed_features
|= CEPH_MDSMAP_ALLOW_MULTIMDS
;
268 void clear_multimds_allowed() { clear_flag(CEPH_MDSMAP_ALLOW_MULTIMDS
); }
269 bool allows_multimds() const { return test_flag(CEPH_MDSMAP_ALLOW_MULTIMDS
); }
271 void set_dirfrags_allowed() {
272 set_flag(CEPH_MDSMAP_ALLOW_DIRFRAGS
);
273 ever_allowed_features
|= CEPH_MDSMAP_ALLOW_DIRFRAGS
;
274 explicitly_allowed_features
|= CEPH_MDSMAP_ALLOW_DIRFRAGS
;
276 void clear_dirfrags_allowed() { clear_flag(CEPH_MDSMAP_ALLOW_DIRFRAGS
); }
277 bool allows_dirfrags() const { return test_flag(CEPH_MDSMAP_ALLOW_DIRFRAGS
); }
279 epoch_t
get_epoch() const { return epoch
; }
280 void inc_epoch() { epoch
++; }
282 bool get_enabled() const { return enabled
; }
284 const utime_t
& get_created() const { return created
; }
285 void set_created(utime_t ct
) { modified
= created
= ct
; }
286 const utime_t
& get_modified() const { return modified
; }
287 void set_modified(utime_t mt
) { modified
= mt
; }
289 epoch_t
get_last_failure() const { return last_failure
; }
290 epoch_t
get_last_failure_osd_epoch() const { return last_failure_osd_epoch
; }
292 mds_rank_t
get_max_mds() const { return max_mds
; }
293 void set_max_mds(mds_rank_t m
) { max_mds
= m
; }
295 mds_rank_t
get_standby_count_wanted(mds_rank_t standby_daemon_count
) const {
296 assert(standby_daemon_count
>= 0);
297 std::set
<mds_rank_t
> s
;
298 get_standby_replay_mds_set(s
);
299 mds_rank_t standbys_avail
= (mds_rank_t
)s
.size()+standby_daemon_count
;
300 mds_rank_t wanted
= std::max(0, standby_count_wanted
);
301 return wanted
> standbys_avail
? wanted
- standbys_avail
: 0;
303 void set_standby_count_wanted(mds_rank_t n
) { standby_count_wanted
= n
; }
304 bool check_health(mds_rank_t standby_daemon_count
);
306 const std::string
get_balancer() const { return balancer
; }
307 void set_balancer(std::string val
) { balancer
.assign(val
); }
309 mds_rank_t
get_tableserver() const { return tableserver
; }
310 mds_rank_t
get_root() const { return root
; }
312 const std::set
<int64_t> &get_data_pools() const { return data_pools
; }
313 int64_t get_first_data_pool() const { return *data_pools
.begin(); }
314 int64_t get_metadata_pool() const { return metadata_pool
; }
315 bool is_data_pool(int64_t poolid
) const {
316 return data_pools
.count(poolid
);
319 bool pool_in_use(int64_t poolid
) const {
320 return get_enabled() && (is_data_pool(poolid
) || metadata_pool
== poolid
);
323 const std::map
<mds_gid_t
,mds_info_t
>& get_mds_info() const { return mds_info
; }
324 const mds_info_t
& get_mds_info_gid(mds_gid_t gid
) const {
325 return mds_info
.at(gid
);
327 const mds_info_t
& get_mds_info(mds_rank_t m
) const {
328 assert(up
.count(m
) && mds_info
.count(up
.at(m
)));
329 return mds_info
.at(up
.at(m
));
331 mds_gid_t
find_mds_gid_by_name(const std::string
& s
) const {
332 for (std::map
<mds_gid_t
,mds_info_t
>::const_iterator p
= mds_info
.begin();
335 if (p
->second
.name
== s
) {
343 unsigned get_num_in_mds() const {
346 unsigned get_num_up_mds() const {
349 int get_num_failed_mds() const {
350 return failed
.size();
352 unsigned get_num_mds(int state
) const {
354 for (std::map
<mds_gid_t
,mds_info_t
>::const_iterator p
= mds_info
.begin();
357 if (p
->second
.state
== state
) ++n
;
362 void add_data_pool(int64_t poolid
) {
363 data_pools
.insert(poolid
);
365 int remove_data_pool(int64_t poolid
) {
366 std::set
<int64_t>::iterator p
= data_pools
.find(poolid
);
367 if (p
== data_pools
.end())
374 void get_mds_set(std::set
<mds_rank_t
>& s
) const {
377 void get_up_mds_set(std::set
<mds_rank_t
>& s
) const {
378 for (std::map
<mds_rank_t
, mds_gid_t
>::const_iterator p
= up
.begin();
383 void get_active_mds_set(std::set
<mds_rank_t
>& s
) const {
384 get_mds_set(s
, MDSMap::STATE_ACTIVE
);
386 void get_standby_replay_mds_set(std::set
<mds_rank_t
>& s
) const {
387 get_mds_set(s
, MDSMap::STATE_STANDBY_REPLAY
);
389 void get_failed_mds_set(std::set
<mds_rank_t
>& s
) const {
394 uint64_t get_up_features() {
395 if (!cached_up_features
) {
397 for (std::map
<mds_rank_t
, mds_gid_t
>::const_iterator p
= up
.begin();
400 std::map
<mds_gid_t
, mds_info_t
>::const_iterator q
=
401 mds_info
.find(p
->second
);
402 assert(q
!= mds_info
.end());
404 cached_up_features
= q
->second
.mds_features
;
407 cached_up_features
&= q
->second
.mds_features
;
411 return cached_up_features
;
415 * Get MDS ranks which are in but not up.
417 void get_down_mds_set(std::set
<mds_rank_t
> *s
) const
420 s
->insert(failed
.begin(), failed
.end());
421 s
->insert(damaged
.begin(), damaged
.end());
424 int get_failed() const {
425 if (!failed
.empty()) return *failed
.begin();
428 void get_stopped_mds_set(std::set
<mds_rank_t
>& s
) const {
431 void get_recovery_mds_set(std::set
<mds_rank_t
>& s
) const {
433 for (const auto& p
: damaged
)
435 for (const auto& p
: mds_info
)
436 if (p
.second
.state
>= STATE_REPLAY
&& p
.second
.state
<= STATE_STOPPING
)
437 s
.insert(p
.second
.rank
);
441 get_clientreplay_or_active_or_stopping_mds_set(std::set
<mds_rank_t
>& s
) const {
442 for (std::map
<mds_gid_t
, mds_info_t
>::const_iterator p
= mds_info
.begin();
445 if (p
->second
.state
>= STATE_CLIENTREPLAY
&& p
->second
.state
<= STATE_STOPPING
)
446 s
.insert(p
->second
.rank
);
448 void get_mds_set(std::set
<mds_rank_t
>& s
, DaemonState state
) const {
449 for (std::map
<mds_gid_t
, mds_info_t
>::const_iterator p
= mds_info
.begin();
452 if (p
->second
.state
== state
)
453 s
.insert(p
->second
.rank
);
456 void get_health(list
<pair
<health_status_t
,std::string
> >& summary
,
457 list
<pair
<health_status_t
,std::string
> > *detail
) const;
462 TRANSIENT_UNAVAILABLE
= 1,
463 STUCK_UNAVAILABLE
= 2
468 * Return indication of whether cluster is available. This is a
469 * heuristic for clients to see if they should bother waiting to talk to
470 * MDSs, or whether they should error out at startup/mount.
472 * A TRANSIENT_UNAVAILABLE result indicates that the cluster is in a
473 * transition state like replaying, or is potentially about the fail over.
474 * Clients should wait for an updated map before making a final decision
475 * about whether the filesystem is mountable.
477 * A STUCK_UNAVAILABLE result indicates that we can't see a way that
478 * the cluster is about to recover on its own, so it'll probably require
479 * administrator intervention: clients should probaly not bother trying
482 availability_t
is_cluster_available() const;
485 bool is_down(mds_rank_t m
) const { return up
.count(m
) == 0; }
486 bool is_up(mds_rank_t m
) const { return up
.count(m
); }
487 bool is_in(mds_rank_t m
) const { return up
.count(m
) || failed
.count(m
); }
488 bool is_out(mds_rank_t m
) const { return !is_in(m
); }
490 bool is_failed(mds_rank_t m
) const { return failed
.count(m
); }
491 bool is_stopped(mds_rank_t m
) const { return stopped
.count(m
); }
493 bool is_dne(mds_rank_t m
) const { return in
.count(m
) == 0; }
494 bool is_dne_gid(mds_gid_t gid
) const { return mds_info
.count(gid
) == 0; }
497 * Get MDS rank state if the rank is up, else STATE_NULL
499 DaemonState
get_state(mds_rank_t m
) const {
500 std::map
<mds_rank_t
, mds_gid_t
>::const_iterator u
= up
.find(m
);
503 return get_state_gid(u
->second
);
507 * Get MDS daemon status by GID
509 DaemonState
get_state_gid(mds_gid_t gid
) const {
510 std::map
<mds_gid_t
,mds_info_t
>::const_iterator i
= mds_info
.find(gid
);
511 if (i
== mds_info
.end())
513 return i
->second
.state
;
516 const mds_info_t
& get_info(const mds_rank_t m
) const {
517 return mds_info
.at(up
.at(m
));
519 const mds_info_t
& get_info_gid(const mds_gid_t gid
) const {
520 return mds_info
.at(gid
);
523 bool is_boot(mds_rank_t m
) const { return get_state(m
) == STATE_BOOT
; }
524 bool is_creating(mds_rank_t m
) const { return get_state(m
) == STATE_CREATING
; }
525 bool is_starting(mds_rank_t m
) const { return get_state(m
) == STATE_STARTING
; }
526 bool is_replay(mds_rank_t m
) const { return get_state(m
) == STATE_REPLAY
; }
527 bool is_resolve(mds_rank_t m
) const { return get_state(m
) == STATE_RESOLVE
; }
528 bool is_reconnect(mds_rank_t m
) const { return get_state(m
) == STATE_RECONNECT
; }
529 bool is_rejoin(mds_rank_t m
) const { return get_state(m
) == STATE_REJOIN
; }
530 bool is_clientreplay(mds_rank_t m
) const { return get_state(m
) == STATE_CLIENTREPLAY
; }
531 bool is_active(mds_rank_t m
) const { return get_state(m
) == STATE_ACTIVE
; }
532 bool is_stopping(mds_rank_t m
) const { return get_state(m
) == STATE_STOPPING
; }
533 bool is_active_or_stopping(mds_rank_t m
) const {
534 return is_active(m
) || is_stopping(m
);
536 bool is_clientreplay_or_active_or_stopping(mds_rank_t m
) const {
537 return is_clientreplay(m
) || is_active(m
) || is_stopping(m
);
540 bool is_followable(mds_rank_t m
) const {
541 return (is_resolve(m
) ||
544 is_clientreplay(m
) ||
549 bool is_laggy_gid(mds_gid_t gid
) const {
550 if (!mds_info
.count(gid
))
552 std::map
<mds_gid_t
,mds_info_t
>::const_iterator p
= mds_info
.find(gid
);
553 return p
->second
.laggy();
556 // degraded = some recovery in process. fixes active membership and
558 bool is_degraded() const {
559 if (!failed
.empty() || !damaged
.empty())
561 for (std::map
<mds_gid_t
,mds_info_t
>::const_iterator p
= mds_info
.begin();
564 if (p
->second
.state
>= STATE_REPLAY
&& p
->second
.state
<= STATE_CLIENTREPLAY
)
568 bool is_any_failed() const {
569 return failed
.size();
571 bool is_resolving() const {
573 get_num_mds(STATE_RESOLVE
) > 0 &&
574 get_num_mds(STATE_REPLAY
) == 0 &&
575 failed
.empty() && damaged
.empty();
577 bool is_rejoining() const {
578 // nodes are rejoining cache state
580 get_num_mds(STATE_REJOIN
) > 0 &&
581 get_num_mds(STATE_REPLAY
) == 0 &&
582 get_num_mds(STATE_RECONNECT
) == 0 &&
583 get_num_mds(STATE_RESOLVE
) == 0 &&
584 failed
.empty() && damaged
.empty();
586 bool is_stopped() const {
591 * Get whether a rank is 'up', i.e. has
592 * an MDS daemon's entity_inst_t associated
595 bool have_inst(mds_rank_t m
) const {
600 * Get the MDS daemon entity_inst_t for a rank
603 const entity_inst_t
get_inst(mds_rank_t m
) {
605 return mds_info
[up
[m
]].get_inst();
607 const entity_addr_t
get_addr(mds_rank_t m
) {
609 return mds_info
[up
[m
]].addr
;
613 * Get the MDS daemon entity_inst_t for a rank,
616 * @return true if the rank was up and the inst
617 * was populated, else false.
619 bool get_inst(mds_rank_t m
, entity_inst_t
& inst
) {
627 mds_rank_t
get_rank_gid(mds_gid_t gid
) const {
628 if (mds_info
.count(gid
)) {
629 return mds_info
.at(gid
).rank
;
631 return MDS_RANK_NONE
;
635 int get_inc_gid(mds_gid_t gid
) const {
636 auto mds_info_entry
= mds_info
.find(gid
);
637 if (mds_info_entry
!= mds_info
.end())
638 return mds_info_entry
->second
.inc
;
641 void encode(bufferlist
& bl
, uint64_t features
) const;
642 void decode(bufferlist::iterator
& p
);
643 void decode(bufferlist
& bl
) {
644 bufferlist::iterator p
= bl
.begin();
649 void print(ostream
& out
) const;
650 void print_summary(Formatter
*f
, ostream
*out
) const;
652 void dump(Formatter
*f
) const;
653 static void generate_test_instances(list
<MDSMap
*>& ls
);
655 static bool state_transition_valid(DaemonState prev
, DaemonState next
);
657 WRITE_CLASS_ENCODER_FEATURES(MDSMap::mds_info_t
)
658 WRITE_CLASS_ENCODER_FEATURES(MDSMap
)
660 inline ostream
& operator<<(ostream
&out
, const MDSMap
&m
) {
661 m
.print_summary(NULL
, &out
);