1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
4 * Ceph - scalable distributed file system
6 * Copyright (C) 2015 Red Hat
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
18 #include <boost/utility/string_view.hpp>
20 #include "common/DecayCounter.h"
21 #include "common/LogClient.h"
22 #include "common/Timer.h"
23 #include "common/TrackedOp.h"
25 #include "messages/MCommand.h"
28 #include "DamageTable.h"
30 #include "SessionMap.h"
34 #include "PurgeQueue.h"
35 #include "osdc/Journaler.h"
37 // Full .h import instead of forward declaration for PerfCounter, for the
38 // benefit of those including this header and using MDSRank::logger
39 #include "common/perf_counters.h"
55 l_mds_inodes_pin_tail
,
58 l_mds_inodes_with_caps
,
63 l_mds_traverse_forward
,
64 l_mds_traverse_discover
,
65 l_mds_traverse_dir_fetch
,
66 l_mds_traverse_remote_ino
,
69 l_mds_dispatch_queue_len
,
71 l_mds_exported_inodes
,
73 l_mds_imported_inodes
,
99 struct heartbeat_handle_d
;
110 class MDSTableServer
;
111 class MDSTableClient
;
120 * The public part of this class's interface is what's exposed to all
121 * the various subsystems (server, mdcache, etc), such as pointers
122 * to the other subsystems, and message-sending calls.
126 const mds_rank_t whoami
;
128 // Incarnation as seen in MDSMap at the point where a rank is
133 mds_rank_t
get_nodeid() const { return whoami
; }
134 int64_t get_metadata_pool();
136 // Reference to global MDS::mds_lock, so that users of MDSRank don't
137 // carry around references to the outer MDS, and we can substitute
138 // a separate lock here in future potentially.
141 mono_time
get_starttime() const {
144 chrono::duration
<double> get_uptime() const {
145 mono_time now
= mono_clock::now();
146 return chrono::duration
<double>(now
-starttime
);
149 class CephContext
*cct
;
151 bool is_daemon_stopping() const;
153 // Reference to global cluster log client, just to avoid initialising
154 // a separate one here.
157 // Reference to global timer utility, because MDSRank and MDSDaemon
158 // currently both use the same mds_lock, so it makes sense for them
171 MDBalancer
*balancer
;
172 ScrubStack
*scrubstack
;
173 DamageTable damage_table
;
178 SnapServer
*snapserver
;
179 SnapClient
*snapclient
;
181 MDSTableClient
*get_table_client(int t
);
182 MDSTableServer
*get_table_server(int t
);
184 SessionMap sessionmap
;
185 Session
*get_session(client_t client
) {
186 return sessionmap
.get_session(entity_name_t::CLIENT(client
.v
));
188 Session
*get_session(Message
*m
);
190 PerfCounters
*logger
, *mlogger
;
191 OpTracker op_tracker
;
193 // The last different state I held before current
194 MDSMap::DaemonState last_state
;
195 // The state assigned to me by the MDSMap
196 MDSMap::DaemonState state
;
198 bool cluster_degraded
;
200 MDSMap::DaemonState
get_state() const { return state
; }
201 MDSMap::DaemonState
get_want_state() const { return beacon
.get_want_state(); }
203 bool is_creating() const { return state
== MDSMap::STATE_CREATING
; }
204 bool is_starting() const { return state
== MDSMap::STATE_STARTING
; }
205 bool is_standby() const { return state
== MDSMap::STATE_STANDBY
; }
206 bool is_replay() const { return state
== MDSMap::STATE_REPLAY
; }
207 bool is_standby_replay() const { return state
== MDSMap::STATE_STANDBY_REPLAY
; }
208 bool is_resolve() const { return state
== MDSMap::STATE_RESOLVE
; }
209 bool is_reconnect() const { return state
== MDSMap::STATE_RECONNECT
; }
210 bool is_rejoin() const { return state
== MDSMap::STATE_REJOIN
; }
211 bool is_clientreplay() const { return state
== MDSMap::STATE_CLIENTREPLAY
; }
212 bool is_active() const { return state
== MDSMap::STATE_ACTIVE
; }
213 bool is_stopping() const { return state
== MDSMap::STATE_STOPPING
; }
214 bool is_any_replay() const { return (is_replay() || is_standby_replay()); }
215 bool is_stopped() const { return mdsmap
->is_stopped(whoami
); }
216 bool is_cluster_degraded() const { return cluster_degraded
; }
218 void handle_write_error(int err
);
220 void handle_conf_change(const struct md_config_t
*conf
,
221 const std::set
<std::string
> &changed
)
223 mdcache
->migrator
->handle_conf_change(conf
, changed
, *mdsmap
);
224 purge_queue
.handle_conf_change(conf
, changed
, *mdsmap
);
227 void update_mlogger();
229 // Flag to indicate we entered shutdown: anyone seeing this to be true
230 // after taking mds_lock must drop out.
233 // PurgeQueue is only used by StrayManager, but it is owned by MDSRank
234 // because its init/shutdown happens at the top level.
235 PurgeQueue purge_queue
;
237 class ProgressThread
: public Thread
{
241 explicit ProgressThread(MDSRank
*mds_
) : mds(mds_
) {}
242 void * entry() override
;
244 void signal() {cond
.Signal();}
247 list
<Message
*> waiting_for_nolaggy
;
248 list
<MDSInternalContextBase
*> finished_queue
;
249 // Dispatch, retry, queues
251 void inc_dispatch_depth() { ++dispatch_depth
; }
252 void dec_dispatch_depth() { --dispatch_depth
; }
253 void retry_dispatch(Message
*m
);
254 bool handle_deferrable_message(Message
*m
);
255 void _advance_queues();
256 bool _dispatch(Message
*m
, bool new_msg
);
258 ceph::heartbeat_handle_d
*hb
; // Heartbeat for threads using mds_lock
260 bool is_stale_message(Message
*m
) const;
262 map
<mds_rank_t
, version_t
> peer_mdsmap_epoch
;
264 ceph_tid_t last_tid
; // for mds-initiated requests (e.g. stray rename)
266 list
<MDSInternalContextBase
*> waiting_for_active
, waiting_for_replay
, waiting_for_reconnect
, waiting_for_resolve
;
267 list
<MDSInternalContextBase
*> waiting_for_any_client_connection
;
268 list
<MDSInternalContextBase
*> replay_queue
;
269 map
<mds_rank_t
, list
<MDSInternalContextBase
*> > waiting_for_active_peer
;
270 map
<epoch_t
, list
<MDSInternalContextBase
*> > waiting_for_mdsmap
;
272 epoch_t osd_epoch_barrier
;
274 // Const reference to the beacon so that we can behave differently
279 * Emit clog warnings for any ops reported as warnings by optracker
281 void check_ops_in_flight();
283 int mds_slow_req_count
;
286 * Share MDSMap with clients
288 void bcast_mds_map(); // to mounted clients
289 epoch_t last_client_mdsmap_bcast
;
291 map
<mds_rank_t
,DecayCounter
> export_targets
; /* targets this MDS is exporting to or wants/tries to */
293 void create_logger();
296 void queue_waiter(MDSInternalContextBase
*c
) {
297 finished_queue
.push_back(c
);
298 progress_thread
.signal();
300 void queue_waiters(std::list
<MDSInternalContextBase
*>& ls
) {
301 finished_queue
.splice( finished_queue
.end(), ls
);
302 progress_thread
.signal();
308 LogChannelRef
&clog_
,
314 Context
*respawn_hook_
,
315 Context
*suicide_hook_
);
322 // Daemon lifetime functions: these guys break the abstraction
323 // and call up into the parent MDSDaemon instance. It's kind
324 // of unavoidable: if we want any depth into our calls
325 // to be able to e.g. tear down the whole process, we have to
326 // have a reference going all the way down.
333 * Call this periodically if inside a potentially long running piece
334 * of code while holding the mds_lock
336 void heartbeat_reset();
339 * Report state DAMAGED to the mon, and then pass on to respawn(). Call
340 * this when an unrecoverable error is encountered while attempting
341 * to load an MDS rank's data structures. This is *not* for use with
342 * errors affecting normal dirfrag/inode objects -- they should be handled
343 * through cleaner scrub/repair mechanisms.
345 * Callers must already hold mds_lock.
350 * Wrapper around `damaged` for users who are not
351 * already holding mds_lock.
353 * Callers must not already hold mds_lock.
355 void damaged_unlocked();
357 utime_t
get_laggy_until() const;
359 void send_message_mds(Message
*m
, mds_rank_t mds
);
360 void forward_message_mds(Message
*req
, mds_rank_t mds
);
362 void send_message_client_counted(Message
*m
, client_t client
);
363 void send_message_client_counted(Message
*m
, Session
*session
);
364 void send_message_client_counted(Message
*m
, Connection
*connection
);
365 void send_message_client_counted(Message
*m
, const ConnectionRef
& con
) {
366 send_message_client_counted(m
, con
.get());
368 void send_message_client(Message
*m
, Session
*session
);
369 void send_message(Message
*m
, Connection
*c
);
370 void send_message(Message
*m
, const ConnectionRef
& c
) {
371 send_message(m
, c
.get());
374 void wait_for_active_peer(mds_rank_t who
, MDSInternalContextBase
*c
) {
375 waiting_for_active_peer
[who
].push_back(c
);
377 void wait_for_cluster_recovered(MDSInternalContextBase
*c
) {
378 assert(cluster_degraded
);
379 waiting_for_active_peer
[MDS_RANK_NONE
].push_back(c
);
382 void wait_for_any_client_connection(MDSInternalContextBase
*c
) {
383 waiting_for_any_client_connection
.push_back(c
);
385 void kick_waiters_for_any_client_connection(void) {
386 finish_contexts(g_ceph_context
, waiting_for_any_client_connection
);
388 void wait_for_active(MDSInternalContextBase
*c
) {
389 waiting_for_active
.push_back(c
);
391 void wait_for_replay(MDSInternalContextBase
*c
) {
392 waiting_for_replay
.push_back(c
);
394 void wait_for_reconnect(MDSInternalContextBase
*c
) {
395 waiting_for_reconnect
.push_back(c
);
397 void wait_for_resolve(MDSInternalContextBase
*c
) {
398 waiting_for_resolve
.push_back(c
);
400 void wait_for_mdsmap(epoch_t e
, MDSInternalContextBase
*c
) {
401 waiting_for_mdsmap
[e
].push_back(c
);
403 void enqueue_replay(MDSInternalContextBase
*c
) {
404 replay_queue
.push_back(c
);
407 bool queue_one_replay();
409 void set_osd_epoch_barrier(epoch_t e
);
410 epoch_t
get_osd_epoch_barrier() const {return osd_epoch_barrier
;}
411 epoch_t
get_osd_epoch() const;
413 ceph_tid_t
issue_tid() { return ++last_tid
; }
417 MDSMap
*get_mds_map() { return mdsmap
; }
419 uint64_t get_num_requests() const { return logger
->get(l_mds_request
); }
421 int get_mds_slow_req_count() const { return mds_slow_req_count
; }
423 void dump_status(Formatter
*f
) const;
425 void hit_export_target(utime_t now
, mds_rank_t rank
, double amount
=-1.0);
426 bool is_export_target(mds_rank_t rank
) {
427 const set
<mds_rank_t
>& map_targets
= mdsmap
->get_mds_info(get_nodeid()).export_targets
;
428 return map_targets
.count(rank
);
431 bool evict_client(int64_t session_id
, bool wait
, bool blacklist
,
432 std::stringstream
& ss
, Context
*on_killed
=nullptr);
435 void dump_clientreplay_status(Formatter
*f
) const;
436 void command_scrub_path(Formatter
*f
, boost::string_view path
, vector
<string
>& scrubop_vec
);
437 void command_tag_path(Formatter
*f
, boost::string_view path
,
438 boost::string_view tag
);
439 void command_flush_path(Formatter
*f
, boost::string_view path
);
440 void command_flush_journal(Formatter
*f
);
441 void command_get_subtrees(Formatter
*f
);
442 void command_export_dir(Formatter
*f
,
443 boost::string_view path
, mds_rank_t dest
);
444 bool command_dirfrag_split(
447 bool command_dirfrag_merge(
450 bool command_dirfrag_ls(
454 int _command_export_dir(boost::string_view path
, mds_rank_t dest
);
455 int _command_flush_journal(std::stringstream
*ss
);
456 CDir
*_command_dirfrag_get(
457 const cmdmap_t
&cmdmap
,
461 Messenger
*messenger
;
464 Context
*respawn_hook
;
465 Context
*suicide_hook
;
467 // Friended to access retry_dispatch
468 friend class C_MDS_RetryMessage
;
470 // FIXME the state machine logic should be separable from the dispatch
471 // logic that calls it.
473 void calc_recovery_set();
474 void request_state(MDSMap::DaemonState s
);
476 bool standby_replaying
; // true if current replay pass is in standby-replay mode
479 // The MDSMap is available, configure default layouts and structures
480 MDS_BOOT_INITIAL
= 0,
481 // We are ready to open some inodes
483 // We are ready to do a replay if needed
484 MDS_BOOT_PREPARE_LOG
,
485 // Replay is complete
488 friend class C_MDS_BootStart
;
489 friend class C_MDS_InternalBootStart
;
490 void boot_create(); // i am new mds.
491 void boot_start(BootStep step
=MDS_BOOT_INITIAL
, int r
=0); // starting|replay
494 void creating_done();
495 void starting_done();
497 void standby_replay_restart();
498 void _standby_replay_restart_finish(int r
, uint64_t old_read_pos
);
499 class C_MDS_StandbyReplayRestart
;
500 class C_MDS_StandbyReplayRestartFinish
;
504 void resolve_start();
506 void reconnect_start();
507 void reconnect_done();
508 void rejoin_joint_start();
511 void recovery_done(int oldstate
);
512 void clientreplay_start();
513 void clientreplay_done();
515 void stopping_start();
516 void stopping_done();
518 void validate_sessions();
522 void handle_mds_recovery(mds_rank_t who
);
523 void handle_mds_failure(mds_rank_t who
);
526 /* Update MDSMap export_targets for this rank. Called on ::tick(). */
527 void update_targets(utime_t now
);
530 mono_time starttime
= mono_clock::zero();
533 /* This expects to be given a reference which it is responsible for.
534 * The finish function calls functions which
535 * will put the Message exactly once.*/
536 class C_MDS_RetryMessage
: public MDSInternalContext
{
540 C_MDS_RetryMessage(MDSRank
*mds
, Message
*m
)
541 : MDSInternalContext(mds
)
546 void finish(int r
) override
{
547 mds
->retry_dispatch(m
);
552 * The aspect of MDSRank exposed to MDSDaemon but not subsystems: i.e.
553 * the service/dispatcher stuff like init/shutdown that subsystems should
556 class MDSRankDispatcher
: public MDSRank
562 bool handle_asok_command(std::string command
, cmdmap_t
& cmdmap
,
563 Formatter
*f
, std::ostream
& ss
);
564 void handle_mds_map(MMDSMap
*m
, MDSMap
*oldmap
);
565 void handle_osd_map();
566 void update_log_config();
569 const cmdmap_t
&cmdmap
,
572 std::stringstream
*ds
,
573 std::stringstream
*ss
,
576 void dump_sessions(const SessionFilter
&filter
, Formatter
*f
) const;
577 void evict_clients(const SessionFilter
&filter
, MCommand
*m
);
579 // Call into me from MDS::ms_dispatch
580 bool ms_dispatch(Message
*m
);
585 LogChannelRef
&clog_
,
591 Context
*respawn_hook_
,
592 Context
*suicide_hook_
);
595 // This utility for MDS and MDSRank dispatchers.
596 #define ALLOW_MESSAGES_FROM(peers) \
598 if (m->get_connection() && (m->get_connection()->get_peer_type() & (peers)) == 0) { \
599 dout(0) << __FILE__ << "." << __LINE__ << ": filtered out request, peer=" << m->get_connection()->get_peer_type() \
600 << " allowing=" << #peers << " message=" << *m << dendl; \
606 #endif // MDS_RANK_H_