1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
4 * Ceph - scalable distributed file system
6 * Copyright (C) 2015 Red Hat
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
18 #include <string_view>
20 #include <boost/asio/io_context.hpp>
22 #include "common/DecayCounter.h"
23 #include "common/LogClient.h"
24 #include "common/Timer.h"
25 #include "common/fair_mutex.h"
26 #include "common/TrackedOp.h"
27 #include "common/ceph_mutex.h"
29 #include "include/common_fwd.h"
31 #include "messages/MClientRequest.h"
32 #include "messages/MCommand.h"
33 #include "messages/MMDSMap.h"
36 #include "DamageTable.h"
38 #include "SessionMap.h"
41 #include "MDSContext.h"
42 #include "PurgeQueue.h"
44 #include "MetricsHandler.h"
45 #include "osdc/Journaler.h"
47 // Full .h import instead of forward declaration for PerfCounter, for the
48 // benefit of those including this header and using MDSRank::logger
49 #include "common/perf_counters.h"
58 l_mds_dir_fetch_complete
,
66 l_mds_inodes_pin_tail
,
69 l_mds_inodes_with_caps
,
74 l_mds_traverse_forward
,
75 l_mds_traverse_discover
,
76 l_mds_traverse_dir_fetch
,
77 l_mds_traverse_remote_ino
,
80 l_mds_dispatch_queue_len
,
82 l_mds_exported_inodes
,
84 l_mds_imported_inodes
,
85 l_mds_openino_dir_fetch
,
86 l_mds_openino_backtrace_fetch
,
87 l_mds_openino_peer_discover
,
91 l_mds_scrub_backtrace_fetch
,
93 l_mds_scrub_backtrace_repaired
,
94 l_mds_scrub_inotable_repaired
,
95 l_mds_scrub_dir_inodes
,
96 l_mds_scrub_dir_base_inodes
,
97 l_mds_scrub_dirfrag_rstats
,
98 l_mds_scrub_file_inodes
,
99 l_mdss_handle_inode_file_caps
,
100 l_mdss_ceph_cap_op_revoke
,
101 l_mdss_ceph_cap_op_grant
,
102 l_mdss_ceph_cap_op_trunc
,
103 l_mdss_ceph_cap_op_flushsnap_ack
,
104 l_mdss_ceph_cap_op_flush_ack
,
105 l_mdss_handle_client_caps
,
106 l_mdss_handle_client_caps_dirty
,
107 l_mdss_handle_client_cap_release
,
108 l_mdss_process_request_cap_release
,
112 // memory utilization
133 struct heartbeat_handle_d
;
143 class MDSTableServer
;
144 class MDSTableClient
;
146 class MetricAggregator
;
152 class C_ExecAndReply
;
154 struct MDSMetaRequest
{
160 explicit MDSMetaRequest(int op
, CDentry
*dn
, ceph_tid_t tid
) :
161 _op(op
), _dentry(dn
), _tid(tid
) {
163 _dentry
->get(CDentry::PIN_PURGING
);
168 _dentry
->put(CDentry::PIN_PURGING
);
172 CDentry
*get_dentry() { return _dentry
; }
173 int get_op() { return _op
; }
174 ceph_tid_t
get_tid() { return _tid
; }
178 * The public part of this class's interface is what's exposed to all
179 * the various subsystems (server, mdcache, etc), such as pointers
180 * to the other subsystems, and message-sending calls.
184 friend class C_Flush_Journal
;
185 friend class C_Drop_Cache
;
186 friend class C_CacheDropExecAndReply
;
187 friend class C_ScrubExecAndReply
;
188 friend class C_ScrubControlExecAndReply
;
194 ceph::fair_mutex
&mds_lock_
,
195 LogChannelRef
&clog_
,
196 CommonSafeTimer
<ceph::fair_mutex
> &timer_
,
198 std::unique_ptr
<MDSMap
> & mdsmap_
,
202 Context
*respawn_hook_
,
203 Context
*suicide_hook_
,
204 boost::asio::io_context
& ioc
);
206 mds_rank_t
get_nodeid() const { return whoami
; }
207 int64_t get_metadata_pool() const
209 return metadata_pool
;
212 mono_time
get_starttime() const {
215 std::chrono::duration
<double> get_uptime() const {
216 mono_time now
= mono_clock::now();
217 return std::chrono::duration
<double>(now
-starttime
);
220 bool is_daemon_stopping() const;
222 MDSTableClient
*get_table_client(int t
);
223 MDSTableServer
*get_table_server(int t
);
225 Session
*get_session(client_t client
) {
226 return sessionmap
.get_session(entity_name_t::CLIENT(client
.v
));
228 Session
*get_session(const cref_t
<Message
> &m
);
230 MDSMap::DaemonState
get_state() const { return state
; }
231 MDSMap::DaemonState
get_want_state() const { return beacon
.get_want_state(); }
233 bool is_creating() const { return state
== MDSMap::STATE_CREATING
; }
234 bool is_starting() const { return state
== MDSMap::STATE_STARTING
; }
235 bool is_standby() const { return state
== MDSMap::STATE_STANDBY
; }
236 bool is_replay() const { return state
== MDSMap::STATE_REPLAY
; }
237 bool is_standby_replay() const { return state
== MDSMap::STATE_STANDBY_REPLAY
; }
238 bool is_resolve() const { return state
== MDSMap::STATE_RESOLVE
; }
239 bool is_reconnect() const { return state
== MDSMap::STATE_RECONNECT
; }
240 bool is_rejoin() const { return state
== MDSMap::STATE_REJOIN
; }
241 bool is_clientreplay() const { return state
== MDSMap::STATE_CLIENTREPLAY
; }
242 bool is_active() const { return state
== MDSMap::STATE_ACTIVE
; }
243 bool is_stopping() const { return state
== MDSMap::STATE_STOPPING
; }
244 bool is_any_replay() const { return (is_replay() || is_standby_replay()); }
245 bool is_stopped() const { return mdsmap
->is_stopped(whoami
); }
246 bool is_cluster_degraded() const { return cluster_degraded
; }
247 bool allows_multimds_snaps() const { return mdsmap
->allows_multimds_snaps(); }
249 bool is_cache_trimmable() const {
250 return is_standby_replay() || is_clientreplay() || is_active() || is_stopping();
253 void handle_write_error(int err
);
254 void handle_write_error_with_lock(int err
);
256 void update_mlogger();
258 void queue_waiter(MDSContext
*c
) {
259 finished_queue
.push_back(c
);
260 progress_thread
.signal();
262 void queue_waiter_front(MDSContext
*c
) {
263 finished_queue
.push_front(c
);
264 progress_thread
.signal();
266 void queue_waiters(MDSContext::vec
& ls
) {
269 std::copy(v
.begin(), v
.end(), std::back_inserter(finished_queue
));
270 progress_thread
.signal();
272 void queue_waiters_front(MDSContext::vec
& ls
) {
275 std::copy(v
.rbegin(), v
.rend(), std::front_inserter(finished_queue
));
276 progress_thread
.signal();
279 // Daemon lifetime functions: these guys break the abstraction
280 // and call up into the parent MDSDaemon instance. It's kind
281 // of unavoidable: if we want any depth into our calls
282 // to be able to e.g. tear down the whole process, we have to
283 // have a reference going all the way down.
290 * Call this periodically if inside a potentially long running piece
291 * of code while holding the mds_lock
293 void heartbeat_reset();
294 int heartbeat_reset_grace(int count
=1) {
295 return count
* _heartbeat_reset_grace
;
299 * Report state DAMAGED to the mon, and then pass on to respawn(). Call
300 * this when an unrecoverable error is encountered while attempting
301 * to load an MDS rank's data structures. This is *not* for use with
302 * errors affecting normal dirfrag/inode objects -- they should be handled
303 * through cleaner scrub/repair mechanisms.
305 * Callers must already hold mds_lock.
310 * Wrapper around `damaged` for users who are not
311 * already holding mds_lock.
313 * Callers must not already hold mds_lock.
315 void damaged_unlocked();
317 double last_cleared_laggy() const {
318 return beacon
.last_cleared_laggy();
321 double get_dispatch_queue_max_age(utime_t now
) const;
323 void send_message_mds(const ref_t
<Message
>& m
, mds_rank_t mds
);
324 void send_message_mds(const ref_t
<Message
>& m
, const entity_addrvec_t
&addr
);
325 void forward_message_mds(const cref_t
<MClientRequest
>& req
, mds_rank_t mds
);
326 void send_message_client_counted(const ref_t
<Message
>& m
, client_t client
);
327 void send_message_client_counted(const ref_t
<Message
>& m
, Session
* session
);
328 void send_message_client_counted(const ref_t
<Message
>& m
, const ConnectionRef
& connection
);
329 void send_message_client(const ref_t
<Message
>& m
, Session
* session
);
330 void send_message(const ref_t
<Message
>& m
, const ConnectionRef
& c
);
332 void wait_for_bootstrapped_peer(mds_rank_t who
, MDSContext
*c
) {
333 waiting_for_bootstrapping_peer
[who
].push_back(c
);
335 void wait_for_active_peer(mds_rank_t who
, MDSContext
*c
) {
336 waiting_for_active_peer
[who
].push_back(c
);
338 void wait_for_cluster_recovered(MDSContext
*c
) {
339 ceph_assert(cluster_degraded
);
340 waiting_for_active_peer
[MDS_RANK_NONE
].push_back(c
);
343 void wait_for_any_client_connection(MDSContext
*c
) {
344 waiting_for_any_client_connection
.push_back(c
);
346 void kick_waiters_for_any_client_connection(void) {
347 finish_contexts(g_ceph_context
, waiting_for_any_client_connection
);
349 void wait_for_active(MDSContext
*c
) {
350 waiting_for_active
.push_back(c
);
352 void wait_for_replay(MDSContext
*c
) {
353 waiting_for_replay
.push_back(c
);
355 void wait_for_rejoin(MDSContext
*c
) {
356 waiting_for_rejoin
.push_back(c
);
358 void wait_for_reconnect(MDSContext
*c
) {
359 waiting_for_reconnect
.push_back(c
);
361 void wait_for_resolve(MDSContext
*c
) {
362 waiting_for_resolve
.push_back(c
);
364 void wait_for_mdsmap(epoch_t e
, MDSContext
*c
) {
365 waiting_for_mdsmap
[e
].push_back(c
);
367 void enqueue_replay(MDSContext
*c
) {
368 replay_queue
.push_back(c
);
371 bool queue_one_replay();
372 void maybe_clientreplay_done();
374 void set_osd_epoch_barrier(epoch_t e
);
375 epoch_t
get_osd_epoch_barrier() const {return osd_epoch_barrier
;}
376 epoch_t
get_osd_epoch() const;
378 ceph_tid_t
issue_tid() { return ++last_tid
; }
380 MDSMap
*get_mds_map() { return mdsmap
.get(); }
382 uint64_t get_num_requests() const { return logger
->get(l_mds_request
); }
384 int get_mds_slow_req_count() const { return mds_slow_req_count
; }
386 void dump_status(Formatter
*f
) const;
388 void hit_export_target(mds_rank_t rank
, double amount
=-1.0);
389 bool is_export_target(mds_rank_t rank
) {
390 const std::set
<mds_rank_t
>& map_targets
= mdsmap
->get_mds_info(get_nodeid()).export_targets
;
391 return map_targets
.count(rank
);
394 bool evict_client(int64_t session_id
, bool wait
, bool blocklist
,
395 std::ostream
& ss
, Context
*on_killed
=nullptr);
396 int config_client(int64_t session_id
, bool remove
,
397 const std::string
& option
, const std::string
& value
,
399 void schedule_inmemory_logger();
401 double get_inject_journal_corrupt_dentry_first() const {
402 return inject_journal_corrupt_dentry_first
;
405 // Reference to global MDS::mds_lock, so that users of MDSRank don't
406 // carry around references to the outer MDS, and we can substitute
407 // a separate lock here in future potentially.
408 ceph::fair_mutex
&mds_lock
;
410 // Reference to global cluster log client, just to avoid initialising
411 // a separate one here.
414 // Reference to global timer utility, because MDSRank and MDSDaemon
415 // currently both use the same mds_lock, so it makes sense for them
417 CommonSafeTimer
<ceph::fair_mutex
> &timer
;
419 std::unique_ptr
<MDSMap
> &mdsmap
; /* MDSDaemon::mdsmap */
424 Server
*server
= nullptr;
425 MDCache
*mdcache
= nullptr;
426 Locker
*locker
= nullptr;
427 MDLog
*mdlog
= nullptr;
428 MDBalancer
*balancer
= nullptr;
429 ScrubStack
*scrubstack
= nullptr;
430 DamageTable damage_table
;
432 InoTable
*inotable
= nullptr;
434 SnapServer
*snapserver
= nullptr;
435 SnapClient
*snapclient
= nullptr;
437 SessionMap sessionmap
;
439 PerfCounters
*logger
= nullptr, *mlogger
= nullptr;
440 OpTracker op_tracker
;
442 std::map
<ceph_tid_t
, MDSMetaRequest
> internal_client_requests
;
444 // The last different state I held before current
445 MDSMap::DaemonState last_state
= MDSMap::STATE_BOOT
;
446 // The state assigned to me by the MDSMap
447 MDSMap::DaemonState state
= MDSMap::STATE_STANDBY
;
449 bool cluster_degraded
= false;
454 // The MDSMap is available, configure default layouts and structures
455 MDS_BOOT_INITIAL
= 0,
456 // We are ready to open some inodes
458 // We are ready to do a replay if needed
459 MDS_BOOT_PREPARE_LOG
,
460 // Replay is complete
464 class ProgressThread
: public Thread
{
466 explicit ProgressThread(MDSRank
*mds_
) : mds(mds_
) {}
467 void * entry() override
;
469 void signal() {cond
.notify_all();}
472 std::condition_variable_any cond
;
475 class C_MDS_StandbyReplayRestart
;
476 class C_MDS_StandbyReplayRestartFinish
;
477 // Friended to access retry_dispatch
478 friend class C_MDS_RetryMessage
;
479 friend class C_MDS_BootStart
;
480 friend class C_MDS_InternalBootStart
;
481 friend class C_MDS_MonCommand
;
483 const mds_rank_t whoami
;
487 void inc_dispatch_depth() { ++dispatch_depth
; }
488 void dec_dispatch_depth() { --dispatch_depth
; }
489 void retry_dispatch(const cref_t
<Message
> &m
);
490 bool is_valid_message(const cref_t
<Message
> &m
);
491 void handle_message(const cref_t
<Message
> &m
);
492 void _advance_queues();
493 bool _dispatch(const cref_t
<Message
> &m
, bool new_msg
);
494 bool is_stale_message(const cref_t
<Message
> &m
) const;
497 * Emit clog warnings for any ops reported as warnings by optracker
499 void check_ops_in_flight();
502 * Share MDSMap with clients
504 void create_logger();
506 void dump_clientreplay_status(Formatter
*f
) const;
507 void command_scrub_start(Formatter
*f
,
508 std::string_view path
, std::string_view tag
,
509 const std::vector
<std::string
>& scrubop_vec
, Context
*on_finish
);
510 void command_tag_path(Formatter
*f
, std::string_view path
,
511 std::string_view tag
);
512 // scrub control commands
513 void command_scrub_abort(Formatter
*f
, Context
*on_finish
);
514 void command_scrub_pause(Formatter
*f
, Context
*on_finish
);
515 void command_scrub_resume(Formatter
*f
);
516 void command_scrub_status(Formatter
*f
);
518 void command_flush_path(Formatter
*f
, std::string_view path
);
519 void command_flush_journal(Formatter
*f
);
520 void command_get_subtrees(Formatter
*f
);
521 void command_export_dir(Formatter
*f
,
522 std::string_view path
, mds_rank_t dest
);
523 bool command_dirfrag_split(
526 bool command_dirfrag_merge(
529 bool command_dirfrag_ls(
533 int _command_export_dir(std::string_view path
, mds_rank_t dest
);
534 CDir
*_command_dirfrag_get(
535 const cmdmap_t
&cmdmap
,
537 void command_openfiles_ls(Formatter
*f
);
538 void command_dump_tree(const cmdmap_t
&cmdmap
, std::ostream
&ss
, Formatter
*f
);
539 void command_dump_inode(Formatter
*f
, const cmdmap_t
&cmdmap
, std::ostream
&ss
);
540 void command_cache_drop(uint64_t timeout
, Formatter
*f
, Context
*on_finish
);
542 // FIXME the state machine logic should be separable from the dispatch
543 // logic that calls it.
545 void calc_recovery_set();
546 void request_state(MDSMap::DaemonState s
);
548 void boot_create(); // i am new mds.
549 void boot_start(BootStep step
=MDS_BOOT_INITIAL
, int r
=0); // starting|replay
552 void creating_done();
553 void starting_done();
555 void standby_replay_restart();
556 void _standby_replay_restart_finish(int r
, uint64_t old_read_pos
);
560 void resolve_start();
562 void reconnect_start();
563 void reconnect_done();
564 void rejoin_joint_start();
567 void recovery_done(int oldstate
);
568 void clientreplay_start();
569 void clientreplay_done();
571 void stopping_start();
572 void stopping_done();
574 void validate_sessions();
576 void handle_mds_recovery(mds_rank_t who
);
577 void handle_mds_failure(mds_rank_t who
);
579 /* Update MDSMap export_targets for this rank. Called on ::tick(). */
580 void update_targets();
582 void _mon_command_finish(int r
, std::string_view cmd
, std::string_view outs
);
583 void set_mdsmap_multimds_snaps_allowed();
585 Context
*create_async_exec_context(C_ExecAndReply
*ctx
);
587 // blocklist the provided addrs and set OSD epoch barrier
588 // with the provided epoch.
589 void apply_blocklist(const std::set
<entity_addr_t
> &addrs
, epoch_t epoch
);
591 void reset_event_flags();
593 // Incarnation as seen in MDSMap at the point where a rank is
597 // Flag to indicate we entered shutdown: anyone seeing this to be true
598 // after taking mds_lock must drop out.
599 bool stopping
= false;
601 // PurgeQueue is only used by StrayManager, but it is owned by MDSRank
602 // because its init/shutdown happens at the top level.
603 PurgeQueue purge_queue
;
605 MetricsHandler metrics_handler
;
606 std::unique_ptr
<MetricAggregator
> metric_aggregator
;
608 std::list
<cref_t
<Message
>> waiting_for_nolaggy
;
609 MDSContext::que finished_queue
;
610 // Dispatch, retry, queues
611 int dispatch_depth
= 0;
613 ceph::heartbeat_handle_d
*hb
= nullptr; // Heartbeat for threads using mds_lock
614 double heartbeat_grace
;
615 int _heartbeat_reset_grace
;
617 std::map
<mds_rank_t
, version_t
> peer_mdsmap_epoch
;
619 ceph_tid_t last_tid
= 0; // for mds-initiated requests (e.g. stray rename)
621 MDSContext::vec waiting_for_active
, waiting_for_replay
, waiting_for_rejoin
,
622 waiting_for_reconnect
, waiting_for_resolve
;
623 MDSContext::vec waiting_for_any_client_connection
;
624 MDSContext::que replay_queue
;
625 bool replaying_requests_done
= false;
627 std::map
<mds_rank_t
, MDSContext::vec
> waiting_for_active_peer
;
628 std::map
<mds_rank_t
, MDSContext::vec
> waiting_for_bootstrapping_peer
;
629 std::map
<epoch_t
, MDSContext::vec
> waiting_for_mdsmap
;
631 epoch_t osd_epoch_barrier
= 0;
633 // Const reference to the beacon so that we can behave differently
637 int mds_slow_req_count
= 0;
639 std::map
<mds_rank_t
,DecayCounter
> export_targets
; /* targets this MDS is exporting to or wants/tries to */
641 Messenger
*messenger
;
645 Context
*respawn_hook
;
646 Context
*suicide_hook
;
648 bool standby_replaying
= false; // true if current replay pass is in standby-replay mode
649 uint64_t extraordinary_events_dump_interval
= 0;
650 double inject_journal_corrupt_dentry_first
= 0.0;
652 bool send_status
= true;
654 // The metadata pool won't change in the whole life time of the fs,
655 // with this we can get rid of the mds_lock in many places too.
656 int64_t metadata_pool
= -1;
658 // "task" string that gets displayed in ceph status
659 inline static const std::string SCRUB_STATUS_KEY
= "scrub status";
661 bool client_eviction_dump
= false;
663 void get_task_status(std::map
<std::string
, std::string
> *status
);
664 void schedule_update_timer_task();
665 void send_task_status();
667 void inmemory_logger();
668 bool is_rank0() const {
669 return whoami
== (mds_rank_t
)0;
672 mono_time starttime
= mono_clock::zero();
673 boost::asio::io_context
& ioc
;
676 class C_MDS_RetryMessage
: public MDSInternalContext
{
678 C_MDS_RetryMessage(MDSRank
*mds
, const cref_t
<Message
> &m
)
679 : MDSInternalContext(mds
), m(m
) {}
680 void finish(int r
) override
{
681 get_mds()->retry_dispatch(m
);
687 class CF_MDS_RetryMessageFactory
: public MDSContextFactory
{
689 CF_MDS_RetryMessageFactory(MDSRank
*mds
, const cref_t
<Message
> &m
)
692 MDSContext
*build() {
693 return new C_MDS_RetryMessage(mds
, m
);
701 * The aspect of MDSRank exposed to MDSDaemon but not subsystems: i.e.
702 * the service/dispatcher stuff like init/shutdown that subsystems should
705 class MDSRankDispatcher
: public MDSRank
, public md_config_obs_t
710 ceph::fair_mutex
&mds_lock_
,
711 LogChannelRef
&clog_
,
712 CommonSafeTimer
<ceph::fair_mutex
> &timer_
,
714 std::unique_ptr
<MDSMap
> &mdsmap_
,
718 Context
*respawn_hook_
,
719 Context
*suicide_hook_
,
720 boost::asio::io_context
& ioc
);
725 void handle_asok_command(
726 std::string_view command
,
727 const cmdmap_t
& cmdmap
,
729 const bufferlist
&inbl
,
730 std::function
<void(int,const std::string
&,bufferlist
&)> on_finish
);
731 void handle_mds_map(const cref_t
<MMDSMap
> &m
, const MDSMap
&oldmap
);
732 void handle_osd_map();
733 void update_log_config();
735 const char** get_tracked_conf_keys() const override final
;
736 void handle_conf_change(const ConfigProxy
& conf
, const std::set
<std::string
>& changed
) override
;
738 void dump_sessions(const SessionFilter
&filter
, Formatter
*f
, bool cap_dump
=false) const;
739 void evict_clients(const SessionFilter
&filter
,
740 std::function
<void(int,const std::string
&,bufferlist
&)> on_finish
);
742 // Call into me from MDS::ms_dispatch
743 bool ms_dispatch(const cref_t
<Message
> &m
);
746 #endif // MDS_RANK_H_