1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
4 * Ceph - scalable distributed file system
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
16 * This is the top level monitor. It runs on each machine in the Monitor
17 * Cluster. The election of a leader for the paxos algorithm only happens
18 * once per machine via the elector. There is a separate paxos instance (state)
19 * kept for each of the system components: Object Store Device (OSD) Monitor,
20 * Placement Group (PG) Monitor, Metadata Server (MDS) Monitor, and Client Monitor.
23 #ifndef CEPH_MONITOR_H
24 #define CEPH_MONITOR_H
31 #include "include/types.h"
32 #include "include/health.h"
33 #include "msg/Messenger.h"
35 #include "common/Timer.h"
37 #include "health_check.h"
42 #include "MonCommand.h"
45 #include "common/config_obs.h"
46 #include "common/LogClient.h"
47 #include "auth/AuthClient.h"
48 #include "auth/AuthServer.h"
49 #include "auth/cephx/CephxKeyServer.h"
50 #include "auth/AuthMethodList.h"
51 #include "auth/KeyRing.h"
52 #include "include/common_fwd.h"
53 #include "messages/MMonCommand.h"
54 #include "mon/MonitorDBStore.h"
55 #include "mgr/MgrClient.h"
57 #include "mon/MonOpRequest.h"
58 #include "common/WorkQueue.h"
60 using namespace TOPNSPC::common
;
62 #define CEPH_MON_PROTOCOL 13 /* cluster internal */
66 l_cluster_first
= 555000,
68 l_cluster_num_mon_quorum
,
74 l_cluster_osd_bytes_used
,
75 l_cluster_osd_bytes_avail
,
78 l_cluster_num_pg_active_clean
,
79 l_cluster_num_pg_active
,
80 l_cluster_num_pg_peering
,
82 l_cluster_num_object_degraded
,
83 l_cluster_num_object_misplaced
,
84 l_cluster_num_object_unfound
,
104 class AdminSocketHook
;
106 #define COMPAT_SET_LOC "feature_set"
108 class Monitor
: public Dispatcher
,
111 public md_config_obs_t
{
114 const char **orig_argv
= nullptr;
119 Messenger
*messenger
;
120 ConnectionRef con_self
;
121 ceph::mutex lock
= ceph::make_mutex("Monitor::lock");
124 ThreadPool cpu_tp
; ///< threadpool for CPU intensive work
126 ceph::mutex auth_lock
= ceph::make_mutex("Monitor::auth_lock");
128 /// true if we have ever joined a quorum. if false, we are either a
129 /// new cluster, a newly joining monitor, or a just-upgraded
131 bool has_ever_joined
;
133 PerfCounters
*logger
, *cluster_logger
;
134 bool cluster_logger_registered
;
136 void register_cluster_logger();
137 void unregister_cluster_logger();
142 std::set
<entity_addrvec_t
> extra_probe_peers
;
144 LogClient log_client
;
146 LogChannelRef audit_clog
;
148 KeyServer key_server
;
150 AuthMethodList auth_cluster_required
;
151 AuthMethodList auth_service_required
;
155 std::vector
<MonCommand
> leader_mon_commands
; // quorum leader's commands
156 std::vector
<MonCommand
> local_mon_commands
; // commands i support
157 ceph::buffer::list local_mon_commands_bl
; // encoded version of above
159 std::vector
<MonCommand
> prenautilus_local_mon_commands
;
160 ceph::buffer::list prenautilus_local_mon_commands_bl
;
162 Messenger
*mgr_messenger
;
163 MgrClient mgr_client
;
164 uint64_t mgr_proxy_bytes
= 0; // in-flight proxied mgr command message bytes
165 std::string gss_ktfile_client
{};
170 // -- local storage --
172 MonitorDBStore
*store
;
173 static const std::string MONITOR_NAME
;
174 static const std::string MONITOR_STORE_PREFIX
;
176 // -- monitor state --
187 int state
= STATE_INIT
;
190 static const char *get_state_name(int s
) {
192 case STATE_PROBING
: return "probing";
193 case STATE_SYNCHRONIZING
: return "synchronizing";
194 case STATE_ELECTING
: return "electing";
195 case STATE_LEADER
: return "leader";
196 case STATE_PEON
: return "peon";
197 case STATE_SHUTDOWN
: return "shutdown";
198 default: return "???";
201 const char *get_state_name() const {
202 return get_state_name(state
);
205 bool is_init() const { return state
== STATE_INIT
; }
206 bool is_shutdown() const { return state
== STATE_SHUTDOWN
; }
207 bool is_probing() const { return state
== STATE_PROBING
; }
208 bool is_synchronizing() const { return state
== STATE_SYNCHRONIZING
; }
209 bool is_electing() const { return state
== STATE_ELECTING
; }
210 bool is_leader() const { return state
== STATE_LEADER
; }
211 bool is_peon() const { return state
== STATE_PEON
; }
213 const utime_t
&get_leader_since() const;
215 void prepare_new_fingerprint(MonitorDBStore::TransactionRef t
);
217 std::vector
<DaemonHealthMetric
> get_health_metrics();
219 int quorum_age() const {
220 auto age
= std::chrono::duration_cast
<std::chrono::seconds
>(
221 ceph::mono_clock::now() - quorum_since
);
225 bool is_mon_down() const {
226 int max
= monmap
->size();
227 int actual
= get_quorum().size();
228 auto now
= ceph::real_clock::now();
229 return actual
< max
&& now
> monmap
->created
.to_real_time();
234 std::unique_ptr
<Paxos
> paxos
;
236 friend class Elector
;
238 /// features we require of peers (based on on-disk compatset)
239 uint64_t required_features
;
241 int leader
; // current leader (to best of knowledge)
242 std::set
<int> quorum
; // current active set of monitors (if !starting)
243 ceph::mono_clock::time_point quorum_since
; // when quorum formed
244 utime_t leader_since
; // when this monitor became the leader, if it is the leader
245 utime_t exited_quorum
; // time detected as not in quorum; 0 if in
247 // map of counts of connected clients, by type and features, for
249 std::map
<int,FeatureMap
> quorum_feature_map
;
252 * Intersection of quorum member's connection feature bits.
254 uint64_t quorum_con_features
;
256 * Intersection of quorum members mon-specific feature bits
258 mon_feature_t quorum_mon_features
;
260 ceph_release_t quorum_min_mon_release
{ceph_release_t::unknown
};
262 std::set
<std::string
> outside_quorum
;
264 bool stretch_mode_engaged
{false};
265 bool degraded_stretch_mode
{false};
266 bool recovering_stretch_mode
{false};
267 std::string stretch_bucket_divider
;
268 std::map
<std::string
, std::set
<std::string
>> dead_mon_buckets
; // bucket->mon ranks, locations with no live mons
269 std::set
<std::string
> up_mon_buckets
; // locations with a live mon
270 void do_stretch_mode_election_work();
272 bool session_stretch_allowed(MonSession
*s
, MonOpRequestRef
& op
);
273 void disconnect_disallowed_stretch_sessions();
274 void set_elector_disallowed_leaders(bool allow_election
);
276 std::map
<std::string
,std::string
> crush_loc
;
277 bool need_set_crush_loc
{false};
279 bool is_stretch_mode() { return stretch_mode_engaged
; }
280 bool is_degraded_stretch_mode() { return degraded_stretch_mode
; }
281 bool is_recovering_stretch_mode() { return recovering_stretch_mode
; }
284 * This set of functions maintains the in-memory stretch state
285 * and sets up transitions of the map states by calling in to
286 * MonmapMonitor and OSDMonitor.
288 * The [maybe_]go_* functions are called on the leader to
289 * decide if transitions should happen; the trigger_* functions
290 * set up the map transitions; and the set_* functions actually
291 * change the memory state -- but these are only called
292 * via OSDMonitor::update_from_paxos, to guarantee consistent
293 * updates across the entire cluster.
295 void try_engage_stretch_mode();
296 void maybe_go_degraded_stretch_mode();
297 void trigger_degraded_stretch_mode(const std::set
<std::string
>& dead_mons
,
298 const std::set
<int>& dead_buckets
);
299 void set_degraded_stretch_mode();
300 void go_recovery_stretch_mode();
301 void set_recovery_stretch_mode();
302 void trigger_healthy_stretch_mode();
303 void set_healthy_stretch_mode();
304 void enable_stretch_mode();
305 void set_mon_crush_location(const std::string
& loc
);
311 * @defgroup Monitor_h_scrub
314 version_t scrub_version
; ///< paxos version we are scrubbing
315 std::map
<int,ScrubResult
> scrub_result
; ///< results so far
318 * trigger a cross-mon scrub
320 * Verify all mons are storing identical content
324 void handle_scrub(MonOpRequestRef op
);
325 bool _scrub(ScrubResult
*r
,
326 std::pair
<std::string
,std::string
> *start
,
328 void scrub_check_results();
329 void scrub_timeout();
332 void scrub_update_interval(ceph::timespan interval
);
334 Context
*scrub_event
; ///< periodic event to trigger scrub (leader)
335 Context
*scrub_timeout_event
; ///< scrub round timeout (leader)
336 void scrub_event_start();
337 void scrub_event_cancel();
338 void scrub_reset_timeout();
339 void scrub_cancel_timeout();
342 std::pair
<std::string
,std::string
> last_key
; ///< last scrubbed key
345 ScrubState() : finished(false) { }
346 virtual ~ScrubState() { }
348 std::shared_ptr
<ScrubState
> scrub_state
; ///< keeps track of current scrub
351 * @defgroup Monitor_h_sync Synchronization
355 * @} // provider state
357 struct SyncProvider
{
358 entity_addrvec_t addrs
;
359 uint64_t cookie
; ///< unique cookie for this sync attempt
360 utime_t timeout
; ///< when we give up and expire this attempt
361 version_t last_committed
; ///< last paxos version on peer
362 std::pair
<std::string
,std::string
> last_key
; ///< last key sent to (or on) peer
363 bool full
; ///< full scan?
364 MonitorDBStore::Synchronizer synchronizer
; ///< iterator
366 SyncProvider() : cookie(0), last_committed(0), full(false) {}
368 void reset_timeout(CephContext
*cct
, int grace
) {
369 timeout
= ceph_clock_now();
374 std::map
<std::uint64_t, SyncProvider
> sync_providers
; ///< cookie -> SyncProvider for those syncing from us
375 uint64_t sync_provider_count
; ///< counter for issued cookies to keep them unique
378 * @} // requester state
380 entity_addrvec_t sync_provider
; ///< who we are syncing from
381 uint64_t sync_cookie
; ///< 0 if we are starting, non-zero otherwise
382 bool sync_full
; ///< true if we are a full sync, false for recent catch-up
383 version_t sync_start_version
; ///< last_committed at sync start
384 Context
*sync_timeout_event
; ///< timeout event
387 * floor for sync source
389 * When we sync we forget about our old last_committed value which
390 * can be dangerous. For example, if we have a cluster of:
396 * If something forces us to sync (say, corruption, or manual
397 * intervention, or bug), we forget last_committed, and might abort.
398 * If mon.a happens to be down when we come back, we will see:
403 * and sync from mon.b, at which point a+b will both have lc 80 and
404 * come online with a majority holding out of date commits.
406 * Avoid this by preserving our old last_committed value prior to
407 * sync and never going backwards.
409 version_t sync_last_committed_floor
;
412 * Obtain the synchronization target prefixes in set form.
414 * We consider a target prefix all those that are relevant when
415 * synchronizing two stores. That is, all those that hold paxos service's
416 * versions, as well as paxos versions, or any control keys such as the
417 * first or last committed version.
419 * Given the current design, this function should return the name of all and
420 * any available paxos service, plus the paxos name.
422 * @returns a set of strings referring to the prefixes being synchronized
424 std::set
<std::string
> get_sync_targets_names();
427 * Reset the monitor's sync-related data structures for syncing *from* a peer
429 void sync_reset_requester();
432 * Reset sync state related to allowing others to sync from us
434 void sync_reset_provider();
437 * Caled when a sync attempt times out (requester-side)
442 * Get the latest monmap for backup purposes during sync
444 void sync_obtain_latest_monmap(ceph::buffer::list
&bl
);
449 * Start pulling committed state from another monitor.
451 * @param entity where to pull committed state from
452 * @param full whether to do a full sync or just catch up on recent paxos
454 void sync_start(entity_addrvec_t
&addrs
, bool full
);
458 * force a sync on next mon restart
460 void sync_force(ceph::Formatter
*f
);
464 * store critical state for safekeeping during sync
466 * We store a few things on the side that we don't want to get clobbered by sync. This
467 * includes the latest monmap and a lower bound on last_committed.
469 void sync_stash_critical_state(MonitorDBStore::TransactionRef tx
);
472 * reset the sync timeout
474 * This is used on the client to restart if things aren't progressing
476 void sync_reset_timeout();
479 * trim stale sync provider state
481 * If someone is syncing from us and hasn't talked to us recently, expire their state.
483 void sync_trim_providers();
488 * Finish up a sync after we've gotten all of the chunks.
490 * @param last_committed final last_committed value from provider
492 void sync_finish(version_t last_committed
);
495 * request the next chunk from the provider
497 void sync_get_next_chunk();
500 * handle sync message
502 * @param m Sync message with operation type MMonSync::OP_START_CHUNKS
504 void handle_sync(MonOpRequestRef op
);
506 void _sync_reply_no_cookie(MonOpRequestRef op
);
508 void handle_sync_get_cookie(MonOpRequestRef op
);
509 void handle_sync_get_chunk(MonOpRequestRef op
);
510 void handle_sync_finish(MonOpRequestRef op
);
512 void handle_sync_cookie(MonOpRequestRef op
);
513 void handle_sync_forward(MonOpRequestRef op
);
514 void handle_sync_chunk(MonOpRequestRef op
);
515 void handle_sync_no_cookie(MonOpRequestRef op
);
518 * @} // Synchronization
521 std::list
<Context
*> waitfor_quorum
;
522 std::list
<Context
*> maybe_wait_for_quorum
;
525 * @defgroup Monitor_h_TimeCheck Monitor Clock Drift Early Warning System
528 * We use time checks to keep track of any clock drifting going on in the
529 * cluster. This is accomplished by periodically ping each monitor in the
530 * quorum and register its response time on a map, assessing how much its
531 * clock has drifted. We also take this opportunity to assess the latency
534 * This mechanism works as follows:
536 * - Leader sends out a 'PING' message to each other monitor in the quorum.
537 * The message is timestamped with the leader's current time. The leader's
538 * current time is recorded in a map, associated with each peon's
540 * - The peon replies to the leader with a timestamped 'PONG' message.
541 * - The leader calculates a delta between the peon's timestamp and its
542 * current time and stashes it.
543 * - The leader also calculates the time it took to receive the 'PONG'
544 * since the 'PING' was sent, and stashes an approximate latency estimate.
545 * - Once all the quorum members have pong'ed, the leader will share the
546 * clock skew and latency maps with all the monitors in the quorum.
548 std::map
<int, utime_t
> timecheck_waiting
;
549 std::map
<int, double> timecheck_skews
;
550 std::map
<int, double> timecheck_latencies
;
551 // odd value means we are mid-round; even value means the round has
553 version_t timecheck_round
;
554 unsigned int timecheck_acks
;
555 utime_t timecheck_round_start
;
556 friend class HealthMonitor
;
557 /* When we hit a skew we will start a new round based off of
558 * 'mon_timecheck_skew_interval'. Each new round will be backed off
559 * until we hit 'mon_timecheck_interval' -- which is the typical
560 * interval when not in the presence of a skew.
562 * This variable tracks the number of rounds with skews since last clean
563 * so that we can report to the user and properly adjust the backoff.
565 uint64_t timecheck_rounds_since_clean
;
569 Context
*timecheck_event
;
571 void timecheck_start();
572 void timecheck_finish();
573 void timecheck_start_round();
574 void timecheck_finish_round(bool success
= true);
575 void timecheck_cancel_round();
576 void timecheck_cleanup();
577 void timecheck_reset_event();
578 void timecheck_check_skews();
579 void timecheck_report();
581 health_status_t
timecheck_status(std::ostringstream
&ss
,
582 const double skew_bound
,
583 const double latency
);
584 void handle_timecheck_leader(MonOpRequestRef op
);
585 void handle_timecheck_peon(MonOpRequestRef op
);
586 void handle_timecheck(MonOpRequestRef op
);
589 * Returns 'true' if this is considered to be a skew; 'false' otherwise.
591 bool timecheck_has_skew(const double skew_bound
, double *abs
) const {
592 double abs_skew
= std::fabs(skew_bound
);
595 return (abs_skew
> g_conf()->mon_clock_drift_allowed
);
602 * Handle ping messages from others.
604 void handle_ping(MonOpRequestRef op
);
606 Context
*probe_timeout_event
= nullptr; // for probing
608 void reset_probe_timeout();
609 void cancel_probe_timeout();
610 void probe_timeout(int r
);
612 void _apply_compatset_features(CompatSet
&new_features
);
616 int get_leader() const { return leader
; }
617 std::string
get_leader_name() {
618 return quorum
.empty() ? std::string() : monmap
->get_name(leader
);
620 const std::set
<int>& get_quorum() const { return quorum
; }
621 std::list
<std::string
> get_quorum_names() {
622 std::list
<std::string
> q
;
623 for (auto p
= quorum
.begin(); p
!= quorum
.end(); ++p
)
624 q
.push_back(monmap
->get_name(*p
));
627 uint64_t get_quorum_con_features() const {
628 return quorum_con_features
;
630 mon_feature_t
get_quorum_mon_features() const {
631 return quorum_mon_features
;
633 uint64_t get_required_features() const {
634 return required_features
;
636 mon_feature_t
get_required_mon_features() const {
637 return monmap
->get_required_features();
639 void apply_quorum_to_compatset_features();
640 void apply_monmap_to_compatset_features();
641 void calc_quorum_requirements();
643 void get_combined_feature_map(FeatureMap
*fm
);
646 void _reset(); ///< called from bootstrap, start_, or join_election
647 void wait_for_paxos_write();
648 void _finish_svc_election(); ///< called by {win,lose}_election
652 void join_election();
653 void start_election();
654 void win_standalone_election();
655 // end election (called by Elector)
656 void win_election(epoch_t epoch
, const std::set
<int>& q
,
658 const mon_feature_t
& mon_features
,
659 ceph_release_t min_mon_release
,
660 const std::map
<int,Metadata
>& metadata
);
661 void lose_election(epoch_t epoch
, std::set
<int>& q
, int l
,
663 const mon_feature_t
& mon_features
,
664 ceph_release_t min_mon_release
);
665 // end election (called by Elector)
666 void finish_election();
668 void update_logger();
671 * Vector holding the Services serviced by this Monitor.
673 std::array
<std::unique_ptr
<PaxosService
>, PAXOS_NUM
> paxos_service
;
675 class MDSMonitor
*mdsmon() {
676 return (class MDSMonitor
*)paxos_service
[PAXOS_MDSMAP
].get();
679 class MonmapMonitor
*monmon() {
680 return (class MonmapMonitor
*)paxos_service
[PAXOS_MONMAP
].get();
683 class OSDMonitor
*osdmon() {
684 return (class OSDMonitor
*)paxos_service
[PAXOS_OSDMAP
].get();
687 class AuthMonitor
*authmon() {
688 return (class AuthMonitor
*)paxos_service
[PAXOS_AUTH
].get();
691 class LogMonitor
*logmon() {
692 return (class LogMonitor
*) paxos_service
[PAXOS_LOG
].get();
695 class MgrMonitor
*mgrmon() {
696 return (class MgrMonitor
*) paxos_service
[PAXOS_MGR
].get();
699 class MgrStatMonitor
*mgrstatmon() {
700 return (class MgrStatMonitor
*) paxos_service
[PAXOS_MGRSTAT
].get();
703 class HealthMonitor
*healthmon() {
704 return (class HealthMonitor
*) paxos_service
[PAXOS_HEALTH
].get();
707 class ConfigMonitor
*configmon() {
708 return (class ConfigMonitor
*) paxos_service
[PAXOS_CONFIG
].get();
711 class KVMonitor
*kvmon() {
712 return (class KVMonitor
*) paxos_service
[PAXOS_KV
].get();
716 friend class OSDMonitor
;
717 friend class MDSMonitor
;
718 friend class MonmapMonitor
;
719 friend class LogMonitor
;
720 friend class KVMonitor
;
723 MonSessionMap session_map
;
724 ceph::mutex session_map_lock
= ceph::make_mutex("Monitor::session_map_lock");
725 AdminSocketHook
*admin_hook
;
727 template<typename Func
, typename
...Args
>
728 void with_session_map(Func
&& func
) {
729 std::lock_guard
l(session_map_lock
);
730 std::forward
<Func
>(func
)(session_map
);
732 void send_latest_monmap(Connection
*con
);
735 void handle_get_version(MonOpRequestRef op
);
736 void handle_subscribe(MonOpRequestRef op
);
737 void handle_mon_get_map(MonOpRequestRef op
);
739 static void _generate_command_map(cmdmap_t
& cmdmap
,
740 std::map
<std::string
,std::string
> ¶m_str_map
);
741 static const MonCommand
*_get_moncommand(
742 const std::string
&cmd_prefix
,
743 const std::vector
<MonCommand
>& cmds
);
744 bool _allowed_command(MonSession
*s
, const std::string
& module
,
745 const std::string
& prefix
,
746 const cmdmap_t
& cmdmap
,
747 const std::map
<std::string
,std::string
>& param_str_map
,
748 const MonCommand
*this_cmd
);
749 void get_mon_status(ceph::Formatter
*f
);
750 void _quorum_status(ceph::Formatter
*f
, std::ostream
& ss
);
751 bool _add_bootstrap_peer_hint(std::string_view cmd
, const cmdmap_t
& cmdmap
,
753 void handle_tell_command(MonOpRequestRef op
);
754 void handle_command(MonOpRequestRef op
);
755 void handle_route(MonOpRequestRef op
);
757 int get_mon_metadata(int mon
, ceph::Formatter
*f
, std::ostream
& err
);
758 int print_nodes(ceph::Formatter
*f
, std::ostream
& err
);
760 // track metadata reported by win_election()
761 std::map
<int, Metadata
> mon_metadata
;
762 std::map
<int, Metadata
> pending_metadata
;
767 struct health_cache_t
{
768 health_status_t overall
;
772 // health_status_t doesn't really have a NONE value and we're not
773 // okay with setting something else (say, HEALTH_ERR). so just
777 } health_status_cache
;
779 Context
*health_tick_event
= nullptr;
780 Context
*health_interval_event
= nullptr;
782 void health_tick_start();
783 void health_tick_stop();
784 ceph::real_clock::time_point
health_interval_calc_next_update();
785 void health_interval_start();
786 void health_interval_stop();
787 void health_events_cleanup();
789 void health_to_clog_update_conf(const std::set
<std::string
> &changed
);
791 void do_health_to_clog_interval();
792 void do_health_to_clog(bool force
= false);
795 const health_check_map_t
& updated
,
796 const health_check_map_t
& previous
,
797 MonitorDBStore::TransactionRef t
);
799 void update_pending_metadata();
803 class HealthCheckLogStatus
{
805 health_status_t severity
;
806 std::string last_message
;
807 utime_t updated_at
= 0;
808 HealthCheckLogStatus(health_status_t severity_
,
809 const std::string
&last_message_
,
811 : severity(severity_
),
812 last_message(last_message_
),
813 updated_at(updated_at_
)
816 std::map
<std::string
, HealthCheckLogStatus
> health_check_log_times
;
820 void get_cluster_status(std::stringstream
&ss
, ceph::Formatter
*f
,
821 MonSession
*session
);
823 void reply_command(MonOpRequestRef op
, int rc
, const std::string
&rs
, version_t version
);
824 void reply_command(MonOpRequestRef op
, int rc
, const std::string
&rs
, ceph::buffer::list
& rdata
, version_t version
);
826 void reply_tell_command(MonOpRequestRef op
, int rc
, const std::string
&rs
);
830 void handle_probe(MonOpRequestRef op
);
832 * Handle a Probe Operation, replying with our name, quorum and known versions.
834 * We use the MMonProbe message class for anything and everything related with
835 * Monitor probing. One of the operations relates directly with the probing
836 * itself, in which we receive a probe request and to which we reply with
837 * our name, our quorum and the known versions for each Paxos service. Thus the
838 * redundant function name. This reply will obviously be sent to the one
839 * probing/requesting these infos.
841 * @todo Add @pre and @post
843 * @param m A Probe message, with an operation of type Probe.
845 void handle_probe_probe(MonOpRequestRef op
);
846 void handle_probe_reply(MonOpRequestRef op
);
849 struct RoutedRequest
{
851 ceph::buffer::list request_bl
;
854 uint64_t con_features
;
857 RoutedRequest() : tid(0), session(NULL
), con_features(0) {}
863 uint64_t routed_request_tid
;
864 std::map
<uint64_t, RoutedRequest
*> routed_requests
;
866 void forward_request_leader(MonOpRequestRef op
);
867 void handle_forward(MonOpRequestRef op
);
868 void send_reply(MonOpRequestRef op
, Message
*reply
);
869 void no_reply(MonOpRequestRef op
);
870 void resend_routed_requests();
871 void remove_session(MonSession
*s
);
872 void remove_all_sessions();
873 void waitlist_or_zap_client(MonOpRequestRef op
);
875 void send_mon_message(Message
*m
, int rank
);
876 /** can_change_external_state if we can do things like
877 * call elections as a result of the new map.
879 void notify_new_monmap(bool can_change_external_state
=false, bool remove_rank_elector
=true);
882 struct C_Command
: public C_MonOp
{
886 ceph::buffer::list rdata
;
888 C_Command(Monitor
&_mm
, MonOpRequestRef _op
, int r
, std::string s
, version_t v
) :
889 C_MonOp(_op
), mon(_mm
), rc(r
), rs(s
), version(v
){}
890 C_Command(Monitor
&_mm
, MonOpRequestRef _op
, int r
, std::string s
, ceph::buffer::list rd
, version_t v
) :
891 C_MonOp(_op
), mon(_mm
), rc(r
), rs(s
), rdata(rd
), version(v
){}
893 void _finish(int r
) override
{
894 auto m
= op
->get_req
<MMonCommand
>();
896 std::ostringstream ss
;
897 if (!op
->get_req()->get_connection()) {
898 ss
<< "connection dropped for command ";
900 MonSession
*s
= op
->get_session();
902 // if client drops we may not have a session to draw information from.
904 ss
<< "from='" << s
->name
<< " " << s
->addrs
<< "' "
905 << "entity='" << s
->entity_name
<< "' ";
907 ss
<< "session dropped for command ";
911 std::ostringstream ds
;
913 cmdmap_from_json(m
->cmd
, &cmdmap
, ds
);
914 cmd_getval(cmdmap
, "prefix", prefix
);
915 if (prefix
!= "config set" && prefix
!= "config-key set")
916 ss
<< "cmd='" << m
->cmd
<< "': finished";
918 mon
.audit_clog
->info() << ss
.str();
919 mon
.reply_command(op
, rc
, rs
, rdata
, version
);
921 else if (r
== -ECANCELED
)
923 else if (r
== -EAGAIN
)
926 ceph_abort_msg("bad C_Command return value");
931 class C_RetryMessage
: public C_MonOp
{
934 C_RetryMessage(Monitor
*m
, MonOpRequestRef op
) :
935 C_MonOp(op
), mon(m
) { }
937 void _finish(int r
) override
{
938 if (r
== -EAGAIN
|| r
>= 0)
939 mon
->dispatch_op(op
);
940 else if (r
== -ECANCELED
)
943 ceph_abort_msg("bad C_RetryMessage return value");
947 //ms_dispatch handles a lot of logic and we want to reuse it
948 //on forwarded messages, so we create a non-locking version for this class
949 void _ms_dispatch(Message
*m
);
950 bool ms_dispatch(Message
*m
) override
{
951 std::lock_guard l
{lock
};
955 void dispatch_op(MonOpRequestRef op
);
956 //mon_caps is used for un-connected messages from monitors
958 bool get_authorizer(int dest_type
, AuthAuthorizer
**authorizer
);
959 public: // for AuthMonitor msgr1:
960 int ms_handle_authentication(Connection
*con
) override
;
962 void ms_handle_accept(Connection
*con
) override
;
963 bool ms_handle_reset(Connection
*con
) override
;
964 void ms_handle_remote_reset(Connection
*con
) override
{}
965 bool ms_handle_refused(Connection
*con
) override
;
968 int get_auth_request(
970 AuthConnectionMeta
*auth_meta
,
972 std::vector
<uint32_t> *preferred_modes
,
973 ceph::buffer::list
*out
) override
;
974 int handle_auth_reply_more(
976 AuthConnectionMeta
*auth_meta
,
977 const ceph::buffer::list
& bl
,
978 ceph::buffer::list
*reply
) override
;
979 int handle_auth_done(
981 AuthConnectionMeta
*auth_meta
,
984 const ceph::buffer::list
& bl
,
985 CryptoKey
*session_key
,
986 std::string
*connection_secret
) override
;
987 int handle_auth_bad_method(
989 AuthConnectionMeta
*auth_meta
,
990 uint32_t old_auth_method
,
992 const std::vector
<uint32_t>& allowed_methods
,
993 const std::vector
<uint32_t>& allowed_modes
) override
;
996 int handle_auth_request(
998 AuthConnectionMeta
*auth_meta
,
1000 uint32_t auth_method
,
1001 const ceph::buffer::list
& bl
,
1002 ceph::buffer::list
*reply
) override
;
1005 int write_default_keyring(ceph::buffer::list
& bl
);
1006 void extract_save_mon_key(KeyRing
& keyring
);
1008 void collect_metadata(Metadata
*m
);
1009 int load_metadata();
1010 void count_metadata(const std::string
& field
, ceph::Formatter
*f
);
1011 void count_metadata(const std::string
& field
, std::map
<std::string
,int> *out
);
1012 // get_all_versions() gathers version information from daemons for health check
1013 void get_all_versions(std::map
<std::string
, std::list
<std::string
>> &versions
);
1014 void get_versions(std::map
<std::string
, std::list
<std::string
>> &versions
);
1017 static CompatSet
get_initial_supported_features();
1018 static CompatSet
get_supported_features();
1019 static CompatSet
get_legacy_features();
1020 /// read the ondisk features into the CompatSet pointed to by read_features
1021 static void read_features_off_disk(MonitorDBStore
*store
, CompatSet
*read_features
);
1022 void read_features();
1023 void write_features(MonitorDBStore::TransactionRef t
);
1025 OpTracker op_tracker
;
1028 Monitor(CephContext
*cct_
, std::string nm
, MonitorDBStore
*s
,
1029 Messenger
*m
, Messenger
*mgr_m
, MonMap
*map
);
1030 ~Monitor() override
;
1032 static int check_features(MonitorDBStore
*store
);
1035 const char** get_tracked_conf_keys() const override
;
1036 void handle_conf_change(const ConfigProxy
& conf
,
1037 const std::set
<std::string
> &changed
) override
;
1039 void update_log_clients();
1040 int sanitize_options();
1044 void refresh_from_paxos(bool *need_bootstrap
);
1048 void handle_signal(int sig
);
1050 int mkfs(ceph::buffer::list
& osdmapbl
);
1053 * check cluster_fsid file
1055 * @return EEXIST if file exists and doesn't match, 0 on match, or negative error code
1060 * write cluster_fsid file
1062 * @return 0 on success, or negative error code
1065 int write_fsid(MonitorDBStore::TransactionRef t
);
1067 int do_admin_command(std::string_view command
, const cmdmap_t
& cmdmap
,
1073 // don't allow copying
1074 Monitor(const Monitor
& rhs
);
1075 Monitor
& operator=(const Monitor
&rhs
);
1078 static void format_command_descriptions(const std::vector
<MonCommand
> &commands
,
1081 ceph::buffer::list
*rdata
);
1083 const std::vector
<MonCommand
> &get_local_commands(mon_feature_t f
) {
1084 if (f
.contains_all(ceph::features::mon::FEATURE_NAUTILUS
)) {
1085 return local_mon_commands
;
1087 return prenautilus_local_mon_commands
;
1090 const ceph::buffer::list
& get_local_commands_bl(mon_feature_t f
) {
1091 if (f
.contains_all(ceph::features::mon::FEATURE_NAUTILUS
)) {
1092 return local_mon_commands_bl
;
1094 return prenautilus_local_mon_commands_bl
;
1097 void set_leader_commands(const std::vector
<MonCommand
>& cmds
) {
1098 leader_mon_commands
= cmds
;
1101 bool is_keyring_required();
1104 #define CEPH_MON_FEATURE_INCOMPAT_BASE CompatSet::Feature (1, "initial feature set (~v.18)")
1105 #define CEPH_MON_FEATURE_INCOMPAT_GV CompatSet::Feature (2, "global version sequencing (v0.52)")
1106 #define CEPH_MON_FEATURE_INCOMPAT_SINGLE_PAXOS CompatSet::Feature (3, "single paxos with k/v store (v0.\?)")
1107 #define CEPH_MON_FEATURE_INCOMPAT_OSD_ERASURE_CODES CompatSet::Feature(4, "support erasure code pools")
1108 #define CEPH_MON_FEATURE_INCOMPAT_OSDMAP_ENC CompatSet::Feature(5, "new-style osdmap encoding")
1109 #define CEPH_MON_FEATURE_INCOMPAT_ERASURE_CODE_PLUGINS_V2 CompatSet::Feature(6, "support isa/lrc erasure code")
1110 #define CEPH_MON_FEATURE_INCOMPAT_ERASURE_CODE_PLUGINS_V3 CompatSet::Feature(7, "support shec erasure code")
1111 #define CEPH_MON_FEATURE_INCOMPAT_KRAKEN CompatSet::Feature(8, "support monmap features")
1112 #define CEPH_MON_FEATURE_INCOMPAT_LUMINOUS CompatSet::Feature(9, "luminous ondisk layout")
1113 #define CEPH_MON_FEATURE_INCOMPAT_MIMIC CompatSet::Feature(10, "mimic ondisk layout")
1114 #define CEPH_MON_FEATURE_INCOMPAT_NAUTILUS CompatSet::Feature(11, "nautilus ondisk layout")
1115 #define CEPH_MON_FEATURE_INCOMPAT_OCTOPUS CompatSet::Feature(12, "octopus ondisk layout")
1116 #define CEPH_MON_FEATURE_INCOMPAT_PACIFIC CompatSet::Feature(13, "pacific ondisk layout")
1117 #define CEPH_MON_FEATURE_INCOMPAT_QUINCY CompatSet::Feature(14, "quincy ondisk layout")
1118 // make sure you add your feature to Monitor::get_supported_features
1123 * new C_MonContext{...}
1127 * new C_MonContext(...)
1129 * because of gcc bug [1].
1131 * [1] https://gcc.gnu.org/bugzilla/show_bug.cgi?id=85883
1133 template<typename T
>
1134 class C_MonContext
: public LambdaContext
<T
> {
1136 C_MonContext(const Monitor
* m
, T
&& f
) :
1137 LambdaContext
<T
>(std::forward
<T
>(f
)),
1140 void finish(int r
) override
{
1141 if (mon
->is_shutdown())
1143 LambdaContext
<T
>::finish(r
);