]> git.proxmox.com Git - ceph.git/blob - ceph/src/mon/Monitor.h
import ceph quincy 17.2.6
[ceph.git] / ceph / src / mon / Monitor.h
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3 /*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 *
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
12 *
13 */
14
15 /*
16 * This is the top level monitor. It runs on each machine in the Monitor
17 * Cluster. The election of a leader for the paxos algorithm only happens
18 * once per machine via the elector. There is a separate paxos instance (state)
19 * kept for each of the system components: Object Store Device (OSD) Monitor,
20 * Placement Group (PG) Monitor, Metadata Server (MDS) Monitor, and Client Monitor.
21 */
22
23 #ifndef CEPH_MONITOR_H
24 #define CEPH_MONITOR_H
25
26 #include <errno.h>
27 #include <cmath>
28 #include <string>
29 #include <array>
30
31 #include "include/types.h"
32 #include "include/health.h"
33 #include "msg/Messenger.h"
34
35 #include "common/Timer.h"
36
37 #include "health_check.h"
38 #include "MonMap.h"
39 #include "Elector.h"
40 #include "Paxos.h"
41 #include "Session.h"
42 #include "MonCommand.h"
43
44
45 #include "common/config_obs.h"
46 #include "common/LogClient.h"
47 #include "auth/AuthClient.h"
48 #include "auth/AuthServer.h"
49 #include "auth/cephx/CephxKeyServer.h"
50 #include "auth/AuthMethodList.h"
51 #include "auth/KeyRing.h"
52 #include "include/common_fwd.h"
53 #include "messages/MMonCommand.h"
54 #include "mon/MonitorDBStore.h"
55 #include "mgr/MgrClient.h"
56
57 #include "mon/MonOpRequest.h"
58 #include "common/WorkQueue.h"
59
60 using namespace TOPNSPC::common;
61
62 #define CEPH_MON_PROTOCOL 13 /* cluster internal */
63
64
65 enum {
66 l_cluster_first = 555000,
67 l_cluster_num_mon,
68 l_cluster_num_mon_quorum,
69 l_cluster_num_osd,
70 l_cluster_num_osd_up,
71 l_cluster_num_osd_in,
72 l_cluster_osd_epoch,
73 l_cluster_osd_bytes,
74 l_cluster_osd_bytes_used,
75 l_cluster_osd_bytes_avail,
76 l_cluster_num_pool,
77 l_cluster_num_pg,
78 l_cluster_num_pg_active_clean,
79 l_cluster_num_pg_active,
80 l_cluster_num_pg_peering,
81 l_cluster_num_object,
82 l_cluster_num_object_degraded,
83 l_cluster_num_object_misplaced,
84 l_cluster_num_object_unfound,
85 l_cluster_num_bytes,
86 l_cluster_last,
87 };
88
89 enum {
90 l_mon_first = 456000,
91 l_mon_num_sessions,
92 l_mon_session_add,
93 l_mon_session_rm,
94 l_mon_session_trim,
95 l_mon_num_elections,
96 l_mon_election_call,
97 l_mon_election_win,
98 l_mon_election_lose,
99 l_mon_last,
100 };
101
102 class PaxosService;
103
104 class AdminSocketHook;
105
106 #define COMPAT_SET_LOC "feature_set"
107
108 class Monitor : public Dispatcher,
109 public AuthClient,
110 public AuthServer,
111 public md_config_obs_t {
112 public:
113 int orig_argc = 0;
114 const char **orig_argv = nullptr;
115
116 // me
117 std::string name;
118 int rank;
119 Messenger *messenger;
120 ConnectionRef con_self;
121 ceph::mutex lock = ceph::make_mutex("Monitor::lock");
122 SafeTimer timer;
123 Finisher finisher;
124 ThreadPool cpu_tp; ///< threadpool for CPU intensive work
125
126 ceph::mutex auth_lock = ceph::make_mutex("Monitor::auth_lock");
127
128 /// true if we have ever joined a quorum. if false, we are either a
129 /// new cluster, a newly joining monitor, or a just-upgraded
130 /// monitor.
131 bool has_ever_joined;
132
133 PerfCounters *logger, *cluster_logger;
134 bool cluster_logger_registered;
135
136 void register_cluster_logger();
137 void unregister_cluster_logger();
138
139 MonMap *monmap;
140 uuid_d fingerprint;
141
142 std::set<entity_addrvec_t> extra_probe_peers;
143
144 LogClient log_client;
145 LogChannelRef clog;
146 LogChannelRef audit_clog;
147 KeyRing keyring;
148 KeyServer key_server;
149
150 AuthMethodList auth_cluster_required;
151 AuthMethodList auth_service_required;
152
153 CompatSet features;
154
155 std::vector<MonCommand> leader_mon_commands; // quorum leader's commands
156 std::vector<MonCommand> local_mon_commands; // commands i support
157 ceph::buffer::list local_mon_commands_bl; // encoded version of above
158
159 std::vector<MonCommand> prenautilus_local_mon_commands;
160 ceph::buffer::list prenautilus_local_mon_commands_bl;
161
162 Messenger *mgr_messenger;
163 MgrClient mgr_client;
164 uint64_t mgr_proxy_bytes = 0; // in-flight proxied mgr command message bytes
165 std::string gss_ktfile_client{};
166
167 private:
168 void new_tick();
169
170 // -- local storage --
171 public:
172 MonitorDBStore *store;
173 static const std::string MONITOR_NAME;
174 static const std::string MONITOR_STORE_PREFIX;
175
176 // -- monitor state --
177 private:
178 enum {
179 STATE_INIT = 1,
180 STATE_PROBING,
181 STATE_SYNCHRONIZING,
182 STATE_ELECTING,
183 STATE_LEADER,
184 STATE_PEON,
185 STATE_SHUTDOWN
186 };
187 int state = STATE_INIT;
188
189 public:
190 static const char *get_state_name(int s) {
191 switch (s) {
192 case STATE_PROBING: return "probing";
193 case STATE_SYNCHRONIZING: return "synchronizing";
194 case STATE_ELECTING: return "electing";
195 case STATE_LEADER: return "leader";
196 case STATE_PEON: return "peon";
197 case STATE_SHUTDOWN: return "shutdown";
198 default: return "???";
199 }
200 }
201 const char *get_state_name() const {
202 return get_state_name(state);
203 }
204
205 bool is_init() const { return state == STATE_INIT; }
206 bool is_shutdown() const { return state == STATE_SHUTDOWN; }
207 bool is_probing() const { return state == STATE_PROBING; }
208 bool is_synchronizing() const { return state == STATE_SYNCHRONIZING; }
209 bool is_electing() const { return state == STATE_ELECTING; }
210 bool is_leader() const { return state == STATE_LEADER; }
211 bool is_peon() const { return state == STATE_PEON; }
212
213 const utime_t &get_leader_since() const;
214
215 void prepare_new_fingerprint(MonitorDBStore::TransactionRef t);
216
217 std::vector<DaemonHealthMetric> get_health_metrics();
218
219 int quorum_age() const {
220 auto age = std::chrono::duration_cast<std::chrono::seconds>(
221 ceph::mono_clock::now() - quorum_since);
222 return age.count();
223 }
224
225 bool is_mon_down() const {
226 int max = monmap->size();
227 int actual = get_quorum().size();
228 auto now = ceph::real_clock::now();
229 return actual < max && now > monmap->created.to_real_time();
230 }
231
232 // -- elector --
233 private:
234 std::unique_ptr<Paxos> paxos;
235 Elector elector;
236 friend class Elector;
237
238 /// features we require of peers (based on on-disk compatset)
239 uint64_t required_features;
240
241 int leader; // current leader (to best of knowledge)
242 std::set<int> quorum; // current active set of monitors (if !starting)
243 ceph::mono_clock::time_point quorum_since; // when quorum formed
244 utime_t leader_since; // when this monitor became the leader, if it is the leader
245 utime_t exited_quorum; // time detected as not in quorum; 0 if in
246
247 // map of counts of connected clients, by type and features, for
248 // each quorum mon
249 std::map<int,FeatureMap> quorum_feature_map;
250
251 /**
252 * Intersection of quorum member's connection feature bits.
253 */
254 uint64_t quorum_con_features;
255 /**
256 * Intersection of quorum members mon-specific feature bits
257 */
258 mon_feature_t quorum_mon_features;
259
260 ceph_release_t quorum_min_mon_release{ceph_release_t::unknown};
261
262 std::set<std::string> outside_quorum;
263
264 bool stretch_mode_engaged{false};
265 bool degraded_stretch_mode{false};
266 bool recovering_stretch_mode{false};
267 std::string stretch_bucket_divider;
268 std::map<std::string, std::set<std::string>> dead_mon_buckets; // bucket->mon ranks, locations with no live mons
269 std::set<std::string> up_mon_buckets; // locations with a live mon
270 void do_stretch_mode_election_work();
271
272 bool session_stretch_allowed(MonSession *s, MonOpRequestRef& op);
273 void disconnect_disallowed_stretch_sessions();
274 void set_elector_disallowed_leaders(bool allow_election);
275
276 std::map<std::string,std::string> crush_loc;
277 bool need_set_crush_loc{false};
278 public:
279 bool is_stretch_mode() { return stretch_mode_engaged; }
280 bool is_degraded_stretch_mode() { return degraded_stretch_mode; }
281 bool is_recovering_stretch_mode() { return recovering_stretch_mode; }
282
283 /**
284 * This set of functions maintains the in-memory stretch state
285 * and sets up transitions of the map states by calling in to
286 * MonmapMonitor and OSDMonitor.
287 *
288 * The [maybe_]go_* functions are called on the leader to
289 * decide if transitions should happen; the trigger_* functions
290 * set up the map transitions; and the set_* functions actually
291 * change the memory state -- but these are only called
292 * via OSDMonitor::update_from_paxos, to guarantee consistent
293 * updates across the entire cluster.
294 */
295 void try_engage_stretch_mode();
296 void maybe_go_degraded_stretch_mode();
297 void trigger_degraded_stretch_mode(const std::set<std::string>& dead_mons,
298 const std::set<int>& dead_buckets);
299 void set_degraded_stretch_mode();
300 void go_recovery_stretch_mode();
301 void set_recovery_stretch_mode();
302 void trigger_healthy_stretch_mode();
303 void set_healthy_stretch_mode();
304 void enable_stretch_mode();
305 void set_mon_crush_location(const std::string& loc);
306
307
308 private:
309
310 /**
311 * @defgroup Monitor_h_scrub
312 * @{
313 */
314 version_t scrub_version; ///< paxos version we are scrubbing
315 std::map<int,ScrubResult> scrub_result; ///< results so far
316
317 /**
318 * trigger a cross-mon scrub
319 *
320 * Verify all mons are storing identical content
321 */
322 int scrub_start();
323 int scrub();
324 void handle_scrub(MonOpRequestRef op);
325 bool _scrub(ScrubResult *r,
326 std::pair<std::string,std::string> *start,
327 int *num_keys);
328 void scrub_check_results();
329 void scrub_timeout();
330 void scrub_finish();
331 void scrub_reset();
332 void scrub_update_interval(ceph::timespan interval);
333
334 Context *scrub_event; ///< periodic event to trigger scrub (leader)
335 Context *scrub_timeout_event; ///< scrub round timeout (leader)
336 void scrub_event_start();
337 void scrub_event_cancel();
338 void scrub_reset_timeout();
339 void scrub_cancel_timeout();
340
341 struct ScrubState {
342 std::pair<std::string,std::string> last_key; ///< last scrubbed key
343 bool finished;
344
345 ScrubState() : finished(false) { }
346 virtual ~ScrubState() { }
347 };
348 std::shared_ptr<ScrubState> scrub_state; ///< keeps track of current scrub
349
350 /**
351 * @defgroup Monitor_h_sync Synchronization
352 * @{
353 */
354 /**
355 * @} // provider state
356 */
357 struct SyncProvider {
358 entity_addrvec_t addrs;
359 uint64_t cookie; ///< unique cookie for this sync attempt
360 utime_t timeout; ///< when we give up and expire this attempt
361 version_t last_committed; ///< last paxos version on peer
362 std::pair<std::string,std::string> last_key; ///< last key sent to (or on) peer
363 bool full; ///< full scan?
364 MonitorDBStore::Synchronizer synchronizer; ///< iterator
365
366 SyncProvider() : cookie(0), last_committed(0), full(false) {}
367
368 void reset_timeout(CephContext *cct, int grace) {
369 timeout = ceph_clock_now();
370 timeout += grace;
371 }
372 };
373
374 std::map<std::uint64_t, SyncProvider> sync_providers; ///< cookie -> SyncProvider for those syncing from us
375 uint64_t sync_provider_count; ///< counter for issued cookies to keep them unique
376
377 /**
378 * @} // requester state
379 */
380 entity_addrvec_t sync_provider; ///< who we are syncing from
381 uint64_t sync_cookie; ///< 0 if we are starting, non-zero otherwise
382 bool sync_full; ///< true if we are a full sync, false for recent catch-up
383 version_t sync_start_version; ///< last_committed at sync start
384 Context *sync_timeout_event; ///< timeout event
385
386 /**
387 * floor for sync source
388 *
389 * When we sync we forget about our old last_committed value which
390 * can be dangerous. For example, if we have a cluster of:
391 *
392 * mon.a: lc 100
393 * mon.b: lc 80
394 * mon.c: lc 100 (us)
395 *
396 * If something forces us to sync (say, corruption, or manual
397 * intervention, or bug), we forget last_committed, and might abort.
398 * If mon.a happens to be down when we come back, we will see:
399 *
400 * mon.b: lc 80
401 * mon.c: lc 0 (us)
402 *
403 * and sync from mon.b, at which point a+b will both have lc 80 and
404 * come online with a majority holding out of date commits.
405 *
406 * Avoid this by preserving our old last_committed value prior to
407 * sync and never going backwards.
408 */
409 version_t sync_last_committed_floor;
410
411 /**
412 * Obtain the synchronization target prefixes in set form.
413 *
414 * We consider a target prefix all those that are relevant when
415 * synchronizing two stores. That is, all those that hold paxos service's
416 * versions, as well as paxos versions, or any control keys such as the
417 * first or last committed version.
418 *
419 * Given the current design, this function should return the name of all and
420 * any available paxos service, plus the paxos name.
421 *
422 * @returns a set of strings referring to the prefixes being synchronized
423 */
424 std::set<std::string> get_sync_targets_names();
425
426 /**
427 * Reset the monitor's sync-related data structures for syncing *from* a peer
428 */
429 void sync_reset_requester();
430
431 /**
432 * Reset sync state related to allowing others to sync from us
433 */
434 void sync_reset_provider();
435
436 /**
437 * Caled when a sync attempt times out (requester-side)
438 */
439 void sync_timeout();
440
441 /**
442 * Get the latest monmap for backup purposes during sync
443 */
444 void sync_obtain_latest_monmap(ceph::buffer::list &bl);
445
446 /**
447 * Start sync process
448 *
449 * Start pulling committed state from another monitor.
450 *
451 * @param entity where to pull committed state from
452 * @param full whether to do a full sync or just catch up on recent paxos
453 */
454 void sync_start(entity_addrvec_t &addrs, bool full);
455
456 public:
457 /**
458 * force a sync on next mon restart
459 */
460 void sync_force(ceph::Formatter *f);
461
462 private:
463 /**
464 * store critical state for safekeeping during sync
465 *
466 * We store a few things on the side that we don't want to get clobbered by sync. This
467 * includes the latest monmap and a lower bound on last_committed.
468 */
469 void sync_stash_critical_state(MonitorDBStore::TransactionRef tx);
470
471 /**
472 * reset the sync timeout
473 *
474 * This is used on the client to restart if things aren't progressing
475 */
476 void sync_reset_timeout();
477
478 /**
479 * trim stale sync provider state
480 *
481 * If someone is syncing from us and hasn't talked to us recently, expire their state.
482 */
483 void sync_trim_providers();
484
485 /**
486 * Complete a sync
487 *
488 * Finish up a sync after we've gotten all of the chunks.
489 *
490 * @param last_committed final last_committed value from provider
491 */
492 void sync_finish(version_t last_committed);
493
494 /**
495 * request the next chunk from the provider
496 */
497 void sync_get_next_chunk();
498
499 /**
500 * handle sync message
501 *
502 * @param m Sync message with operation type MMonSync::OP_START_CHUNKS
503 */
504 void handle_sync(MonOpRequestRef op);
505
506 void _sync_reply_no_cookie(MonOpRequestRef op);
507
508 void handle_sync_get_cookie(MonOpRequestRef op);
509 void handle_sync_get_chunk(MonOpRequestRef op);
510 void handle_sync_finish(MonOpRequestRef op);
511
512 void handle_sync_cookie(MonOpRequestRef op);
513 void handle_sync_forward(MonOpRequestRef op);
514 void handle_sync_chunk(MonOpRequestRef op);
515 void handle_sync_no_cookie(MonOpRequestRef op);
516
517 /**
518 * @} // Synchronization
519 */
520
521 std::list<Context*> waitfor_quorum;
522 std::list<Context*> maybe_wait_for_quorum;
523
524 /**
525 * @defgroup Monitor_h_TimeCheck Monitor Clock Drift Early Warning System
526 * @{
527 *
528 * We use time checks to keep track of any clock drifting going on in the
529 * cluster. This is accomplished by periodically ping each monitor in the
530 * quorum and register its response time on a map, assessing how much its
531 * clock has drifted. We also take this opportunity to assess the latency
532 * on response.
533 *
534 * This mechanism works as follows:
535 *
536 * - Leader sends out a 'PING' message to each other monitor in the quorum.
537 * The message is timestamped with the leader's current time. The leader's
538 * current time is recorded in a map, associated with each peon's
539 * instance.
540 * - The peon replies to the leader with a timestamped 'PONG' message.
541 * - The leader calculates a delta between the peon's timestamp and its
542 * current time and stashes it.
543 * - The leader also calculates the time it took to receive the 'PONG'
544 * since the 'PING' was sent, and stashes an approximate latency estimate.
545 * - Once all the quorum members have pong'ed, the leader will share the
546 * clock skew and latency maps with all the monitors in the quorum.
547 */
548 std::map<int, utime_t> timecheck_waiting;
549 std::map<int, double> timecheck_skews;
550 std::map<int, double> timecheck_latencies;
551 // odd value means we are mid-round; even value means the round has
552 // finished.
553 version_t timecheck_round;
554 unsigned int timecheck_acks;
555 utime_t timecheck_round_start;
556 friend class HealthMonitor;
557 /* When we hit a skew we will start a new round based off of
558 * 'mon_timecheck_skew_interval'. Each new round will be backed off
559 * until we hit 'mon_timecheck_interval' -- which is the typical
560 * interval when not in the presence of a skew.
561 *
562 * This variable tracks the number of rounds with skews since last clean
563 * so that we can report to the user and properly adjust the backoff.
564 */
565 uint64_t timecheck_rounds_since_clean;
566 /**
567 * Time Check event.
568 */
569 Context *timecheck_event;
570
571 void timecheck_start();
572 void timecheck_finish();
573 void timecheck_start_round();
574 void timecheck_finish_round(bool success = true);
575 void timecheck_cancel_round();
576 void timecheck_cleanup();
577 void timecheck_reset_event();
578 void timecheck_check_skews();
579 void timecheck_report();
580 void timecheck();
581 health_status_t timecheck_status(std::ostringstream &ss,
582 const double skew_bound,
583 const double latency);
584 void handle_timecheck_leader(MonOpRequestRef op);
585 void handle_timecheck_peon(MonOpRequestRef op);
586 void handle_timecheck(MonOpRequestRef op);
587
588 /**
589 * Returns 'true' if this is considered to be a skew; 'false' otherwise.
590 */
591 bool timecheck_has_skew(const double skew_bound, double *abs) const {
592 double abs_skew = std::fabs(skew_bound);
593 if (abs)
594 *abs = abs_skew;
595 return (abs_skew > g_conf()->mon_clock_drift_allowed);
596 }
597
598 /**
599 * @}
600 */
601 /**
602 * Handle ping messages from others.
603 */
604 void handle_ping(MonOpRequestRef op);
605
606 Context *probe_timeout_event = nullptr; // for probing
607
608 void reset_probe_timeout();
609 void cancel_probe_timeout();
610 void probe_timeout(int r);
611
612 void _apply_compatset_features(CompatSet &new_features);
613
614 public:
615 epoch_t get_epoch();
616 int get_leader() const { return leader; }
617 std::string get_leader_name() {
618 return quorum.empty() ? std::string() : monmap->get_name(leader);
619 }
620 const std::set<int>& get_quorum() const { return quorum; }
621 std::list<std::string> get_quorum_names() {
622 std::list<std::string> q;
623 for (auto p = quorum.begin(); p != quorum.end(); ++p)
624 q.push_back(monmap->get_name(*p));
625 return q;
626 }
627 uint64_t get_quorum_con_features() const {
628 return quorum_con_features;
629 }
630 mon_feature_t get_quorum_mon_features() const {
631 return quorum_mon_features;
632 }
633 uint64_t get_required_features() const {
634 return required_features;
635 }
636 mon_feature_t get_required_mon_features() const {
637 return monmap->get_required_features();
638 }
639 void apply_quorum_to_compatset_features();
640 void apply_monmap_to_compatset_features();
641 void calc_quorum_requirements();
642
643 void get_combined_feature_map(FeatureMap *fm);
644
645 private:
646 void _reset(); ///< called from bootstrap, start_, or join_election
647 void wait_for_paxos_write();
648 void _finish_svc_election(); ///< called by {win,lose}_election
649 void respawn();
650 public:
651 void bootstrap();
652 void join_election();
653 void start_election();
654 void win_standalone_election();
655 // end election (called by Elector)
656 void win_election(epoch_t epoch, const std::set<int>& q,
657 uint64_t features,
658 const mon_feature_t& mon_features,
659 ceph_release_t min_mon_release,
660 const std::map<int,Metadata>& metadata);
661 void lose_election(epoch_t epoch, std::set<int>& q, int l,
662 uint64_t features,
663 const mon_feature_t& mon_features,
664 ceph_release_t min_mon_release);
665 // end election (called by Elector)
666 void finish_election();
667
668 void update_logger();
669
670 /**
671 * Vector holding the Services serviced by this Monitor.
672 */
673 std::array<std::unique_ptr<PaxosService>, PAXOS_NUM> paxos_service;
674
675 class MDSMonitor *mdsmon() {
676 return (class MDSMonitor *)paxos_service[PAXOS_MDSMAP].get();
677 }
678
679 class MonmapMonitor *monmon() {
680 return (class MonmapMonitor *)paxos_service[PAXOS_MONMAP].get();
681 }
682
683 class OSDMonitor *osdmon() {
684 return (class OSDMonitor *)paxos_service[PAXOS_OSDMAP].get();
685 }
686
687 class AuthMonitor *authmon() {
688 return (class AuthMonitor *)paxos_service[PAXOS_AUTH].get();
689 }
690
691 class LogMonitor *logmon() {
692 return (class LogMonitor*) paxos_service[PAXOS_LOG].get();
693 }
694
695 class MgrMonitor *mgrmon() {
696 return (class MgrMonitor*) paxos_service[PAXOS_MGR].get();
697 }
698
699 class MgrStatMonitor *mgrstatmon() {
700 return (class MgrStatMonitor*) paxos_service[PAXOS_MGRSTAT].get();
701 }
702
703 class HealthMonitor *healthmon() {
704 return (class HealthMonitor*) paxos_service[PAXOS_HEALTH].get();
705 }
706
707 class ConfigMonitor *configmon() {
708 return (class ConfigMonitor*) paxos_service[PAXOS_CONFIG].get();
709 }
710
711 class KVMonitor *kvmon() {
712 return (class KVMonitor*) paxos_service[PAXOS_KV].get();
713 }
714
715 friend class Paxos;
716 friend class OSDMonitor;
717 friend class MDSMonitor;
718 friend class MonmapMonitor;
719 friend class LogMonitor;
720 friend class KVMonitor;
721
722 // -- sessions --
723 MonSessionMap session_map;
724 ceph::mutex session_map_lock = ceph::make_mutex("Monitor::session_map_lock");
725 AdminSocketHook *admin_hook;
726
727 template<typename Func, typename...Args>
728 void with_session_map(Func&& func) {
729 std::lock_guard l(session_map_lock);
730 std::forward<Func>(func)(session_map);
731 }
732 void send_latest_monmap(Connection *con);
733
734 // messages
735 void handle_get_version(MonOpRequestRef op);
736 void handle_subscribe(MonOpRequestRef op);
737 void handle_mon_get_map(MonOpRequestRef op);
738
739 static void _generate_command_map(cmdmap_t& cmdmap,
740 std::map<std::string,std::string> &param_str_map);
741 static const MonCommand *_get_moncommand(
742 const std::string &cmd_prefix,
743 const std::vector<MonCommand>& cmds);
744 bool _allowed_command(MonSession *s, const std::string& module,
745 const std::string& prefix,
746 const cmdmap_t& cmdmap,
747 const std::map<std::string,std::string>& param_str_map,
748 const MonCommand *this_cmd);
749 void get_mon_status(ceph::Formatter *f);
750 void _quorum_status(ceph::Formatter *f, std::ostream& ss);
751 bool _add_bootstrap_peer_hint(std::string_view cmd, const cmdmap_t& cmdmap,
752 std::ostream& ss);
753 void handle_tell_command(MonOpRequestRef op);
754 void handle_command(MonOpRequestRef op);
755 void handle_route(MonOpRequestRef op);
756
757 int get_mon_metadata(int mon, ceph::Formatter *f, std::ostream& err);
758 int print_nodes(ceph::Formatter *f, std::ostream& err);
759
760 // track metadata reported by win_election()
761 std::map<int, Metadata> mon_metadata;
762 std::map<int, Metadata> pending_metadata;
763
764 /**
765 *
766 */
767 struct health_cache_t {
768 health_status_t overall;
769 std::string summary;
770
771 void reset() {
772 // health_status_t doesn't really have a NONE value and we're not
773 // okay with setting something else (say, HEALTH_ERR). so just
774 // leave it be.
775 summary.clear();
776 }
777 } health_status_cache;
778
779 Context *health_tick_event = nullptr;
780 Context *health_interval_event = nullptr;
781
782 void health_tick_start();
783 void health_tick_stop();
784 ceph::real_clock::time_point health_interval_calc_next_update();
785 void health_interval_start();
786 void health_interval_stop();
787 void health_events_cleanup();
788
789 void health_to_clog_update_conf(const std::set<std::string> &changed);
790
791 void do_health_to_clog_interval();
792 void do_health_to_clog(bool force = false);
793
794 void log_health(
795 const health_check_map_t& updated,
796 const health_check_map_t& previous,
797 MonitorDBStore::TransactionRef t);
798
799 void update_pending_metadata();
800
801 protected:
802
803 class HealthCheckLogStatus {
804 public:
805 health_status_t severity;
806 std::string last_message;
807 utime_t updated_at = 0;
808 HealthCheckLogStatus(health_status_t severity_,
809 const std::string &last_message_,
810 utime_t updated_at_)
811 : severity(severity_),
812 last_message(last_message_),
813 updated_at(updated_at_)
814 {}
815 };
816 std::map<std::string, HealthCheckLogStatus> health_check_log_times;
817
818 public:
819
820 void get_cluster_status(std::stringstream &ss, ceph::Formatter *f,
821 MonSession *session);
822
823 void reply_command(MonOpRequestRef op, int rc, const std::string &rs, version_t version);
824 void reply_command(MonOpRequestRef op, int rc, const std::string &rs, ceph::buffer::list& rdata, version_t version);
825
826 void reply_tell_command(MonOpRequestRef op, int rc, const std::string &rs);
827
828
829
830 void handle_probe(MonOpRequestRef op);
831 /**
832 * Handle a Probe Operation, replying with our name, quorum and known versions.
833 *
834 * We use the MMonProbe message class for anything and everything related with
835 * Monitor probing. One of the operations relates directly with the probing
836 * itself, in which we receive a probe request and to which we reply with
837 * our name, our quorum and the known versions for each Paxos service. Thus the
838 * redundant function name. This reply will obviously be sent to the one
839 * probing/requesting these infos.
840 *
841 * @todo Add @pre and @post
842 *
843 * @param m A Probe message, with an operation of type Probe.
844 */
845 void handle_probe_probe(MonOpRequestRef op);
846 void handle_probe_reply(MonOpRequestRef op);
847
848 // request routing
849 struct RoutedRequest {
850 uint64_t tid;
851 ceph::buffer::list request_bl;
852 MonSession *session;
853 ConnectionRef con;
854 uint64_t con_features;
855 MonOpRequestRef op;
856
857 RoutedRequest() : tid(0), session(NULL), con_features(0) {}
858 ~RoutedRequest() {
859 if (session)
860 session->put();
861 }
862 };
863 uint64_t routed_request_tid;
864 std::map<uint64_t, RoutedRequest*> routed_requests;
865
866 void forward_request_leader(MonOpRequestRef op);
867 void handle_forward(MonOpRequestRef op);
868 void send_reply(MonOpRequestRef op, Message *reply);
869 void no_reply(MonOpRequestRef op);
870 void resend_routed_requests();
871 void remove_session(MonSession *s);
872 void remove_all_sessions();
873 void waitlist_or_zap_client(MonOpRequestRef op);
874
875 void send_mon_message(Message *m, int rank);
876 /** can_change_external_state if we can do things like
877 * call elections as a result of the new map.
878 */
879 void notify_new_monmap(bool can_change_external_state=false, bool remove_rank_elector=true);
880
881 public:
882 struct C_Command : public C_MonOp {
883 Monitor &mon;
884 int rc;
885 std::string rs;
886 ceph::buffer::list rdata;
887 version_t version;
888 C_Command(Monitor &_mm, MonOpRequestRef _op, int r, std::string s, version_t v) :
889 C_MonOp(_op), mon(_mm), rc(r), rs(s), version(v){}
890 C_Command(Monitor &_mm, MonOpRequestRef _op, int r, std::string s, ceph::buffer::list rd, version_t v) :
891 C_MonOp(_op), mon(_mm), rc(r), rs(s), rdata(rd), version(v){}
892
893 void _finish(int r) override {
894 auto m = op->get_req<MMonCommand>();
895 if (r >= 0) {
896 std::ostringstream ss;
897 if (!op->get_req()->get_connection()) {
898 ss << "connection dropped for command ";
899 } else {
900 MonSession *s = op->get_session();
901
902 // if client drops we may not have a session to draw information from.
903 if (s) {
904 ss << "from='" << s->name << " " << s->addrs << "' "
905 << "entity='" << s->entity_name << "' ";
906 } else {
907 ss << "session dropped for command ";
908 }
909 }
910 cmdmap_t cmdmap;
911 std::ostringstream ds;
912 std::string prefix;
913 cmdmap_from_json(m->cmd, &cmdmap, ds);
914 cmd_getval(cmdmap, "prefix", prefix);
915 if (prefix != "config set" && prefix != "config-key set")
916 ss << "cmd='" << m->cmd << "': finished";
917
918 mon.audit_clog->info() << ss.str();
919 mon.reply_command(op, rc, rs, rdata, version);
920 }
921 else if (r == -ECANCELED)
922 return;
923 else if (r == -EAGAIN)
924 mon.dispatch_op(op);
925 else
926 ceph_abort_msg("bad C_Command return value");
927 }
928 };
929
930 private:
931 class C_RetryMessage : public C_MonOp {
932 Monitor *mon;
933 public:
934 C_RetryMessage(Monitor *m, MonOpRequestRef op) :
935 C_MonOp(op), mon(m) { }
936
937 void _finish(int r) override {
938 if (r == -EAGAIN || r >= 0)
939 mon->dispatch_op(op);
940 else if (r == -ECANCELED)
941 return;
942 else
943 ceph_abort_msg("bad C_RetryMessage return value");
944 }
945 };
946
947 //ms_dispatch handles a lot of logic and we want to reuse it
948 //on forwarded messages, so we create a non-locking version for this class
949 void _ms_dispatch(Message *m);
950 bool ms_dispatch(Message *m) override {
951 std::lock_guard l{lock};
952 _ms_dispatch(m);
953 return true;
954 }
955 void dispatch_op(MonOpRequestRef op);
956 //mon_caps is used for un-connected messages from monitors
957 MonCap mon_caps;
958 bool get_authorizer(int dest_type, AuthAuthorizer **authorizer);
959 public: // for AuthMonitor msgr1:
960 int ms_handle_authentication(Connection *con) override;
961 private:
962 void ms_handle_accept(Connection *con) override;
963 bool ms_handle_reset(Connection *con) override;
964 void ms_handle_remote_reset(Connection *con) override {}
965 bool ms_handle_refused(Connection *con) override;
966
967 // AuthClient
968 int get_auth_request(
969 Connection *con,
970 AuthConnectionMeta *auth_meta,
971 uint32_t *method,
972 std::vector<uint32_t> *preferred_modes,
973 ceph::buffer::list *out) override;
974 int handle_auth_reply_more(
975 Connection *con,
976 AuthConnectionMeta *auth_meta,
977 const ceph::buffer::list& bl,
978 ceph::buffer::list *reply) override;
979 int handle_auth_done(
980 Connection *con,
981 AuthConnectionMeta *auth_meta,
982 uint64_t global_id,
983 uint32_t con_mode,
984 const ceph::buffer::list& bl,
985 CryptoKey *session_key,
986 std::string *connection_secret) override;
987 int handle_auth_bad_method(
988 Connection *con,
989 AuthConnectionMeta *auth_meta,
990 uint32_t old_auth_method,
991 int result,
992 const std::vector<uint32_t>& allowed_methods,
993 const std::vector<uint32_t>& allowed_modes) override;
994 // /AuthClient
995 // AuthServer
996 int handle_auth_request(
997 Connection *con,
998 AuthConnectionMeta *auth_meta,
999 bool more,
1000 uint32_t auth_method,
1001 const ceph::buffer::list& bl,
1002 ceph::buffer::list *reply) override;
1003 // /AuthServer
1004
1005 int write_default_keyring(ceph::buffer::list& bl);
1006 void extract_save_mon_key(KeyRing& keyring);
1007
1008 void collect_metadata(Metadata *m);
1009 int load_metadata();
1010 void count_metadata(const std::string& field, ceph::Formatter *f);
1011 void count_metadata(const std::string& field, std::map<std::string,int> *out);
1012 // get_all_versions() gathers version information from daemons for health check
1013 void get_all_versions(std::map<std::string, std::list<std::string>> &versions);
1014 void get_versions(std::map<std::string, std::list<std::string>> &versions);
1015
1016 // features
1017 static CompatSet get_initial_supported_features();
1018 static CompatSet get_supported_features();
1019 static CompatSet get_legacy_features();
1020 /// read the ondisk features into the CompatSet pointed to by read_features
1021 static void read_features_off_disk(MonitorDBStore *store, CompatSet *read_features);
1022 void read_features();
1023 void write_features(MonitorDBStore::TransactionRef t);
1024
1025 OpTracker op_tracker;
1026
1027 public:
1028 Monitor(CephContext *cct_, std::string nm, MonitorDBStore *s,
1029 Messenger *m, Messenger *mgr_m, MonMap *map);
1030 ~Monitor() override;
1031
1032 static int check_features(MonitorDBStore *store);
1033
1034 // config observer
1035 const char** get_tracked_conf_keys() const override;
1036 void handle_conf_change(const ConfigProxy& conf,
1037 const std::set<std::string> &changed) override;
1038
1039 void update_log_clients();
1040 int sanitize_options();
1041 int preinit();
1042 int init();
1043 void init_paxos();
1044 void refresh_from_paxos(bool *need_bootstrap);
1045 void shutdown();
1046 void tick();
1047
1048 void handle_signal(int sig);
1049
1050 int mkfs(ceph::buffer::list& osdmapbl);
1051
1052 /**
1053 * check cluster_fsid file
1054 *
1055 * @return EEXIST if file exists and doesn't match, 0 on match, or negative error code
1056 */
1057 int check_fsid();
1058
1059 /**
1060 * write cluster_fsid file
1061 *
1062 * @return 0 on success, or negative error code
1063 */
1064 int write_fsid();
1065 int write_fsid(MonitorDBStore::TransactionRef t);
1066
1067 int do_admin_command(std::string_view command, const cmdmap_t& cmdmap,
1068 ceph::Formatter *f,
1069 std::ostream& err,
1070 std::ostream& out);
1071
1072 private:
1073 // don't allow copying
1074 Monitor(const Monitor& rhs);
1075 Monitor& operator=(const Monitor &rhs);
1076
1077 public:
1078 static void format_command_descriptions(const std::vector<MonCommand> &commands,
1079 ceph::Formatter *f,
1080 uint64_t features,
1081 ceph::buffer::list *rdata);
1082
1083 const std::vector<MonCommand> &get_local_commands(mon_feature_t f) {
1084 if (f.contains_all(ceph::features::mon::FEATURE_NAUTILUS)) {
1085 return local_mon_commands;
1086 } else {
1087 return prenautilus_local_mon_commands;
1088 }
1089 }
1090 const ceph::buffer::list& get_local_commands_bl(mon_feature_t f) {
1091 if (f.contains_all(ceph::features::mon::FEATURE_NAUTILUS)) {
1092 return local_mon_commands_bl;
1093 } else {
1094 return prenautilus_local_mon_commands_bl;
1095 }
1096 }
1097 void set_leader_commands(const std::vector<MonCommand>& cmds) {
1098 leader_mon_commands = cmds;
1099 }
1100
1101 bool is_keyring_required();
1102 };
1103
1104 #define CEPH_MON_FEATURE_INCOMPAT_BASE CompatSet::Feature (1, "initial feature set (~v.18)")
1105 #define CEPH_MON_FEATURE_INCOMPAT_GV CompatSet::Feature (2, "global version sequencing (v0.52)")
1106 #define CEPH_MON_FEATURE_INCOMPAT_SINGLE_PAXOS CompatSet::Feature (3, "single paxos with k/v store (v0.\?)")
1107 #define CEPH_MON_FEATURE_INCOMPAT_OSD_ERASURE_CODES CompatSet::Feature(4, "support erasure code pools")
1108 #define CEPH_MON_FEATURE_INCOMPAT_OSDMAP_ENC CompatSet::Feature(5, "new-style osdmap encoding")
1109 #define CEPH_MON_FEATURE_INCOMPAT_ERASURE_CODE_PLUGINS_V2 CompatSet::Feature(6, "support isa/lrc erasure code")
1110 #define CEPH_MON_FEATURE_INCOMPAT_ERASURE_CODE_PLUGINS_V3 CompatSet::Feature(7, "support shec erasure code")
1111 #define CEPH_MON_FEATURE_INCOMPAT_KRAKEN CompatSet::Feature(8, "support monmap features")
1112 #define CEPH_MON_FEATURE_INCOMPAT_LUMINOUS CompatSet::Feature(9, "luminous ondisk layout")
1113 #define CEPH_MON_FEATURE_INCOMPAT_MIMIC CompatSet::Feature(10, "mimic ondisk layout")
1114 #define CEPH_MON_FEATURE_INCOMPAT_NAUTILUS CompatSet::Feature(11, "nautilus ondisk layout")
1115 #define CEPH_MON_FEATURE_INCOMPAT_OCTOPUS CompatSet::Feature(12, "octopus ondisk layout")
1116 #define CEPH_MON_FEATURE_INCOMPAT_PACIFIC CompatSet::Feature(13, "pacific ondisk layout")
1117 #define CEPH_MON_FEATURE_INCOMPAT_QUINCY CompatSet::Feature(14, "quincy ondisk layout")
1118 // make sure you add your feature to Monitor::get_supported_features
1119
1120
1121 /* Callers use:
1122 *
1123 * new C_MonContext{...}
1124 *
1125 * instead of
1126 *
1127 * new C_MonContext(...)
1128 *
1129 * because of gcc bug [1].
1130 *
1131 * [1] https://gcc.gnu.org/bugzilla/show_bug.cgi?id=85883
1132 */
1133 template<typename T>
1134 class C_MonContext : public LambdaContext<T> {
1135 public:
1136 C_MonContext(const Monitor* m, T&& f) :
1137 LambdaContext<T>(std::forward<T>(f)),
1138 mon(m)
1139 {}
1140 void finish(int r) override {
1141 if (mon->is_shutdown())
1142 return;
1143 LambdaContext<T>::finish(r);
1144 }
1145 private:
1146 const Monitor* mon;
1147 };
1148
1149 #endif