]> git.proxmox.com Git - ceph.git/blame - ceph/src/mon/Monitor.h
update dh_systemd restart patch for pacific
[ceph.git] / ceph / src / mon / Monitor.h
CommitLineData
f67539c2 1// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
7c673cae
FG
2// vim: ts=8 sw=2 smarttab
3/*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 *
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
f67539c2 10 * License version 2.1, as published by the Free Software
7c673cae 11 * Foundation. See file COPYING.
f67539c2 12 *
7c673cae
FG
13 */
14
f67539c2
TL
15/*
16 * This is the top level monitor. It runs on each machine in the Monitor
17 * Cluster. The election of a leader for the paxos algorithm only happens
18 * once per machine via the elector. There is a separate paxos instance (state)
19 * kept for each of the system components: Object Store Device (OSD) Monitor,
7c673cae
FG
20 * Placement Group (PG) Monitor, Metadata Server (MDS) Monitor, and Client Monitor.
21 */
22
23#ifndef CEPH_MONITOR_H
24#define CEPH_MONITOR_H
25
26#include <errno.h>
27#include <cmath>
11fdf7f2 28#include <string>
f67539c2 29#include <array>
7c673cae
FG
30
31#include "include/types.h"
224ce89b 32#include "include/health.h"
7c673cae
FG
33#include "msg/Messenger.h"
34
35#include "common/Timer.h"
36
224ce89b 37#include "health_check.h"
7c673cae
FG
38#include "MonMap.h"
39#include "Elector.h"
40#include "Paxos.h"
41#include "Session.h"
c07f9fc5 42#include "MonCommand.h"
7c673cae 43
11fdf7f2
TL
44
45#include "common/config_obs.h"
7c673cae 46#include "common/LogClient.h"
11fdf7f2
TL
47#include "auth/AuthClient.h"
48#include "auth/AuthServer.h"
7c673cae
FG
49#include "auth/cephx/CephxKeyServer.h"
50#include "auth/AuthMethodList.h"
51#include "auth/KeyRing.h"
9f95a23c 52#include "include/common_fwd.h"
7c673cae
FG
53#include "messages/MMonCommand.h"
54#include "mon/MonitorDBStore.h"
7c673cae
FG
55#include "mgr/MgrClient.h"
56
57#include "mon/MonOpRequest.h"
58#include "common/WorkQueue.h"
59
adb31ebb 60using namespace TOPNSPC::common;
7c673cae
FG
61
62#define CEPH_MON_PROTOCOL 13 /* cluster internal */
63
64
65enum {
66 l_cluster_first = 555000,
67 l_cluster_num_mon,
68 l_cluster_num_mon_quorum,
69 l_cluster_num_osd,
70 l_cluster_num_osd_up,
71 l_cluster_num_osd_in,
72 l_cluster_osd_epoch,
73 l_cluster_osd_bytes,
74 l_cluster_osd_bytes_used,
75 l_cluster_osd_bytes_avail,
76 l_cluster_num_pool,
77 l_cluster_num_pg,
78 l_cluster_num_pg_active_clean,
79 l_cluster_num_pg_active,
80 l_cluster_num_pg_peering,
81 l_cluster_num_object,
82 l_cluster_num_object_degraded,
83 l_cluster_num_object_misplaced,
84 l_cluster_num_object_unfound,
85 l_cluster_num_bytes,
7c673cae
FG
86 l_cluster_last,
87};
88
89enum {
90 l_mon_first = 456000,
91 l_mon_num_sessions,
92 l_mon_session_add,
93 l_mon_session_rm,
94 l_mon_session_trim,
95 l_mon_num_elections,
96 l_mon_election_call,
97 l_mon_election_win,
98 l_mon_election_lose,
99 l_mon_last,
100};
101
7c673cae
FG
102class PaxosService;
103
7c673cae
FG
104class AdminSocketHook;
105
7c673cae
FG
106#define COMPAT_SET_LOC "feature_set"
107
7c673cae 108class Monitor : public Dispatcher,
11fdf7f2
TL
109 public AuthClient,
110 public AuthServer,
7c673cae
FG
111 public md_config_obs_t {
112public:
11fdf7f2
TL
113 int orig_argc = 0;
114 const char **orig_argv = nullptr;
115
7c673cae 116 // me
f67539c2 117 std::string name;
7c673cae
FG
118 int rank;
119 Messenger *messenger;
120 ConnectionRef con_self;
9f95a23c 121 ceph::mutex lock = ceph::make_mutex("Monitor::lock");
7c673cae
FG
122 SafeTimer timer;
123 Finisher finisher;
124 ThreadPool cpu_tp; ///< threadpool for CPU intensive work
11fdf7f2
TL
125
126 ceph::mutex auth_lock = ceph::make_mutex("Monitor::auth_lock");
127
7c673cae
FG
128 /// true if we have ever joined a quorum. if false, we are either a
129 /// new cluster, a newly joining monitor, or a just-upgraded
130 /// monitor.
131 bool has_ever_joined;
132
133 PerfCounters *logger, *cluster_logger;
134 bool cluster_logger_registered;
135
136 void register_cluster_logger();
137 void unregister_cluster_logger();
138
139 MonMap *monmap;
140 uuid_d fingerprint;
141
f67539c2 142 std::set<entity_addrvec_t> extra_probe_peers;
7c673cae
FG
143
144 LogClient log_client;
145 LogChannelRef clog;
146 LogChannelRef audit_clog;
147 KeyRing keyring;
148 KeyServer key_server;
149
150 AuthMethodList auth_cluster_required;
151 AuthMethodList auth_service_required;
152
153 CompatSet features;
154
f67539c2
TL
155 std::vector<MonCommand> leader_mon_commands; // quorum leader's commands
156 std::vector<MonCommand> local_mon_commands; // commands i support
157 ceph::buffer::list local_mon_commands_bl; // encoded version of above
d2e6a577 158
f67539c2
TL
159 std::vector<MonCommand> prenautilus_local_mon_commands;
160 ceph::buffer::list prenautilus_local_mon_commands_bl;
7c673cae
FG
161
162 Messenger *mgr_messenger;
163 MgrClient mgr_client;
164 uint64_t mgr_proxy_bytes = 0; // in-flight proxied mgr command message bytes
11fdf7f2 165 std::string gss_ktfile_client{};
31f18b77 166
7c673cae
FG
167private:
168 void new_tick();
169
170 // -- local storage --
171public:
172 MonitorDBStore *store;
f67539c2
TL
173 static const std::string MONITOR_NAME;
174 static const std::string MONITOR_STORE_PREFIX;
7c673cae
FG
175
176 // -- monitor state --
177private:
178 enum {
11fdf7f2
TL
179 STATE_INIT = 1,
180 STATE_PROBING,
7c673cae
FG
181 STATE_SYNCHRONIZING,
182 STATE_ELECTING,
183 STATE_LEADER,
184 STATE_PEON,
185 STATE_SHUTDOWN
186 };
11fdf7f2 187 int state = STATE_INIT;
7c673cae
FG
188
189public:
190 static const char *get_state_name(int s) {
191 switch (s) {
192 case STATE_PROBING: return "probing";
193 case STATE_SYNCHRONIZING: return "synchronizing";
194 case STATE_ELECTING: return "electing";
195 case STATE_LEADER: return "leader";
196 case STATE_PEON: return "peon";
197 case STATE_SHUTDOWN: return "shutdown";
198 default: return "???";
199 }
200 }
201 const char *get_state_name() const {
202 return get_state_name(state);
203 }
204
11fdf7f2 205 bool is_init() const { return state == STATE_INIT; }
7c673cae
FG
206 bool is_shutdown() const { return state == STATE_SHUTDOWN; }
207 bool is_probing() const { return state == STATE_PROBING; }
208 bool is_synchronizing() const { return state == STATE_SYNCHRONIZING; }
209 bool is_electing() const { return state == STATE_ELECTING; }
210 bool is_leader() const { return state == STATE_LEADER; }
211 bool is_peon() const { return state == STATE_PEON; }
212
213 const utime_t &get_leader_since() const;
214
215 void prepare_new_fingerprint(MonitorDBStore::TransactionRef t);
216
11fdf7f2
TL
217 std::vector<DaemonHealthMetric> get_health_metrics();
218
7c673cae
FG
219 // -- elector --
220private:
f67539c2 221 std::unique_ptr<Paxos> paxos;
7c673cae
FG
222 Elector elector;
223 friend class Elector;
224
225 /// features we require of peers (based on on-disk compatset)
226 uint64_t required_features;
227
228 int leader; // current leader (to best of knowledge)
f67539c2
TL
229 std::set<int> quorum; // current active set of monitors (if !starting)
230 ceph::mono_clock::time_point quorum_since; // when quorum formed
7c673cae
FG
231 utime_t leader_since; // when this monitor became the leader, if it is the leader
232 utime_t exited_quorum; // time detected as not in quorum; 0 if in
31f18b77
FG
233
234 // map of counts of connected clients, by type and features, for
235 // each quorum mon
f67539c2 236 std::map<int,FeatureMap> quorum_feature_map;
31f18b77 237
7c673cae
FG
238 /**
239 * Intersection of quorum member's connection feature bits.
240 */
241 uint64_t quorum_con_features;
242 /**
243 * Intersection of quorum members mon-specific feature bits
244 */
245 mon_feature_t quorum_mon_features;
7c673cae 246
9f95a23c 247 ceph_release_t quorum_min_mon_release{ceph_release_t::unknown};
11fdf7f2 248
f67539c2
TL
249 std::set<std::string> outside_quorum;
250
251 bool stretch_mode_engaged{false};
252 bool degraded_stretch_mode{false};
253 bool recovering_stretch_mode{false};
254 string stretch_bucket_divider;
255 map<string, set<string>> dead_mon_buckets; // bucket->mon ranks, locations with no live mons
256 set<string> up_mon_buckets; // locations with a live mon
257 void do_stretch_mode_election_work();
258
259 bool session_stretch_allowed(MonSession *s, MonOpRequestRef& op);
260 void disconnect_disallowed_stretch_sessions();
261 void set_elector_disallowed_leaders(bool allow_election);
262public:
263 bool is_stretch_mode() { return stretch_mode_engaged; }
264 bool is_degraded_stretch_mode() { return degraded_stretch_mode; }
265 bool is_recovering_stretch_mode() { return recovering_stretch_mode; }
266 void maybe_engage_stretch_mode();
267 void maybe_go_degraded_stretch_mode();
268 void trigger_degraded_stretch_mode(const set<string>& dead_mons,
269 const set<int>& dead_buckets);
270 void set_degraded_stretch_mode();
271 void go_recovery_stretch_mode();
272 void trigger_healthy_stretch_mode();
273 void set_healthy_stretch_mode();
274 void enable_stretch_mode();
275
276
277private:
7c673cae
FG
278
279 /**
280 * @defgroup Monitor_h_scrub
281 * @{
282 */
283 version_t scrub_version; ///< paxos version we are scrubbing
f67539c2 284 std::map<int,ScrubResult> scrub_result; ///< results so far
7c673cae
FG
285
286 /**
287 * trigger a cross-mon scrub
288 *
289 * Verify all mons are storing identical content
290 */
291 int scrub_start();
292 int scrub();
293 void handle_scrub(MonOpRequestRef op);
294 bool _scrub(ScrubResult *r,
f67539c2 295 std::pair<std::string,std::string> *start,
7c673cae
FG
296 int *num_keys);
297 void scrub_check_results();
298 void scrub_timeout();
299 void scrub_finish();
300 void scrub_reset();
f67539c2 301 void scrub_update_interval(ceph::timespan interval);
7c673cae
FG
302
303 Context *scrub_event; ///< periodic event to trigger scrub (leader)
304 Context *scrub_timeout_event; ///< scrub round timeout (leader)
305 void scrub_event_start();
306 void scrub_event_cancel();
307 void scrub_reset_timeout();
308 void scrub_cancel_timeout();
309
310 struct ScrubState {
f67539c2 311 std::pair<std::string,std::string> last_key; ///< last scrubbed key
7c673cae
FG
312 bool finished;
313
314 ScrubState() : finished(false) { }
315 virtual ~ScrubState() { }
316 };
11fdf7f2 317 std::shared_ptr<ScrubState> scrub_state; ///< keeps track of current scrub
7c673cae
FG
318
319 /**
320 * @defgroup Monitor_h_sync Synchronization
321 * @{
322 */
323 /**
324 * @} // provider state
325 */
326 struct SyncProvider {
11fdf7f2 327 entity_addrvec_t addrs;
7c673cae
FG
328 uint64_t cookie; ///< unique cookie for this sync attempt
329 utime_t timeout; ///< when we give up and expire this attempt
330 version_t last_committed; ///< last paxos version on peer
f67539c2 331 std::pair<std::string,std::string> last_key; ///< last key sent to (or on) peer
7c673cae
FG
332 bool full; ///< full scan?
333 MonitorDBStore::Synchronizer synchronizer; ///< iterator
334
335 SyncProvider() : cookie(0), last_committed(0), full(false) {}
336
337 void reset_timeout(CephContext *cct, int grace) {
338 timeout = ceph_clock_now();
339 timeout += grace;
340 }
341 };
342
f67539c2 343 std::map<std::uint64_t, SyncProvider> sync_providers; ///< cookie -> SyncProvider for those syncing from us
7c673cae
FG
344 uint64_t sync_provider_count; ///< counter for issued cookies to keep them unique
345
346 /**
347 * @} // requester state
348 */
11fdf7f2 349 entity_addrvec_t sync_provider; ///< who we are syncing from
7c673cae
FG
350 uint64_t sync_cookie; ///< 0 if we are starting, non-zero otherwise
351 bool sync_full; ///< true if we are a full sync, false for recent catch-up
352 version_t sync_start_version; ///< last_committed at sync start
353 Context *sync_timeout_event; ///< timeout event
354
355 /**
356 * floor for sync source
357 *
358 * When we sync we forget about our old last_committed value which
359 * can be dangerous. For example, if we have a cluster of:
360 *
361 * mon.a: lc 100
362 * mon.b: lc 80
363 * mon.c: lc 100 (us)
364 *
365 * If something forces us to sync (say, corruption, or manual
366 * intervention, or bug), we forget last_committed, and might abort.
367 * If mon.a happens to be down when we come back, we will see:
368 *
369 * mon.b: lc 80
370 * mon.c: lc 0 (us)
371 *
372 * and sync from mon.b, at which point a+b will both have lc 80 and
373 * come online with a majority holding out of date commits.
374 *
375 * Avoid this by preserving our old last_committed value prior to
376 * sync and never going backwards.
377 */
378 version_t sync_last_committed_floor;
379
380 /**
381 * Obtain the synchronization target prefixes in set form.
382 *
383 * We consider a target prefix all those that are relevant when
384 * synchronizing two stores. That is, all those that hold paxos service's
385 * versions, as well as paxos versions, or any control keys such as the
386 * first or last committed version.
387 *
388 * Given the current design, this function should return the name of all and
389 * any available paxos service, plus the paxos name.
390 *
391 * @returns a set of strings referring to the prefixes being synchronized
392 */
f67539c2 393 std::set<std::string> get_sync_targets_names();
7c673cae
FG
394
395 /**
396 * Reset the monitor's sync-related data structures for syncing *from* a peer
397 */
398 void sync_reset_requester();
399
400 /**
401 * Reset sync state related to allowing others to sync from us
402 */
403 void sync_reset_provider();
404
405 /**
406 * Caled when a sync attempt times out (requester-side)
407 */
408 void sync_timeout();
409
410 /**
411 * Get the latest monmap for backup purposes during sync
412 */
f67539c2 413 void sync_obtain_latest_monmap(ceph::buffer::list &bl);
7c673cae
FG
414
415 /**
416 * Start sync process
417 *
418 * Start pulling committed state from another monitor.
419 *
420 * @param entity where to pull committed state from
421 * @param full whether to do a full sync or just catch up on recent paxos
422 */
11fdf7f2 423 void sync_start(entity_addrvec_t &addrs, bool full);
7c673cae
FG
424
425public:
426 /**
427 * force a sync on next mon restart
428 */
f67539c2 429 void sync_force(ceph::Formatter *f);
7c673cae
FG
430
431private:
432 /**
433 * store critical state for safekeeping during sync
434 *
435 * We store a few things on the side that we don't want to get clobbered by sync. This
436 * includes the latest monmap and a lower bound on last_committed.
437 */
438 void sync_stash_critical_state(MonitorDBStore::TransactionRef tx);
439
440 /**
441 * reset the sync timeout
442 *
443 * This is used on the client to restart if things aren't progressing
444 */
445 void sync_reset_timeout();
446
447 /**
448 * trim stale sync provider state
449 *
450 * If someone is syncing from us and hasn't talked to us recently, expire their state.
451 */
452 void sync_trim_providers();
453
454 /**
455 * Complete a sync
456 *
457 * Finish up a sync after we've gotten all of the chunks.
458 *
459 * @param last_committed final last_committed value from provider
460 */
461 void sync_finish(version_t last_committed);
462
463 /**
464 * request the next chunk from the provider
465 */
466 void sync_get_next_chunk();
467
468 /**
469 * handle sync message
470 *
471 * @param m Sync message with operation type MMonSync::OP_START_CHUNKS
472 */
473 void handle_sync(MonOpRequestRef op);
474
475 void _sync_reply_no_cookie(MonOpRequestRef op);
476
477 void handle_sync_get_cookie(MonOpRequestRef op);
478 void handle_sync_get_chunk(MonOpRequestRef op);
479 void handle_sync_finish(MonOpRequestRef op);
480
481 void handle_sync_cookie(MonOpRequestRef op);
482 void handle_sync_forward(MonOpRequestRef op);
483 void handle_sync_chunk(MonOpRequestRef op);
484 void handle_sync_no_cookie(MonOpRequestRef op);
485
486 /**
487 * @} // Synchronization
488 */
489
f67539c2
TL
490 std::list<Context*> waitfor_quorum;
491 std::list<Context*> maybe_wait_for_quorum;
7c673cae
FG
492
493 /**
494 * @defgroup Monitor_h_TimeCheck Monitor Clock Drift Early Warning System
495 * @{
496 *
497 * We use time checks to keep track of any clock drifting going on in the
498 * cluster. This is accomplished by periodically ping each monitor in the
499 * quorum and register its response time on a map, assessing how much its
500 * clock has drifted. We also take this opportunity to assess the latency
501 * on response.
502 *
503 * This mechanism works as follows:
504 *
505 * - Leader sends out a 'PING' message to each other monitor in the quorum.
506 * The message is timestamped with the leader's current time. The leader's
507 * current time is recorded in a map, associated with each peon's
508 * instance.
509 * - The peon replies to the leader with a timestamped 'PONG' message.
510 * - The leader calculates a delta between the peon's timestamp and its
511 * current time and stashes it.
512 * - The leader also calculates the time it took to receive the 'PONG'
513 * since the 'PING' was sent, and stashes an approximate latency estimate.
514 * - Once all the quorum members have pong'ed, the leader will share the
515 * clock skew and latency maps with all the monitors in the quorum.
516 */
f67539c2
TL
517 std::map<int, utime_t> timecheck_waiting;
518 std::map<int, double> timecheck_skews;
519 std::map<int, double> timecheck_latencies;
7c673cae
FG
520 // odd value means we are mid-round; even value means the round has
521 // finished.
522 version_t timecheck_round;
523 unsigned int timecheck_acks;
524 utime_t timecheck_round_start;
224ce89b 525 friend class HealthMonitor;
7c673cae
FG
526 /* When we hit a skew we will start a new round based off of
527 * 'mon_timecheck_skew_interval'. Each new round will be backed off
528 * until we hit 'mon_timecheck_interval' -- which is the typical
529 * interval when not in the presence of a skew.
530 *
531 * This variable tracks the number of rounds with skews since last clean
532 * so that we can report to the user and properly adjust the backoff.
533 */
534 uint64_t timecheck_rounds_since_clean;
535 /**
536 * Time Check event.
537 */
538 Context *timecheck_event;
539
540 void timecheck_start();
541 void timecheck_finish();
542 void timecheck_start_round();
543 void timecheck_finish_round(bool success = true);
544 void timecheck_cancel_round();
545 void timecheck_cleanup();
546 void timecheck_reset_event();
547 void timecheck_check_skews();
548 void timecheck_report();
549 void timecheck();
f67539c2 550 health_status_t timecheck_status(std::ostringstream &ss,
7c673cae
FG
551 const double skew_bound,
552 const double latency);
553 void handle_timecheck_leader(MonOpRequestRef op);
554 void handle_timecheck_peon(MonOpRequestRef op);
555 void handle_timecheck(MonOpRequestRef op);
556
557 /**
558 * Returns 'true' if this is considered to be a skew; 'false' otherwise.
559 */
560 bool timecheck_has_skew(const double skew_bound, double *abs) const {
561 double abs_skew = std::fabs(skew_bound);
562 if (abs)
563 *abs = abs_skew;
11fdf7f2 564 return (abs_skew > g_conf()->mon_clock_drift_allowed);
7c673cae
FG
565 }
566
567 /**
568 * @}
569 */
570 /**
571 * Handle ping messages from others.
572 */
573 void handle_ping(MonOpRequestRef op);
574
575 Context *probe_timeout_event = nullptr; // for probing
576
577 void reset_probe_timeout();
578 void cancel_probe_timeout();
579 void probe_timeout(int r);
580
581 void _apply_compatset_features(CompatSet &new_features);
582
583public:
584 epoch_t get_epoch();
585 int get_leader() const { return leader; }
f67539c2
TL
586 std::string get_leader_name() {
587 return quorum.empty() ? std::string() : monmap->get_name(leader);
224ce89b 588 }
f67539c2
TL
589 const std::set<int>& get_quorum() const { return quorum; }
590 std::list<std::string> get_quorum_names() {
591 std::list<std::string> q;
592 for (auto p = quorum.begin(); p != quorum.end(); ++p)
7c673cae
FG
593 q.push_back(monmap->get_name(*p));
594 return q;
595 }
596 uint64_t get_quorum_con_features() const {
597 return quorum_con_features;
598 }
599 mon_feature_t get_quorum_mon_features() const {
600 return quorum_mon_features;
601 }
602 uint64_t get_required_features() const {
603 return required_features;
604 }
605 mon_feature_t get_required_mon_features() const {
606 return monmap->get_required_features();
607 }
608 void apply_quorum_to_compatset_features();
609 void apply_monmap_to_compatset_features();
610 void calc_quorum_requirements();
611
31f18b77
FG
612 void get_combined_feature_map(FeatureMap *fm);
613
7c673cae
FG
614private:
615 void _reset(); ///< called from bootstrap, start_, or join_election
616 void wait_for_paxos_write();
617 void _finish_svc_election(); ///< called by {win,lose}_election
11fdf7f2 618 void respawn();
7c673cae
FG
619public:
620 void bootstrap();
621 void join_election();
622 void start_election();
623 void win_standalone_election();
624 // end election (called by Elector)
f67539c2 625 void win_election(epoch_t epoch, const std::set<int>& q,
7c673cae
FG
626 uint64_t features,
627 const mon_feature_t& mon_features,
9f95a23c 628 ceph_release_t min_mon_release,
f67539c2
TL
629 const std::map<int,Metadata>& metadata);
630 void lose_election(epoch_t epoch, std::set<int>& q, int l,
7c673cae 631 uint64_t features,
11fdf7f2 632 const mon_feature_t& mon_features,
9f95a23c 633 ceph_release_t min_mon_release);
7c673cae
FG
634 // end election (called by Elector)
635 void finish_election();
636
7c673cae
FG
637 void update_logger();
638
639 /**
640 * Vector holding the Services serviced by this Monitor.
641 */
f67539c2 642 std::array<std::unique_ptr<PaxosService>, PAXOS_NUM> paxos_service;
7c673cae
FG
643
644 class MDSMonitor *mdsmon() {
11fdf7f2 645 return (class MDSMonitor *)paxos_service[PAXOS_MDSMAP].get();
7c673cae
FG
646 }
647
648 class MonmapMonitor *monmon() {
11fdf7f2 649 return (class MonmapMonitor *)paxos_service[PAXOS_MONMAP].get();
7c673cae
FG
650 }
651
652 class OSDMonitor *osdmon() {
11fdf7f2 653 return (class OSDMonitor *)paxos_service[PAXOS_OSDMAP].get();
7c673cae
FG
654 }
655
656 class AuthMonitor *authmon() {
11fdf7f2 657 return (class AuthMonitor *)paxos_service[PAXOS_AUTH].get();
7c673cae
FG
658 }
659
660 class LogMonitor *logmon() {
11fdf7f2 661 return (class LogMonitor*) paxos_service[PAXOS_LOG].get();
7c673cae
FG
662 }
663
664 class MgrMonitor *mgrmon() {
11fdf7f2 665 return (class MgrMonitor*) paxos_service[PAXOS_MGR].get();
7c673cae
FG
666 }
667
31f18b77 668 class MgrStatMonitor *mgrstatmon() {
11fdf7f2 669 return (class MgrStatMonitor*) paxos_service[PAXOS_MGRSTAT].get();
31f18b77
FG
670 }
671
b32b8144 672 class HealthMonitor *healthmon() {
11fdf7f2
TL
673 return (class HealthMonitor*) paxos_service[PAXOS_HEALTH].get();
674 }
675
676 class ConfigMonitor *configmon() {
677 return (class ConfigMonitor*) paxos_service[PAXOS_CONFIG].get();
224ce89b
WB
678 }
679
f67539c2
TL
680 class KVMonitor *kvmon() {
681 return (class KVMonitor*) paxos_service[PAXOS_KV].get();
682 }
683
7c673cae
FG
684 friend class Paxos;
685 friend class OSDMonitor;
686 friend class MDSMonitor;
687 friend class MonmapMonitor;
7c673cae 688 friend class LogMonitor;
f67539c2 689 friend class KVMonitor;
7c673cae
FG
690
691 // -- sessions --
692 MonSessionMap session_map;
9f95a23c 693 ceph::mutex session_map_lock = ceph::make_mutex("Monitor::session_map_lock");
7c673cae
FG
694 AdminSocketHook *admin_hook;
695
696 template<typename Func, typename...Args>
697 void with_session_map(Func&& func) {
11fdf7f2 698 std::lock_guard l(session_map_lock);
7c673cae
FG
699 std::forward<Func>(func)(session_map);
700 }
701 void send_latest_monmap(Connection *con);
702
703 // messages
704 void handle_get_version(MonOpRequestRef op);
705 void handle_subscribe(MonOpRequestRef op);
706 void handle_mon_get_map(MonOpRequestRef op);
707
11fdf7f2 708 static void _generate_command_map(cmdmap_t& cmdmap,
f67539c2 709 std::map<std::string,std::string> &param_str_map);
c07f9fc5 710 static const MonCommand *_get_moncommand(
f67539c2
TL
711 const std::string &cmd_prefix,
712 const std::vector<MonCommand>& cmds);
713 bool _allowed_command(MonSession *s, const std::string& module,
714 const std::string& prefix,
11fdf7f2 715 const cmdmap_t& cmdmap,
f67539c2 716 const std::map<std::string,std::string>& param_str_map,
7c673cae 717 const MonCommand *this_cmd);
f67539c2
TL
718 void get_mon_status(ceph::Formatter *f);
719 void _quorum_status(ceph::Formatter *f, std::ostream& ss);
11fdf7f2
TL
720 bool _add_bootstrap_peer_hint(std::string_view cmd, const cmdmap_t& cmdmap,
721 std::ostream& ss);
9f95a23c 722 void handle_tell_command(MonOpRequestRef op);
7c673cae
FG
723 void handle_command(MonOpRequestRef op);
724 void handle_route(MonOpRequestRef op);
725
f67539c2
TL
726 int get_mon_metadata(int mon, ceph::Formatter *f, std::ostream& err);
727 int print_nodes(ceph::Formatter *f, std::ostream& err);
7c673cae 728
f67539c2
TL
729 // track metadata reported by win_election()
730 std::map<int, Metadata> mon_metadata;
731 std::map<int, Metadata> pending_metadata;
7c673cae
FG
732
733 /**
734 *
735 */
736 struct health_cache_t {
737 health_status_t overall;
f67539c2 738 std::string summary;
7c673cae
FG
739
740 void reset() {
741 // health_status_t doesn't really have a NONE value and we're not
742 // okay with setting something else (say, HEALTH_ERR). so just
743 // leave it be.
744 summary.clear();
745 }
746 } health_status_cache;
747
748 Context *health_tick_event = nullptr;
749 Context *health_interval_event = nullptr;
750
751 void health_tick_start();
752 void health_tick_stop();
9f95a23c 753 ceph::real_clock::time_point health_interval_calc_next_update();
7c673cae
FG
754 void health_interval_start();
755 void health_interval_stop();
756 void health_events_cleanup();
757
758 void health_to_clog_update_conf(const std::set<std::string> &changed);
759
760 void do_health_to_clog_interval();
761 void do_health_to_clog(bool force = false);
762
224ce89b
WB
763 void log_health(
764 const health_check_map_t& updated,
765 const health_check_map_t& previous,
766 MonitorDBStore::TransactionRef t);
767
181888fb
FG
768protected:
769
770 class HealthCheckLogStatus {
771 public:
772 health_status_t severity;
773 std::string last_message;
774 utime_t updated_at = 0;
775 HealthCheckLogStatus(health_status_t severity_,
776 const std::string &last_message_,
777 utime_t updated_at_)
778 : severity(severity_),
779 last_message(last_message_),
780 updated_at(updated_at_)
781 {}
782 };
783 std::map<std::string, HealthCheckLogStatus> health_check_log_times;
784
785public:
786
f67539c2
TL
787 void get_cluster_status(std::stringstream &ss, ceph::Formatter *f,
788 MonSession *session);
7c673cae 789
f67539c2
TL
790 void reply_command(MonOpRequestRef op, int rc, const std::string &rs, version_t version);
791 void reply_command(MonOpRequestRef op, int rc, const std::string &rs, ceph::buffer::list& rdata, version_t version);
7c673cae 792
f67539c2 793 void reply_tell_command(MonOpRequestRef op, int rc, const std::string &rs);
9f95a23c
TL
794
795
7c673cae
FG
796
797 void handle_probe(MonOpRequestRef op);
798 /**
799 * Handle a Probe Operation, replying with our name, quorum and known versions.
800 *
801 * We use the MMonProbe message class for anything and everything related with
802 * Monitor probing. One of the operations relates directly with the probing
803 * itself, in which we receive a probe request and to which we reply with
804 * our name, our quorum and the known versions for each Paxos service. Thus the
805 * redundant function name. This reply will obviously be sent to the one
806 * probing/requesting these infos.
807 *
808 * @todo Add @pre and @post
809 *
810 * @param m A Probe message, with an operation of type Probe.
811 */
812 void handle_probe_probe(MonOpRequestRef op);
813 void handle_probe_reply(MonOpRequestRef op);
814
815 // request routing
816 struct RoutedRequest {
817 uint64_t tid;
f67539c2 818 ceph::buffer::list request_bl;
7c673cae
FG
819 MonSession *session;
820 ConnectionRef con;
821 uint64_t con_features;
7c673cae
FG
822 MonOpRequestRef op;
823
824 RoutedRequest() : tid(0), session(NULL), con_features(0) {}
825 ~RoutedRequest() {
826 if (session)
827 session->put();
828 }
829 };
830 uint64_t routed_request_tid;
f67539c2
TL
831 std::map<uint64_t, RoutedRequest*> routed_requests;
832
7c673cae
FG
833 void forward_request_leader(MonOpRequestRef op);
834 void handle_forward(MonOpRequestRef op);
7c673cae
FG
835 void send_reply(MonOpRequestRef op, Message *reply);
836 void no_reply(MonOpRequestRef op);
837 void resend_routed_requests();
838 void remove_session(MonSession *s);
839 void remove_all_sessions();
840 void waitlist_or_zap_client(MonOpRequestRef op);
841
11fdf7f2 842 void send_mon_message(Message *m, int rank);
f67539c2 843 void notify_new_monmap();
7c673cae
FG
844
845public:
846 struct C_Command : public C_MonOp {
f67539c2 847 Monitor &mon;
7c673cae 848 int rc;
f67539c2
TL
849 std::string rs;
850 ceph::buffer::list rdata;
7c673cae 851 version_t version;
f67539c2 852 C_Command(Monitor &_mm, MonOpRequestRef _op, int r, std::string s, version_t v) :
7c673cae 853 C_MonOp(_op), mon(_mm), rc(r), rs(s), version(v){}
f67539c2 854 C_Command(Monitor &_mm, MonOpRequestRef _op, int r, std::string s, ceph::buffer::list rd, version_t v) :
7c673cae
FG
855 C_MonOp(_op), mon(_mm), rc(r), rs(s), rdata(rd), version(v){}
856
857 void _finish(int r) override {
9f95a23c 858 auto m = op->get_req<MMonCommand>();
7c673cae 859 if (r >= 0) {
f67539c2 860 std::ostringstream ss;
7c673cae
FG
861 if (!op->get_req()->get_connection()) {
862 ss << "connection dropped for command ";
863 } else {
864 MonSession *s = op->get_session();
865
866 // if client drops we may not have a session to draw information from.
867 if (s) {
11fdf7f2 868 ss << "from='" << s->name << " " << s->addrs << "' "
7c673cae
FG
869 << "entity='" << s->entity_name << "' ";
870 } else {
871 ss << "session dropped for command ";
872 }
873 }
adb31ebb 874 cmdmap_t cmdmap;
f67539c2 875 std::ostringstream ds;
adb31ebb
TL
876 string prefix;
877 cmdmap_from_json(m->cmd, &cmdmap, ds);
878 cmd_getval(cmdmap, "prefix", prefix);
879 if (prefix != "config set" && prefix != "config-key set")
880 ss << "cmd='" << m->cmd << "': finished";
7c673cae 881
f67539c2
TL
882 mon.audit_clog->info() << ss.str();
883 mon.reply_command(op, rc, rs, rdata, version);
7c673cae
FG
884 }
885 else if (r == -ECANCELED)
886 return;
887 else if (r == -EAGAIN)
f67539c2 888 mon.dispatch_op(op);
7c673cae 889 else
11fdf7f2 890 ceph_abort_msg("bad C_Command return value");
7c673cae
FG
891 }
892 };
893
894 private:
895 class C_RetryMessage : public C_MonOp {
896 Monitor *mon;
897 public:
898 C_RetryMessage(Monitor *m, MonOpRequestRef op) :
899 C_MonOp(op), mon(m) { }
900
901 void _finish(int r) override {
902 if (r == -EAGAIN || r >= 0)
903 mon->dispatch_op(op);
904 else if (r == -ECANCELED)
905 return;
906 else
11fdf7f2 907 ceph_abort_msg("bad C_RetryMessage return value");
7c673cae
FG
908 }
909 };
910
911 //ms_dispatch handles a lot of logic and we want to reuse it
912 //on forwarded messages, so we create a non-locking version for this class
913 void _ms_dispatch(Message *m);
914 bool ms_dispatch(Message *m) override {
9f95a23c 915 std::lock_guard l{lock};
7c673cae 916 _ms_dispatch(m);
7c673cae
FG
917 return true;
918 }
919 void dispatch_op(MonOpRequestRef op);
920 //mon_caps is used for un-connected messages from monitors
11fdf7f2 921 MonCap mon_caps;
9f95a23c 922 bool get_authorizer(int dest_type, AuthAuthorizer **authorizer);
11fdf7f2
TL
923public: // for AuthMonitor msgr1:
924 int ms_handle_authentication(Connection *con) override;
925private:
926 void ms_handle_accept(Connection *con) override;
7c673cae
FG
927 bool ms_handle_reset(Connection *con) override;
928 void ms_handle_remote_reset(Connection *con) override {}
929 bool ms_handle_refused(Connection *con) override;
930
11fdf7f2
TL
931 // AuthClient
932 int get_auth_request(
933 Connection *con,
934 AuthConnectionMeta *auth_meta,
935 uint32_t *method,
f67539c2
TL
936 std::vector<uint32_t> *preferred_modes,
937 ceph::buffer::list *out) override;
11fdf7f2
TL
938 int handle_auth_reply_more(
939 Connection *con,
940 AuthConnectionMeta *auth_meta,
f67539c2
TL
941 const ceph::buffer::list& bl,
942 ceph::buffer::list *reply) override;
11fdf7f2
TL
943 int handle_auth_done(
944 Connection *con,
945 AuthConnectionMeta *auth_meta,
946 uint64_t global_id,
947 uint32_t con_mode,
f67539c2 948 const ceph::buffer::list& bl,
11fdf7f2
TL
949 CryptoKey *session_key,
950 std::string *connection_secret) override;
951 int handle_auth_bad_method(
952 Connection *con,
953 AuthConnectionMeta *auth_meta,
954 uint32_t old_auth_method,
955 int result,
956 const std::vector<uint32_t>& allowed_methods,
957 const std::vector<uint32_t>& allowed_modes) override;
958 // /AuthClient
959 // AuthServer
960 int handle_auth_request(
961 Connection *con,
962 AuthConnectionMeta *auth_meta,
963 bool more,
964 uint32_t auth_method,
f67539c2
TL
965 const ceph::buffer::list& bl,
966 ceph::buffer::list *reply) override;
11fdf7f2
TL
967 // /AuthServer
968
f67539c2 969 int write_default_keyring(ceph::buffer::list& bl);
7c673cae
FG
970 void extract_save_mon_key(KeyRing& keyring);
971
224ce89b 972 void collect_metadata(Metadata *m);
224ce89b 973 int load_metadata();
f67539c2
TL
974 void count_metadata(const std::string& field, ceph::Formatter *f);
975 void count_metadata(const std::string& field, std::map<std::string,int> *out);
976 // get_all_versions() gathers version information from daemons for health check
977 void get_all_versions(std::map<string, std::list<std::string>> &versions);
978 void get_versions(std::map<string, std::list<std::string>> &versions);
7c673cae
FG
979
980 // features
981 static CompatSet get_initial_supported_features();
982 static CompatSet get_supported_features();
983 static CompatSet get_legacy_features();
984 /// read the ondisk features into the CompatSet pointed to by read_features
985 static void read_features_off_disk(MonitorDBStore *store, CompatSet *read_features);
986 void read_features();
987 void write_features(MonitorDBStore::TransactionRef t);
988
989 OpTracker op_tracker;
990
991 public:
f67539c2 992 Monitor(CephContext *cct_, std::string nm, MonitorDBStore *s,
7c673cae
FG
993 Messenger *m, Messenger *mgr_m, MonMap *map);
994 ~Monitor() override;
995
996 static int check_features(MonitorDBStore *store);
997
998 // config observer
999 const char** get_tracked_conf_keys() const override;
11fdf7f2 1000 void handle_conf_change(const ConfigProxy& conf,
7c673cae
FG
1001 const std::set<std::string> &changed) override;
1002
1003 void update_log_clients();
1004 int sanitize_options();
1005 int preinit();
1006 int init();
1007 void init_paxos();
1008 void refresh_from_paxos(bool *need_bootstrap);
1009 void shutdown();
1010 void tick();
1011
1012 void handle_signal(int sig);
1013
f67539c2 1014 int mkfs(ceph::buffer::list& osdmapbl);
7c673cae
FG
1015
1016 /**
1017 * check cluster_fsid file
1018 *
1019 * @return EEXIST if file exists and doesn't match, 0 on match, or negative error code
1020 */
1021 int check_fsid();
1022
1023 /**
1024 * write cluster_fsid file
1025 *
1026 * @return 0 on success, or negative error code
1027 */
1028 int write_fsid();
1029 int write_fsid(MonitorDBStore::TransactionRef t);
1030
9f95a23c 1031 int do_admin_command(std::string_view command, const cmdmap_t& cmdmap,
f67539c2 1032 ceph::Formatter *f,
9f95a23c
TL
1033 std::ostream& err,
1034 std::ostream& out);
7c673cae
FG
1035
1036private:
1037 // don't allow copying
1038 Monitor(const Monitor& rhs);
1039 Monitor& operator=(const Monitor &rhs);
1040
1041public:
c07f9fc5 1042 static void format_command_descriptions(const std::vector<MonCommand> &commands,
f67539c2 1043 ceph::Formatter *f,
11fdf7f2 1044 uint64_t features,
f67539c2 1045 ceph::buffer::list *rdata);
d2e6a577
FG
1046
1047 const std::vector<MonCommand> &get_local_commands(mon_feature_t f) {
11fdf7f2 1048 if (f.contains_all(ceph::features::mon::FEATURE_NAUTILUS)) {
d2e6a577 1049 return local_mon_commands;
11fdf7f2
TL
1050 } else {
1051 return prenautilus_local_mon_commands;
1052 }
d2e6a577 1053 }
f67539c2 1054 const ceph::buffer::list& get_local_commands_bl(mon_feature_t f) {
11fdf7f2 1055 if (f.contains_all(ceph::features::mon::FEATURE_NAUTILUS)) {
d2e6a577 1056 return local_mon_commands_bl;
11fdf7f2
TL
1057 } else {
1058 return prenautilus_local_mon_commands_bl;
1059 }
d2e6a577
FG
1060 }
1061 void set_leader_commands(const std::vector<MonCommand>& cmds) {
1062 leader_mon_commands = cmds;
1063 }
1064
11fdf7f2 1065 bool is_keyring_required();
7c673cae
FG
1066};
1067
1068#define CEPH_MON_FEATURE_INCOMPAT_BASE CompatSet::Feature (1, "initial feature set (~v.18)")
1069#define CEPH_MON_FEATURE_INCOMPAT_GV CompatSet::Feature (2, "global version sequencing (v0.52)")
1070#define CEPH_MON_FEATURE_INCOMPAT_SINGLE_PAXOS CompatSet::Feature (3, "single paxos with k/v store (v0.\?)")
1071#define CEPH_MON_FEATURE_INCOMPAT_OSD_ERASURE_CODES CompatSet::Feature(4, "support erasure code pools")
1072#define CEPH_MON_FEATURE_INCOMPAT_OSDMAP_ENC CompatSet::Feature(5, "new-style osdmap encoding")
1073#define CEPH_MON_FEATURE_INCOMPAT_ERASURE_CODE_PLUGINS_V2 CompatSet::Feature(6, "support isa/lrc erasure code")
1074#define CEPH_MON_FEATURE_INCOMPAT_ERASURE_CODE_PLUGINS_V3 CompatSet::Feature(7, "support shec erasure code")
1075#define CEPH_MON_FEATURE_INCOMPAT_KRAKEN CompatSet::Feature(8, "support monmap features")
181888fb 1076#define CEPH_MON_FEATURE_INCOMPAT_LUMINOUS CompatSet::Feature(9, "luminous ondisk layout")
11fdf7f2
TL
1077#define CEPH_MON_FEATURE_INCOMPAT_MIMIC CompatSet::Feature(10, "mimic ondisk layout")
1078#define CEPH_MON_FEATURE_INCOMPAT_NAUTILUS CompatSet::Feature(11, "nautilus ondisk layout")
9f95a23c 1079#define CEPH_MON_FEATURE_INCOMPAT_OCTOPUS CompatSet::Feature(12, "octopus ondisk layout")
f67539c2 1080#define CEPH_MON_FEATURE_INCOMPAT_PACIFIC CompatSet::Feature(13, "pacific ondisk layout")
7c673cae
FG
1081// make sure you add your feature to Monitor::get_supported_features
1082
7c673cae 1083
9f95a23c
TL
1084/* Callers use:
1085 *
1086 * new C_MonContext{...}
1087 *
1088 * instead of
1089 *
1090 * new C_MonContext(...)
1091 *
1092 * because of gcc bug [1].
1093 *
1094 * [1] https://gcc.gnu.org/bugzilla/show_bug.cgi?id=85883
1095 */
1096template<typename T>
1097class C_MonContext : public LambdaContext<T> {
1098public:
1099 C_MonContext(const Monitor* m, T&& f) :
1100 LambdaContext<T>(std::forward<T>(f)),
1101 mon(m)
1102 {}
1103 void finish(int r) override {
1104 if (mon->is_shutdown())
1105 return;
1106 LambdaContext<T>::finish(r);
1107 }
1108private:
1109 const Monitor* mon;
1110};
7c673cae
FG
1111
1112#endif