]> git.proxmox.com Git - ceph.git/blob - ceph/src/mon/Monitor.h
import 15.2.9
[ceph.git] / ceph / src / mon / Monitor.h
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3 /*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 *
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
12 *
13 */
14
15 /*
16 * This is the top level monitor. It runs on each machine in the Monitor
17 * Cluster. The election of a leader for the paxos algorithm only happens
18 * once per machine via the elector. There is a separate paxos instance (state)
19 * kept for each of the system components: Object Store Device (OSD) Monitor,
20 * Placement Group (PG) Monitor, Metadata Server (MDS) Monitor, and Client Monitor.
21 */
22
23 #ifndef CEPH_MONITOR_H
24 #define CEPH_MONITOR_H
25
26 #include <errno.h>
27 #include <cmath>
28 #include <string>
29
30 #include "include/types.h"
31 #include "include/health.h"
32 #include "msg/Messenger.h"
33
34 #include "common/Timer.h"
35
36 #include "health_check.h"
37 #include "MonMap.h"
38 #include "Elector.h"
39 #include "Paxos.h"
40 #include "Session.h"
41 #include "MonCommand.h"
42
43
44 #include "common/config_obs.h"
45 #include "common/LogClient.h"
46 #include "auth/AuthClient.h"
47 #include "auth/AuthServer.h"
48 #include "auth/cephx/CephxKeyServer.h"
49 #include "auth/AuthMethodList.h"
50 #include "auth/KeyRing.h"
51 #include "include/common_fwd.h"
52 #include "messages/MMonCommand.h"
53 #include "mon/MonitorDBStore.h"
54 #include "mgr/MgrClient.h"
55
56 #include "mon/MonOpRequest.h"
57 #include "common/WorkQueue.h"
58
59 using namespace TOPNSPC::common;
60
61 #define CEPH_MON_PROTOCOL 13 /* cluster internal */
62
63
64 enum {
65 l_cluster_first = 555000,
66 l_cluster_num_mon,
67 l_cluster_num_mon_quorum,
68 l_cluster_num_osd,
69 l_cluster_num_osd_up,
70 l_cluster_num_osd_in,
71 l_cluster_osd_epoch,
72 l_cluster_osd_bytes,
73 l_cluster_osd_bytes_used,
74 l_cluster_osd_bytes_avail,
75 l_cluster_num_pool,
76 l_cluster_num_pg,
77 l_cluster_num_pg_active_clean,
78 l_cluster_num_pg_active,
79 l_cluster_num_pg_peering,
80 l_cluster_num_object,
81 l_cluster_num_object_degraded,
82 l_cluster_num_object_misplaced,
83 l_cluster_num_object_unfound,
84 l_cluster_num_bytes,
85 l_cluster_last,
86 };
87
88 enum {
89 l_mon_first = 456000,
90 l_mon_num_sessions,
91 l_mon_session_add,
92 l_mon_session_rm,
93 l_mon_session_trim,
94 l_mon_num_elections,
95 l_mon_election_call,
96 l_mon_election_win,
97 l_mon_election_lose,
98 l_mon_last,
99 };
100
101 class QuorumService;
102 class PaxosService;
103
104 class AdminSocketHook;
105
106 #define COMPAT_SET_LOC "feature_set"
107
108 class Monitor : public Dispatcher,
109 public AuthClient,
110 public AuthServer,
111 public md_config_obs_t {
112 public:
113 int orig_argc = 0;
114 const char **orig_argv = nullptr;
115
116 // me
117 string name;
118 int rank;
119 Messenger *messenger;
120 ConnectionRef con_self;
121 ceph::mutex lock = ceph::make_mutex("Monitor::lock");
122 SafeTimer timer;
123 Finisher finisher;
124 ThreadPool cpu_tp; ///< threadpool for CPU intensive work
125
126 ceph::mutex auth_lock = ceph::make_mutex("Monitor::auth_lock");
127
128 /// true if we have ever joined a quorum. if false, we are either a
129 /// new cluster, a newly joining monitor, or a just-upgraded
130 /// monitor.
131 bool has_ever_joined;
132
133 PerfCounters *logger, *cluster_logger;
134 bool cluster_logger_registered;
135
136 void register_cluster_logger();
137 void unregister_cluster_logger();
138
139 MonMap *monmap;
140 uuid_d fingerprint;
141
142 set<entity_addrvec_t> extra_probe_peers;
143
144 LogClient log_client;
145 LogChannelRef clog;
146 LogChannelRef audit_clog;
147 KeyRing keyring;
148 KeyServer key_server;
149
150 AuthMethodList auth_cluster_required;
151 AuthMethodList auth_service_required;
152
153 CompatSet features;
154
155 vector<MonCommand> leader_mon_commands; // quorum leader's commands
156 vector<MonCommand> local_mon_commands; // commands i support
157 bufferlist local_mon_commands_bl; // encoded version of above
158
159 vector<MonCommand> prenautilus_local_mon_commands;
160 bufferlist prenautilus_local_mon_commands_bl;
161
162 Messenger *mgr_messenger;
163 MgrClient mgr_client;
164 uint64_t mgr_proxy_bytes = 0; // in-flight proxied mgr command message bytes
165 std::string gss_ktfile_client{};
166
167 private:
168 void new_tick();
169
170 // -- local storage --
171 public:
172 MonitorDBStore *store;
173 static const string MONITOR_NAME;
174 static const string MONITOR_STORE_PREFIX;
175
176 // -- monitor state --
177 private:
178 enum {
179 STATE_INIT = 1,
180 STATE_PROBING,
181 STATE_SYNCHRONIZING,
182 STATE_ELECTING,
183 STATE_LEADER,
184 STATE_PEON,
185 STATE_SHUTDOWN
186 };
187 int state = STATE_INIT;
188
189 public:
190 static const char *get_state_name(int s) {
191 switch (s) {
192 case STATE_PROBING: return "probing";
193 case STATE_SYNCHRONIZING: return "synchronizing";
194 case STATE_ELECTING: return "electing";
195 case STATE_LEADER: return "leader";
196 case STATE_PEON: return "peon";
197 case STATE_SHUTDOWN: return "shutdown";
198 default: return "???";
199 }
200 }
201 const char *get_state_name() const {
202 return get_state_name(state);
203 }
204
205 bool is_init() const { return state == STATE_INIT; }
206 bool is_shutdown() const { return state == STATE_SHUTDOWN; }
207 bool is_probing() const { return state == STATE_PROBING; }
208 bool is_synchronizing() const { return state == STATE_SYNCHRONIZING; }
209 bool is_electing() const { return state == STATE_ELECTING; }
210 bool is_leader() const { return state == STATE_LEADER; }
211 bool is_peon() const { return state == STATE_PEON; }
212
213 const utime_t &get_leader_since() const;
214
215 void prepare_new_fingerprint(MonitorDBStore::TransactionRef t);
216
217 std::vector<DaemonHealthMetric> get_health_metrics();
218
219 // -- elector --
220 private:
221 Paxos *paxos;
222 Elector elector;
223 friend class Elector;
224
225 /// features we require of peers (based on on-disk compatset)
226 uint64_t required_features;
227
228 int leader; // current leader (to best of knowledge)
229 set<int> quorum; // current active set of monitors (if !starting)
230 mono_clock::time_point quorum_since; // when quorum formed
231 utime_t leader_since; // when this monitor became the leader, if it is the leader
232 utime_t exited_quorum; // time detected as not in quorum; 0 if in
233
234 // map of counts of connected clients, by type and features, for
235 // each quorum mon
236 map<int,FeatureMap> quorum_feature_map;
237
238 /**
239 * Intersection of quorum member's connection feature bits.
240 */
241 uint64_t quorum_con_features;
242 /**
243 * Intersection of quorum members mon-specific feature bits
244 */
245 mon_feature_t quorum_mon_features;
246
247 ceph_release_t quorum_min_mon_release{ceph_release_t::unknown};
248
249 set<string> outside_quorum;
250
251 /**
252 * @defgroup Monitor_h_scrub
253 * @{
254 */
255 version_t scrub_version; ///< paxos version we are scrubbing
256 map<int,ScrubResult> scrub_result; ///< results so far
257
258 /**
259 * trigger a cross-mon scrub
260 *
261 * Verify all mons are storing identical content
262 */
263 int scrub_start();
264 int scrub();
265 void handle_scrub(MonOpRequestRef op);
266 bool _scrub(ScrubResult *r,
267 pair<string,string> *start,
268 int *num_keys);
269 void scrub_check_results();
270 void scrub_timeout();
271 void scrub_finish();
272 void scrub_reset();
273 void scrub_update_interval(int secs);
274
275 Context *scrub_event; ///< periodic event to trigger scrub (leader)
276 Context *scrub_timeout_event; ///< scrub round timeout (leader)
277 void scrub_event_start();
278 void scrub_event_cancel();
279 void scrub_reset_timeout();
280 void scrub_cancel_timeout();
281
282 struct ScrubState {
283 pair<string,string> last_key; ///< last scrubbed key
284 bool finished;
285
286 ScrubState() : finished(false) { }
287 virtual ~ScrubState() { }
288 };
289 std::shared_ptr<ScrubState> scrub_state; ///< keeps track of current scrub
290
291 /**
292 * @defgroup Monitor_h_sync Synchronization
293 * @{
294 */
295 /**
296 * @} // provider state
297 */
298 struct SyncProvider {
299 entity_addrvec_t addrs;
300 uint64_t cookie; ///< unique cookie for this sync attempt
301 utime_t timeout; ///< when we give up and expire this attempt
302 version_t last_committed; ///< last paxos version on peer
303 pair<string,string> last_key; ///< last key sent to (or on) peer
304 bool full; ///< full scan?
305 MonitorDBStore::Synchronizer synchronizer; ///< iterator
306
307 SyncProvider() : cookie(0), last_committed(0), full(false) {}
308
309 void reset_timeout(CephContext *cct, int grace) {
310 timeout = ceph_clock_now();
311 timeout += grace;
312 }
313 };
314
315 map<uint64_t, SyncProvider> sync_providers; ///< cookie -> SyncProvider for those syncing from us
316 uint64_t sync_provider_count; ///< counter for issued cookies to keep them unique
317
318 /**
319 * @} // requester state
320 */
321 entity_addrvec_t sync_provider; ///< who we are syncing from
322 uint64_t sync_cookie; ///< 0 if we are starting, non-zero otherwise
323 bool sync_full; ///< true if we are a full sync, false for recent catch-up
324 version_t sync_start_version; ///< last_committed at sync start
325 Context *sync_timeout_event; ///< timeout event
326
327 /**
328 * floor for sync source
329 *
330 * When we sync we forget about our old last_committed value which
331 * can be dangerous. For example, if we have a cluster of:
332 *
333 * mon.a: lc 100
334 * mon.b: lc 80
335 * mon.c: lc 100 (us)
336 *
337 * If something forces us to sync (say, corruption, or manual
338 * intervention, or bug), we forget last_committed, and might abort.
339 * If mon.a happens to be down when we come back, we will see:
340 *
341 * mon.b: lc 80
342 * mon.c: lc 0 (us)
343 *
344 * and sync from mon.b, at which point a+b will both have lc 80 and
345 * come online with a majority holding out of date commits.
346 *
347 * Avoid this by preserving our old last_committed value prior to
348 * sync and never going backwards.
349 */
350 version_t sync_last_committed_floor;
351
352 /**
353 * Obtain the synchronization target prefixes in set form.
354 *
355 * We consider a target prefix all those that are relevant when
356 * synchronizing two stores. That is, all those that hold paxos service's
357 * versions, as well as paxos versions, or any control keys such as the
358 * first or last committed version.
359 *
360 * Given the current design, this function should return the name of all and
361 * any available paxos service, plus the paxos name.
362 *
363 * @returns a set of strings referring to the prefixes being synchronized
364 */
365 set<string> get_sync_targets_names();
366
367 /**
368 * Reset the monitor's sync-related data structures for syncing *from* a peer
369 */
370 void sync_reset_requester();
371
372 /**
373 * Reset sync state related to allowing others to sync from us
374 */
375 void sync_reset_provider();
376
377 /**
378 * Caled when a sync attempt times out (requester-side)
379 */
380 void sync_timeout();
381
382 /**
383 * Get the latest monmap for backup purposes during sync
384 */
385 void sync_obtain_latest_monmap(bufferlist &bl);
386
387 /**
388 * Start sync process
389 *
390 * Start pulling committed state from another monitor.
391 *
392 * @param entity where to pull committed state from
393 * @param full whether to do a full sync or just catch up on recent paxos
394 */
395 void sync_start(entity_addrvec_t &addrs, bool full);
396
397 public:
398 /**
399 * force a sync on next mon restart
400 */
401 void sync_force(Formatter *f);
402
403 private:
404 /**
405 * store critical state for safekeeping during sync
406 *
407 * We store a few things on the side that we don't want to get clobbered by sync. This
408 * includes the latest monmap and a lower bound on last_committed.
409 */
410 void sync_stash_critical_state(MonitorDBStore::TransactionRef tx);
411
412 /**
413 * reset the sync timeout
414 *
415 * This is used on the client to restart if things aren't progressing
416 */
417 void sync_reset_timeout();
418
419 /**
420 * trim stale sync provider state
421 *
422 * If someone is syncing from us and hasn't talked to us recently, expire their state.
423 */
424 void sync_trim_providers();
425
426 /**
427 * Complete a sync
428 *
429 * Finish up a sync after we've gotten all of the chunks.
430 *
431 * @param last_committed final last_committed value from provider
432 */
433 void sync_finish(version_t last_committed);
434
435 /**
436 * request the next chunk from the provider
437 */
438 void sync_get_next_chunk();
439
440 /**
441 * handle sync message
442 *
443 * @param m Sync message with operation type MMonSync::OP_START_CHUNKS
444 */
445 void handle_sync(MonOpRequestRef op);
446
447 void _sync_reply_no_cookie(MonOpRequestRef op);
448
449 void handle_sync_get_cookie(MonOpRequestRef op);
450 void handle_sync_get_chunk(MonOpRequestRef op);
451 void handle_sync_finish(MonOpRequestRef op);
452
453 void handle_sync_cookie(MonOpRequestRef op);
454 void handle_sync_forward(MonOpRequestRef op);
455 void handle_sync_chunk(MonOpRequestRef op);
456 void handle_sync_no_cookie(MonOpRequestRef op);
457
458 /**
459 * @} // Synchronization
460 */
461
462 list<Context*> waitfor_quorum;
463 list<Context*> maybe_wait_for_quorum;
464
465 /**
466 * @defgroup Monitor_h_TimeCheck Monitor Clock Drift Early Warning System
467 * @{
468 *
469 * We use time checks to keep track of any clock drifting going on in the
470 * cluster. This is accomplished by periodically ping each monitor in the
471 * quorum and register its response time on a map, assessing how much its
472 * clock has drifted. We also take this opportunity to assess the latency
473 * on response.
474 *
475 * This mechanism works as follows:
476 *
477 * - Leader sends out a 'PING' message to each other monitor in the quorum.
478 * The message is timestamped with the leader's current time. The leader's
479 * current time is recorded in a map, associated with each peon's
480 * instance.
481 * - The peon replies to the leader with a timestamped 'PONG' message.
482 * - The leader calculates a delta between the peon's timestamp and its
483 * current time and stashes it.
484 * - The leader also calculates the time it took to receive the 'PONG'
485 * since the 'PING' was sent, and stashes an approximate latency estimate.
486 * - Once all the quorum members have pong'ed, the leader will share the
487 * clock skew and latency maps with all the monitors in the quorum.
488 */
489 map<int, utime_t> timecheck_waiting;
490 map<int, double> timecheck_skews;
491 map<int, double> timecheck_latencies;
492 // odd value means we are mid-round; even value means the round has
493 // finished.
494 version_t timecheck_round;
495 unsigned int timecheck_acks;
496 utime_t timecheck_round_start;
497 friend class HealthMonitor;
498 /* When we hit a skew we will start a new round based off of
499 * 'mon_timecheck_skew_interval'. Each new round will be backed off
500 * until we hit 'mon_timecheck_interval' -- which is the typical
501 * interval when not in the presence of a skew.
502 *
503 * This variable tracks the number of rounds with skews since last clean
504 * so that we can report to the user and properly adjust the backoff.
505 */
506 uint64_t timecheck_rounds_since_clean;
507 /**
508 * Time Check event.
509 */
510 Context *timecheck_event;
511
512 void timecheck_start();
513 void timecheck_finish();
514 void timecheck_start_round();
515 void timecheck_finish_round(bool success = true);
516 void timecheck_cancel_round();
517 void timecheck_cleanup();
518 void timecheck_reset_event();
519 void timecheck_check_skews();
520 void timecheck_report();
521 void timecheck();
522 health_status_t timecheck_status(ostringstream &ss,
523 const double skew_bound,
524 const double latency);
525 void handle_timecheck_leader(MonOpRequestRef op);
526 void handle_timecheck_peon(MonOpRequestRef op);
527 void handle_timecheck(MonOpRequestRef op);
528
529 /**
530 * Returns 'true' if this is considered to be a skew; 'false' otherwise.
531 */
532 bool timecheck_has_skew(const double skew_bound, double *abs) const {
533 double abs_skew = std::fabs(skew_bound);
534 if (abs)
535 *abs = abs_skew;
536 return (abs_skew > g_conf()->mon_clock_drift_allowed);
537 }
538
539 /**
540 * @}
541 */
542 /**
543 * Handle ping messages from others.
544 */
545 void handle_ping(MonOpRequestRef op);
546
547 Context *probe_timeout_event = nullptr; // for probing
548
549 void reset_probe_timeout();
550 void cancel_probe_timeout();
551 void probe_timeout(int r);
552
553 void _apply_compatset_features(CompatSet &new_features);
554
555 public:
556 epoch_t get_epoch();
557 int get_leader() const { return leader; }
558 string get_leader_name() {
559 return quorum.empty() ? string() : monmap->get_name(*quorum.begin());
560 }
561 const set<int>& get_quorum() const { return quorum; }
562 list<string> get_quorum_names() {
563 list<string> q;
564 for (set<int>::iterator p = quorum.begin(); p != quorum.end(); ++p)
565 q.push_back(monmap->get_name(*p));
566 return q;
567 }
568 uint64_t get_quorum_con_features() const {
569 return quorum_con_features;
570 }
571 mon_feature_t get_quorum_mon_features() const {
572 return quorum_mon_features;
573 }
574 uint64_t get_required_features() const {
575 return required_features;
576 }
577 mon_feature_t get_required_mon_features() const {
578 return monmap->get_required_features();
579 }
580 void apply_quorum_to_compatset_features();
581 void apply_monmap_to_compatset_features();
582 void calc_quorum_requirements();
583
584 void get_combined_feature_map(FeatureMap *fm);
585
586 private:
587 void _reset(); ///< called from bootstrap, start_, or join_election
588 void wait_for_paxos_write();
589 void _finish_svc_election(); ///< called by {win,lose}_election
590 void respawn();
591 public:
592 void bootstrap();
593 void join_election();
594 void start_election();
595 void win_standalone_election();
596 // end election (called by Elector)
597 void win_election(epoch_t epoch, const set<int>& q,
598 uint64_t features,
599 const mon_feature_t& mon_features,
600 ceph_release_t min_mon_release,
601 const map<int,Metadata>& metadata);
602 void lose_election(epoch_t epoch, set<int>& q, int l,
603 uint64_t features,
604 const mon_feature_t& mon_features,
605 ceph_release_t min_mon_release);
606 // end election (called by Elector)
607 void finish_election();
608
609 void update_logger();
610
611 /**
612 * Vector holding the Services serviced by this Monitor.
613 */
614 vector<std::unique_ptr<PaxosService>> paxos_service;
615
616 class MDSMonitor *mdsmon() {
617 return (class MDSMonitor *)paxos_service[PAXOS_MDSMAP].get();
618 }
619
620 class MonmapMonitor *monmon() {
621 return (class MonmapMonitor *)paxos_service[PAXOS_MONMAP].get();
622 }
623
624 class OSDMonitor *osdmon() {
625 return (class OSDMonitor *)paxos_service[PAXOS_OSDMAP].get();
626 }
627
628 class AuthMonitor *authmon() {
629 return (class AuthMonitor *)paxos_service[PAXOS_AUTH].get();
630 }
631
632 class LogMonitor *logmon() {
633 return (class LogMonitor*) paxos_service[PAXOS_LOG].get();
634 }
635
636 class MgrMonitor *mgrmon() {
637 return (class MgrMonitor*) paxos_service[PAXOS_MGR].get();
638 }
639
640 class MgrStatMonitor *mgrstatmon() {
641 return (class MgrStatMonitor*) paxos_service[PAXOS_MGRSTAT].get();
642 }
643
644 class HealthMonitor *healthmon() {
645 return (class HealthMonitor*) paxos_service[PAXOS_HEALTH].get();
646 }
647
648 class ConfigMonitor *configmon() {
649 return (class ConfigMonitor*) paxos_service[PAXOS_CONFIG].get();
650 }
651
652 friend class Paxos;
653 friend class OSDMonitor;
654 friend class MDSMonitor;
655 friend class MonmapMonitor;
656 friend class LogMonitor;
657 friend class ConfigKeyService;
658
659 QuorumService *config_key_service;
660
661 // -- sessions --
662 MonSessionMap session_map;
663 ceph::mutex session_map_lock = ceph::make_mutex("Monitor::session_map_lock");
664 AdminSocketHook *admin_hook;
665
666 template<typename Func, typename...Args>
667 void with_session_map(Func&& func) {
668 std::lock_guard l(session_map_lock);
669 std::forward<Func>(func)(session_map);
670 }
671 void send_latest_monmap(Connection *con);
672
673 // messages
674 void handle_get_version(MonOpRequestRef op);
675 void handle_subscribe(MonOpRequestRef op);
676 void handle_mon_get_map(MonOpRequestRef op);
677
678 static void _generate_command_map(cmdmap_t& cmdmap,
679 map<string,string> &param_str_map);
680 static const MonCommand *_get_moncommand(
681 const string &cmd_prefix,
682 const vector<MonCommand>& cmds);
683 bool _allowed_command(MonSession *s, const string& module,
684 const string& prefix,
685 const cmdmap_t& cmdmap,
686 const map<string,string>& param_str_map,
687 const MonCommand *this_cmd);
688 void get_mon_status(Formatter *f);
689 void _quorum_status(Formatter *f, ostream& ss);
690 bool _add_bootstrap_peer_hint(std::string_view cmd, const cmdmap_t& cmdmap,
691 std::ostream& ss);
692 void handle_tell_command(MonOpRequestRef op);
693 void handle_command(MonOpRequestRef op);
694 void handle_route(MonOpRequestRef op);
695
696 void handle_mon_metadata(MonOpRequestRef op);
697 int get_mon_metadata(int mon, Formatter *f, ostream& err);
698 int print_nodes(Formatter *f, ostream& err);
699
700 // Accumulate metadata across calls to update_mon_metadata
701 map<int, Metadata> mon_metadata;
702 map<int, Metadata> pending_metadata;
703
704 /**
705 *
706 */
707 struct health_cache_t {
708 health_status_t overall;
709 string summary;
710
711 void reset() {
712 // health_status_t doesn't really have a NONE value and we're not
713 // okay with setting something else (say, HEALTH_ERR). so just
714 // leave it be.
715 summary.clear();
716 }
717 } health_status_cache;
718
719 Context *health_tick_event = nullptr;
720 Context *health_interval_event = nullptr;
721
722 void health_tick_start();
723 void health_tick_stop();
724 ceph::real_clock::time_point health_interval_calc_next_update();
725 void health_interval_start();
726 void health_interval_stop();
727 void health_events_cleanup();
728
729 void health_to_clog_update_conf(const std::set<std::string> &changed);
730
731 void do_health_to_clog_interval();
732 void do_health_to_clog(bool force = false);
733
734 void log_health(
735 const health_check_map_t& updated,
736 const health_check_map_t& previous,
737 MonitorDBStore::TransactionRef t);
738
739 protected:
740
741 class HealthCheckLogStatus {
742 public:
743 health_status_t severity;
744 std::string last_message;
745 utime_t updated_at = 0;
746 HealthCheckLogStatus(health_status_t severity_,
747 const std::string &last_message_,
748 utime_t updated_at_)
749 : severity(severity_),
750 last_message(last_message_),
751 updated_at(updated_at_)
752 {}
753 };
754 std::map<std::string, HealthCheckLogStatus> health_check_log_times;
755
756 public:
757
758 void get_cluster_status(stringstream &ss, Formatter *f);
759
760 void reply_command(MonOpRequestRef op, int rc, const string &rs, version_t version);
761 void reply_command(MonOpRequestRef op, int rc, const string &rs, bufferlist& rdata, version_t version);
762
763 void reply_tell_command(MonOpRequestRef op, int rc, const string &rs);
764
765
766
767 void handle_probe(MonOpRequestRef op);
768 /**
769 * Handle a Probe Operation, replying with our name, quorum and known versions.
770 *
771 * We use the MMonProbe message class for anything and everything related with
772 * Monitor probing. One of the operations relates directly with the probing
773 * itself, in which we receive a probe request and to which we reply with
774 * our name, our quorum and the known versions for each Paxos service. Thus the
775 * redundant function name. This reply will obviously be sent to the one
776 * probing/requesting these infos.
777 *
778 * @todo Add @pre and @post
779 *
780 * @param m A Probe message, with an operation of type Probe.
781 */
782 void handle_probe_probe(MonOpRequestRef op);
783 void handle_probe_reply(MonOpRequestRef op);
784
785 // request routing
786 struct RoutedRequest {
787 uint64_t tid;
788 bufferlist request_bl;
789 MonSession *session;
790 ConnectionRef con;
791 uint64_t con_features;
792 MonOpRequestRef op;
793
794 RoutedRequest() : tid(0), session(NULL), con_features(0) {}
795 ~RoutedRequest() {
796 if (session)
797 session->put();
798 }
799 };
800 uint64_t routed_request_tid;
801 map<uint64_t, RoutedRequest*> routed_requests;
802
803 void forward_request_leader(MonOpRequestRef op);
804 void handle_forward(MonOpRequestRef op);
805 void send_reply(MonOpRequestRef op, Message *reply);
806 void no_reply(MonOpRequestRef op);
807 void resend_routed_requests();
808 void remove_session(MonSession *s);
809 void remove_all_sessions();
810 void waitlist_or_zap_client(MonOpRequestRef op);
811
812 void send_mon_message(Message *m, int rank);
813
814 public:
815 struct C_Command : public C_MonOp {
816 Monitor *mon;
817 int rc;
818 string rs;
819 bufferlist rdata;
820 version_t version;
821 C_Command(Monitor *_mm, MonOpRequestRef _op, int r, string s, version_t v) :
822 C_MonOp(_op), mon(_mm), rc(r), rs(s), version(v){}
823 C_Command(Monitor *_mm, MonOpRequestRef _op, int r, string s, bufferlist rd, version_t v) :
824 C_MonOp(_op), mon(_mm), rc(r), rs(s), rdata(rd), version(v){}
825
826 void _finish(int r) override {
827 auto m = op->get_req<MMonCommand>();
828 if (r >= 0) {
829 ostringstream ss;
830 if (!op->get_req()->get_connection()) {
831 ss << "connection dropped for command ";
832 } else {
833 MonSession *s = op->get_session();
834
835 // if client drops we may not have a session to draw information from.
836 if (s) {
837 ss << "from='" << s->name << " " << s->addrs << "' "
838 << "entity='" << s->entity_name << "' ";
839 } else {
840 ss << "session dropped for command ";
841 }
842 }
843 cmdmap_t cmdmap;
844 stringstream ds;
845 string prefix;
846 cmdmap_from_json(m->cmd, &cmdmap, ds);
847 cmd_getval(cmdmap, "prefix", prefix);
848 if (prefix != "config set" && prefix != "config-key set")
849 ss << "cmd='" << m->cmd << "': finished";
850
851 mon->audit_clog->info() << ss.str();
852 mon->reply_command(op, rc, rs, rdata, version);
853 }
854 else if (r == -ECANCELED)
855 return;
856 else if (r == -EAGAIN)
857 mon->dispatch_op(op);
858 else
859 ceph_abort_msg("bad C_Command return value");
860 }
861 };
862
863 private:
864 class C_RetryMessage : public C_MonOp {
865 Monitor *mon;
866 public:
867 C_RetryMessage(Monitor *m, MonOpRequestRef op) :
868 C_MonOp(op), mon(m) { }
869
870 void _finish(int r) override {
871 if (r == -EAGAIN || r >= 0)
872 mon->dispatch_op(op);
873 else if (r == -ECANCELED)
874 return;
875 else
876 ceph_abort_msg("bad C_RetryMessage return value");
877 }
878 };
879
880 //ms_dispatch handles a lot of logic and we want to reuse it
881 //on forwarded messages, so we create a non-locking version for this class
882 void _ms_dispatch(Message *m);
883 bool ms_dispatch(Message *m) override {
884 std::lock_guard l{lock};
885 _ms_dispatch(m);
886 return true;
887 }
888 void dispatch_op(MonOpRequestRef op);
889 //mon_caps is used for un-connected messages from monitors
890 MonCap mon_caps;
891 bool get_authorizer(int dest_type, AuthAuthorizer **authorizer);
892 public: // for AuthMonitor msgr1:
893 int ms_handle_authentication(Connection *con) override;
894 private:
895 void ms_handle_accept(Connection *con) override;
896 bool ms_handle_reset(Connection *con) override;
897 void ms_handle_remote_reset(Connection *con) override {}
898 bool ms_handle_refused(Connection *con) override;
899
900 // AuthClient
901 int get_auth_request(
902 Connection *con,
903 AuthConnectionMeta *auth_meta,
904 uint32_t *method,
905 vector<uint32_t> *preferred_modes,
906 bufferlist *out) override;
907 int handle_auth_reply_more(
908 Connection *con,
909 AuthConnectionMeta *auth_meta,
910 const bufferlist& bl,
911 bufferlist *reply) override;
912 int handle_auth_done(
913 Connection *con,
914 AuthConnectionMeta *auth_meta,
915 uint64_t global_id,
916 uint32_t con_mode,
917 const bufferlist& bl,
918 CryptoKey *session_key,
919 std::string *connection_secret) override;
920 int handle_auth_bad_method(
921 Connection *con,
922 AuthConnectionMeta *auth_meta,
923 uint32_t old_auth_method,
924 int result,
925 const std::vector<uint32_t>& allowed_methods,
926 const std::vector<uint32_t>& allowed_modes) override;
927 // /AuthClient
928 // AuthServer
929 int handle_auth_request(
930 Connection *con,
931 AuthConnectionMeta *auth_meta,
932 bool more,
933 uint32_t auth_method,
934 const bufferlist& bl,
935 bufferlist *reply) override;
936 // /AuthServer
937
938 int write_default_keyring(bufferlist& bl);
939 void extract_save_mon_key(KeyRing& keyring);
940
941 void collect_metadata(Metadata *m);
942 void update_mon_metadata(int from, Metadata&& m);
943 int load_metadata();
944 void count_metadata(const string& field, Formatter *f);
945 void count_metadata(const string& field, map<string,int> *out);
946
947 // features
948 static CompatSet get_initial_supported_features();
949 static CompatSet get_supported_features();
950 static CompatSet get_legacy_features();
951 /// read the ondisk features into the CompatSet pointed to by read_features
952 static void read_features_off_disk(MonitorDBStore *store, CompatSet *read_features);
953 void read_features();
954 void write_features(MonitorDBStore::TransactionRef t);
955
956 OpTracker op_tracker;
957
958 public:
959 Monitor(CephContext *cct_, string nm, MonitorDBStore *s,
960 Messenger *m, Messenger *mgr_m, MonMap *map);
961 ~Monitor() override;
962
963 static int check_features(MonitorDBStore *store);
964
965 // config observer
966 const char** get_tracked_conf_keys() const override;
967 void handle_conf_change(const ConfigProxy& conf,
968 const std::set<std::string> &changed) override;
969
970 void update_log_clients();
971 int sanitize_options();
972 int preinit();
973 int init();
974 void init_paxos();
975 void refresh_from_paxos(bool *need_bootstrap);
976 void shutdown();
977 void tick();
978
979 void handle_signal(int sig);
980
981 int mkfs(bufferlist& osdmapbl);
982
983 /**
984 * check cluster_fsid file
985 *
986 * @return EEXIST if file exists and doesn't match, 0 on match, or negative error code
987 */
988 int check_fsid();
989
990 /**
991 * write cluster_fsid file
992 *
993 * @return 0 on success, or negative error code
994 */
995 int write_fsid();
996 int write_fsid(MonitorDBStore::TransactionRef t);
997
998 int do_admin_command(std::string_view command, const cmdmap_t& cmdmap,
999 Formatter *f,
1000 std::ostream& err,
1001 std::ostream& out);
1002
1003 private:
1004 // don't allow copying
1005 Monitor(const Monitor& rhs);
1006 Monitor& operator=(const Monitor &rhs);
1007
1008 public:
1009 static void format_command_descriptions(const std::vector<MonCommand> &commands,
1010 Formatter *f,
1011 uint64_t features,
1012 bufferlist *rdata);
1013
1014 const std::vector<MonCommand> &get_local_commands(mon_feature_t f) {
1015 if (f.contains_all(ceph::features::mon::FEATURE_NAUTILUS)) {
1016 return local_mon_commands;
1017 } else {
1018 return prenautilus_local_mon_commands;
1019 }
1020 }
1021 const bufferlist& get_local_commands_bl(mon_feature_t f) {
1022 if (f.contains_all(ceph::features::mon::FEATURE_NAUTILUS)) {
1023 return local_mon_commands_bl;
1024 } else {
1025 return prenautilus_local_mon_commands_bl;
1026 }
1027 }
1028 void set_leader_commands(const std::vector<MonCommand>& cmds) {
1029 leader_mon_commands = cmds;
1030 }
1031
1032 bool is_keyring_required();
1033 };
1034
1035 #define CEPH_MON_FEATURE_INCOMPAT_BASE CompatSet::Feature (1, "initial feature set (~v.18)")
1036 #define CEPH_MON_FEATURE_INCOMPAT_GV CompatSet::Feature (2, "global version sequencing (v0.52)")
1037 #define CEPH_MON_FEATURE_INCOMPAT_SINGLE_PAXOS CompatSet::Feature (3, "single paxos with k/v store (v0.\?)")
1038 #define CEPH_MON_FEATURE_INCOMPAT_OSD_ERASURE_CODES CompatSet::Feature(4, "support erasure code pools")
1039 #define CEPH_MON_FEATURE_INCOMPAT_OSDMAP_ENC CompatSet::Feature(5, "new-style osdmap encoding")
1040 #define CEPH_MON_FEATURE_INCOMPAT_ERASURE_CODE_PLUGINS_V2 CompatSet::Feature(6, "support isa/lrc erasure code")
1041 #define CEPH_MON_FEATURE_INCOMPAT_ERASURE_CODE_PLUGINS_V3 CompatSet::Feature(7, "support shec erasure code")
1042 #define CEPH_MON_FEATURE_INCOMPAT_KRAKEN CompatSet::Feature(8, "support monmap features")
1043 #define CEPH_MON_FEATURE_INCOMPAT_LUMINOUS CompatSet::Feature(9, "luminous ondisk layout")
1044 #define CEPH_MON_FEATURE_INCOMPAT_MIMIC CompatSet::Feature(10, "mimic ondisk layout")
1045 #define CEPH_MON_FEATURE_INCOMPAT_NAUTILUS CompatSet::Feature(11, "nautilus ondisk layout")
1046 #define CEPH_MON_FEATURE_INCOMPAT_OCTOPUS CompatSet::Feature(12, "octopus ondisk layout")
1047 // make sure you add your feature to Monitor::get_supported_features
1048
1049
1050 /* Callers use:
1051 *
1052 * new C_MonContext{...}
1053 *
1054 * instead of
1055 *
1056 * new C_MonContext(...)
1057 *
1058 * because of gcc bug [1].
1059 *
1060 * [1] https://gcc.gnu.org/bugzilla/show_bug.cgi?id=85883
1061 */
1062 template<typename T>
1063 class C_MonContext : public LambdaContext<T> {
1064 public:
1065 C_MonContext(const Monitor* m, T&& f) :
1066 LambdaContext<T>(std::forward<T>(f)),
1067 mon(m)
1068 {}
1069 void finish(int r) override {
1070 if (mon->is_shutdown())
1071 return;
1072 LambdaContext<T>::finish(r);
1073 }
1074 private:
1075 const Monitor* mon;
1076 };
1077
1078 #endif