]> git.proxmox.com Git - ceph.git/blob - ceph/src/mon/Monitor.h
update sources to 12.2.7
[ceph.git] / ceph / src / mon / Monitor.h
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3 /*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 *
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
12 *
13 */
14
15 /*
16 * This is the top level monitor. It runs on each machine in the Monitor
17 * Cluster. The election of a leader for the paxos algorithm only happens
18 * once per machine via the elector. There is a separate paxos instance (state)
19 * kept for each of the system components: Object Store Device (OSD) Monitor,
20 * Placement Group (PG) Monitor, Metadata Server (MDS) Monitor, and Client Monitor.
21 */
22
23 #ifndef CEPH_MONITOR_H
24 #define CEPH_MONITOR_H
25
26 #include <errno.h>
27 #include <cmath>
28
29 #include "include/types.h"
30 #include "include/health.h"
31 #include "msg/Messenger.h"
32
33 #include "common/Timer.h"
34
35 #include "health_check.h"
36 #include "MonMap.h"
37 #include "Elector.h"
38 #include "Paxos.h"
39 #include "Session.h"
40 #include "PGStatService.h"
41 #include "MonCommand.h"
42
43 #include "common/LogClient.h"
44 #include "auth/cephx/CephxKeyServer.h"
45 #include "auth/AuthMethodList.h"
46 #include "auth/KeyRing.h"
47 #include "messages/MMonCommand.h"
48 #include "mon/MonitorDBStore.h"
49 #include "include/memory.h"
50 #include "mgr/MgrClient.h"
51
52 #include "mon/MonOpRequest.h"
53 #include "common/WorkQueue.h"
54
55
56 #define CEPH_MON_PROTOCOL 13 /* cluster internal */
57
58
59 enum {
60 l_cluster_first = 555000,
61 l_cluster_num_mon,
62 l_cluster_num_mon_quorum,
63 l_cluster_num_osd,
64 l_cluster_num_osd_up,
65 l_cluster_num_osd_in,
66 l_cluster_osd_epoch,
67 l_cluster_osd_bytes,
68 l_cluster_osd_bytes_used,
69 l_cluster_osd_bytes_avail,
70 l_cluster_num_pool,
71 l_cluster_num_pg,
72 l_cluster_num_pg_active_clean,
73 l_cluster_num_pg_active,
74 l_cluster_num_pg_peering,
75 l_cluster_num_object,
76 l_cluster_num_object_degraded,
77 l_cluster_num_object_misplaced,
78 l_cluster_num_object_unfound,
79 l_cluster_num_bytes,
80 l_cluster_num_mds_up,
81 l_cluster_num_mds_in,
82 l_cluster_num_mds_failed,
83 l_cluster_mds_epoch,
84 l_cluster_last,
85 };
86
87 enum {
88 l_mon_first = 456000,
89 l_mon_num_sessions,
90 l_mon_session_add,
91 l_mon_session_rm,
92 l_mon_session_trim,
93 l_mon_num_elections,
94 l_mon_election_call,
95 l_mon_election_win,
96 l_mon_election_lose,
97 l_mon_last,
98 };
99
100 class QuorumService;
101 class PaxosService;
102
103 class PerfCounters;
104 class AdminSocketHook;
105
106 class MMonGetMap;
107 class MMonGetVersion;
108 class MMonMetadata;
109 class MMonSync;
110 class MMonScrub;
111 class MMonProbe;
112 struct MMonSubscribe;
113 struct MRoute;
114 struct MForward;
115 struct MTimeCheck;
116 struct MMonHealth;
117
118 #define COMPAT_SET_LOC "feature_set"
119
120 class C_MonContext final : public FunctionContext {
121 const Monitor *mon;
122 public:
123 explicit C_MonContext(Monitor *m, boost::function<void(int)>&& callback)
124 : FunctionContext(std::move(callback)), mon(m) {}
125 void finish(int r) override;
126 };
127
128 class Monitor : public Dispatcher,
129 public md_config_obs_t {
130 public:
131 // me
132 string name;
133 int rank;
134 Messenger *messenger;
135 ConnectionRef con_self;
136 Mutex lock;
137 SafeTimer timer;
138 Finisher finisher;
139 ThreadPool cpu_tp; ///< threadpool for CPU intensive work
140
141 /// true if we have ever joined a quorum. if false, we are either a
142 /// new cluster, a newly joining monitor, or a just-upgraded
143 /// monitor.
144 bool has_ever_joined;
145
146 PerfCounters *logger, *cluster_logger;
147 bool cluster_logger_registered;
148
149 void register_cluster_logger();
150 void unregister_cluster_logger();
151
152 MonMap *monmap;
153 uuid_d fingerprint;
154
155 set<entity_addr_t> extra_probe_peers;
156
157 LogClient log_client;
158 LogChannelRef clog;
159 LogChannelRef audit_clog;
160 KeyRing keyring;
161 KeyServer key_server;
162
163 AuthMethodList auth_cluster_required;
164 AuthMethodList auth_service_required;
165
166 CompatSet features;
167
168 vector<MonCommand> leader_mon_commands; // quorum leader's commands
169 vector<MonCommand> local_mon_commands; // commands i support
170 bufferlist local_mon_commands_bl; // encoded version of above
171
172 // for upgrading mon cluster that still uses PGMonitor
173 vector<MonCommand> local_upgrading_mon_commands; // mixed mon cluster commands
174 bufferlist local_upgrading_mon_commands_bl; // encoded version of above
175
176 Messenger *mgr_messenger;
177 MgrClient mgr_client;
178 uint64_t mgr_proxy_bytes = 0; // in-flight proxied mgr command message bytes
179
180 const MonPGStatService *pgservice;
181
182 private:
183 void new_tick();
184
185 // -- local storage --
186 public:
187 MonitorDBStore *store;
188 static const string MONITOR_NAME;
189 static const string MONITOR_STORE_PREFIX;
190
191 // -- monitor state --
192 private:
193 enum {
194 STATE_PROBING = 1,
195 STATE_SYNCHRONIZING,
196 STATE_ELECTING,
197 STATE_LEADER,
198 STATE_PEON,
199 STATE_SHUTDOWN
200 };
201 int state;
202
203 public:
204 static const char *get_state_name(int s) {
205 switch (s) {
206 case STATE_PROBING: return "probing";
207 case STATE_SYNCHRONIZING: return "synchronizing";
208 case STATE_ELECTING: return "electing";
209 case STATE_LEADER: return "leader";
210 case STATE_PEON: return "peon";
211 case STATE_SHUTDOWN: return "shutdown";
212 default: return "???";
213 }
214 }
215 const char *get_state_name() const {
216 return get_state_name(state);
217 }
218
219 bool is_shutdown() const { return state == STATE_SHUTDOWN; }
220 bool is_probing() const { return state == STATE_PROBING; }
221 bool is_synchronizing() const { return state == STATE_SYNCHRONIZING; }
222 bool is_electing() const { return state == STATE_ELECTING; }
223 bool is_leader() const { return state == STATE_LEADER; }
224 bool is_peon() const { return state == STATE_PEON; }
225
226 const utime_t &get_leader_since() const;
227
228 void prepare_new_fingerprint(MonitorDBStore::TransactionRef t);
229
230 // -- elector --
231 private:
232 Paxos *paxos;
233 Elector elector;
234 friend class Elector;
235
236 /// features we require of peers (based on on-disk compatset)
237 uint64_t required_features;
238
239 int leader; // current leader (to best of knowledge)
240 set<int> quorum; // current active set of monitors (if !starting)
241 utime_t leader_since; // when this monitor became the leader, if it is the leader
242 utime_t exited_quorum; // time detected as not in quorum; 0 if in
243
244 // map of counts of connected clients, by type and features, for
245 // each quorum mon
246 map<int,FeatureMap> quorum_feature_map;
247
248 /**
249 * Intersection of quorum member's connection feature bits.
250 */
251 uint64_t quorum_con_features;
252 /**
253 * Intersection of quorum members mon-specific feature bits
254 */
255 mon_feature_t quorum_mon_features;
256
257 set<string> outside_quorum;
258
259 /**
260 * @defgroup Monitor_h_scrub
261 * @{
262 */
263 version_t scrub_version; ///< paxos version we are scrubbing
264 map<int,ScrubResult> scrub_result; ///< results so far
265
266 /**
267 * trigger a cross-mon scrub
268 *
269 * Verify all mons are storing identical content
270 */
271 int scrub_start();
272 int scrub();
273 void handle_scrub(MonOpRequestRef op);
274 bool _scrub(ScrubResult *r,
275 pair<string,string> *start,
276 int *num_keys);
277 void scrub_check_results();
278 void scrub_timeout();
279 void scrub_finish();
280 void scrub_reset();
281 void scrub_update_interval(int secs);
282
283 Context *scrub_event; ///< periodic event to trigger scrub (leader)
284 Context *scrub_timeout_event; ///< scrub round timeout (leader)
285 void scrub_event_start();
286 void scrub_event_cancel();
287 void scrub_reset_timeout();
288 void scrub_cancel_timeout();
289
290 struct ScrubState {
291 pair<string,string> last_key; ///< last scrubbed key
292 bool finished;
293
294 ScrubState() : finished(false) { }
295 virtual ~ScrubState() { }
296 };
297 ceph::shared_ptr<ScrubState> scrub_state; ///< keeps track of current scrub
298
299 /**
300 * @defgroup Monitor_h_sync Synchronization
301 * @{
302 */
303 /**
304 * @} // provider state
305 */
306 struct SyncProvider {
307 entity_inst_t entity; ///< who
308 uint64_t cookie; ///< unique cookie for this sync attempt
309 utime_t timeout; ///< when we give up and expire this attempt
310 version_t last_committed; ///< last paxos version on peer
311 pair<string,string> last_key; ///< last key sent to (or on) peer
312 bool full; ///< full scan?
313 MonitorDBStore::Synchronizer synchronizer; ///< iterator
314
315 SyncProvider() : cookie(0), last_committed(0), full(false) {}
316
317 void reset_timeout(CephContext *cct, int grace) {
318 timeout = ceph_clock_now();
319 timeout += grace;
320 }
321 };
322
323 map<uint64_t, SyncProvider> sync_providers; ///< cookie -> SyncProvider for those syncing from us
324 uint64_t sync_provider_count; ///< counter for issued cookies to keep them unique
325
326 /**
327 * @} // requester state
328 */
329 entity_inst_t sync_provider; ///< who we are syncing from
330 uint64_t sync_cookie; ///< 0 if we are starting, non-zero otherwise
331 bool sync_full; ///< true if we are a full sync, false for recent catch-up
332 version_t sync_start_version; ///< last_committed at sync start
333 Context *sync_timeout_event; ///< timeout event
334
335 /**
336 * floor for sync source
337 *
338 * When we sync we forget about our old last_committed value which
339 * can be dangerous. For example, if we have a cluster of:
340 *
341 * mon.a: lc 100
342 * mon.b: lc 80
343 * mon.c: lc 100 (us)
344 *
345 * If something forces us to sync (say, corruption, or manual
346 * intervention, or bug), we forget last_committed, and might abort.
347 * If mon.a happens to be down when we come back, we will see:
348 *
349 * mon.b: lc 80
350 * mon.c: lc 0 (us)
351 *
352 * and sync from mon.b, at which point a+b will both have lc 80 and
353 * come online with a majority holding out of date commits.
354 *
355 * Avoid this by preserving our old last_committed value prior to
356 * sync and never going backwards.
357 */
358 version_t sync_last_committed_floor;
359
360 /**
361 * Obtain the synchronization target prefixes in set form.
362 *
363 * We consider a target prefix all those that are relevant when
364 * synchronizing two stores. That is, all those that hold paxos service's
365 * versions, as well as paxos versions, or any control keys such as the
366 * first or last committed version.
367 *
368 * Given the current design, this function should return the name of all and
369 * any available paxos service, plus the paxos name.
370 *
371 * @returns a set of strings referring to the prefixes being synchronized
372 */
373 set<string> get_sync_targets_names();
374
375 /**
376 * Reset the monitor's sync-related data structures for syncing *from* a peer
377 */
378 void sync_reset_requester();
379
380 /**
381 * Reset sync state related to allowing others to sync from us
382 */
383 void sync_reset_provider();
384
385 /**
386 * Caled when a sync attempt times out (requester-side)
387 */
388 void sync_timeout();
389
390 /**
391 * Get the latest monmap for backup purposes during sync
392 */
393 void sync_obtain_latest_monmap(bufferlist &bl);
394
395 /**
396 * Start sync process
397 *
398 * Start pulling committed state from another monitor.
399 *
400 * @param entity where to pull committed state from
401 * @param full whether to do a full sync or just catch up on recent paxos
402 */
403 void sync_start(entity_inst_t &entity, bool full);
404
405 public:
406 /**
407 * force a sync on next mon restart
408 */
409 void sync_force(Formatter *f, ostream& ss);
410
411 private:
412 /**
413 * store critical state for safekeeping during sync
414 *
415 * We store a few things on the side that we don't want to get clobbered by sync. This
416 * includes the latest monmap and a lower bound on last_committed.
417 */
418 void sync_stash_critical_state(MonitorDBStore::TransactionRef tx);
419
420 /**
421 * reset the sync timeout
422 *
423 * This is used on the client to restart if things aren't progressing
424 */
425 void sync_reset_timeout();
426
427 /**
428 * trim stale sync provider state
429 *
430 * If someone is syncing from us and hasn't talked to us recently, expire their state.
431 */
432 void sync_trim_providers();
433
434 /**
435 * Complete a sync
436 *
437 * Finish up a sync after we've gotten all of the chunks.
438 *
439 * @param last_committed final last_committed value from provider
440 */
441 void sync_finish(version_t last_committed);
442
443 /**
444 * request the next chunk from the provider
445 */
446 void sync_get_next_chunk();
447
448 /**
449 * handle sync message
450 *
451 * @param m Sync message with operation type MMonSync::OP_START_CHUNKS
452 */
453 void handle_sync(MonOpRequestRef op);
454
455 void _sync_reply_no_cookie(MonOpRequestRef op);
456
457 void handle_sync_get_cookie(MonOpRequestRef op);
458 void handle_sync_get_chunk(MonOpRequestRef op);
459 void handle_sync_finish(MonOpRequestRef op);
460
461 void handle_sync_cookie(MonOpRequestRef op);
462 void handle_sync_forward(MonOpRequestRef op);
463 void handle_sync_chunk(MonOpRequestRef op);
464 void handle_sync_no_cookie(MonOpRequestRef op);
465
466 /**
467 * @} // Synchronization
468 */
469
470 list<Context*> waitfor_quorum;
471 list<Context*> maybe_wait_for_quorum;
472
473 /**
474 * @defgroup Monitor_h_TimeCheck Monitor Clock Drift Early Warning System
475 * @{
476 *
477 * We use time checks to keep track of any clock drifting going on in the
478 * cluster. This is accomplished by periodically ping each monitor in the
479 * quorum and register its response time on a map, assessing how much its
480 * clock has drifted. We also take this opportunity to assess the latency
481 * on response.
482 *
483 * This mechanism works as follows:
484 *
485 * - Leader sends out a 'PING' message to each other monitor in the quorum.
486 * The message is timestamped with the leader's current time. The leader's
487 * current time is recorded in a map, associated with each peon's
488 * instance.
489 * - The peon replies to the leader with a timestamped 'PONG' message.
490 * - The leader calculates a delta between the peon's timestamp and its
491 * current time and stashes it.
492 * - The leader also calculates the time it took to receive the 'PONG'
493 * since the 'PING' was sent, and stashes an approximate latency estimate.
494 * - Once all the quorum members have pong'ed, the leader will share the
495 * clock skew and latency maps with all the monitors in the quorum.
496 */
497 map<entity_inst_t, utime_t> timecheck_waiting;
498 map<entity_inst_t, double> timecheck_skews;
499 map<entity_inst_t, double> timecheck_latencies;
500 // odd value means we are mid-round; even value means the round has
501 // finished.
502 version_t timecheck_round;
503 unsigned int timecheck_acks;
504 utime_t timecheck_round_start;
505 friend class HealthMonitor;
506 /* When we hit a skew we will start a new round based off of
507 * 'mon_timecheck_skew_interval'. Each new round will be backed off
508 * until we hit 'mon_timecheck_interval' -- which is the typical
509 * interval when not in the presence of a skew.
510 *
511 * This variable tracks the number of rounds with skews since last clean
512 * so that we can report to the user and properly adjust the backoff.
513 */
514 uint64_t timecheck_rounds_since_clean;
515 /**
516 * Time Check event.
517 */
518 Context *timecheck_event;
519
520 void timecheck_start();
521 void timecheck_finish();
522 void timecheck_start_round();
523 void timecheck_finish_round(bool success = true);
524 void timecheck_cancel_round();
525 void timecheck_cleanup();
526 void timecheck_reset_event();
527 void timecheck_check_skews();
528 void timecheck_report();
529 void timecheck();
530 health_status_t timecheck_status(ostringstream &ss,
531 const double skew_bound,
532 const double latency);
533 void handle_timecheck_leader(MonOpRequestRef op);
534 void handle_timecheck_peon(MonOpRequestRef op);
535 void handle_timecheck(MonOpRequestRef op);
536
537 /**
538 * Returns 'true' if this is considered to be a skew; 'false' otherwise.
539 */
540 bool timecheck_has_skew(const double skew_bound, double *abs) const {
541 double abs_skew = std::fabs(skew_bound);
542 if (abs)
543 *abs = abs_skew;
544 return (abs_skew > g_conf->mon_clock_drift_allowed);
545 }
546
547 /**
548 * @}
549 */
550 /**
551 * Handle ping messages from others.
552 */
553 void handle_ping(MonOpRequestRef op);
554
555 Context *probe_timeout_event = nullptr; // for probing
556
557 void reset_probe_timeout();
558 void cancel_probe_timeout();
559 void probe_timeout(int r);
560
561 void _apply_compatset_features(CompatSet &new_features);
562
563 public:
564 epoch_t get_epoch();
565 int get_leader() const { return leader; }
566 string get_leader_name() {
567 return quorum.empty() ? string() : monmap->get_name(*quorum.begin());
568 }
569 const set<int>& get_quorum() const { return quorum; }
570 list<string> get_quorum_names() {
571 list<string> q;
572 for (set<int>::iterator p = quorum.begin(); p != quorum.end(); ++p)
573 q.push_back(monmap->get_name(*p));
574 return q;
575 }
576 uint64_t get_quorum_con_features() const {
577 return quorum_con_features;
578 }
579 mon_feature_t get_quorum_mon_features() const {
580 return quorum_mon_features;
581 }
582 uint64_t get_required_features() const {
583 return required_features;
584 }
585 mon_feature_t get_required_mon_features() const {
586 return monmap->get_required_features();
587 }
588 void apply_quorum_to_compatset_features();
589 void apply_monmap_to_compatset_features();
590 void calc_quorum_requirements();
591
592 void get_combined_feature_map(FeatureMap *fm);
593
594 private:
595 void _reset(); ///< called from bootstrap, start_, or join_election
596 void wait_for_paxos_write();
597 void _finish_svc_election(); ///< called by {win,lose}_election
598 public:
599 void bootstrap();
600 void join_election();
601 void start_election();
602 void win_standalone_election();
603 // end election (called by Elector)
604 void win_election(epoch_t epoch, set<int>& q,
605 uint64_t features,
606 const mon_feature_t& mon_features,
607 const map<int,Metadata>& metadata);
608 void lose_election(epoch_t epoch, set<int>& q, int l,
609 uint64_t features,
610 const mon_feature_t& mon_features);
611 // end election (called by Elector)
612 void finish_election();
613
614 void update_logger();
615
616 /**
617 * Vector holding the Services serviced by this Monitor.
618 */
619 vector<PaxosService*> paxos_service;
620
621 class PGMonitor *pgmon() {
622 return (class PGMonitor *)paxos_service[PAXOS_PGMAP];
623 }
624
625 class MDSMonitor *mdsmon() {
626 return (class MDSMonitor *)paxos_service[PAXOS_MDSMAP];
627 }
628
629 class MonmapMonitor *monmon() {
630 return (class MonmapMonitor *)paxos_service[PAXOS_MONMAP];
631 }
632
633 class OSDMonitor *osdmon() {
634 return (class OSDMonitor *)paxos_service[PAXOS_OSDMAP];
635 }
636
637 class AuthMonitor *authmon() {
638 return (class AuthMonitor *)paxos_service[PAXOS_AUTH];
639 }
640
641 class LogMonitor *logmon() {
642 return (class LogMonitor*) paxos_service[PAXOS_LOG];
643 }
644
645 class MgrMonitor *mgrmon() {
646 return (class MgrMonitor*) paxos_service[PAXOS_MGR];
647 }
648
649 class MgrStatMonitor *mgrstatmon() {
650 return (class MgrStatMonitor*) paxos_service[PAXOS_MGRSTAT];
651 }
652
653 class HealthMonitor *healthmon() {
654 return (class HealthMonitor*) paxos_service[PAXOS_HEALTH];
655 }
656
657 friend class Paxos;
658 friend class OSDMonitor;
659 friend class MDSMonitor;
660 friend class MonmapMonitor;
661 friend class PGMonitor;
662 friend class LogMonitor;
663 friend class ConfigKeyService;
664
665 QuorumService *health_monitor;
666 QuorumService *config_key_service;
667
668 // -- sessions --
669 MonSessionMap session_map;
670 Mutex session_map_lock{"Monitor::session_map_lock"};
671 AdminSocketHook *admin_hook;
672
673 template<typename Func, typename...Args>
674 void with_session_map(Func&& func) {
675 Mutex::Locker l(session_map_lock);
676 std::forward<Func>(func)(session_map);
677 }
678 void send_latest_monmap(Connection *con);
679
680 // messages
681 void handle_get_version(MonOpRequestRef op);
682 void handle_subscribe(MonOpRequestRef op);
683 void handle_mon_get_map(MonOpRequestRef op);
684
685 static void _generate_command_map(map<string,cmd_vartype>& cmdmap,
686 map<string,string> &param_str_map);
687 static const MonCommand *_get_moncommand(
688 const string &cmd_prefix,
689 const vector<MonCommand>& cmds);
690 bool _allowed_command(MonSession *s, string &module, string &prefix,
691 const map<string,cmd_vartype>& cmdmap,
692 const map<string,string>& param_str_map,
693 const MonCommand *this_cmd);
694 void get_mon_status(Formatter *f, ostream& ss);
695 void _quorum_status(Formatter *f, ostream& ss);
696 bool _add_bootstrap_peer_hint(string cmd, cmdmap_t& cmdmap, ostream& ss);
697 void handle_command(MonOpRequestRef op);
698 void handle_route(MonOpRequestRef op);
699
700 void handle_mon_metadata(MonOpRequestRef op);
701 int get_mon_metadata(int mon, Formatter *f, ostream& err);
702 int print_nodes(Formatter *f, ostream& err);
703
704 // Accumulate metadata across calls to update_mon_metadata
705 map<int, Metadata> mon_metadata;
706 map<int, Metadata> pending_metadata;
707
708 /**
709 *
710 */
711 struct health_cache_t {
712 health_status_t overall;
713 string summary;
714
715 void reset() {
716 // health_status_t doesn't really have a NONE value and we're not
717 // okay with setting something else (say, HEALTH_ERR). so just
718 // leave it be.
719 summary.clear();
720 }
721 } health_status_cache;
722
723 Context *health_tick_event = nullptr;
724 Context *health_interval_event = nullptr;
725
726 void health_tick_start();
727 void health_tick_stop();
728 utime_t health_interval_calc_next_update();
729 void health_interval_start();
730 void health_interval_stop();
731 void health_events_cleanup();
732
733 void health_to_clog_update_conf(const std::set<std::string> &changed);
734
735 void do_health_to_clog_interval();
736 void do_health_to_clog(bool force = false);
737
738 /**
739 * Generate health report
740 *
741 * @param status one-line status summary
742 * @param detailbl optional bufferlist* to fill with a detailed report
743 * @returns health status
744 */
745 health_status_t get_health(list<string>& status, bufferlist *detailbl,
746 Formatter *f);
747
748 health_status_t get_health_status(
749 bool want_detail,
750 Formatter *f,
751 std::string *plain,
752 const char *sep1 = " ",
753 const char *sep2 = "; ");
754 void log_health(
755 const health_check_map_t& updated,
756 const health_check_map_t& previous,
757 MonitorDBStore::TransactionRef t);
758
759 protected:
760
761 class HealthCheckLogStatus {
762 public:
763 health_status_t severity;
764 std::string last_message;
765 utime_t updated_at = 0;
766 HealthCheckLogStatus(health_status_t severity_,
767 const std::string &last_message_,
768 utime_t updated_at_)
769 : severity(severity_),
770 last_message(last_message_),
771 updated_at(updated_at_)
772 {}
773 };
774 std::map<std::string, HealthCheckLogStatus> health_check_log_times;
775
776 public:
777
778 void get_cluster_status(stringstream &ss, Formatter *f);
779
780 void reply_command(MonOpRequestRef op, int rc, const string &rs, version_t version);
781 void reply_command(MonOpRequestRef op, int rc, const string &rs, bufferlist& rdata, version_t version);
782
783
784 void handle_probe(MonOpRequestRef op);
785 /**
786 * Handle a Probe Operation, replying with our name, quorum and known versions.
787 *
788 * We use the MMonProbe message class for anything and everything related with
789 * Monitor probing. One of the operations relates directly with the probing
790 * itself, in which we receive a probe request and to which we reply with
791 * our name, our quorum and the known versions for each Paxos service. Thus the
792 * redundant function name. This reply will obviously be sent to the one
793 * probing/requesting these infos.
794 *
795 * @todo Add @pre and @post
796 *
797 * @param m A Probe message, with an operation of type Probe.
798 */
799 void handle_probe_probe(MonOpRequestRef op);
800 void handle_probe_reply(MonOpRequestRef op);
801
802 // request routing
803 struct RoutedRequest {
804 uint64_t tid;
805 bufferlist request_bl;
806 MonSession *session;
807 ConnectionRef con;
808 uint64_t con_features;
809 entity_inst_t client_inst;
810 MonOpRequestRef op;
811
812 RoutedRequest() : tid(0), session(NULL), con_features(0) {}
813 ~RoutedRequest() {
814 if (session)
815 session->put();
816 }
817 };
818 uint64_t routed_request_tid;
819 map<uint64_t, RoutedRequest*> routed_requests;
820
821 void forward_request_leader(MonOpRequestRef op);
822 void handle_forward(MonOpRequestRef op);
823 void try_send_message(Message *m, const entity_inst_t& to);
824 void send_reply(MonOpRequestRef op, Message *reply);
825 void no_reply(MonOpRequestRef op);
826 void resend_routed_requests();
827 void remove_session(MonSession *s);
828 void remove_all_sessions();
829 void waitlist_or_zap_client(MonOpRequestRef op);
830
831 void send_command(const entity_inst_t& inst,
832 const vector<string>& com);
833
834 public:
835 struct C_Command : public C_MonOp {
836 Monitor *mon;
837 int rc;
838 string rs;
839 bufferlist rdata;
840 version_t version;
841 C_Command(Monitor *_mm, MonOpRequestRef _op, int r, string s, version_t v) :
842 C_MonOp(_op), mon(_mm), rc(r), rs(s), version(v){}
843 C_Command(Monitor *_mm, MonOpRequestRef _op, int r, string s, bufferlist rd, version_t v) :
844 C_MonOp(_op), mon(_mm), rc(r), rs(s), rdata(rd), version(v){}
845
846 void _finish(int r) override {
847 MMonCommand *m = static_cast<MMonCommand*>(op->get_req());
848 if (r >= 0) {
849 ostringstream ss;
850 if (!op->get_req()->get_connection()) {
851 ss << "connection dropped for command ";
852 } else {
853 MonSession *s = op->get_session();
854
855 // if client drops we may not have a session to draw information from.
856 if (s) {
857 ss << "from='" << s->inst << "' "
858 << "entity='" << s->entity_name << "' ";
859 } else {
860 ss << "session dropped for command ";
861 }
862 }
863 ss << "cmd='" << m->cmd << "': finished";
864
865 mon->audit_clog->info() << ss.str();
866 mon->reply_command(op, rc, rs, rdata, version);
867 }
868 else if (r == -ECANCELED)
869 return;
870 else if (r == -EAGAIN)
871 mon->dispatch_op(op);
872 else
873 assert(0 == "bad C_Command return value");
874 }
875 };
876
877 private:
878 class C_RetryMessage : public C_MonOp {
879 Monitor *mon;
880 public:
881 C_RetryMessage(Monitor *m, MonOpRequestRef op) :
882 C_MonOp(op), mon(m) { }
883
884 void _finish(int r) override {
885 if (r == -EAGAIN || r >= 0)
886 mon->dispatch_op(op);
887 else if (r == -ECANCELED)
888 return;
889 else
890 assert(0 == "bad C_RetryMessage return value");
891 }
892 };
893
894 //ms_dispatch handles a lot of logic and we want to reuse it
895 //on forwarded messages, so we create a non-locking version for this class
896 void _ms_dispatch(Message *m);
897 bool ms_dispatch(Message *m) override {
898 lock.Lock();
899 _ms_dispatch(m);
900 lock.Unlock();
901 return true;
902 }
903 void dispatch_op(MonOpRequestRef op);
904 //mon_caps is used for un-connected messages from monitors
905 MonCap * mon_caps;
906 bool ms_get_authorizer(int dest_type, AuthAuthorizer **authorizer, bool force_new) override;
907 bool ms_verify_authorizer(Connection *con, int peer_type,
908 int protocol, bufferlist& authorizer_data, bufferlist& authorizer_reply,
909 bool& isvalid, CryptoKey& session_key,
910 std::unique_ptr<AuthAuthorizerChallenge> *challenge) override;
911 bool ms_handle_reset(Connection *con) override;
912 void ms_handle_remote_reset(Connection *con) override {}
913 bool ms_handle_refused(Connection *con) override;
914
915 int write_default_keyring(bufferlist& bl);
916 void extract_save_mon_key(KeyRing& keyring);
917
918 void collect_metadata(Metadata *m);
919 void update_mon_metadata(int from, Metadata&& m);
920 int load_metadata();
921 void count_metadata(const string& field, Formatter *f);
922 void count_metadata(const string& field, map<string,int> *out);
923
924 // features
925 static CompatSet get_initial_supported_features();
926 static CompatSet get_supported_features();
927 static CompatSet get_legacy_features();
928 /// read the ondisk features into the CompatSet pointed to by read_features
929 static void read_features_off_disk(MonitorDBStore *store, CompatSet *read_features);
930 void read_features();
931 void write_features(MonitorDBStore::TransactionRef t);
932
933 OpTracker op_tracker;
934
935 public:
936 Monitor(CephContext *cct_, string nm, MonitorDBStore *s,
937 Messenger *m, Messenger *mgr_m, MonMap *map);
938 ~Monitor() override;
939
940 static int check_features(MonitorDBStore *store);
941
942 // config observer
943 const char** get_tracked_conf_keys() const override;
944 void handle_conf_change(const struct md_config_t *conf,
945 const std::set<std::string> &changed) override;
946
947 void update_log_clients();
948 int sanitize_options();
949 int preinit();
950 int init();
951 void init_paxos();
952 void refresh_from_paxos(bool *need_bootstrap);
953 void shutdown();
954 void tick();
955
956 void handle_signal(int sig);
957
958 int mkfs(bufferlist& osdmapbl);
959
960 /**
961 * check cluster_fsid file
962 *
963 * @return EEXIST if file exists and doesn't match, 0 on match, or negative error code
964 */
965 int check_fsid();
966
967 /**
968 * write cluster_fsid file
969 *
970 * @return 0 on success, or negative error code
971 */
972 int write_fsid();
973 int write_fsid(MonitorDBStore::TransactionRef t);
974
975 void do_admin_command(std::string command, cmdmap_t& cmdmap,
976 std::string format, ostream& ss);
977
978 private:
979 // don't allow copying
980 Monitor(const Monitor& rhs);
981 Monitor& operator=(const Monitor &rhs);
982
983 public:
984 static void format_command_descriptions(const std::vector<MonCommand> &commands,
985 Formatter *f,
986 bufferlist *rdata,
987 bool hide_mgr_flag=false);
988
989 const std::vector<MonCommand> &get_local_commands(mon_feature_t f) {
990 if (f.contains_all(ceph::features::mon::FEATURE_LUMINOUS))
991 return local_mon_commands;
992 else
993 return local_upgrading_mon_commands;
994 }
995 const bufferlist& get_local_commands_bl(mon_feature_t f) {
996 if (f.contains_all(ceph::features::mon::FEATURE_LUMINOUS))
997 return local_mon_commands_bl;
998 else
999 return local_upgrading_mon_commands_bl;
1000 }
1001 void set_leader_commands(const std::vector<MonCommand>& cmds) {
1002 leader_mon_commands = cmds;
1003 }
1004
1005 static bool is_keyring_required();
1006 };
1007
1008 #define CEPH_MON_FEATURE_INCOMPAT_BASE CompatSet::Feature (1, "initial feature set (~v.18)")
1009 #define CEPH_MON_FEATURE_INCOMPAT_GV CompatSet::Feature (2, "global version sequencing (v0.52)")
1010 #define CEPH_MON_FEATURE_INCOMPAT_SINGLE_PAXOS CompatSet::Feature (3, "single paxos with k/v store (v0.\?)")
1011 #define CEPH_MON_FEATURE_INCOMPAT_OSD_ERASURE_CODES CompatSet::Feature(4, "support erasure code pools")
1012 #define CEPH_MON_FEATURE_INCOMPAT_OSDMAP_ENC CompatSet::Feature(5, "new-style osdmap encoding")
1013 #define CEPH_MON_FEATURE_INCOMPAT_ERASURE_CODE_PLUGINS_V2 CompatSet::Feature(6, "support isa/lrc erasure code")
1014 #define CEPH_MON_FEATURE_INCOMPAT_ERASURE_CODE_PLUGINS_V3 CompatSet::Feature(7, "support shec erasure code")
1015 #define CEPH_MON_FEATURE_INCOMPAT_KRAKEN CompatSet::Feature(8, "support monmap features")
1016 #define CEPH_MON_FEATURE_INCOMPAT_LUMINOUS CompatSet::Feature(9, "luminous ondisk layout")
1017 // make sure you add your feature to Monitor::get_supported_features
1018
1019
1020
1021 #endif