]> git.proxmox.com Git - ceph.git/blame - ceph/src/mon/Monitor.h
update sources to v12.1.1
[ceph.git] / ceph / src / mon / Monitor.h
CommitLineData
7c673cae
FG
1// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2// vim: ts=8 sw=2 smarttab
3/*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 *
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
12 *
13 */
14
15/*
16 * This is the top level monitor. It runs on each machine in the Monitor
17 * Cluster. The election of a leader for the paxos algorithm only happens
18 * once per machine via the elector. There is a separate paxos instance (state)
19 * kept for each of the system components: Object Store Device (OSD) Monitor,
20 * Placement Group (PG) Monitor, Metadata Server (MDS) Monitor, and Client Monitor.
21 */
22
23#ifndef CEPH_MONITOR_H
24#define CEPH_MONITOR_H
25
26#include <errno.h>
27#include <cmath>
28
29#include "include/types.h"
224ce89b 30#include "include/health.h"
7c673cae
FG
31#include "msg/Messenger.h"
32
33#include "common/Timer.h"
34
224ce89b 35#include "health_check.h"
7c673cae
FG
36#include "MonMap.h"
37#include "Elector.h"
38#include "Paxos.h"
39#include "Session.h"
31f18b77 40#include "PGStatService.h"
7c673cae
FG
41
42#include "common/LogClient.h"
43#include "auth/cephx/CephxKeyServer.h"
44#include "auth/AuthMethodList.h"
45#include "auth/KeyRing.h"
46#include "messages/MMonCommand.h"
47#include "mon/MonitorDBStore.h"
48#include "include/memory.h"
49#include "mgr/MgrClient.h"
50
51#include "mon/MonOpRequest.h"
52#include "common/WorkQueue.h"
53
54
55#define CEPH_MON_PROTOCOL 13 /* cluster internal */
56
57
58enum {
59 l_cluster_first = 555000,
60 l_cluster_num_mon,
61 l_cluster_num_mon_quorum,
62 l_cluster_num_osd,
63 l_cluster_num_osd_up,
64 l_cluster_num_osd_in,
65 l_cluster_osd_epoch,
66 l_cluster_osd_bytes,
67 l_cluster_osd_bytes_used,
68 l_cluster_osd_bytes_avail,
69 l_cluster_num_pool,
70 l_cluster_num_pg,
71 l_cluster_num_pg_active_clean,
72 l_cluster_num_pg_active,
73 l_cluster_num_pg_peering,
74 l_cluster_num_object,
75 l_cluster_num_object_degraded,
76 l_cluster_num_object_misplaced,
77 l_cluster_num_object_unfound,
78 l_cluster_num_bytes,
79 l_cluster_num_mds_up,
80 l_cluster_num_mds_in,
81 l_cluster_num_mds_failed,
82 l_cluster_mds_epoch,
83 l_cluster_last,
84};
85
86enum {
87 l_mon_first = 456000,
88 l_mon_num_sessions,
89 l_mon_session_add,
90 l_mon_session_rm,
91 l_mon_session_trim,
92 l_mon_num_elections,
93 l_mon_election_call,
94 l_mon_election_win,
95 l_mon_election_lose,
96 l_mon_last,
97};
98
99class QuorumService;
100class PaxosService;
101
102class PerfCounters;
103class AdminSocketHook;
104
105class MMonGetMap;
106class MMonGetVersion;
107class MMonMetadata;
108class MMonSync;
109class MMonScrub;
110class MMonProbe;
111struct MMonSubscribe;
112struct MRoute;
113struct MForward;
114struct MTimeCheck;
115struct MMonHealth;
116struct MonCommand;
117
118#define COMPAT_SET_LOC "feature_set"
119
120class C_MonContext final : public FunctionContext {
121 const Monitor *mon;
122public:
123 explicit C_MonContext(Monitor *m, boost::function<void(int)>&& callback)
124 : FunctionContext(std::move(callback)), mon(m) {}
125 void finish(int r) override;
126};
127
128class Monitor : public Dispatcher,
129 public md_config_obs_t {
130public:
131 // me
132 string name;
133 int rank;
134 Messenger *messenger;
135 ConnectionRef con_self;
136 Mutex lock;
137 SafeTimer timer;
138 Finisher finisher;
139 ThreadPool cpu_tp; ///< threadpool for CPU intensive work
140
141 /// true if we have ever joined a quorum. if false, we are either a
142 /// new cluster, a newly joining monitor, or a just-upgraded
143 /// monitor.
144 bool has_ever_joined;
145
146 PerfCounters *logger, *cluster_logger;
147 bool cluster_logger_registered;
148
149 void register_cluster_logger();
150 void unregister_cluster_logger();
151
152 MonMap *monmap;
153 uuid_d fingerprint;
154
155 set<entity_addr_t> extra_probe_peers;
156
157 LogClient log_client;
158 LogChannelRef clog;
159 LogChannelRef audit_clog;
160 KeyRing keyring;
161 KeyServer key_server;
162
163 AuthMethodList auth_cluster_required;
164 AuthMethodList auth_service_required;
165
166 CompatSet features;
167
168 const MonCommand *leader_supported_mon_commands;
169 int leader_supported_mon_commands_size;
170
171 Messenger *mgr_messenger;
172 MgrClient mgr_client;
173 uint64_t mgr_proxy_bytes = 0; // in-flight proxied mgr command message bytes
174
31f18b77
FG
175 const MonPGStatService *pgservice;
176
7c673cae
FG
177private:
178 void new_tick();
179
180 // -- local storage --
181public:
182 MonitorDBStore *store;
183 static const string MONITOR_NAME;
184 static const string MONITOR_STORE_PREFIX;
185
186 // -- monitor state --
187private:
188 enum {
189 STATE_PROBING = 1,
190 STATE_SYNCHRONIZING,
191 STATE_ELECTING,
192 STATE_LEADER,
193 STATE_PEON,
194 STATE_SHUTDOWN
195 };
196 int state;
197
198public:
199 static const char *get_state_name(int s) {
200 switch (s) {
201 case STATE_PROBING: return "probing";
202 case STATE_SYNCHRONIZING: return "synchronizing";
203 case STATE_ELECTING: return "electing";
204 case STATE_LEADER: return "leader";
205 case STATE_PEON: return "peon";
206 case STATE_SHUTDOWN: return "shutdown";
207 default: return "???";
208 }
209 }
210 const char *get_state_name() const {
211 return get_state_name(state);
212 }
213
214 bool is_shutdown() const { return state == STATE_SHUTDOWN; }
215 bool is_probing() const { return state == STATE_PROBING; }
216 bool is_synchronizing() const { return state == STATE_SYNCHRONIZING; }
217 bool is_electing() const { return state == STATE_ELECTING; }
218 bool is_leader() const { return state == STATE_LEADER; }
219 bool is_peon() const { return state == STATE_PEON; }
220
221 const utime_t &get_leader_since() const;
222
223 void prepare_new_fingerprint(MonitorDBStore::TransactionRef t);
224
225 // -- elector --
226private:
227 Paxos *paxos;
228 Elector elector;
229 friend class Elector;
230
231 /// features we require of peers (based on on-disk compatset)
232 uint64_t required_features;
233
234 int leader; // current leader (to best of knowledge)
235 set<int> quorum; // current active set of monitors (if !starting)
236 utime_t leader_since; // when this monitor became the leader, if it is the leader
237 utime_t exited_quorum; // time detected as not in quorum; 0 if in
31f18b77
FG
238
239 // map of counts of connected clients, by type and features, for
240 // each quorum mon
241 map<int,FeatureMap> quorum_feature_map;
242
7c673cae
FG
243 /**
244 * Intersection of quorum member's connection feature bits.
245 */
246 uint64_t quorum_con_features;
247 /**
248 * Intersection of quorum members mon-specific feature bits
249 */
250 mon_feature_t quorum_mon_features;
251 bufferlist supported_commands_bl; // encoded MonCommands we support
252
253 set<string> outside_quorum;
254
255 /**
256 * @defgroup Monitor_h_scrub
257 * @{
258 */
259 version_t scrub_version; ///< paxos version we are scrubbing
260 map<int,ScrubResult> scrub_result; ///< results so far
261
262 /**
263 * trigger a cross-mon scrub
264 *
265 * Verify all mons are storing identical content
266 */
267 int scrub_start();
268 int scrub();
269 void handle_scrub(MonOpRequestRef op);
270 bool _scrub(ScrubResult *r,
271 pair<string,string> *start,
272 int *num_keys);
273 void scrub_check_results();
274 void scrub_timeout();
275 void scrub_finish();
276 void scrub_reset();
277 void scrub_update_interval(int secs);
278
279 Context *scrub_event; ///< periodic event to trigger scrub (leader)
280 Context *scrub_timeout_event; ///< scrub round timeout (leader)
281 void scrub_event_start();
282 void scrub_event_cancel();
283 void scrub_reset_timeout();
284 void scrub_cancel_timeout();
285
286 struct ScrubState {
287 pair<string,string> last_key; ///< last scrubbed key
288 bool finished;
289
290 ScrubState() : finished(false) { }
291 virtual ~ScrubState() { }
292 };
293 ceph::shared_ptr<ScrubState> scrub_state; ///< keeps track of current scrub
294
295 /**
296 * @defgroup Monitor_h_sync Synchronization
297 * @{
298 */
299 /**
300 * @} // provider state
301 */
302 struct SyncProvider {
303 entity_inst_t entity; ///< who
304 uint64_t cookie; ///< unique cookie for this sync attempt
305 utime_t timeout; ///< when we give up and expire this attempt
306 version_t last_committed; ///< last paxos version on peer
307 pair<string,string> last_key; ///< last key sent to (or on) peer
308 bool full; ///< full scan?
309 MonitorDBStore::Synchronizer synchronizer; ///< iterator
310
311 SyncProvider() : cookie(0), last_committed(0), full(false) {}
312
313 void reset_timeout(CephContext *cct, int grace) {
314 timeout = ceph_clock_now();
315 timeout += grace;
316 }
317 };
318
319 map<uint64_t, SyncProvider> sync_providers; ///< cookie -> SyncProvider for those syncing from us
320 uint64_t sync_provider_count; ///< counter for issued cookies to keep them unique
321
322 /**
323 * @} // requester state
324 */
325 entity_inst_t sync_provider; ///< who we are syncing from
326 uint64_t sync_cookie; ///< 0 if we are starting, non-zero otherwise
327 bool sync_full; ///< true if we are a full sync, false for recent catch-up
328 version_t sync_start_version; ///< last_committed at sync start
329 Context *sync_timeout_event; ///< timeout event
330
331 /**
332 * floor for sync source
333 *
334 * When we sync we forget about our old last_committed value which
335 * can be dangerous. For example, if we have a cluster of:
336 *
337 * mon.a: lc 100
338 * mon.b: lc 80
339 * mon.c: lc 100 (us)
340 *
341 * If something forces us to sync (say, corruption, or manual
342 * intervention, or bug), we forget last_committed, and might abort.
343 * If mon.a happens to be down when we come back, we will see:
344 *
345 * mon.b: lc 80
346 * mon.c: lc 0 (us)
347 *
348 * and sync from mon.b, at which point a+b will both have lc 80 and
349 * come online with a majority holding out of date commits.
350 *
351 * Avoid this by preserving our old last_committed value prior to
352 * sync and never going backwards.
353 */
354 version_t sync_last_committed_floor;
355
356 /**
357 * Obtain the synchronization target prefixes in set form.
358 *
359 * We consider a target prefix all those that are relevant when
360 * synchronizing two stores. That is, all those that hold paxos service's
361 * versions, as well as paxos versions, or any control keys such as the
362 * first or last committed version.
363 *
364 * Given the current design, this function should return the name of all and
365 * any available paxos service, plus the paxos name.
366 *
367 * @returns a set of strings referring to the prefixes being synchronized
368 */
369 set<string> get_sync_targets_names();
370
371 /**
372 * Reset the monitor's sync-related data structures for syncing *from* a peer
373 */
374 void sync_reset_requester();
375
376 /**
377 * Reset sync state related to allowing others to sync from us
378 */
379 void sync_reset_provider();
380
381 /**
382 * Caled when a sync attempt times out (requester-side)
383 */
384 void sync_timeout();
385
386 /**
387 * Get the latest monmap for backup purposes during sync
388 */
389 void sync_obtain_latest_monmap(bufferlist &bl);
390
391 /**
392 * Start sync process
393 *
394 * Start pulling committed state from another monitor.
395 *
396 * @param entity where to pull committed state from
397 * @param full whether to do a full sync or just catch up on recent paxos
398 */
399 void sync_start(entity_inst_t &entity, bool full);
400
401public:
402 /**
403 * force a sync on next mon restart
404 */
405 void sync_force(Formatter *f, ostream& ss);
406
407private:
408 /**
409 * store critical state for safekeeping during sync
410 *
411 * We store a few things on the side that we don't want to get clobbered by sync. This
412 * includes the latest monmap and a lower bound on last_committed.
413 */
414 void sync_stash_critical_state(MonitorDBStore::TransactionRef tx);
415
416 /**
417 * reset the sync timeout
418 *
419 * This is used on the client to restart if things aren't progressing
420 */
421 void sync_reset_timeout();
422
423 /**
424 * trim stale sync provider state
425 *
426 * If someone is syncing from us and hasn't talked to us recently, expire their state.
427 */
428 void sync_trim_providers();
429
430 /**
431 * Complete a sync
432 *
433 * Finish up a sync after we've gotten all of the chunks.
434 *
435 * @param last_committed final last_committed value from provider
436 */
437 void sync_finish(version_t last_committed);
438
439 /**
440 * request the next chunk from the provider
441 */
442 void sync_get_next_chunk();
443
444 /**
445 * handle sync message
446 *
447 * @param m Sync message with operation type MMonSync::OP_START_CHUNKS
448 */
449 void handle_sync(MonOpRequestRef op);
450
451 void _sync_reply_no_cookie(MonOpRequestRef op);
452
453 void handle_sync_get_cookie(MonOpRequestRef op);
454 void handle_sync_get_chunk(MonOpRequestRef op);
455 void handle_sync_finish(MonOpRequestRef op);
456
457 void handle_sync_cookie(MonOpRequestRef op);
458 void handle_sync_forward(MonOpRequestRef op);
459 void handle_sync_chunk(MonOpRequestRef op);
460 void handle_sync_no_cookie(MonOpRequestRef op);
461
462 /**
463 * @} // Synchronization
464 */
465
466 list<Context*> waitfor_quorum;
467 list<Context*> maybe_wait_for_quorum;
468
469 /**
470 * @defgroup Monitor_h_TimeCheck Monitor Clock Drift Early Warning System
471 * @{
472 *
473 * We use time checks to keep track of any clock drifting going on in the
474 * cluster. This is accomplished by periodically ping each monitor in the
475 * quorum and register its response time on a map, assessing how much its
476 * clock has drifted. We also take this opportunity to assess the latency
477 * on response.
478 *
479 * This mechanism works as follows:
480 *
481 * - Leader sends out a 'PING' message to each other monitor in the quorum.
482 * The message is timestamped with the leader's current time. The leader's
483 * current time is recorded in a map, associated with each peon's
484 * instance.
485 * - The peon replies to the leader with a timestamped 'PONG' message.
486 * - The leader calculates a delta between the peon's timestamp and its
487 * current time and stashes it.
488 * - The leader also calculates the time it took to receive the 'PONG'
489 * since the 'PING' was sent, and stashes an approximate latency estimate.
490 * - Once all the quorum members have pong'ed, the leader will share the
491 * clock skew and latency maps with all the monitors in the quorum.
492 */
493 map<entity_inst_t, utime_t> timecheck_waiting;
494 map<entity_inst_t, double> timecheck_skews;
495 map<entity_inst_t, double> timecheck_latencies;
496 // odd value means we are mid-round; even value means the round has
497 // finished.
498 version_t timecheck_round;
499 unsigned int timecheck_acks;
500 utime_t timecheck_round_start;
224ce89b 501 friend class HealthMonitor;
7c673cae
FG
502 /* When we hit a skew we will start a new round based off of
503 * 'mon_timecheck_skew_interval'. Each new round will be backed off
504 * until we hit 'mon_timecheck_interval' -- which is the typical
505 * interval when not in the presence of a skew.
506 *
507 * This variable tracks the number of rounds with skews since last clean
508 * so that we can report to the user and properly adjust the backoff.
509 */
510 uint64_t timecheck_rounds_since_clean;
511 /**
512 * Time Check event.
513 */
514 Context *timecheck_event;
515
516 void timecheck_start();
517 void timecheck_finish();
518 void timecheck_start_round();
519 void timecheck_finish_round(bool success = true);
520 void timecheck_cancel_round();
521 void timecheck_cleanup();
522 void timecheck_reset_event();
523 void timecheck_check_skews();
524 void timecheck_report();
525 void timecheck();
526 health_status_t timecheck_status(ostringstream &ss,
527 const double skew_bound,
528 const double latency);
529 void handle_timecheck_leader(MonOpRequestRef op);
530 void handle_timecheck_peon(MonOpRequestRef op);
531 void handle_timecheck(MonOpRequestRef op);
532
533 /**
534 * Returns 'true' if this is considered to be a skew; 'false' otherwise.
535 */
536 bool timecheck_has_skew(const double skew_bound, double *abs) const {
537 double abs_skew = std::fabs(skew_bound);
538 if (abs)
539 *abs = abs_skew;
540 return (abs_skew > g_conf->mon_clock_drift_allowed);
541 }
542
543 /**
544 * @}
545 */
546 /**
547 * Handle ping messages from others.
548 */
549 void handle_ping(MonOpRequestRef op);
550
551 Context *probe_timeout_event = nullptr; // for probing
552
553 void reset_probe_timeout();
554 void cancel_probe_timeout();
555 void probe_timeout(int r);
556
557 void _apply_compatset_features(CompatSet &new_features);
558
559public:
560 epoch_t get_epoch();
561 int get_leader() const { return leader; }
224ce89b
WB
562 string get_leader_name() {
563 return quorum.empty() ? string() : monmap->get_name(*quorum.begin());
564 }
7c673cae
FG
565 const set<int>& get_quorum() const { return quorum; }
566 list<string> get_quorum_names() {
567 list<string> q;
568 for (set<int>::iterator p = quorum.begin(); p != quorum.end(); ++p)
569 q.push_back(monmap->get_name(*p));
570 return q;
571 }
572 uint64_t get_quorum_con_features() const {
573 return quorum_con_features;
574 }
575 mon_feature_t get_quorum_mon_features() const {
576 return quorum_mon_features;
577 }
578 uint64_t get_required_features() const {
579 return required_features;
580 }
581 mon_feature_t get_required_mon_features() const {
582 return monmap->get_required_features();
583 }
584 void apply_quorum_to_compatset_features();
585 void apply_monmap_to_compatset_features();
586 void calc_quorum_requirements();
587
31f18b77
FG
588 void get_combined_feature_map(FeatureMap *fm);
589
7c673cae
FG
590private:
591 void _reset(); ///< called from bootstrap, start_, or join_election
592 void wait_for_paxos_write();
593 void _finish_svc_election(); ///< called by {win,lose}_election
594public:
595 void bootstrap();
596 void join_election();
597 void start_election();
598 void win_standalone_election();
599 // end election (called by Elector)
600 void win_election(epoch_t epoch, set<int>& q,
601 uint64_t features,
602 const mon_feature_t& mon_features,
224ce89b 603 const map<int,Metadata>& metadata,
7c673cae
FG
604 const MonCommand *cmdset, int cmdsize);
605 void lose_election(epoch_t epoch, set<int>& q, int l,
606 uint64_t features,
607 const mon_feature_t& mon_features);
608 // end election (called by Elector)
609 void finish_election();
610
611 const bufferlist& get_supported_commands_bl() {
612 return supported_commands_bl;
613 }
614
615 void update_logger();
616
617 /**
618 * Vector holding the Services serviced by this Monitor.
619 */
620 vector<PaxosService*> paxos_service;
621
7c673cae
FG
622 class PGMonitor *pgmon() {
623 return (class PGMonitor *)paxos_service[PAXOS_PGMAP];
624 }
625
626 class MDSMonitor *mdsmon() {
627 return (class MDSMonitor *)paxos_service[PAXOS_MDSMAP];
628 }
629
630 class MonmapMonitor *monmon() {
631 return (class MonmapMonitor *)paxos_service[PAXOS_MONMAP];
632 }
633
634 class OSDMonitor *osdmon() {
635 return (class OSDMonitor *)paxos_service[PAXOS_OSDMAP];
636 }
637
638 class AuthMonitor *authmon() {
639 return (class AuthMonitor *)paxos_service[PAXOS_AUTH];
640 }
641
642 class LogMonitor *logmon() {
643 return (class LogMonitor*) paxos_service[PAXOS_LOG];
644 }
645
646 class MgrMonitor *mgrmon() {
647 return (class MgrMonitor*) paxos_service[PAXOS_MGR];
648 }
649
31f18b77
FG
650 class MgrStatMonitor *mgrstatmon() {
651 return (class MgrStatMonitor*) paxos_service[PAXOS_MGRSTAT];
652 }
653
224ce89b
WB
654 class MgrStatMonitor *healthmon() {
655 return (class MgrStatMonitor*) paxos_service[PAXOS_MGRSTAT];
656 }
657
7c673cae
FG
658 friend class Paxos;
659 friend class OSDMonitor;
660 friend class MDSMonitor;
661 friend class MonmapMonitor;
662 friend class PGMonitor;
663 friend class LogMonitor;
664 friend class ConfigKeyService;
665
666 QuorumService *health_monitor;
667 QuorumService *config_key_service;
668
669 // -- sessions --
670 MonSessionMap session_map;
671 Mutex session_map_lock{"Monitor::session_map_lock"};
672 AdminSocketHook *admin_hook;
673
674 template<typename Func, typename...Args>
675 void with_session_map(Func&& func) {
676 Mutex::Locker l(session_map_lock);
677 std::forward<Func>(func)(session_map);
678 }
679 void send_latest_monmap(Connection *con);
680
681 // messages
682 void handle_get_version(MonOpRequestRef op);
683 void handle_subscribe(MonOpRequestRef op);
684 void handle_mon_get_map(MonOpRequestRef op);
685
686 static void _generate_command_map(map<string,cmd_vartype>& cmdmap,
687 map<string,string> &param_str_map);
688 static const MonCommand *_get_moncommand(const string &cmd_prefix,
689 MonCommand *cmds, int cmds_size);
690 bool _allowed_command(MonSession *s, string &module, string &prefix,
691 const map<string,cmd_vartype>& cmdmap,
692 const map<string,string>& param_str_map,
693 const MonCommand *this_cmd);
694 void get_mon_status(Formatter *f, ostream& ss);
695 void _quorum_status(Formatter *f, ostream& ss);
696 bool _add_bootstrap_peer_hint(string cmd, cmdmap_t& cmdmap, ostream& ss);
697 void handle_command(MonOpRequestRef op);
698 void handle_route(MonOpRequestRef op);
699
700 void handle_mon_metadata(MonOpRequestRef op);
701 int get_mon_metadata(int mon, Formatter *f, ostream& err);
702 int print_nodes(Formatter *f, ostream& err);
703
704 // Accumulate metadata across calls to update_mon_metadata
224ce89b 705 map<int, Metadata> mon_metadata;
7c673cae
FG
706 map<int, Metadata> pending_metadata;
707
708 /**
709 *
710 */
711 struct health_cache_t {
712 health_status_t overall;
713 string summary;
714
715 void reset() {
716 // health_status_t doesn't really have a NONE value and we're not
717 // okay with setting something else (say, HEALTH_ERR). so just
718 // leave it be.
719 summary.clear();
720 }
721 } health_status_cache;
722
723 Context *health_tick_event = nullptr;
724 Context *health_interval_event = nullptr;
725
726 void health_tick_start();
727 void health_tick_stop();
728 utime_t health_interval_calc_next_update();
729 void health_interval_start();
730 void health_interval_stop();
731 void health_events_cleanup();
732
733 void health_to_clog_update_conf(const std::set<std::string> &changed);
734
735 void do_health_to_clog_interval();
736 void do_health_to_clog(bool force = false);
737
738 /**
739 * Generate health report
740 *
741 * @param status one-line status summary
742 * @param detailbl optional bufferlist* to fill with a detailed report
743 * @returns health status
744 */
745 health_status_t get_health(list<string>& status, bufferlist *detailbl,
746 Formatter *f);
224ce89b
WB
747
748 health_status_t get_health_status(
749 bool want_detail,
750 Formatter *f,
751 std::string *plain,
752 const char *sep1 = " ",
753 const char *sep2 = "; ");
754 void log_health(
755 const health_check_map_t& updated,
756 const health_check_map_t& previous,
757 MonitorDBStore::TransactionRef t);
758
7c673cae
FG
759 void get_cluster_status(stringstream &ss, Formatter *f);
760
761 void reply_command(MonOpRequestRef op, int rc, const string &rs, version_t version);
762 void reply_command(MonOpRequestRef op, int rc, const string &rs, bufferlist& rdata, version_t version);
763
764
765 void handle_probe(MonOpRequestRef op);
766 /**
767 * Handle a Probe Operation, replying with our name, quorum and known versions.
768 *
769 * We use the MMonProbe message class for anything and everything related with
770 * Monitor probing. One of the operations relates directly with the probing
771 * itself, in which we receive a probe request and to which we reply with
772 * our name, our quorum and the known versions for each Paxos service. Thus the
773 * redundant function name. This reply will obviously be sent to the one
774 * probing/requesting these infos.
775 *
776 * @todo Add @pre and @post
777 *
778 * @param m A Probe message, with an operation of type Probe.
779 */
780 void handle_probe_probe(MonOpRequestRef op);
781 void handle_probe_reply(MonOpRequestRef op);
782
783 // request routing
784 struct RoutedRequest {
785 uint64_t tid;
786 bufferlist request_bl;
787 MonSession *session;
788 ConnectionRef con;
789 uint64_t con_features;
790 entity_inst_t client_inst;
791 MonOpRequestRef op;
792
793 RoutedRequest() : tid(0), session(NULL), con_features(0) {}
794 ~RoutedRequest() {
795 if (session)
796 session->put();
797 }
798 };
799 uint64_t routed_request_tid;
800 map<uint64_t, RoutedRequest*> routed_requests;
801
802 void forward_request_leader(MonOpRequestRef op);
803 void handle_forward(MonOpRequestRef op);
804 void try_send_message(Message *m, const entity_inst_t& to);
805 void send_reply(MonOpRequestRef op, Message *reply);
806 void no_reply(MonOpRequestRef op);
807 void resend_routed_requests();
808 void remove_session(MonSession *s);
809 void remove_all_sessions();
810 void waitlist_or_zap_client(MonOpRequestRef op);
811
812 void send_command(const entity_inst_t& inst,
813 const vector<string>& com);
814
815public:
816 struct C_Command : public C_MonOp {
817 Monitor *mon;
818 int rc;
819 string rs;
820 bufferlist rdata;
821 version_t version;
822 C_Command(Monitor *_mm, MonOpRequestRef _op, int r, string s, version_t v) :
823 C_MonOp(_op), mon(_mm), rc(r), rs(s), version(v){}
824 C_Command(Monitor *_mm, MonOpRequestRef _op, int r, string s, bufferlist rd, version_t v) :
825 C_MonOp(_op), mon(_mm), rc(r), rs(s), rdata(rd), version(v){}
826
827 void _finish(int r) override {
828 MMonCommand *m = static_cast<MMonCommand*>(op->get_req());
829 if (r >= 0) {
830 ostringstream ss;
831 if (!op->get_req()->get_connection()) {
832 ss << "connection dropped for command ";
833 } else {
834 MonSession *s = op->get_session();
835
836 // if client drops we may not have a session to draw information from.
837 if (s) {
838 ss << "from='" << s->inst << "' "
839 << "entity='" << s->entity_name << "' ";
840 } else {
841 ss << "session dropped for command ";
842 }
843 }
844 ss << "cmd='" << m->cmd << "': finished";
845
846 mon->audit_clog->info() << ss.str();
847 mon->reply_command(op, rc, rs, rdata, version);
848 }
849 else if (r == -ECANCELED)
850 return;
851 else if (r == -EAGAIN)
852 mon->dispatch_op(op);
853 else
854 assert(0 == "bad C_Command return value");
855 }
856 };
857
858 private:
859 class C_RetryMessage : public C_MonOp {
860 Monitor *mon;
861 public:
862 C_RetryMessage(Monitor *m, MonOpRequestRef op) :
863 C_MonOp(op), mon(m) { }
864
865 void _finish(int r) override {
866 if (r == -EAGAIN || r >= 0)
867 mon->dispatch_op(op);
868 else if (r == -ECANCELED)
869 return;
870 else
871 assert(0 == "bad C_RetryMessage return value");
872 }
873 };
874
875 //ms_dispatch handles a lot of logic and we want to reuse it
876 //on forwarded messages, so we create a non-locking version for this class
877 void _ms_dispatch(Message *m);
878 bool ms_dispatch(Message *m) override {
879 lock.Lock();
880 _ms_dispatch(m);
881 lock.Unlock();
882 return true;
883 }
884 void dispatch_op(MonOpRequestRef op);
885 //mon_caps is used for un-connected messages from monitors
886 MonCap * mon_caps;
887 bool ms_get_authorizer(int dest_type, AuthAuthorizer **authorizer, bool force_new) override;
888 bool ms_verify_authorizer(Connection *con, int peer_type,
889 int protocol, bufferlist& authorizer_data, bufferlist& authorizer_reply,
890 bool& isvalid, CryptoKey& session_key) override;
891 bool ms_handle_reset(Connection *con) override;
892 void ms_handle_remote_reset(Connection *con) override {}
893 bool ms_handle_refused(Connection *con) override;
894
895 int write_default_keyring(bufferlist& bl);
896 void extract_save_mon_key(KeyRing& keyring);
897
224ce89b 898 void collect_metadata(Metadata *m);
7c673cae 899 void update_mon_metadata(int from, Metadata&& m);
224ce89b 900 int load_metadata();
31f18b77 901 void count_metadata(const string& field, Formatter *f);
7c673cae
FG
902
903 // features
904 static CompatSet get_initial_supported_features();
905 static CompatSet get_supported_features();
906 static CompatSet get_legacy_features();
907 /// read the ondisk features into the CompatSet pointed to by read_features
908 static void read_features_off_disk(MonitorDBStore *store, CompatSet *read_features);
909 void read_features();
910 void write_features(MonitorDBStore::TransactionRef t);
911
912 OpTracker op_tracker;
913
914 public:
915 Monitor(CephContext *cct_, string nm, MonitorDBStore *s,
916 Messenger *m, Messenger *mgr_m, MonMap *map);
917 ~Monitor() override;
918
919 static int check_features(MonitorDBStore *store);
920
921 // config observer
922 const char** get_tracked_conf_keys() const override;
923 void handle_conf_change(const struct md_config_t *conf,
924 const std::set<std::string> &changed) override;
925
926 void update_log_clients();
927 int sanitize_options();
928 int preinit();
929 int init();
930 void init_paxos();
931 void refresh_from_paxos(bool *need_bootstrap);
932 void shutdown();
933 void tick();
934
935 void handle_signal(int sig);
936
937 int mkfs(bufferlist& osdmapbl);
938
939 /**
940 * check cluster_fsid file
941 *
942 * @return EEXIST if file exists and doesn't match, 0 on match, or negative error code
943 */
944 int check_fsid();
945
946 /**
947 * write cluster_fsid file
948 *
949 * @return 0 on success, or negative error code
950 */
951 int write_fsid();
952 int write_fsid(MonitorDBStore::TransactionRef t);
953
954 void do_admin_command(std::string command, cmdmap_t& cmdmap,
955 std::string format, ostream& ss);
956
957private:
958 // don't allow copying
959 Monitor(const Monitor& rhs);
960 Monitor& operator=(const Monitor &rhs);
961
962public:
963 static void format_command_descriptions(const MonCommand *commands,
964 unsigned commands_size,
965 Formatter *f,
966 bufferlist *rdata,
967 bool hide_mgr_flag=false);
968 void get_locally_supported_monitor_commands(const MonCommand **cmds, int *count);
969 /// the Monitor owns this pointer once you pass it in
970 void set_leader_supported_commands(const MonCommand *cmds, int size);
971 static bool is_keyring_required();
972};
973
974#define CEPH_MON_FEATURE_INCOMPAT_BASE CompatSet::Feature (1, "initial feature set (~v.18)")
975#define CEPH_MON_FEATURE_INCOMPAT_GV CompatSet::Feature (2, "global version sequencing (v0.52)")
976#define CEPH_MON_FEATURE_INCOMPAT_SINGLE_PAXOS CompatSet::Feature (3, "single paxos with k/v store (v0.\?)")
977#define CEPH_MON_FEATURE_INCOMPAT_OSD_ERASURE_CODES CompatSet::Feature(4, "support erasure code pools")
978#define CEPH_MON_FEATURE_INCOMPAT_OSDMAP_ENC CompatSet::Feature(5, "new-style osdmap encoding")
979#define CEPH_MON_FEATURE_INCOMPAT_ERASURE_CODE_PLUGINS_V2 CompatSet::Feature(6, "support isa/lrc erasure code")
980#define CEPH_MON_FEATURE_INCOMPAT_ERASURE_CODE_PLUGINS_V3 CompatSet::Feature(7, "support shec erasure code")
981#define CEPH_MON_FEATURE_INCOMPAT_KRAKEN CompatSet::Feature(8, "support monmap features")
982// make sure you add your feature to Monitor::get_supported_features
983
7c673cae
FG
984struct MonCommand {
985 string cmdstring;
986 string helpstring;
987 string module;
988 string req_perms;
989 string availability;
990 uint64_t flags;
991
992 // MonCommand flags
993 static const uint64_t FLAG_NONE = 0;
994 static const uint64_t FLAG_NOFORWARD = 1 << 0;
995 static const uint64_t FLAG_OBSOLETE = 1 << 1;
996 static const uint64_t FLAG_DEPRECATED = 1 << 2;
997 static const uint64_t FLAG_MGR = 1 << 3;
998
999 bool has_flag(uint64_t flag) const { return (flags & flag) != 0; }
1000 void set_flag(uint64_t flag) { flags |= flag; }
1001 void unset_flag(uint64_t flag) { flags &= ~flag; }
1002
1003 void encode(bufferlist &bl) const {
1004 /*
1005 * very naughty: deliberately unversioned because individual commands
1006 * shouldn't be encoded standalone, only as a full set (which we do
1007 * version, see encode_array() below).
1008 */
1009 ::encode(cmdstring, bl);
1010 ::encode(helpstring, bl);
1011 ::encode(module, bl);
1012 ::encode(req_perms, bl);
1013 ::encode(availability, bl);
1014 }
1015 void decode(bufferlist::iterator &bl) {
1016 ::decode(cmdstring, bl);
1017 ::decode(helpstring, bl);
1018 ::decode(module, bl);
1019 ::decode(req_perms, bl);
1020 ::decode(availability, bl);
1021 }
1022 bool is_compat(const MonCommand* o) const {
1023 return cmdstring == o->cmdstring &&
1024 module == o->module && req_perms == o->req_perms &&
1025 availability == o->availability;
1026 }
1027
1028 bool is_noforward() const {
1029 return has_flag(MonCommand::FLAG_NOFORWARD);
1030 }
1031
1032 bool is_obsolete() const {
1033 return has_flag(MonCommand::FLAG_OBSOLETE);
1034 }
1035
1036 bool is_deprecated() const {
1037 return has_flag(MonCommand::FLAG_DEPRECATED);
1038 }
1039
1040 bool is_mgr() const {
1041 return has_flag(MonCommand::FLAG_MGR);
1042 }
1043
1044 static void encode_array(const MonCommand *cmds, int size, bufferlist &bl) {
1045 ENCODE_START(2, 1, bl);
1046 uint16_t s = size;
1047 ::encode(s, bl);
1048 ::encode_array_nohead(cmds, size, bl);
1049 for (int i = 0; i < size; i++)
1050 ::encode(cmds[i].flags, bl);
1051 ENCODE_FINISH(bl);
1052 }
1053 static void decode_array(MonCommand **cmds, int *size,
1054 bufferlist::iterator &bl) {
1055 DECODE_START(2, bl);
1056 uint16_t s = 0;
1057 ::decode(s, bl);
1058 *size = s;
1059 *cmds = new MonCommand[*size];
1060 ::decode_array_nohead(*cmds, *size, bl);
1061 if (struct_v >= 2) {
1062 for (int i = 0; i < *size; i++)
1063 ::decode((*cmds)[i].flags, bl);
1064 } else {
1065 for (int i = 0; i < *size; i++)
1066 (*cmds)[i].flags = 0;
1067 }
1068 DECODE_FINISH(bl);
1069 }
1070
1071 bool requires_perm(char p) const {
1072 return (req_perms.find(p) != string::npos);
1073 }
1074};
1075WRITE_CLASS_ENCODER(MonCommand)
1076
1077#endif