]> git.proxmox.com Git - ceph.git/blob - ceph/src/mon/Monitor.h
a7a00642a27e315405545cf0f747ee61b67e8d0b
[ceph.git] / ceph / src / mon / Monitor.h
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3 /*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 *
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
12 *
13 */
14
15 /*
16 * This is the top level monitor. It runs on each machine in the Monitor
17 * Cluster. The election of a leader for the paxos algorithm only happens
18 * once per machine via the elector. There is a separate paxos instance (state)
19 * kept for each of the system components: Object Store Device (OSD) Monitor,
20 * Placement Group (PG) Monitor, Metadata Server (MDS) Monitor, and Client Monitor.
21 */
22
23 #ifndef CEPH_MONITOR_H
24 #define CEPH_MONITOR_H
25
26 #include <errno.h>
27 #include <cmath>
28
29 #include "include/types.h"
30 #include "msg/Messenger.h"
31
32 #include "common/Timer.h"
33
34 #include "MonMap.h"
35 #include "Elector.h"
36 #include "Paxos.h"
37 #include "Session.h"
38 #include "PGStatService.h"
39
40 #include "common/LogClient.h"
41 #include "auth/cephx/CephxKeyServer.h"
42 #include "auth/AuthMethodList.h"
43 #include "auth/KeyRing.h"
44 #include "messages/MMonCommand.h"
45 #include "mon/MonitorDBStore.h"
46 #include "include/memory.h"
47 #include "mgr/MgrClient.h"
48
49 #include "mon/MonOpRequest.h"
50 #include "common/WorkQueue.h"
51
52
53 #define CEPH_MON_PROTOCOL 13 /* cluster internal */
54
55
56 enum {
57 l_cluster_first = 555000,
58 l_cluster_num_mon,
59 l_cluster_num_mon_quorum,
60 l_cluster_num_osd,
61 l_cluster_num_osd_up,
62 l_cluster_num_osd_in,
63 l_cluster_osd_epoch,
64 l_cluster_osd_bytes,
65 l_cluster_osd_bytes_used,
66 l_cluster_osd_bytes_avail,
67 l_cluster_num_pool,
68 l_cluster_num_pg,
69 l_cluster_num_pg_active_clean,
70 l_cluster_num_pg_active,
71 l_cluster_num_pg_peering,
72 l_cluster_num_object,
73 l_cluster_num_object_degraded,
74 l_cluster_num_object_misplaced,
75 l_cluster_num_object_unfound,
76 l_cluster_num_bytes,
77 l_cluster_num_mds_up,
78 l_cluster_num_mds_in,
79 l_cluster_num_mds_failed,
80 l_cluster_mds_epoch,
81 l_cluster_last,
82 };
83
84 enum {
85 l_mon_first = 456000,
86 l_mon_num_sessions,
87 l_mon_session_add,
88 l_mon_session_rm,
89 l_mon_session_trim,
90 l_mon_num_elections,
91 l_mon_election_call,
92 l_mon_election_win,
93 l_mon_election_lose,
94 l_mon_last,
95 };
96
97 class QuorumService;
98 class PaxosService;
99
100 class PerfCounters;
101 class AdminSocketHook;
102
103 class MMonGetMap;
104 class MMonGetVersion;
105 class MMonMetadata;
106 class MMonSync;
107 class MMonScrub;
108 class MMonProbe;
109 struct MMonSubscribe;
110 struct MRoute;
111 struct MForward;
112 struct MTimeCheck;
113 struct MMonHealth;
114 struct MonCommand;
115
116 #define COMPAT_SET_LOC "feature_set"
117
118 class C_MonContext final : public FunctionContext {
119 const Monitor *mon;
120 public:
121 explicit C_MonContext(Monitor *m, boost::function<void(int)>&& callback)
122 : FunctionContext(std::move(callback)), mon(m) {}
123 void finish(int r) override;
124 };
125
126 class Monitor : public Dispatcher,
127 public md_config_obs_t {
128 public:
129 // me
130 string name;
131 int rank;
132 Messenger *messenger;
133 ConnectionRef con_self;
134 Mutex lock;
135 SafeTimer timer;
136 Finisher finisher;
137 ThreadPool cpu_tp; ///< threadpool for CPU intensive work
138
139 /// true if we have ever joined a quorum. if false, we are either a
140 /// new cluster, a newly joining monitor, or a just-upgraded
141 /// monitor.
142 bool has_ever_joined;
143
144 PerfCounters *logger, *cluster_logger;
145 bool cluster_logger_registered;
146
147 void register_cluster_logger();
148 void unregister_cluster_logger();
149
150 MonMap *monmap;
151 uuid_d fingerprint;
152
153 set<entity_addr_t> extra_probe_peers;
154
155 LogClient log_client;
156 LogChannelRef clog;
157 LogChannelRef audit_clog;
158 KeyRing keyring;
159 KeyServer key_server;
160
161 AuthMethodList auth_cluster_required;
162 AuthMethodList auth_service_required;
163
164 CompatSet features;
165
166 const MonCommand *leader_supported_mon_commands;
167 int leader_supported_mon_commands_size;
168
169 Messenger *mgr_messenger;
170 MgrClient mgr_client;
171 uint64_t mgr_proxy_bytes = 0; // in-flight proxied mgr command message bytes
172
173 const MonPGStatService *pgservice;
174
175 private:
176 void new_tick();
177
178 // -- local storage --
179 public:
180 MonitorDBStore *store;
181 static const string MONITOR_NAME;
182 static const string MONITOR_STORE_PREFIX;
183
184 // -- monitor state --
185 private:
186 enum {
187 STATE_PROBING = 1,
188 STATE_SYNCHRONIZING,
189 STATE_ELECTING,
190 STATE_LEADER,
191 STATE_PEON,
192 STATE_SHUTDOWN
193 };
194 int state;
195
196 public:
197 static const char *get_state_name(int s) {
198 switch (s) {
199 case STATE_PROBING: return "probing";
200 case STATE_SYNCHRONIZING: return "synchronizing";
201 case STATE_ELECTING: return "electing";
202 case STATE_LEADER: return "leader";
203 case STATE_PEON: return "peon";
204 case STATE_SHUTDOWN: return "shutdown";
205 default: return "???";
206 }
207 }
208 const char *get_state_name() const {
209 return get_state_name(state);
210 }
211
212 bool is_shutdown() const { return state == STATE_SHUTDOWN; }
213 bool is_probing() const { return state == STATE_PROBING; }
214 bool is_synchronizing() const { return state == STATE_SYNCHRONIZING; }
215 bool is_electing() const { return state == STATE_ELECTING; }
216 bool is_leader() const { return state == STATE_LEADER; }
217 bool is_peon() const { return state == STATE_PEON; }
218
219 const utime_t &get_leader_since() const;
220
221 void prepare_new_fingerprint(MonitorDBStore::TransactionRef t);
222
223 // -- elector --
224 private:
225 Paxos *paxos;
226 Elector elector;
227 friend class Elector;
228
229 /// features we require of peers (based on on-disk compatset)
230 uint64_t required_features;
231
232 int leader; // current leader (to best of knowledge)
233 set<int> quorum; // current active set of monitors (if !starting)
234 utime_t leader_since; // when this monitor became the leader, if it is the leader
235 utime_t exited_quorum; // time detected as not in quorum; 0 if in
236
237 // map of counts of connected clients, by type and features, for
238 // each quorum mon
239 map<int,FeatureMap> quorum_feature_map;
240
241 /**
242 * Intersection of quorum member's connection feature bits.
243 */
244 uint64_t quorum_con_features;
245 /**
246 * Intersection of quorum members mon-specific feature bits
247 */
248 mon_feature_t quorum_mon_features;
249 bufferlist supported_commands_bl; // encoded MonCommands we support
250
251 set<string> outside_quorum;
252
253 /**
254 * @defgroup Monitor_h_scrub
255 * @{
256 */
257 version_t scrub_version; ///< paxos version we are scrubbing
258 map<int,ScrubResult> scrub_result; ///< results so far
259
260 /**
261 * trigger a cross-mon scrub
262 *
263 * Verify all mons are storing identical content
264 */
265 int scrub_start();
266 int scrub();
267 void handle_scrub(MonOpRequestRef op);
268 bool _scrub(ScrubResult *r,
269 pair<string,string> *start,
270 int *num_keys);
271 void scrub_check_results();
272 void scrub_timeout();
273 void scrub_finish();
274 void scrub_reset();
275 void scrub_update_interval(int secs);
276
277 Context *scrub_event; ///< periodic event to trigger scrub (leader)
278 Context *scrub_timeout_event; ///< scrub round timeout (leader)
279 void scrub_event_start();
280 void scrub_event_cancel();
281 void scrub_reset_timeout();
282 void scrub_cancel_timeout();
283
284 struct ScrubState {
285 pair<string,string> last_key; ///< last scrubbed key
286 bool finished;
287
288 ScrubState() : finished(false) { }
289 virtual ~ScrubState() { }
290 };
291 ceph::shared_ptr<ScrubState> scrub_state; ///< keeps track of current scrub
292
293 /**
294 * @defgroup Monitor_h_sync Synchronization
295 * @{
296 */
297 /**
298 * @} // provider state
299 */
300 struct SyncProvider {
301 entity_inst_t entity; ///< who
302 uint64_t cookie; ///< unique cookie for this sync attempt
303 utime_t timeout; ///< when we give up and expire this attempt
304 version_t last_committed; ///< last paxos version on peer
305 pair<string,string> last_key; ///< last key sent to (or on) peer
306 bool full; ///< full scan?
307 MonitorDBStore::Synchronizer synchronizer; ///< iterator
308
309 SyncProvider() : cookie(0), last_committed(0), full(false) {}
310
311 void reset_timeout(CephContext *cct, int grace) {
312 timeout = ceph_clock_now();
313 timeout += grace;
314 }
315 };
316
317 map<uint64_t, SyncProvider> sync_providers; ///< cookie -> SyncProvider for those syncing from us
318 uint64_t sync_provider_count; ///< counter for issued cookies to keep them unique
319
320 /**
321 * @} // requester state
322 */
323 entity_inst_t sync_provider; ///< who we are syncing from
324 uint64_t sync_cookie; ///< 0 if we are starting, non-zero otherwise
325 bool sync_full; ///< true if we are a full sync, false for recent catch-up
326 version_t sync_start_version; ///< last_committed at sync start
327 Context *sync_timeout_event; ///< timeout event
328
329 /**
330 * floor for sync source
331 *
332 * When we sync we forget about our old last_committed value which
333 * can be dangerous. For example, if we have a cluster of:
334 *
335 * mon.a: lc 100
336 * mon.b: lc 80
337 * mon.c: lc 100 (us)
338 *
339 * If something forces us to sync (say, corruption, or manual
340 * intervention, or bug), we forget last_committed, and might abort.
341 * If mon.a happens to be down when we come back, we will see:
342 *
343 * mon.b: lc 80
344 * mon.c: lc 0 (us)
345 *
346 * and sync from mon.b, at which point a+b will both have lc 80 and
347 * come online with a majority holding out of date commits.
348 *
349 * Avoid this by preserving our old last_committed value prior to
350 * sync and never going backwards.
351 */
352 version_t sync_last_committed_floor;
353
354 /**
355 * Obtain the synchronization target prefixes in set form.
356 *
357 * We consider a target prefix all those that are relevant when
358 * synchronizing two stores. That is, all those that hold paxos service's
359 * versions, as well as paxos versions, or any control keys such as the
360 * first or last committed version.
361 *
362 * Given the current design, this function should return the name of all and
363 * any available paxos service, plus the paxos name.
364 *
365 * @returns a set of strings referring to the prefixes being synchronized
366 */
367 set<string> get_sync_targets_names();
368
369 /**
370 * Reset the monitor's sync-related data structures for syncing *from* a peer
371 */
372 void sync_reset_requester();
373
374 /**
375 * Reset sync state related to allowing others to sync from us
376 */
377 void sync_reset_provider();
378
379 /**
380 * Caled when a sync attempt times out (requester-side)
381 */
382 void sync_timeout();
383
384 /**
385 * Get the latest monmap for backup purposes during sync
386 */
387 void sync_obtain_latest_monmap(bufferlist &bl);
388
389 /**
390 * Start sync process
391 *
392 * Start pulling committed state from another monitor.
393 *
394 * @param entity where to pull committed state from
395 * @param full whether to do a full sync or just catch up on recent paxos
396 */
397 void sync_start(entity_inst_t &entity, bool full);
398
399 public:
400 /**
401 * force a sync on next mon restart
402 */
403 void sync_force(Formatter *f, ostream& ss);
404
405 private:
406 /**
407 * store critical state for safekeeping during sync
408 *
409 * We store a few things on the side that we don't want to get clobbered by sync. This
410 * includes the latest monmap and a lower bound on last_committed.
411 */
412 void sync_stash_critical_state(MonitorDBStore::TransactionRef tx);
413
414 /**
415 * reset the sync timeout
416 *
417 * This is used on the client to restart if things aren't progressing
418 */
419 void sync_reset_timeout();
420
421 /**
422 * trim stale sync provider state
423 *
424 * If someone is syncing from us and hasn't talked to us recently, expire their state.
425 */
426 void sync_trim_providers();
427
428 /**
429 * Complete a sync
430 *
431 * Finish up a sync after we've gotten all of the chunks.
432 *
433 * @param last_committed final last_committed value from provider
434 */
435 void sync_finish(version_t last_committed);
436
437 /**
438 * request the next chunk from the provider
439 */
440 void sync_get_next_chunk();
441
442 /**
443 * handle sync message
444 *
445 * @param m Sync message with operation type MMonSync::OP_START_CHUNKS
446 */
447 void handle_sync(MonOpRequestRef op);
448
449 void _sync_reply_no_cookie(MonOpRequestRef op);
450
451 void handle_sync_get_cookie(MonOpRequestRef op);
452 void handle_sync_get_chunk(MonOpRequestRef op);
453 void handle_sync_finish(MonOpRequestRef op);
454
455 void handle_sync_cookie(MonOpRequestRef op);
456 void handle_sync_forward(MonOpRequestRef op);
457 void handle_sync_chunk(MonOpRequestRef op);
458 void handle_sync_no_cookie(MonOpRequestRef op);
459
460 /**
461 * @} // Synchronization
462 */
463
464 list<Context*> waitfor_quorum;
465 list<Context*> maybe_wait_for_quorum;
466
467 /**
468 * @defgroup Monitor_h_TimeCheck Monitor Clock Drift Early Warning System
469 * @{
470 *
471 * We use time checks to keep track of any clock drifting going on in the
472 * cluster. This is accomplished by periodically ping each monitor in the
473 * quorum and register its response time on a map, assessing how much its
474 * clock has drifted. We also take this opportunity to assess the latency
475 * on response.
476 *
477 * This mechanism works as follows:
478 *
479 * - Leader sends out a 'PING' message to each other monitor in the quorum.
480 * The message is timestamped with the leader's current time. The leader's
481 * current time is recorded in a map, associated with each peon's
482 * instance.
483 * - The peon replies to the leader with a timestamped 'PONG' message.
484 * - The leader calculates a delta between the peon's timestamp and its
485 * current time and stashes it.
486 * - The leader also calculates the time it took to receive the 'PONG'
487 * since the 'PING' was sent, and stashes an approximate latency estimate.
488 * - Once all the quorum members have pong'ed, the leader will share the
489 * clock skew and latency maps with all the monitors in the quorum.
490 */
491 map<entity_inst_t, utime_t> timecheck_waiting;
492 map<entity_inst_t, double> timecheck_skews;
493 map<entity_inst_t, double> timecheck_latencies;
494 // odd value means we are mid-round; even value means the round has
495 // finished.
496 version_t timecheck_round;
497 unsigned int timecheck_acks;
498 utime_t timecheck_round_start;
499 /* When we hit a skew we will start a new round based off of
500 * 'mon_timecheck_skew_interval'. Each new round will be backed off
501 * until we hit 'mon_timecheck_interval' -- which is the typical
502 * interval when not in the presence of a skew.
503 *
504 * This variable tracks the number of rounds with skews since last clean
505 * so that we can report to the user and properly adjust the backoff.
506 */
507 uint64_t timecheck_rounds_since_clean;
508 /**
509 * Time Check event.
510 */
511 Context *timecheck_event;
512
513 void timecheck_start();
514 void timecheck_finish();
515 void timecheck_start_round();
516 void timecheck_finish_round(bool success = true);
517 void timecheck_cancel_round();
518 void timecheck_cleanup();
519 void timecheck_reset_event();
520 void timecheck_check_skews();
521 void timecheck_report();
522 void timecheck();
523 health_status_t timecheck_status(ostringstream &ss,
524 const double skew_bound,
525 const double latency);
526 void handle_timecheck_leader(MonOpRequestRef op);
527 void handle_timecheck_peon(MonOpRequestRef op);
528 void handle_timecheck(MonOpRequestRef op);
529
530 /**
531 * Returns 'true' if this is considered to be a skew; 'false' otherwise.
532 */
533 bool timecheck_has_skew(const double skew_bound, double *abs) const {
534 double abs_skew = std::fabs(skew_bound);
535 if (abs)
536 *abs = abs_skew;
537 return (abs_skew > g_conf->mon_clock_drift_allowed);
538 }
539
540 /**
541 * @}
542 */
543 /**
544 * Handle ping messages from others.
545 */
546 void handle_ping(MonOpRequestRef op);
547
548 Context *probe_timeout_event = nullptr; // for probing
549
550 void reset_probe_timeout();
551 void cancel_probe_timeout();
552 void probe_timeout(int r);
553
554 void _apply_compatset_features(CompatSet &new_features);
555
556 public:
557 epoch_t get_epoch();
558 int get_leader() const { return leader; }
559 const set<int>& get_quorum() const { return quorum; }
560 list<string> get_quorum_names() {
561 list<string> q;
562 for (set<int>::iterator p = quorum.begin(); p != quorum.end(); ++p)
563 q.push_back(monmap->get_name(*p));
564 return q;
565 }
566 uint64_t get_quorum_con_features() const {
567 return quorum_con_features;
568 }
569 mon_feature_t get_quorum_mon_features() const {
570 return quorum_mon_features;
571 }
572 uint64_t get_required_features() const {
573 return required_features;
574 }
575 mon_feature_t get_required_mon_features() const {
576 return monmap->get_required_features();
577 }
578 void apply_quorum_to_compatset_features();
579 void apply_monmap_to_compatset_features();
580 void calc_quorum_requirements();
581
582 void get_combined_feature_map(FeatureMap *fm);
583
584 private:
585 void _reset(); ///< called from bootstrap, start_, or join_election
586 void wait_for_paxos_write();
587 void _finish_svc_election(); ///< called by {win,lose}_election
588 public:
589 void bootstrap();
590 void join_election();
591 void start_election();
592 void win_standalone_election();
593 // end election (called by Elector)
594 void win_election(epoch_t epoch, set<int>& q,
595 uint64_t features,
596 const mon_feature_t& mon_features,
597 const MonCommand *cmdset, int cmdsize);
598 void lose_election(epoch_t epoch, set<int>& q, int l,
599 uint64_t features,
600 const mon_feature_t& mon_features);
601 // end election (called by Elector)
602 void finish_election();
603
604 const bufferlist& get_supported_commands_bl() {
605 return supported_commands_bl;
606 }
607
608 void update_logger();
609
610 /**
611 * Vector holding the Services serviced by this Monitor.
612 */
613 vector<PaxosService*> paxos_service;
614
615 PaxosService *get_paxos_service_by_name(const string& name);
616
617 class PGMonitor *pgmon() {
618 return (class PGMonitor *)paxos_service[PAXOS_PGMAP];
619 }
620
621 class MDSMonitor *mdsmon() {
622 return (class MDSMonitor *)paxos_service[PAXOS_MDSMAP];
623 }
624
625 class MonmapMonitor *monmon() {
626 return (class MonmapMonitor *)paxos_service[PAXOS_MONMAP];
627 }
628
629 class OSDMonitor *osdmon() {
630 return (class OSDMonitor *)paxos_service[PAXOS_OSDMAP];
631 }
632
633 class AuthMonitor *authmon() {
634 return (class AuthMonitor *)paxos_service[PAXOS_AUTH];
635 }
636
637 class LogMonitor *logmon() {
638 return (class LogMonitor*) paxos_service[PAXOS_LOG];
639 }
640
641 class MgrMonitor *mgrmon() {
642 return (class MgrMonitor*) paxos_service[PAXOS_MGR];
643 }
644
645 class MgrStatMonitor *mgrstatmon() {
646 return (class MgrStatMonitor*) paxos_service[PAXOS_MGRSTAT];
647 }
648
649 friend class Paxos;
650 friend class OSDMonitor;
651 friend class MDSMonitor;
652 friend class MonmapMonitor;
653 friend class PGMonitor;
654 friend class LogMonitor;
655 friend class ConfigKeyService;
656
657 QuorumService *health_monitor;
658 QuorumService *config_key_service;
659
660 // -- sessions --
661 MonSessionMap session_map;
662 Mutex session_map_lock{"Monitor::session_map_lock"};
663 AdminSocketHook *admin_hook;
664
665 template<typename Func, typename...Args>
666 void with_session_map(Func&& func) {
667 Mutex::Locker l(session_map_lock);
668 std::forward<Func>(func)(session_map);
669 }
670 void send_latest_monmap(Connection *con);
671
672 // messages
673 void handle_get_version(MonOpRequestRef op);
674 void handle_subscribe(MonOpRequestRef op);
675 void handle_mon_get_map(MonOpRequestRef op);
676
677 static void _generate_command_map(map<string,cmd_vartype>& cmdmap,
678 map<string,string> &param_str_map);
679 static const MonCommand *_get_moncommand(const string &cmd_prefix,
680 MonCommand *cmds, int cmds_size);
681 bool _allowed_command(MonSession *s, string &module, string &prefix,
682 const map<string,cmd_vartype>& cmdmap,
683 const map<string,string>& param_str_map,
684 const MonCommand *this_cmd);
685 void get_mon_status(Formatter *f, ostream& ss);
686 void _quorum_status(Formatter *f, ostream& ss);
687 bool _add_bootstrap_peer_hint(string cmd, cmdmap_t& cmdmap, ostream& ss);
688 void handle_command(MonOpRequestRef op);
689 void handle_route(MonOpRequestRef op);
690
691 void handle_mon_metadata(MonOpRequestRef op);
692 int get_mon_metadata(int mon, Formatter *f, ostream& err);
693 int print_nodes(Formatter *f, ostream& err);
694
695 // Accumulate metadata across calls to update_mon_metadata
696 map<int, Metadata> pending_metadata;
697
698 /**
699 *
700 */
701 struct health_cache_t {
702 health_status_t overall;
703 string summary;
704
705 void reset() {
706 // health_status_t doesn't really have a NONE value and we're not
707 // okay with setting something else (say, HEALTH_ERR). so just
708 // leave it be.
709 summary.clear();
710 }
711 } health_status_cache;
712
713 Context *health_tick_event = nullptr;
714 Context *health_interval_event = nullptr;
715
716 void health_tick_start();
717 void health_tick_stop();
718 utime_t health_interval_calc_next_update();
719 void health_interval_start();
720 void health_interval_stop();
721 void health_events_cleanup();
722
723 void health_to_clog_update_conf(const std::set<std::string> &changed);
724
725 void do_health_to_clog_interval();
726 void do_health_to_clog(bool force = false);
727
728 /**
729 * Generate health report
730 *
731 * @param status one-line status summary
732 * @param detailbl optional bufferlist* to fill with a detailed report
733 * @returns health status
734 */
735 health_status_t get_health(list<string>& status, bufferlist *detailbl,
736 Formatter *f);
737 void get_cluster_status(stringstream &ss, Formatter *f);
738
739 void reply_command(MonOpRequestRef op, int rc, const string &rs, version_t version);
740 void reply_command(MonOpRequestRef op, int rc, const string &rs, bufferlist& rdata, version_t version);
741
742
743 void handle_probe(MonOpRequestRef op);
744 /**
745 * Handle a Probe Operation, replying with our name, quorum and known versions.
746 *
747 * We use the MMonProbe message class for anything and everything related with
748 * Monitor probing. One of the operations relates directly with the probing
749 * itself, in which we receive a probe request and to which we reply with
750 * our name, our quorum and the known versions for each Paxos service. Thus the
751 * redundant function name. This reply will obviously be sent to the one
752 * probing/requesting these infos.
753 *
754 * @todo Add @pre and @post
755 *
756 * @param m A Probe message, with an operation of type Probe.
757 */
758 void handle_probe_probe(MonOpRequestRef op);
759 void handle_probe_reply(MonOpRequestRef op);
760
761 // request routing
762 struct RoutedRequest {
763 uint64_t tid;
764 bufferlist request_bl;
765 MonSession *session;
766 ConnectionRef con;
767 uint64_t con_features;
768 entity_inst_t client_inst;
769 MonOpRequestRef op;
770
771 RoutedRequest() : tid(0), session(NULL), con_features(0) {}
772 ~RoutedRequest() {
773 if (session)
774 session->put();
775 }
776 };
777 uint64_t routed_request_tid;
778 map<uint64_t, RoutedRequest*> routed_requests;
779
780 void forward_request_leader(MonOpRequestRef op);
781 void handle_forward(MonOpRequestRef op);
782 void try_send_message(Message *m, const entity_inst_t& to);
783 void send_reply(MonOpRequestRef op, Message *reply);
784 void no_reply(MonOpRequestRef op);
785 void resend_routed_requests();
786 void remove_session(MonSession *s);
787 void remove_all_sessions();
788 void waitlist_or_zap_client(MonOpRequestRef op);
789
790 void send_command(const entity_inst_t& inst,
791 const vector<string>& com);
792
793 public:
794 struct C_Command : public C_MonOp {
795 Monitor *mon;
796 int rc;
797 string rs;
798 bufferlist rdata;
799 version_t version;
800 C_Command(Monitor *_mm, MonOpRequestRef _op, int r, string s, version_t v) :
801 C_MonOp(_op), mon(_mm), rc(r), rs(s), version(v){}
802 C_Command(Monitor *_mm, MonOpRequestRef _op, int r, string s, bufferlist rd, version_t v) :
803 C_MonOp(_op), mon(_mm), rc(r), rs(s), rdata(rd), version(v){}
804
805 void _finish(int r) override {
806 MMonCommand *m = static_cast<MMonCommand*>(op->get_req());
807 if (r >= 0) {
808 ostringstream ss;
809 if (!op->get_req()->get_connection()) {
810 ss << "connection dropped for command ";
811 } else {
812 MonSession *s = op->get_session();
813
814 // if client drops we may not have a session to draw information from.
815 if (s) {
816 ss << "from='" << s->inst << "' "
817 << "entity='" << s->entity_name << "' ";
818 } else {
819 ss << "session dropped for command ";
820 }
821 }
822 ss << "cmd='" << m->cmd << "': finished";
823
824 mon->audit_clog->info() << ss.str();
825 mon->reply_command(op, rc, rs, rdata, version);
826 }
827 else if (r == -ECANCELED)
828 return;
829 else if (r == -EAGAIN)
830 mon->dispatch_op(op);
831 else
832 assert(0 == "bad C_Command return value");
833 }
834 };
835
836 private:
837 class C_RetryMessage : public C_MonOp {
838 Monitor *mon;
839 public:
840 C_RetryMessage(Monitor *m, MonOpRequestRef op) :
841 C_MonOp(op), mon(m) { }
842
843 void _finish(int r) override {
844 if (r == -EAGAIN || r >= 0)
845 mon->dispatch_op(op);
846 else if (r == -ECANCELED)
847 return;
848 else
849 assert(0 == "bad C_RetryMessage return value");
850 }
851 };
852
853 //ms_dispatch handles a lot of logic and we want to reuse it
854 //on forwarded messages, so we create a non-locking version for this class
855 void _ms_dispatch(Message *m);
856 bool ms_dispatch(Message *m) override {
857 lock.Lock();
858 _ms_dispatch(m);
859 lock.Unlock();
860 return true;
861 }
862 void dispatch_op(MonOpRequestRef op);
863 //mon_caps is used for un-connected messages from monitors
864 MonCap * mon_caps;
865 bool ms_get_authorizer(int dest_type, AuthAuthorizer **authorizer, bool force_new) override;
866 bool ms_verify_authorizer(Connection *con, int peer_type,
867 int protocol, bufferlist& authorizer_data, bufferlist& authorizer_reply,
868 bool& isvalid, CryptoKey& session_key) override;
869 bool ms_handle_reset(Connection *con) override;
870 void ms_handle_remote_reset(Connection *con) override {}
871 bool ms_handle_refused(Connection *con) override;
872
873 int write_default_keyring(bufferlist& bl);
874 void extract_save_mon_key(KeyRing& keyring);
875
876 void update_mon_metadata(int from, Metadata&& m);
877 int load_metadata(map<int, Metadata>& m);
878 void count_metadata(const string& field, Formatter *f);
879
880 // features
881 static CompatSet get_initial_supported_features();
882 static CompatSet get_supported_features();
883 static CompatSet get_legacy_features();
884 /// read the ondisk features into the CompatSet pointed to by read_features
885 static void read_features_off_disk(MonitorDBStore *store, CompatSet *read_features);
886 void read_features();
887 void write_features(MonitorDBStore::TransactionRef t);
888
889 OpTracker op_tracker;
890
891 public:
892 Monitor(CephContext *cct_, string nm, MonitorDBStore *s,
893 Messenger *m, Messenger *mgr_m, MonMap *map);
894 ~Monitor() override;
895
896 static int check_features(MonitorDBStore *store);
897
898 // config observer
899 const char** get_tracked_conf_keys() const override;
900 void handle_conf_change(const struct md_config_t *conf,
901 const std::set<std::string> &changed) override;
902
903 void update_log_clients();
904 int sanitize_options();
905 int preinit();
906 int init();
907 void init_paxos();
908 void refresh_from_paxos(bool *need_bootstrap);
909 void shutdown();
910 void tick();
911
912 void handle_signal(int sig);
913
914 int mkfs(bufferlist& osdmapbl);
915
916 /**
917 * check cluster_fsid file
918 *
919 * @return EEXIST if file exists and doesn't match, 0 on match, or negative error code
920 */
921 int check_fsid();
922
923 /**
924 * write cluster_fsid file
925 *
926 * @return 0 on success, or negative error code
927 */
928 int write_fsid();
929 int write_fsid(MonitorDBStore::TransactionRef t);
930
931 void do_admin_command(std::string command, cmdmap_t& cmdmap,
932 std::string format, ostream& ss);
933
934 private:
935 // don't allow copying
936 Monitor(const Monitor& rhs);
937 Monitor& operator=(const Monitor &rhs);
938
939 public:
940 static void format_command_descriptions(const MonCommand *commands,
941 unsigned commands_size,
942 Formatter *f,
943 bufferlist *rdata,
944 bool hide_mgr_flag=false);
945 void get_locally_supported_monitor_commands(const MonCommand **cmds, int *count);
946 /// the Monitor owns this pointer once you pass it in
947 void set_leader_supported_commands(const MonCommand *cmds, int size);
948 static bool is_keyring_required();
949 };
950
951 #define CEPH_MON_FEATURE_INCOMPAT_BASE CompatSet::Feature (1, "initial feature set (~v.18)")
952 #define CEPH_MON_FEATURE_INCOMPAT_GV CompatSet::Feature (2, "global version sequencing (v0.52)")
953 #define CEPH_MON_FEATURE_INCOMPAT_SINGLE_PAXOS CompatSet::Feature (3, "single paxos with k/v store (v0.\?)")
954 #define CEPH_MON_FEATURE_INCOMPAT_OSD_ERASURE_CODES CompatSet::Feature(4, "support erasure code pools")
955 #define CEPH_MON_FEATURE_INCOMPAT_OSDMAP_ENC CompatSet::Feature(5, "new-style osdmap encoding")
956 #define CEPH_MON_FEATURE_INCOMPAT_ERASURE_CODE_PLUGINS_V2 CompatSet::Feature(6, "support isa/lrc erasure code")
957 #define CEPH_MON_FEATURE_INCOMPAT_ERASURE_CODE_PLUGINS_V3 CompatSet::Feature(7, "support shec erasure code")
958 #define CEPH_MON_FEATURE_INCOMPAT_KRAKEN CompatSet::Feature(8, "support monmap features")
959 // make sure you add your feature to Monitor::get_supported_features
960
961 struct MonCommand {
962 string cmdstring;
963 string helpstring;
964 string module;
965 string req_perms;
966 string availability;
967 uint64_t flags;
968
969 // MonCommand flags
970 static const uint64_t FLAG_NONE = 0;
971 static const uint64_t FLAG_NOFORWARD = 1 << 0;
972 static const uint64_t FLAG_OBSOLETE = 1 << 1;
973 static const uint64_t FLAG_DEPRECATED = 1 << 2;
974 static const uint64_t FLAG_MGR = 1 << 3;
975
976 bool has_flag(uint64_t flag) const { return (flags & flag) != 0; }
977 void set_flag(uint64_t flag) { flags |= flag; }
978 void unset_flag(uint64_t flag) { flags &= ~flag; }
979
980 void encode(bufferlist &bl) const {
981 /*
982 * very naughty: deliberately unversioned because individual commands
983 * shouldn't be encoded standalone, only as a full set (which we do
984 * version, see encode_array() below).
985 */
986 ::encode(cmdstring, bl);
987 ::encode(helpstring, bl);
988 ::encode(module, bl);
989 ::encode(req_perms, bl);
990 ::encode(availability, bl);
991 }
992 void decode(bufferlist::iterator &bl) {
993 ::decode(cmdstring, bl);
994 ::decode(helpstring, bl);
995 ::decode(module, bl);
996 ::decode(req_perms, bl);
997 ::decode(availability, bl);
998 }
999 bool is_compat(const MonCommand* o) const {
1000 return cmdstring == o->cmdstring &&
1001 module == o->module && req_perms == o->req_perms &&
1002 availability == o->availability;
1003 }
1004
1005 bool is_noforward() const {
1006 return has_flag(MonCommand::FLAG_NOFORWARD);
1007 }
1008
1009 bool is_obsolete() const {
1010 return has_flag(MonCommand::FLAG_OBSOLETE);
1011 }
1012
1013 bool is_deprecated() const {
1014 return has_flag(MonCommand::FLAG_DEPRECATED);
1015 }
1016
1017 bool is_mgr() const {
1018 return has_flag(MonCommand::FLAG_MGR);
1019 }
1020
1021 static void encode_array(const MonCommand *cmds, int size, bufferlist &bl) {
1022 ENCODE_START(2, 1, bl);
1023 uint16_t s = size;
1024 ::encode(s, bl);
1025 ::encode_array_nohead(cmds, size, bl);
1026 for (int i = 0; i < size; i++)
1027 ::encode(cmds[i].flags, bl);
1028 ENCODE_FINISH(bl);
1029 }
1030 static void decode_array(MonCommand **cmds, int *size,
1031 bufferlist::iterator &bl) {
1032 DECODE_START(2, bl);
1033 uint16_t s = 0;
1034 ::decode(s, bl);
1035 *size = s;
1036 *cmds = new MonCommand[*size];
1037 ::decode_array_nohead(*cmds, *size, bl);
1038 if (struct_v >= 2) {
1039 for (int i = 0; i < *size; i++)
1040 ::decode((*cmds)[i].flags, bl);
1041 } else {
1042 for (int i = 0; i < *size; i++)
1043 (*cmds)[i].flags = 0;
1044 }
1045 DECODE_FINISH(bl);
1046 }
1047
1048 bool requires_perm(char p) const {
1049 return (req_perms.find(p) != string::npos);
1050 }
1051 };
1052 WRITE_CLASS_ENCODER(MonCommand)
1053
1054 #endif