]> git.proxmox.com Git - ceph.git/blob - ceph/src/mon/Monitor.h
add subtree-ish sources for 12.0.3
[ceph.git] / ceph / src / mon / Monitor.h
1 // -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
2 // vim: ts=8 sw=2 smarttab
3 /*
4 * Ceph - scalable distributed file system
5 *
6 * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
7 *
8 * This is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License version 2.1, as published by the Free Software
11 * Foundation. See file COPYING.
12 *
13 */
14
15 /*
16 * This is the top level monitor. It runs on each machine in the Monitor
17 * Cluster. The election of a leader for the paxos algorithm only happens
18 * once per machine via the elector. There is a separate paxos instance (state)
19 * kept for each of the system components: Object Store Device (OSD) Monitor,
20 * Placement Group (PG) Monitor, Metadata Server (MDS) Monitor, and Client Monitor.
21 */
22
23 #ifndef CEPH_MONITOR_H
24 #define CEPH_MONITOR_H
25
26 #include <errno.h>
27 #include <cmath>
28
29 #include "include/types.h"
30 #include "msg/Messenger.h"
31
32 #include "common/Timer.h"
33
34 #include "MonMap.h"
35 #include "Elector.h"
36 #include "Paxos.h"
37 #include "Session.h"
38
39 #include "common/LogClient.h"
40 #include "auth/cephx/CephxKeyServer.h"
41 #include "auth/AuthMethodList.h"
42 #include "auth/KeyRing.h"
43 #include "messages/MMonCommand.h"
44 #include "mon/MonitorDBStore.h"
45 #include "include/memory.h"
46 #include "mgr/MgrClient.h"
47
48 #include "mon/MonOpRequest.h"
49 #include "common/WorkQueue.h"
50
51
52 #define CEPH_MON_PROTOCOL 13 /* cluster internal */
53
54
55 enum {
56 l_cluster_first = 555000,
57 l_cluster_num_mon,
58 l_cluster_num_mon_quorum,
59 l_cluster_num_osd,
60 l_cluster_num_osd_up,
61 l_cluster_num_osd_in,
62 l_cluster_osd_epoch,
63 l_cluster_osd_bytes,
64 l_cluster_osd_bytes_used,
65 l_cluster_osd_bytes_avail,
66 l_cluster_num_pool,
67 l_cluster_num_pg,
68 l_cluster_num_pg_active_clean,
69 l_cluster_num_pg_active,
70 l_cluster_num_pg_peering,
71 l_cluster_num_object,
72 l_cluster_num_object_degraded,
73 l_cluster_num_object_misplaced,
74 l_cluster_num_object_unfound,
75 l_cluster_num_bytes,
76 l_cluster_num_mds_up,
77 l_cluster_num_mds_in,
78 l_cluster_num_mds_failed,
79 l_cluster_mds_epoch,
80 l_cluster_last,
81 };
82
83 enum {
84 l_mon_first = 456000,
85 l_mon_num_sessions,
86 l_mon_session_add,
87 l_mon_session_rm,
88 l_mon_session_trim,
89 l_mon_num_elections,
90 l_mon_election_call,
91 l_mon_election_win,
92 l_mon_election_lose,
93 l_mon_last,
94 };
95
96 class QuorumService;
97 class PaxosService;
98
99 class PerfCounters;
100 class AdminSocketHook;
101
102 class MMonGetMap;
103 class MMonGetVersion;
104 class MMonMetadata;
105 class MMonSync;
106 class MMonScrub;
107 class MMonProbe;
108 struct MMonSubscribe;
109 struct MRoute;
110 struct MForward;
111 struct MTimeCheck;
112 struct MMonHealth;
113 struct MonCommand;
114
115 #define COMPAT_SET_LOC "feature_set"
116
117 class C_MonContext final : public FunctionContext {
118 const Monitor *mon;
119 public:
120 explicit C_MonContext(Monitor *m, boost::function<void(int)>&& callback)
121 : FunctionContext(std::move(callback)), mon(m) {}
122 void finish(int r) override;
123 };
124
125 class Monitor : public Dispatcher,
126 public md_config_obs_t {
127 public:
128 // me
129 string name;
130 int rank;
131 Messenger *messenger;
132 ConnectionRef con_self;
133 Mutex lock;
134 SafeTimer timer;
135 Finisher finisher;
136 ThreadPool cpu_tp; ///< threadpool for CPU intensive work
137
138 /// true if we have ever joined a quorum. if false, we are either a
139 /// new cluster, a newly joining monitor, or a just-upgraded
140 /// monitor.
141 bool has_ever_joined;
142
143 PerfCounters *logger, *cluster_logger;
144 bool cluster_logger_registered;
145
146 void register_cluster_logger();
147 void unregister_cluster_logger();
148
149 MonMap *monmap;
150 uuid_d fingerprint;
151
152 set<entity_addr_t> extra_probe_peers;
153
154 LogClient log_client;
155 LogChannelRef clog;
156 LogChannelRef audit_clog;
157 KeyRing keyring;
158 KeyServer key_server;
159
160 AuthMethodList auth_cluster_required;
161 AuthMethodList auth_service_required;
162
163 CompatSet features;
164
165 const MonCommand *leader_supported_mon_commands;
166 int leader_supported_mon_commands_size;
167
168 Messenger *mgr_messenger;
169 MgrClient mgr_client;
170 uint64_t mgr_proxy_bytes = 0; // in-flight proxied mgr command message bytes
171
172 private:
173 void new_tick();
174
175 // -- local storage --
176 public:
177 MonitorDBStore *store;
178 static const string MONITOR_NAME;
179 static const string MONITOR_STORE_PREFIX;
180
181 // -- monitor state --
182 private:
183 enum {
184 STATE_PROBING = 1,
185 STATE_SYNCHRONIZING,
186 STATE_ELECTING,
187 STATE_LEADER,
188 STATE_PEON,
189 STATE_SHUTDOWN
190 };
191 int state;
192
193 public:
194 static const char *get_state_name(int s) {
195 switch (s) {
196 case STATE_PROBING: return "probing";
197 case STATE_SYNCHRONIZING: return "synchronizing";
198 case STATE_ELECTING: return "electing";
199 case STATE_LEADER: return "leader";
200 case STATE_PEON: return "peon";
201 case STATE_SHUTDOWN: return "shutdown";
202 default: return "???";
203 }
204 }
205 const char *get_state_name() const {
206 return get_state_name(state);
207 }
208
209 bool is_shutdown() const { return state == STATE_SHUTDOWN; }
210 bool is_probing() const { return state == STATE_PROBING; }
211 bool is_synchronizing() const { return state == STATE_SYNCHRONIZING; }
212 bool is_electing() const { return state == STATE_ELECTING; }
213 bool is_leader() const { return state == STATE_LEADER; }
214 bool is_peon() const { return state == STATE_PEON; }
215
216 const utime_t &get_leader_since() const;
217
218 void prepare_new_fingerprint(MonitorDBStore::TransactionRef t);
219
220 // -- elector --
221 private:
222 Paxos *paxos;
223 Elector elector;
224 friend class Elector;
225
226 /// features we require of peers (based on on-disk compatset)
227 uint64_t required_features;
228
229 int leader; // current leader (to best of knowledge)
230 set<int> quorum; // current active set of monitors (if !starting)
231 utime_t leader_since; // when this monitor became the leader, if it is the leader
232 utime_t exited_quorum; // time detected as not in quorum; 0 if in
233 /**
234 * Intersection of quorum member's connection feature bits.
235 */
236 uint64_t quorum_con_features;
237 /**
238 * Intersection of quorum members mon-specific feature bits
239 */
240 mon_feature_t quorum_mon_features;
241 bufferlist supported_commands_bl; // encoded MonCommands we support
242
243 set<string> outside_quorum;
244
245 /**
246 * @defgroup Monitor_h_scrub
247 * @{
248 */
249 version_t scrub_version; ///< paxos version we are scrubbing
250 map<int,ScrubResult> scrub_result; ///< results so far
251
252 /**
253 * trigger a cross-mon scrub
254 *
255 * Verify all mons are storing identical content
256 */
257 int scrub_start();
258 int scrub();
259 void handle_scrub(MonOpRequestRef op);
260 bool _scrub(ScrubResult *r,
261 pair<string,string> *start,
262 int *num_keys);
263 void scrub_check_results();
264 void scrub_timeout();
265 void scrub_finish();
266 void scrub_reset();
267 void scrub_update_interval(int secs);
268
269 Context *scrub_event; ///< periodic event to trigger scrub (leader)
270 Context *scrub_timeout_event; ///< scrub round timeout (leader)
271 void scrub_event_start();
272 void scrub_event_cancel();
273 void scrub_reset_timeout();
274 void scrub_cancel_timeout();
275
276 struct ScrubState {
277 pair<string,string> last_key; ///< last scrubbed key
278 bool finished;
279
280 ScrubState() : finished(false) { }
281 virtual ~ScrubState() { }
282 };
283 ceph::shared_ptr<ScrubState> scrub_state; ///< keeps track of current scrub
284
285 /**
286 * @defgroup Monitor_h_sync Synchronization
287 * @{
288 */
289 /**
290 * @} // provider state
291 */
292 struct SyncProvider {
293 entity_inst_t entity; ///< who
294 uint64_t cookie; ///< unique cookie for this sync attempt
295 utime_t timeout; ///< when we give up and expire this attempt
296 version_t last_committed; ///< last paxos version on peer
297 pair<string,string> last_key; ///< last key sent to (or on) peer
298 bool full; ///< full scan?
299 MonitorDBStore::Synchronizer synchronizer; ///< iterator
300
301 SyncProvider() : cookie(0), last_committed(0), full(false) {}
302
303 void reset_timeout(CephContext *cct, int grace) {
304 timeout = ceph_clock_now();
305 timeout += grace;
306 }
307 };
308
309 map<uint64_t, SyncProvider> sync_providers; ///< cookie -> SyncProvider for those syncing from us
310 uint64_t sync_provider_count; ///< counter for issued cookies to keep them unique
311
312 /**
313 * @} // requester state
314 */
315 entity_inst_t sync_provider; ///< who we are syncing from
316 uint64_t sync_cookie; ///< 0 if we are starting, non-zero otherwise
317 bool sync_full; ///< true if we are a full sync, false for recent catch-up
318 version_t sync_start_version; ///< last_committed at sync start
319 Context *sync_timeout_event; ///< timeout event
320
321 /**
322 * floor for sync source
323 *
324 * When we sync we forget about our old last_committed value which
325 * can be dangerous. For example, if we have a cluster of:
326 *
327 * mon.a: lc 100
328 * mon.b: lc 80
329 * mon.c: lc 100 (us)
330 *
331 * If something forces us to sync (say, corruption, or manual
332 * intervention, or bug), we forget last_committed, and might abort.
333 * If mon.a happens to be down when we come back, we will see:
334 *
335 * mon.b: lc 80
336 * mon.c: lc 0 (us)
337 *
338 * and sync from mon.b, at which point a+b will both have lc 80 and
339 * come online with a majority holding out of date commits.
340 *
341 * Avoid this by preserving our old last_committed value prior to
342 * sync and never going backwards.
343 */
344 version_t sync_last_committed_floor;
345
346 /**
347 * Obtain the synchronization target prefixes in set form.
348 *
349 * We consider a target prefix all those that are relevant when
350 * synchronizing two stores. That is, all those that hold paxos service's
351 * versions, as well as paxos versions, or any control keys such as the
352 * first or last committed version.
353 *
354 * Given the current design, this function should return the name of all and
355 * any available paxos service, plus the paxos name.
356 *
357 * @returns a set of strings referring to the prefixes being synchronized
358 */
359 set<string> get_sync_targets_names();
360
361 /**
362 * Reset the monitor's sync-related data structures for syncing *from* a peer
363 */
364 void sync_reset_requester();
365
366 /**
367 * Reset sync state related to allowing others to sync from us
368 */
369 void sync_reset_provider();
370
371 /**
372 * Caled when a sync attempt times out (requester-side)
373 */
374 void sync_timeout();
375
376 /**
377 * Get the latest monmap for backup purposes during sync
378 */
379 void sync_obtain_latest_monmap(bufferlist &bl);
380
381 /**
382 * Start sync process
383 *
384 * Start pulling committed state from another monitor.
385 *
386 * @param entity where to pull committed state from
387 * @param full whether to do a full sync or just catch up on recent paxos
388 */
389 void sync_start(entity_inst_t &entity, bool full);
390
391 public:
392 /**
393 * force a sync on next mon restart
394 */
395 void sync_force(Formatter *f, ostream& ss);
396
397 private:
398 /**
399 * store critical state for safekeeping during sync
400 *
401 * We store a few things on the side that we don't want to get clobbered by sync. This
402 * includes the latest monmap and a lower bound on last_committed.
403 */
404 void sync_stash_critical_state(MonitorDBStore::TransactionRef tx);
405
406 /**
407 * reset the sync timeout
408 *
409 * This is used on the client to restart if things aren't progressing
410 */
411 void sync_reset_timeout();
412
413 /**
414 * trim stale sync provider state
415 *
416 * If someone is syncing from us and hasn't talked to us recently, expire their state.
417 */
418 void sync_trim_providers();
419
420 /**
421 * Complete a sync
422 *
423 * Finish up a sync after we've gotten all of the chunks.
424 *
425 * @param last_committed final last_committed value from provider
426 */
427 void sync_finish(version_t last_committed);
428
429 /**
430 * request the next chunk from the provider
431 */
432 void sync_get_next_chunk();
433
434 /**
435 * handle sync message
436 *
437 * @param m Sync message with operation type MMonSync::OP_START_CHUNKS
438 */
439 void handle_sync(MonOpRequestRef op);
440
441 void _sync_reply_no_cookie(MonOpRequestRef op);
442
443 void handle_sync_get_cookie(MonOpRequestRef op);
444 void handle_sync_get_chunk(MonOpRequestRef op);
445 void handle_sync_finish(MonOpRequestRef op);
446
447 void handle_sync_cookie(MonOpRequestRef op);
448 void handle_sync_forward(MonOpRequestRef op);
449 void handle_sync_chunk(MonOpRequestRef op);
450 void handle_sync_no_cookie(MonOpRequestRef op);
451
452 /**
453 * @} // Synchronization
454 */
455
456 list<Context*> waitfor_quorum;
457 list<Context*> maybe_wait_for_quorum;
458
459 /**
460 * @defgroup Monitor_h_TimeCheck Monitor Clock Drift Early Warning System
461 * @{
462 *
463 * We use time checks to keep track of any clock drifting going on in the
464 * cluster. This is accomplished by periodically ping each monitor in the
465 * quorum and register its response time on a map, assessing how much its
466 * clock has drifted. We also take this opportunity to assess the latency
467 * on response.
468 *
469 * This mechanism works as follows:
470 *
471 * - Leader sends out a 'PING' message to each other monitor in the quorum.
472 * The message is timestamped with the leader's current time. The leader's
473 * current time is recorded in a map, associated with each peon's
474 * instance.
475 * - The peon replies to the leader with a timestamped 'PONG' message.
476 * - The leader calculates a delta between the peon's timestamp and its
477 * current time and stashes it.
478 * - The leader also calculates the time it took to receive the 'PONG'
479 * since the 'PING' was sent, and stashes an approximate latency estimate.
480 * - Once all the quorum members have pong'ed, the leader will share the
481 * clock skew and latency maps with all the monitors in the quorum.
482 */
483 map<entity_inst_t, utime_t> timecheck_waiting;
484 map<entity_inst_t, double> timecheck_skews;
485 map<entity_inst_t, double> timecheck_latencies;
486 // odd value means we are mid-round; even value means the round has
487 // finished.
488 version_t timecheck_round;
489 unsigned int timecheck_acks;
490 utime_t timecheck_round_start;
491 /* When we hit a skew we will start a new round based off of
492 * 'mon_timecheck_skew_interval'. Each new round will be backed off
493 * until we hit 'mon_timecheck_interval' -- which is the typical
494 * interval when not in the presence of a skew.
495 *
496 * This variable tracks the number of rounds with skews since last clean
497 * so that we can report to the user and properly adjust the backoff.
498 */
499 uint64_t timecheck_rounds_since_clean;
500 /**
501 * Time Check event.
502 */
503 Context *timecheck_event;
504
505 void timecheck_start();
506 void timecheck_finish();
507 void timecheck_start_round();
508 void timecheck_finish_round(bool success = true);
509 void timecheck_cancel_round();
510 void timecheck_cleanup();
511 void timecheck_reset_event();
512 void timecheck_check_skews();
513 void timecheck_report();
514 void timecheck();
515 health_status_t timecheck_status(ostringstream &ss,
516 const double skew_bound,
517 const double latency);
518 void handle_timecheck_leader(MonOpRequestRef op);
519 void handle_timecheck_peon(MonOpRequestRef op);
520 void handle_timecheck(MonOpRequestRef op);
521
522 /**
523 * Returns 'true' if this is considered to be a skew; 'false' otherwise.
524 */
525 bool timecheck_has_skew(const double skew_bound, double *abs) const {
526 double abs_skew = std::fabs(skew_bound);
527 if (abs)
528 *abs = abs_skew;
529 return (abs_skew > g_conf->mon_clock_drift_allowed);
530 }
531
532 /**
533 * @}
534 */
535 /**
536 * Handle ping messages from others.
537 */
538 void handle_ping(MonOpRequestRef op);
539
540 Context *probe_timeout_event = nullptr; // for probing
541
542 void reset_probe_timeout();
543 void cancel_probe_timeout();
544 void probe_timeout(int r);
545
546 void _apply_compatset_features(CompatSet &new_features);
547
548 public:
549 epoch_t get_epoch();
550 int get_leader() const { return leader; }
551 const set<int>& get_quorum() const { return quorum; }
552 list<string> get_quorum_names() {
553 list<string> q;
554 for (set<int>::iterator p = quorum.begin(); p != quorum.end(); ++p)
555 q.push_back(monmap->get_name(*p));
556 return q;
557 }
558 uint64_t get_quorum_con_features() const {
559 return quorum_con_features;
560 }
561 mon_feature_t get_quorum_mon_features() const {
562 return quorum_mon_features;
563 }
564 uint64_t get_required_features() const {
565 return required_features;
566 }
567 mon_feature_t get_required_mon_features() const {
568 return monmap->get_required_features();
569 }
570 void apply_quorum_to_compatset_features();
571 void apply_monmap_to_compatset_features();
572 void calc_quorum_requirements();
573
574 private:
575 void _reset(); ///< called from bootstrap, start_, or join_election
576 void wait_for_paxos_write();
577 void _finish_svc_election(); ///< called by {win,lose}_election
578 public:
579 void bootstrap();
580 void join_election();
581 void start_election();
582 void win_standalone_election();
583 // end election (called by Elector)
584 void win_election(epoch_t epoch, set<int>& q,
585 uint64_t features,
586 const mon_feature_t& mon_features,
587 const MonCommand *cmdset, int cmdsize);
588 void lose_election(epoch_t epoch, set<int>& q, int l,
589 uint64_t features,
590 const mon_feature_t& mon_features);
591 // end election (called by Elector)
592 void finish_election();
593
594 const bufferlist& get_supported_commands_bl() {
595 return supported_commands_bl;
596 }
597
598 void update_logger();
599
600 /**
601 * Vector holding the Services serviced by this Monitor.
602 */
603 vector<PaxosService*> paxos_service;
604
605 PaxosService *get_paxos_service_by_name(const string& name);
606
607 class PGMonitor *pgmon() {
608 return (class PGMonitor *)paxos_service[PAXOS_PGMAP];
609 }
610
611 class MDSMonitor *mdsmon() {
612 return (class MDSMonitor *)paxos_service[PAXOS_MDSMAP];
613 }
614
615 class MonmapMonitor *monmon() {
616 return (class MonmapMonitor *)paxos_service[PAXOS_MONMAP];
617 }
618
619 class OSDMonitor *osdmon() {
620 return (class OSDMonitor *)paxos_service[PAXOS_OSDMAP];
621 }
622
623 class AuthMonitor *authmon() {
624 return (class AuthMonitor *)paxos_service[PAXOS_AUTH];
625 }
626
627 class LogMonitor *logmon() {
628 return (class LogMonitor*) paxos_service[PAXOS_LOG];
629 }
630
631 class MgrMonitor *mgrmon() {
632 return (class MgrMonitor*) paxos_service[PAXOS_MGR];
633 }
634
635 friend class Paxos;
636 friend class OSDMonitor;
637 friend class MDSMonitor;
638 friend class MonmapMonitor;
639 friend class PGMonitor;
640 friend class LogMonitor;
641 friend class ConfigKeyService;
642
643 QuorumService *health_monitor;
644 QuorumService *config_key_service;
645
646 // -- sessions --
647 MonSessionMap session_map;
648 Mutex session_map_lock{"Monitor::session_map_lock"};
649 AdminSocketHook *admin_hook;
650
651 template<typename Func, typename...Args>
652 void with_session_map(Func&& func) {
653 Mutex::Locker l(session_map_lock);
654 std::forward<Func>(func)(session_map);
655 }
656 void send_latest_monmap(Connection *con);
657
658 // messages
659 void handle_get_version(MonOpRequestRef op);
660 void handle_subscribe(MonOpRequestRef op);
661 void handle_mon_get_map(MonOpRequestRef op);
662
663 static void _generate_command_map(map<string,cmd_vartype>& cmdmap,
664 map<string,string> &param_str_map);
665 static const MonCommand *_get_moncommand(const string &cmd_prefix,
666 MonCommand *cmds, int cmds_size);
667 bool _allowed_command(MonSession *s, string &module, string &prefix,
668 const map<string,cmd_vartype>& cmdmap,
669 const map<string,string>& param_str_map,
670 const MonCommand *this_cmd);
671 void get_mon_status(Formatter *f, ostream& ss);
672 void _quorum_status(Formatter *f, ostream& ss);
673 bool _add_bootstrap_peer_hint(string cmd, cmdmap_t& cmdmap, ostream& ss);
674 void handle_command(MonOpRequestRef op);
675 void handle_route(MonOpRequestRef op);
676
677 void handle_mon_metadata(MonOpRequestRef op);
678 int get_mon_metadata(int mon, Formatter *f, ostream& err);
679 int print_nodes(Formatter *f, ostream& err);
680
681 // Accumulate metadata across calls to update_mon_metadata
682 map<int, Metadata> pending_metadata;
683
684 /**
685 *
686 */
687 struct health_cache_t {
688 health_status_t overall;
689 string summary;
690
691 void reset() {
692 // health_status_t doesn't really have a NONE value and we're not
693 // okay with setting something else (say, HEALTH_ERR). so just
694 // leave it be.
695 summary.clear();
696 }
697 } health_status_cache;
698
699 Context *health_tick_event = nullptr;
700 Context *health_interval_event = nullptr;
701
702 void health_tick_start();
703 void health_tick_stop();
704 utime_t health_interval_calc_next_update();
705 void health_interval_start();
706 void health_interval_stop();
707 void health_events_cleanup();
708
709 void health_to_clog_update_conf(const std::set<std::string> &changed);
710
711 void do_health_to_clog_interval();
712 void do_health_to_clog(bool force = false);
713
714 /**
715 * Generate health report
716 *
717 * @param status one-line status summary
718 * @param detailbl optional bufferlist* to fill with a detailed report
719 * @returns health status
720 */
721 health_status_t get_health(list<string>& status, bufferlist *detailbl,
722 Formatter *f);
723 void get_cluster_status(stringstream &ss, Formatter *f);
724
725 void reply_command(MonOpRequestRef op, int rc, const string &rs, version_t version);
726 void reply_command(MonOpRequestRef op, int rc, const string &rs, bufferlist& rdata, version_t version);
727
728
729 void handle_probe(MonOpRequestRef op);
730 /**
731 * Handle a Probe Operation, replying with our name, quorum and known versions.
732 *
733 * We use the MMonProbe message class for anything and everything related with
734 * Monitor probing. One of the operations relates directly with the probing
735 * itself, in which we receive a probe request and to which we reply with
736 * our name, our quorum and the known versions for each Paxos service. Thus the
737 * redundant function name. This reply will obviously be sent to the one
738 * probing/requesting these infos.
739 *
740 * @todo Add @pre and @post
741 *
742 * @param m A Probe message, with an operation of type Probe.
743 */
744 void handle_probe_probe(MonOpRequestRef op);
745 void handle_probe_reply(MonOpRequestRef op);
746
747 // request routing
748 struct RoutedRequest {
749 uint64_t tid;
750 bufferlist request_bl;
751 MonSession *session;
752 ConnectionRef con;
753 uint64_t con_features;
754 entity_inst_t client_inst;
755 MonOpRequestRef op;
756
757 RoutedRequest() : tid(0), session(NULL), con_features(0) {}
758 ~RoutedRequest() {
759 if (session)
760 session->put();
761 }
762 };
763 uint64_t routed_request_tid;
764 map<uint64_t, RoutedRequest*> routed_requests;
765
766 void forward_request_leader(MonOpRequestRef op);
767 void handle_forward(MonOpRequestRef op);
768 void try_send_message(Message *m, const entity_inst_t& to);
769 void send_reply(MonOpRequestRef op, Message *reply);
770 void no_reply(MonOpRequestRef op);
771 void resend_routed_requests();
772 void remove_session(MonSession *s);
773 void remove_all_sessions();
774 void waitlist_or_zap_client(MonOpRequestRef op);
775
776 void send_command(const entity_inst_t& inst,
777 const vector<string>& com);
778
779 public:
780 struct C_Command : public C_MonOp {
781 Monitor *mon;
782 int rc;
783 string rs;
784 bufferlist rdata;
785 version_t version;
786 C_Command(Monitor *_mm, MonOpRequestRef _op, int r, string s, version_t v) :
787 C_MonOp(_op), mon(_mm), rc(r), rs(s), version(v){}
788 C_Command(Monitor *_mm, MonOpRequestRef _op, int r, string s, bufferlist rd, version_t v) :
789 C_MonOp(_op), mon(_mm), rc(r), rs(s), rdata(rd), version(v){}
790
791 void _finish(int r) override {
792 MMonCommand *m = static_cast<MMonCommand*>(op->get_req());
793 if (r >= 0) {
794 ostringstream ss;
795 if (!op->get_req()->get_connection()) {
796 ss << "connection dropped for command ";
797 } else {
798 MonSession *s = op->get_session();
799
800 // if client drops we may not have a session to draw information from.
801 if (s) {
802 ss << "from='" << s->inst << "' "
803 << "entity='" << s->entity_name << "' ";
804 } else {
805 ss << "session dropped for command ";
806 }
807 }
808 ss << "cmd='" << m->cmd << "': finished";
809
810 mon->audit_clog->info() << ss.str();
811 mon->reply_command(op, rc, rs, rdata, version);
812 }
813 else if (r == -ECANCELED)
814 return;
815 else if (r == -EAGAIN)
816 mon->dispatch_op(op);
817 else
818 assert(0 == "bad C_Command return value");
819 }
820 };
821
822 private:
823 class C_RetryMessage : public C_MonOp {
824 Monitor *mon;
825 public:
826 C_RetryMessage(Monitor *m, MonOpRequestRef op) :
827 C_MonOp(op), mon(m) { }
828
829 void _finish(int r) override {
830 if (r == -EAGAIN || r >= 0)
831 mon->dispatch_op(op);
832 else if (r == -ECANCELED)
833 return;
834 else
835 assert(0 == "bad C_RetryMessage return value");
836 }
837 };
838
839 //ms_dispatch handles a lot of logic and we want to reuse it
840 //on forwarded messages, so we create a non-locking version for this class
841 void _ms_dispatch(Message *m);
842 bool ms_dispatch(Message *m) override {
843 lock.Lock();
844 _ms_dispatch(m);
845 lock.Unlock();
846 return true;
847 }
848 void dispatch_op(MonOpRequestRef op);
849 //mon_caps is used for un-connected messages from monitors
850 MonCap * mon_caps;
851 bool ms_get_authorizer(int dest_type, AuthAuthorizer **authorizer, bool force_new) override;
852 bool ms_verify_authorizer(Connection *con, int peer_type,
853 int protocol, bufferlist& authorizer_data, bufferlist& authorizer_reply,
854 bool& isvalid, CryptoKey& session_key) override;
855 bool ms_handle_reset(Connection *con) override;
856 void ms_handle_remote_reset(Connection *con) override {}
857 bool ms_handle_refused(Connection *con) override;
858
859 int write_default_keyring(bufferlist& bl);
860 void extract_save_mon_key(KeyRing& keyring);
861
862 void update_mon_metadata(int from, Metadata&& m);
863 int load_metadata(map<int, Metadata>& m);
864
865 // features
866 static CompatSet get_initial_supported_features();
867 static CompatSet get_supported_features();
868 static CompatSet get_legacy_features();
869 /// read the ondisk features into the CompatSet pointed to by read_features
870 static void read_features_off_disk(MonitorDBStore *store, CompatSet *read_features);
871 void read_features();
872 void write_features(MonitorDBStore::TransactionRef t);
873
874 OpTracker op_tracker;
875
876 public:
877 Monitor(CephContext *cct_, string nm, MonitorDBStore *s,
878 Messenger *m, Messenger *mgr_m, MonMap *map);
879 ~Monitor() override;
880
881 static int check_features(MonitorDBStore *store);
882
883 // config observer
884 const char** get_tracked_conf_keys() const override;
885 void handle_conf_change(const struct md_config_t *conf,
886 const std::set<std::string> &changed) override;
887
888 void update_log_clients();
889 int sanitize_options();
890 int preinit();
891 int init();
892 void init_paxos();
893 void refresh_from_paxos(bool *need_bootstrap);
894 void shutdown();
895 void tick();
896
897 void handle_signal(int sig);
898
899 int mkfs(bufferlist& osdmapbl);
900
901 /**
902 * check cluster_fsid file
903 *
904 * @return EEXIST if file exists and doesn't match, 0 on match, or negative error code
905 */
906 int check_fsid();
907
908 /**
909 * write cluster_fsid file
910 *
911 * @return 0 on success, or negative error code
912 */
913 int write_fsid();
914 int write_fsid(MonitorDBStore::TransactionRef t);
915
916 void do_admin_command(std::string command, cmdmap_t& cmdmap,
917 std::string format, ostream& ss);
918
919 private:
920 // don't allow copying
921 Monitor(const Monitor& rhs);
922 Monitor& operator=(const Monitor &rhs);
923
924 public:
925 static void format_command_descriptions(const MonCommand *commands,
926 unsigned commands_size,
927 Formatter *f,
928 bufferlist *rdata,
929 bool hide_mgr_flag=false);
930 void get_locally_supported_monitor_commands(const MonCommand **cmds, int *count);
931 /// the Monitor owns this pointer once you pass it in
932 void set_leader_supported_commands(const MonCommand *cmds, int size);
933 static bool is_keyring_required();
934 };
935
936 #define CEPH_MON_FEATURE_INCOMPAT_BASE CompatSet::Feature (1, "initial feature set (~v.18)")
937 #define CEPH_MON_FEATURE_INCOMPAT_GV CompatSet::Feature (2, "global version sequencing (v0.52)")
938 #define CEPH_MON_FEATURE_INCOMPAT_SINGLE_PAXOS CompatSet::Feature (3, "single paxos with k/v store (v0.\?)")
939 #define CEPH_MON_FEATURE_INCOMPAT_OSD_ERASURE_CODES CompatSet::Feature(4, "support erasure code pools")
940 #define CEPH_MON_FEATURE_INCOMPAT_OSDMAP_ENC CompatSet::Feature(5, "new-style osdmap encoding")
941 #define CEPH_MON_FEATURE_INCOMPAT_ERASURE_CODE_PLUGINS_V2 CompatSet::Feature(6, "support isa/lrc erasure code")
942 #define CEPH_MON_FEATURE_INCOMPAT_ERASURE_CODE_PLUGINS_V3 CompatSet::Feature(7, "support shec erasure code")
943 #define CEPH_MON_FEATURE_INCOMPAT_KRAKEN CompatSet::Feature(8, "support monmap features")
944 // make sure you add your feature to Monitor::get_supported_features
945
946 long parse_pos_long(const char *s, ostream *pss = NULL);
947
948 struct MonCommand {
949 string cmdstring;
950 string helpstring;
951 string module;
952 string req_perms;
953 string availability;
954 uint64_t flags;
955
956 // MonCommand flags
957 static const uint64_t FLAG_NONE = 0;
958 static const uint64_t FLAG_NOFORWARD = 1 << 0;
959 static const uint64_t FLAG_OBSOLETE = 1 << 1;
960 static const uint64_t FLAG_DEPRECATED = 1 << 2;
961 static const uint64_t FLAG_MGR = 1 << 3;
962
963 bool has_flag(uint64_t flag) const { return (flags & flag) != 0; }
964 void set_flag(uint64_t flag) { flags |= flag; }
965 void unset_flag(uint64_t flag) { flags &= ~flag; }
966
967 void encode(bufferlist &bl) const {
968 /*
969 * very naughty: deliberately unversioned because individual commands
970 * shouldn't be encoded standalone, only as a full set (which we do
971 * version, see encode_array() below).
972 */
973 ::encode(cmdstring, bl);
974 ::encode(helpstring, bl);
975 ::encode(module, bl);
976 ::encode(req_perms, bl);
977 ::encode(availability, bl);
978 }
979 void decode(bufferlist::iterator &bl) {
980 ::decode(cmdstring, bl);
981 ::decode(helpstring, bl);
982 ::decode(module, bl);
983 ::decode(req_perms, bl);
984 ::decode(availability, bl);
985 }
986 bool is_compat(const MonCommand* o) const {
987 return cmdstring == o->cmdstring &&
988 module == o->module && req_perms == o->req_perms &&
989 availability == o->availability;
990 }
991
992 bool is_noforward() const {
993 return has_flag(MonCommand::FLAG_NOFORWARD);
994 }
995
996 bool is_obsolete() const {
997 return has_flag(MonCommand::FLAG_OBSOLETE);
998 }
999
1000 bool is_deprecated() const {
1001 return has_flag(MonCommand::FLAG_DEPRECATED);
1002 }
1003
1004 bool is_mgr() const {
1005 return has_flag(MonCommand::FLAG_MGR);
1006 }
1007
1008 static void encode_array(const MonCommand *cmds, int size, bufferlist &bl) {
1009 ENCODE_START(2, 1, bl);
1010 uint16_t s = size;
1011 ::encode(s, bl);
1012 ::encode_array_nohead(cmds, size, bl);
1013 for (int i = 0; i < size; i++)
1014 ::encode(cmds[i].flags, bl);
1015 ENCODE_FINISH(bl);
1016 }
1017 static void decode_array(MonCommand **cmds, int *size,
1018 bufferlist::iterator &bl) {
1019 DECODE_START(2, bl);
1020 uint16_t s = 0;
1021 ::decode(s, bl);
1022 *size = s;
1023 *cmds = new MonCommand[*size];
1024 ::decode_array_nohead(*cmds, *size, bl);
1025 if (struct_v >= 2) {
1026 for (int i = 0; i < *size; i++)
1027 ::decode((*cmds)[i].flags, bl);
1028 } else {
1029 for (int i = 0; i < *size; i++)
1030 (*cmds)[i].flags = 0;
1031 }
1032 DECODE_FINISH(bl);
1033 }
1034
1035 bool requires_perm(char p) const {
1036 return (req_perms.find(p) != string::npos);
1037 }
1038 };
1039 WRITE_CLASS_ENCODER(MonCommand)
1040
1041 #endif